qwen3-4b-full / trainer_state.json
davidoneil's picture
Upload inicial de modelo
cc43340 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0998851320980872,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 9.98851320980872e-05,
"grad_norm": 9.875,
"learning_rate": 0.0,
"loss": 1.4779,
"memory/device_reserved (GiB)": 86.98,
"memory/max_active (GiB)": 76.6,
"memory/max_allocated (GiB)": 76.6,
"step": 1,
"tokens_per_second_per_gpu": 8243.37,
"total_tokens": 40962
},
{
"epoch": 0.0001997702641961744,
"grad_norm": 8.25,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.3415,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 2,
"tokens_per_second_per_gpu": 19760.57,
"total_tokens": 88833
},
{
"epoch": 0.0002996553962942616,
"grad_norm": 8.8125,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.3753,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 3,
"tokens_per_second_per_gpu": 17636.24,
"total_tokens": 131779
},
{
"epoch": 0.0003995405283923488,
"grad_norm": 8.4375,
"learning_rate": 6.000000000000001e-07,
"loss": 1.4345,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 4,
"tokens_per_second_per_gpu": 20544.74,
"total_tokens": 180993
},
{
"epoch": 0.000499425660490436,
"grad_norm": 9.9375,
"learning_rate": 8.000000000000001e-07,
"loss": 1.486,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 5,
"tokens_per_second_per_gpu": 16871.06,
"total_tokens": 221420
},
{
"epoch": 0.0005993107925885232,
"grad_norm": 8.8125,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.3823,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 6,
"tokens_per_second_per_gpu": 19640.03,
"total_tokens": 268993
},
{
"epoch": 0.0006991959246866104,
"grad_norm": 8.0625,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.3434,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 7,
"tokens_per_second_per_gpu": 17908.21,
"total_tokens": 313714
},
{
"epoch": 0.0007990810567846976,
"grad_norm": 7.875,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.3487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 8,
"tokens_per_second_per_gpu": 18417.25,
"total_tokens": 359437
},
{
"epoch": 0.0008989661888827848,
"grad_norm": 8.75,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.4139,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 9,
"tokens_per_second_per_gpu": 19609.76,
"total_tokens": 406269
},
{
"epoch": 0.000998851320980872,
"grad_norm": 8.0,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.4235,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 10,
"tokens_per_second_per_gpu": 20842.7,
"total_tokens": 456114
},
{
"epoch": 0.0010987364530789592,
"grad_norm": 8.25,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.3933,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 11,
"tokens_per_second_per_gpu": 18011.03,
"total_tokens": 499704
},
{
"epoch": 0.0011986215851770463,
"grad_norm": 8.1875,
"learning_rate": 2.2e-06,
"loss": 1.4046,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 12,
"tokens_per_second_per_gpu": 18778.61,
"total_tokens": 545509
},
{
"epoch": 0.0012985067172751337,
"grad_norm": 6.96875,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.2975,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 13,
"tokens_per_second_per_gpu": 18930.63,
"total_tokens": 593148
},
{
"epoch": 0.0013983918493732208,
"grad_norm": 7.09375,
"learning_rate": 2.6e-06,
"loss": 1.3302,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 14,
"tokens_per_second_per_gpu": 18373.51,
"total_tokens": 639166
},
{
"epoch": 0.001498276981471308,
"grad_norm": 7.0625,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.3545,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 15,
"tokens_per_second_per_gpu": 19694.91,
"total_tokens": 687960
},
{
"epoch": 0.0015981621135693952,
"grad_norm": 7.6875,
"learning_rate": 3e-06,
"loss": 1.4662,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 16,
"tokens_per_second_per_gpu": 19953.67,
"total_tokens": 734740
},
{
"epoch": 0.0016980472456674823,
"grad_norm": 7.0,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.3218,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 17,
"tokens_per_second_per_gpu": 18332.53,
"total_tokens": 778737
},
{
"epoch": 0.0017979323777655696,
"grad_norm": 6.71875,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.3578,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 18,
"tokens_per_second_per_gpu": 18145.79,
"total_tokens": 822714
},
{
"epoch": 0.0018978175098636567,
"grad_norm": 6.3125,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3095,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 19,
"tokens_per_second_per_gpu": 16064.69,
"total_tokens": 862422
},
{
"epoch": 0.001997702641961744,
"grad_norm": 8.0,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3111,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 20,
"tokens_per_second_per_gpu": 19133.11,
"total_tokens": 908386
},
{
"epoch": 0.002097587774059831,
"grad_norm": 5.8125,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3143,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 21,
"tokens_per_second_per_gpu": 21112.41,
"total_tokens": 958799
},
{
"epoch": 0.0021974729061579185,
"grad_norm": 5.15625,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.1851,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 22,
"tokens_per_second_per_gpu": 16875.94,
"total_tokens": 1001224
},
{
"epoch": 0.0022973580382560058,
"grad_norm": 5.0,
"learning_rate": 4.4e-06,
"loss": 1.1666,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 23,
"tokens_per_second_per_gpu": 16288.84,
"total_tokens": 1041090
},
{
"epoch": 0.0023972431703540927,
"grad_norm": 5.3125,
"learning_rate": 4.600000000000001e-06,
"loss": 1.2269,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 24,
"tokens_per_second_per_gpu": 15587.26,
"total_tokens": 1078798
},
{
"epoch": 0.00249712830245218,
"grad_norm": 4.21875,
"learning_rate": 4.800000000000001e-06,
"loss": 1.1578,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 25,
"tokens_per_second_per_gpu": 17930.32,
"total_tokens": 1123045
},
{
"epoch": 0.0025970134345502673,
"grad_norm": 4.28125,
"learning_rate": 5e-06,
"loss": 1.2388,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 26,
"tokens_per_second_per_gpu": 20577.02,
"total_tokens": 1172554
},
{
"epoch": 0.002696898566648354,
"grad_norm": 3.453125,
"learning_rate": 5.2e-06,
"loss": 1.1585,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 27,
"tokens_per_second_per_gpu": 19405.27,
"total_tokens": 1219906
},
{
"epoch": 0.0027967836987464415,
"grad_norm": 3.375,
"learning_rate": 5.400000000000001e-06,
"loss": 1.1144,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 28,
"tokens_per_second_per_gpu": 18917.28,
"total_tokens": 1266344
},
{
"epoch": 0.002896668830844529,
"grad_norm": 3.203125,
"learning_rate": 5.600000000000001e-06,
"loss": 1.0446,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 29,
"tokens_per_second_per_gpu": 16874.54,
"total_tokens": 1308444
},
{
"epoch": 0.002996553962942616,
"grad_norm": 3.34375,
"learning_rate": 5.8e-06,
"loss": 1.1263,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 30,
"tokens_per_second_per_gpu": 18492.77,
"total_tokens": 1353101
},
{
"epoch": 0.003096439095040703,
"grad_norm": 2.71875,
"learning_rate": 6e-06,
"loss": 1.0484,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 31,
"tokens_per_second_per_gpu": 17416.39,
"total_tokens": 1396719
},
{
"epoch": 0.0031963242271387904,
"grad_norm": 2.671875,
"learning_rate": 6.200000000000001e-06,
"loss": 1.0232,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 32,
"tokens_per_second_per_gpu": 17462.08,
"total_tokens": 1439123
},
{
"epoch": 0.0032962093592368777,
"grad_norm": 2.28125,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.0868,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 33,
"tokens_per_second_per_gpu": 19708.23,
"total_tokens": 1486970
},
{
"epoch": 0.0033960944913349646,
"grad_norm": 2.140625,
"learning_rate": 6.600000000000001e-06,
"loss": 1.0017,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 34,
"tokens_per_second_per_gpu": 18253.1,
"total_tokens": 1531013
},
{
"epoch": 0.003495979623433052,
"grad_norm": 2.078125,
"learning_rate": 6.800000000000001e-06,
"loss": 0.9939,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 35,
"tokens_per_second_per_gpu": 18028.16,
"total_tokens": 1574525
},
{
"epoch": 0.0035958647555311392,
"grad_norm": 1.90625,
"learning_rate": 7e-06,
"loss": 1.0111,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 36,
"tokens_per_second_per_gpu": 16813.31,
"total_tokens": 1615560
},
{
"epoch": 0.0036957498876292265,
"grad_norm": 1.7109375,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.0056,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 37,
"tokens_per_second_per_gpu": 18608.55,
"total_tokens": 1660541
},
{
"epoch": 0.0037956350197273134,
"grad_norm": 1.65625,
"learning_rate": 7.4e-06,
"loss": 0.9551,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 38,
"tokens_per_second_per_gpu": 16008.72,
"total_tokens": 1700965
},
{
"epoch": 0.0038955201518254007,
"grad_norm": 1.5625,
"learning_rate": 7.600000000000001e-06,
"loss": 0.9944,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 39,
"tokens_per_second_per_gpu": 18581.27,
"total_tokens": 1746774
},
{
"epoch": 0.003995405283923488,
"grad_norm": 1.484375,
"learning_rate": 7.800000000000002e-06,
"loss": 0.9869,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 40,
"tokens_per_second_per_gpu": 20093.69,
"total_tokens": 1794883
},
{
"epoch": 0.004095290416021575,
"grad_norm": 1.359375,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9399,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 41,
"tokens_per_second_per_gpu": 18810.89,
"total_tokens": 1841261
},
{
"epoch": 0.004195175548119662,
"grad_norm": 1.40625,
"learning_rate": 8.2e-06,
"loss": 0.9138,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 42,
"tokens_per_second_per_gpu": 17770.49,
"total_tokens": 1884703
},
{
"epoch": 0.004295060680217749,
"grad_norm": 1.40625,
"learning_rate": 8.400000000000001e-06,
"loss": 0.8892,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 43,
"tokens_per_second_per_gpu": 16237.8,
"total_tokens": 1924914
},
{
"epoch": 0.004394945812315837,
"grad_norm": 1.2734375,
"learning_rate": 8.6e-06,
"loss": 0.8984,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 44,
"tokens_per_second_per_gpu": 19662.27,
"total_tokens": 1973886
},
{
"epoch": 0.004494830944413924,
"grad_norm": 1.3046875,
"learning_rate": 8.8e-06,
"loss": 0.9596,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 45,
"tokens_per_second_per_gpu": 18586.65,
"total_tokens": 2018483
},
{
"epoch": 0.0045947160765120116,
"grad_norm": 1.2421875,
"learning_rate": 9e-06,
"loss": 0.913,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 46,
"tokens_per_second_per_gpu": 18524.22,
"total_tokens": 2064097
},
{
"epoch": 0.0046946012086100984,
"grad_norm": 1.2265625,
"learning_rate": 9.200000000000002e-06,
"loss": 0.9494,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 47,
"tokens_per_second_per_gpu": 18999.6,
"total_tokens": 2109868
},
{
"epoch": 0.004794486340708185,
"grad_norm": 1.203125,
"learning_rate": 9.4e-06,
"loss": 0.8813,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 48,
"tokens_per_second_per_gpu": 18857.28,
"total_tokens": 2156067
},
{
"epoch": 0.004894371472806273,
"grad_norm": 1.171875,
"learning_rate": 9.600000000000001e-06,
"loss": 0.8018,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 49,
"tokens_per_second_per_gpu": 16664.35,
"total_tokens": 2196879
},
{
"epoch": 0.00499425660490436,
"grad_norm": 1.1796875,
"learning_rate": 9.800000000000001e-06,
"loss": 0.9073,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 50,
"tokens_per_second_per_gpu": 20783.23,
"total_tokens": 2246949
},
{
"epoch": 0.005094141737002447,
"grad_norm": 1.1640625,
"learning_rate": 1e-05,
"loss": 0.8948,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 51,
"tokens_per_second_per_gpu": 19415.04,
"total_tokens": 2294931
},
{
"epoch": 0.005194026869100535,
"grad_norm": 1.1171875,
"learning_rate": 1.02e-05,
"loss": 0.9246,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 52,
"tokens_per_second_per_gpu": 20802.16,
"total_tokens": 2343942
},
{
"epoch": 0.0052939120011986215,
"grad_norm": 1.5078125,
"learning_rate": 1.04e-05,
"loss": 0.864,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 53,
"tokens_per_second_per_gpu": 18225.11,
"total_tokens": 2388139
},
{
"epoch": 0.005393797133296708,
"grad_norm": 1.0703125,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.8455,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 54,
"tokens_per_second_per_gpu": 19175.62,
"total_tokens": 2435894
},
{
"epoch": 0.005493682265394796,
"grad_norm": 1.1171875,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.7576,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 55,
"tokens_per_second_per_gpu": 16339.01,
"total_tokens": 2476460
},
{
"epoch": 0.005593567397492883,
"grad_norm": 1.125,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.7952,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 56,
"tokens_per_second_per_gpu": 18003.43,
"total_tokens": 2521341
},
{
"epoch": 0.005693452529590971,
"grad_norm": 1.1796875,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.7279,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 57,
"tokens_per_second_per_gpu": 16071.69,
"total_tokens": 2561139
},
{
"epoch": 0.005793337661689058,
"grad_norm": 1.171875,
"learning_rate": 1.14e-05,
"loss": 0.841,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 58,
"tokens_per_second_per_gpu": 17495.95,
"total_tokens": 2603702
},
{
"epoch": 0.0058932227937871446,
"grad_norm": 1.0546875,
"learning_rate": 1.16e-05,
"loss": 0.8575,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 59,
"tokens_per_second_per_gpu": 19617.38,
"total_tokens": 2651665
},
{
"epoch": 0.005993107925885232,
"grad_norm": 1.046875,
"learning_rate": 1.18e-05,
"loss": 0.8329,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 60,
"tokens_per_second_per_gpu": 18947.4,
"total_tokens": 2698698
},
{
"epoch": 0.006092993057983319,
"grad_norm": 1.0390625,
"learning_rate": 1.2e-05,
"loss": 0.7797,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 61,
"tokens_per_second_per_gpu": 18057.29,
"total_tokens": 2743457
},
{
"epoch": 0.006192878190081406,
"grad_norm": 1.0390625,
"learning_rate": 1.22e-05,
"loss": 0.7424,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 62,
"tokens_per_second_per_gpu": 17142.61,
"total_tokens": 2786649
},
{
"epoch": 0.006292763322179494,
"grad_norm": 1.1171875,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.8005,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 63,
"tokens_per_second_per_gpu": 17120.79,
"total_tokens": 2828570
},
{
"epoch": 0.006392648454277581,
"grad_norm": 2.046875,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.7657,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 64,
"tokens_per_second_per_gpu": 18101.39,
"total_tokens": 2872421
},
{
"epoch": 0.006492533586375668,
"grad_norm": 1.0859375,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.7696,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 65,
"tokens_per_second_per_gpu": 17904.68,
"total_tokens": 2916417
},
{
"epoch": 0.006592418718473755,
"grad_norm": 1.0390625,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7335,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 66,
"tokens_per_second_per_gpu": 16890.55,
"total_tokens": 2958573
},
{
"epoch": 0.006692303850571842,
"grad_norm": 1.0625,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.758,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 67,
"tokens_per_second_per_gpu": 19215.29,
"total_tokens": 3006135
},
{
"epoch": 0.006792188982669929,
"grad_norm": 1.09375,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.7379,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 68,
"tokens_per_second_per_gpu": 16960.42,
"total_tokens": 3048323
},
{
"epoch": 0.006892074114768017,
"grad_norm": 1.015625,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.7149,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 69,
"tokens_per_second_per_gpu": 18080.8,
"total_tokens": 3092850
},
{
"epoch": 0.006991959246866104,
"grad_norm": 1.0234375,
"learning_rate": 1.38e-05,
"loss": 0.7738,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 70,
"tokens_per_second_per_gpu": 20170.16,
"total_tokens": 3141049
},
{
"epoch": 0.0070918443789641916,
"grad_norm": 1.0859375,
"learning_rate": 1.4e-05,
"loss": 0.7906,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 71,
"tokens_per_second_per_gpu": 18126.92,
"total_tokens": 3184495
},
{
"epoch": 0.0071917295110622784,
"grad_norm": 1.0234375,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.8002,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 72,
"tokens_per_second_per_gpu": 20161.62,
"total_tokens": 3232249
},
{
"epoch": 0.007291614643160365,
"grad_norm": 1.0390625,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.817,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 73,
"tokens_per_second_per_gpu": 20645.27,
"total_tokens": 3281435
},
{
"epoch": 0.007391499775258453,
"grad_norm": 1.1171875,
"learning_rate": 1.46e-05,
"loss": 0.7147,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 74,
"tokens_per_second_per_gpu": 16148.65,
"total_tokens": 3321456
},
{
"epoch": 0.00749138490735654,
"grad_norm": 1.03125,
"learning_rate": 1.48e-05,
"loss": 0.7153,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 75,
"tokens_per_second_per_gpu": 17587.93,
"total_tokens": 3364463
},
{
"epoch": 0.007591270039454627,
"grad_norm": 1.046875,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.7806,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 76,
"tokens_per_second_per_gpu": 19119.21,
"total_tokens": 3411507
},
{
"epoch": 0.007691155171552715,
"grad_norm": 1.015625,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.7713,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 77,
"tokens_per_second_per_gpu": 19492.79,
"total_tokens": 3458778
},
{
"epoch": 0.0077910403036508015,
"grad_norm": 1.03125,
"learning_rate": 1.54e-05,
"loss": 0.7522,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 78,
"tokens_per_second_per_gpu": 20537.8,
"total_tokens": 3507274
},
{
"epoch": 0.007890925435748888,
"grad_norm": 1.015625,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.6706,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 79,
"tokens_per_second_per_gpu": 16985.35,
"total_tokens": 3549783
},
{
"epoch": 0.007990810567846975,
"grad_norm": 1.0390625,
"learning_rate": 1.58e-05,
"loss": 0.7132,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 80,
"tokens_per_second_per_gpu": 18124.3,
"total_tokens": 3594794
},
{
"epoch": 0.008090695699945064,
"grad_norm": 1.015625,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7931,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 81,
"tokens_per_second_per_gpu": 21164.76,
"total_tokens": 3644056
},
{
"epoch": 0.00819058083204315,
"grad_norm": 1.046875,
"learning_rate": 1.62e-05,
"loss": 0.7121,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 82,
"tokens_per_second_per_gpu": 17748.32,
"total_tokens": 3687994
},
{
"epoch": 0.008290465964141238,
"grad_norm": 0.98046875,
"learning_rate": 1.64e-05,
"loss": 0.6742,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 83,
"tokens_per_second_per_gpu": 18630.57,
"total_tokens": 3734033
},
{
"epoch": 0.008390351096239325,
"grad_norm": 0.99609375,
"learning_rate": 1.66e-05,
"loss": 0.6896,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 84,
"tokens_per_second_per_gpu": 17730.4,
"total_tokens": 3777101
},
{
"epoch": 0.008490236228337411,
"grad_norm": 1.0703125,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.6859,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 85,
"tokens_per_second_per_gpu": 18543.13,
"total_tokens": 3822199
},
{
"epoch": 0.008590121360435498,
"grad_norm": 1.015625,
"learning_rate": 1.7e-05,
"loss": 0.6824,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 86,
"tokens_per_second_per_gpu": 18246.7,
"total_tokens": 3866806
},
{
"epoch": 0.008690006492533587,
"grad_norm": 1.0234375,
"learning_rate": 1.72e-05,
"loss": 0.7252,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 87,
"tokens_per_second_per_gpu": 19550.06,
"total_tokens": 3914452
},
{
"epoch": 0.008789891624631674,
"grad_norm": 1.0390625,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.642,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 88,
"tokens_per_second_per_gpu": 17625.61,
"total_tokens": 3957260
},
{
"epoch": 0.00888977675672976,
"grad_norm": 0.97265625,
"learning_rate": 1.76e-05,
"loss": 0.7135,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 89,
"tokens_per_second_per_gpu": 20639.71,
"total_tokens": 4007282
},
{
"epoch": 0.008989661888827848,
"grad_norm": 0.98828125,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.7338,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 90,
"tokens_per_second_per_gpu": 20737.91,
"total_tokens": 4056792
},
{
"epoch": 0.009089547020925935,
"grad_norm": 0.9609375,
"learning_rate": 1.8e-05,
"loss": 0.6705,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 91,
"tokens_per_second_per_gpu": 18263.18,
"total_tokens": 4101429
},
{
"epoch": 0.009189432153024023,
"grad_norm": 1.125,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.6818,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 92,
"tokens_per_second_per_gpu": 16507.6,
"total_tokens": 4142165
},
{
"epoch": 0.00928931728512211,
"grad_norm": 1.046875,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.7067,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 93,
"tokens_per_second_per_gpu": 19972.82,
"total_tokens": 4189518
},
{
"epoch": 0.009389202417220197,
"grad_norm": 1.046875,
"learning_rate": 1.86e-05,
"loss": 0.7137,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 94,
"tokens_per_second_per_gpu": 18967.35,
"total_tokens": 4236108
},
{
"epoch": 0.009489087549318284,
"grad_norm": 1.0546875,
"learning_rate": 1.88e-05,
"loss": 0.6867,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 95,
"tokens_per_second_per_gpu": 18271.78,
"total_tokens": 4280134
},
{
"epoch": 0.00958897268141637,
"grad_norm": 1.03125,
"learning_rate": 1.9e-05,
"loss": 0.6651,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 96,
"tokens_per_second_per_gpu": 17804.55,
"total_tokens": 4323994
},
{
"epoch": 0.009688857813514458,
"grad_norm": 1.0390625,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.6832,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 97,
"tokens_per_second_per_gpu": 17164.59,
"total_tokens": 4365898
},
{
"epoch": 0.009788742945612546,
"grad_norm": 1.0546875,
"learning_rate": 1.94e-05,
"loss": 0.7163,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 98,
"tokens_per_second_per_gpu": 19376.04,
"total_tokens": 4412792
},
{
"epoch": 0.009888628077710633,
"grad_norm": 1.0546875,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.6589,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 99,
"tokens_per_second_per_gpu": 16998.89,
"total_tokens": 4454365
},
{
"epoch": 0.00998851320980872,
"grad_norm": 0.98828125,
"learning_rate": 1.98e-05,
"loss": 0.6456,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 100,
"tokens_per_second_per_gpu": 19673.35,
"total_tokens": 4501083
},
{
"epoch": 0.010088398341906807,
"grad_norm": 0.99609375,
"learning_rate": 2e-05,
"loss": 0.648,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 101,
"tokens_per_second_per_gpu": 18584.02,
"total_tokens": 4547371
},
{
"epoch": 0.010188283474004894,
"grad_norm": 1.03125,
"learning_rate": 1.9999939076577906e-05,
"loss": 0.6653,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 102,
"tokens_per_second_per_gpu": 18727.67,
"total_tokens": 4593306
},
{
"epoch": 0.010288168606102982,
"grad_norm": 0.98046875,
"learning_rate": 1.9999756307053947e-05,
"loss": 0.6515,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 103,
"tokens_per_second_per_gpu": 18131.83,
"total_tokens": 4639433
},
{
"epoch": 0.01038805373820107,
"grad_norm": 1.0625,
"learning_rate": 1.9999451693655125e-05,
"loss": 0.6943,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 104,
"tokens_per_second_per_gpu": 19484.13,
"total_tokens": 4687235
},
{
"epoch": 0.010487938870299156,
"grad_norm": 1.1015625,
"learning_rate": 1.9999025240093045e-05,
"loss": 0.6511,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 105,
"tokens_per_second_per_gpu": 17318.02,
"total_tokens": 4729861
},
{
"epoch": 0.010587824002397243,
"grad_norm": 1.0234375,
"learning_rate": 1.9998476951563914e-05,
"loss": 0.691,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 106,
"tokens_per_second_per_gpu": 18638.54,
"total_tokens": 4775601
},
{
"epoch": 0.01068770913449533,
"grad_norm": 0.9921875,
"learning_rate": 1.9997806834748455e-05,
"loss": 0.6842,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 107,
"tokens_per_second_per_gpu": 20067.07,
"total_tokens": 4824337
},
{
"epoch": 0.010787594266593417,
"grad_norm": 1.0,
"learning_rate": 1.9997014897811834e-05,
"loss": 0.6205,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 108,
"tokens_per_second_per_gpu": 18881.75,
"total_tokens": 4869394
},
{
"epoch": 0.010887479398691505,
"grad_norm": 1.015625,
"learning_rate": 1.9996101150403543e-05,
"loss": 0.6933,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 109,
"tokens_per_second_per_gpu": 20694.39,
"total_tokens": 4920754
},
{
"epoch": 0.010987364530789592,
"grad_norm": 1.0234375,
"learning_rate": 1.9995065603657317e-05,
"loss": 0.6624,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 110,
"tokens_per_second_per_gpu": 19216.67,
"total_tokens": 4967591
},
{
"epoch": 0.01108724966288768,
"grad_norm": 0.99609375,
"learning_rate": 1.999390827019096e-05,
"loss": 0.666,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 111,
"tokens_per_second_per_gpu": 19515.06,
"total_tokens": 5015301
},
{
"epoch": 0.011187134794985766,
"grad_norm": 0.99609375,
"learning_rate": 1.999262916410621e-05,
"loss": 0.604,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 112,
"tokens_per_second_per_gpu": 18859.71,
"total_tokens": 5061747
},
{
"epoch": 0.011287019927083853,
"grad_norm": 1.078125,
"learning_rate": 1.9991228300988586e-05,
"loss": 0.6128,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 113,
"tokens_per_second_per_gpu": 16901.47,
"total_tokens": 5102619
},
{
"epoch": 0.011386905059181942,
"grad_norm": 1.046875,
"learning_rate": 1.998970569790715e-05,
"loss": 0.6028,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 114,
"tokens_per_second_per_gpu": 18441.42,
"total_tokens": 5149611
},
{
"epoch": 0.011486790191280028,
"grad_norm": 1.0859375,
"learning_rate": 1.9988061373414342e-05,
"loss": 0.6268,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 115,
"tokens_per_second_per_gpu": 18813.74,
"total_tokens": 5195136
},
{
"epoch": 0.011586675323378115,
"grad_norm": 1.1328125,
"learning_rate": 1.9986295347545738e-05,
"loss": 0.5847,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 116,
"tokens_per_second_per_gpu": 17771.59,
"total_tokens": 5238688
},
{
"epoch": 0.011686560455476202,
"grad_norm": 1.0,
"learning_rate": 1.9984407641819812e-05,
"loss": 0.6286,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 117,
"tokens_per_second_per_gpu": 18788.16,
"total_tokens": 5284773
},
{
"epoch": 0.011786445587574289,
"grad_norm": 1.046875,
"learning_rate": 1.9982398279237657e-05,
"loss": 0.6314,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 118,
"tokens_per_second_per_gpu": 18806.58,
"total_tokens": 5330605
},
{
"epoch": 0.011886330719672376,
"grad_norm": 0.9921875,
"learning_rate": 1.9980267284282718e-05,
"loss": 0.6555,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 119,
"tokens_per_second_per_gpu": 20889.02,
"total_tokens": 5381886
},
{
"epoch": 0.011986215851770465,
"grad_norm": 1.0625,
"learning_rate": 1.9978014682920503e-05,
"loss": 0.6192,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 120,
"tokens_per_second_per_gpu": 18547.29,
"total_tokens": 5427929
},
{
"epoch": 0.012086100983868552,
"grad_norm": 0.99609375,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.6199,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 121,
"tokens_per_second_per_gpu": 19617.73,
"total_tokens": 5477120
},
{
"epoch": 0.012185986115966638,
"grad_norm": 1.0,
"learning_rate": 1.997314477224458e-05,
"loss": 0.5961,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 122,
"tokens_per_second_per_gpu": 18775.7,
"total_tokens": 5523093
},
{
"epoch": 0.012285871248064725,
"grad_norm": 1.0703125,
"learning_rate": 1.9970527522269204e-05,
"loss": 0.6216,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 123,
"tokens_per_second_per_gpu": 16705.54,
"total_tokens": 5564130
},
{
"epoch": 0.012385756380162812,
"grad_norm": 1.0078125,
"learning_rate": 1.9967788784562474e-05,
"loss": 0.5953,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 124,
"tokens_per_second_per_gpu": 17016.96,
"total_tokens": 5606507
},
{
"epoch": 0.0124856415122609,
"grad_norm": 0.99609375,
"learning_rate": 1.9964928592495046e-05,
"loss": 0.6204,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 125,
"tokens_per_second_per_gpu": 18766.62,
"total_tokens": 5652803
},
{
"epoch": 0.012585526644358988,
"grad_norm": 0.99609375,
"learning_rate": 1.9961946980917457e-05,
"loss": 0.6115,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 126,
"tokens_per_second_per_gpu": 19247.22,
"total_tokens": 5699726
},
{
"epoch": 0.012685411776457075,
"grad_norm": 1.078125,
"learning_rate": 1.9958843986159705e-05,
"loss": 0.565,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 127,
"tokens_per_second_per_gpu": 15244.68,
"total_tokens": 5737177
},
{
"epoch": 0.012785296908555161,
"grad_norm": 1.046875,
"learning_rate": 1.99556196460308e-05,
"loss": 0.5976,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 128,
"tokens_per_second_per_gpu": 18070.5,
"total_tokens": 5780971
},
{
"epoch": 0.012885182040653248,
"grad_norm": 1.3046875,
"learning_rate": 1.9952273999818312e-05,
"loss": 0.6006,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 129,
"tokens_per_second_per_gpu": 16004.02,
"total_tokens": 5820791
},
{
"epoch": 0.012985067172751335,
"grad_norm": 1.0078125,
"learning_rate": 1.9948807088287884e-05,
"loss": 0.5546,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 130,
"tokens_per_second_per_gpu": 17616.14,
"total_tokens": 5864343
},
{
"epoch": 0.013084952304849424,
"grad_norm": 1.015625,
"learning_rate": 1.9945218953682736e-05,
"loss": 0.5823,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 131,
"tokens_per_second_per_gpu": 16860.07,
"total_tokens": 5906747
},
{
"epoch": 0.01318483743694751,
"grad_norm": 0.99609375,
"learning_rate": 1.9941509639723155e-05,
"loss": 0.5803,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 132,
"tokens_per_second_per_gpu": 17772.91,
"total_tokens": 5950850
},
{
"epoch": 0.013284722569045598,
"grad_norm": 1.0234375,
"learning_rate": 1.9937679191605964e-05,
"loss": 0.6474,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 133,
"tokens_per_second_per_gpu": 20154.63,
"total_tokens": 6000386
},
{
"epoch": 0.013384607701143685,
"grad_norm": 1.1015625,
"learning_rate": 1.9933727656003964e-05,
"loss": 0.5555,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 134,
"tokens_per_second_per_gpu": 14310.4,
"total_tokens": 6036301
},
{
"epoch": 0.013484492833241771,
"grad_norm": 0.96875,
"learning_rate": 1.992965508106537e-05,
"loss": 0.5518,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 135,
"tokens_per_second_per_gpu": 18142.79,
"total_tokens": 6080833
},
{
"epoch": 0.013584377965339858,
"grad_norm": 1.03125,
"learning_rate": 1.9925461516413224e-05,
"loss": 0.6084,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 136,
"tokens_per_second_per_gpu": 18133.64,
"total_tokens": 6126732
},
{
"epoch": 0.013684263097437947,
"grad_norm": 1.015625,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.5763,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 137,
"tokens_per_second_per_gpu": 18002.0,
"total_tokens": 6170450
},
{
"epoch": 0.013784148229536034,
"grad_norm": 1.0078125,
"learning_rate": 1.9916711623830904e-05,
"loss": 0.5355,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 138,
"tokens_per_second_per_gpu": 16095.63,
"total_tokens": 6210299
},
{
"epoch": 0.01388403336163412,
"grad_norm": 1.0625,
"learning_rate": 1.991215540251542e-05,
"loss": 0.5808,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 139,
"tokens_per_second_per_gpu": 18650.9,
"total_tokens": 6254948
},
{
"epoch": 0.013983918493732208,
"grad_norm": 0.98046875,
"learning_rate": 1.9907478404714438e-05,
"loss": 0.5979,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 140,
"tokens_per_second_per_gpu": 20297.6,
"total_tokens": 6305011
},
{
"epoch": 0.014083803625830294,
"grad_norm": 1.015625,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.582,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 141,
"tokens_per_second_per_gpu": 17056.17,
"total_tokens": 6347632
},
{
"epoch": 0.014183688757928383,
"grad_norm": 1.09375,
"learning_rate": 1.989776230907789e-05,
"loss": 0.6453,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 142,
"tokens_per_second_per_gpu": 18054.21,
"total_tokens": 6392193
},
{
"epoch": 0.01428357389002647,
"grad_norm": 1.0703125,
"learning_rate": 1.9892723329629885e-05,
"loss": 0.5983,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 143,
"tokens_per_second_per_gpu": 17722.52,
"total_tokens": 6436920
},
{
"epoch": 0.014383459022124557,
"grad_norm": 1.078125,
"learning_rate": 1.988756381047006e-05,
"loss": 0.5753,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 144,
"tokens_per_second_per_gpu": 16254.16,
"total_tokens": 6478117
},
{
"epoch": 0.014483344154222644,
"grad_norm": 0.9765625,
"learning_rate": 1.988228381446553e-05,
"loss": 0.5995,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 145,
"tokens_per_second_per_gpu": 19667.83,
"total_tokens": 6525699
},
{
"epoch": 0.01458322928632073,
"grad_norm": 1.0,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.5977,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 146,
"tokens_per_second_per_gpu": 19651.42,
"total_tokens": 6573453
},
{
"epoch": 0.014683114418418818,
"grad_norm": 1.0546875,
"learning_rate": 1.987136265072988e-05,
"loss": 0.634,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 147,
"tokens_per_second_per_gpu": 21150.33,
"total_tokens": 6624754
},
{
"epoch": 0.014782999550516906,
"grad_norm": 1.0078125,
"learning_rate": 1.9865721616069695e-05,
"loss": 0.582,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 148,
"tokens_per_second_per_gpu": 17994.67,
"total_tokens": 6669304
},
{
"epoch": 0.014882884682614993,
"grad_norm": 1.0390625,
"learning_rate": 1.985996037070505e-05,
"loss": 0.574,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 149,
"tokens_per_second_per_gpu": 17365.56,
"total_tokens": 6711661
},
{
"epoch": 0.01498276981471308,
"grad_norm": 1.03125,
"learning_rate": 1.9854078984834904e-05,
"loss": 0.591,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 150,
"tokens_per_second_per_gpu": 19119.45,
"total_tokens": 6758301
},
{
"epoch": 0.015082654946811167,
"grad_norm": 0.98046875,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.541,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 151,
"tokens_per_second_per_gpu": 17707.92,
"total_tokens": 6801621
},
{
"epoch": 0.015182540078909254,
"grad_norm": 1.0390625,
"learning_rate": 1.984195607969242e-05,
"loss": 0.6135,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 152,
"tokens_per_second_per_gpu": 17502.16,
"total_tokens": 6845361
},
{
"epoch": 0.015282425211007342,
"grad_norm": 1.0078125,
"learning_rate": 1.983571470813386e-05,
"loss": 0.6196,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 153,
"tokens_per_second_per_gpu": 19755.5,
"total_tokens": 6894342
},
{
"epoch": 0.01538231034310543,
"grad_norm": 1.078125,
"learning_rate": 1.9829353491495545e-05,
"loss": 0.544,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 154,
"tokens_per_second_per_gpu": 18517.83,
"total_tokens": 6939316
},
{
"epoch": 0.015482195475203516,
"grad_norm": 1.0546875,
"learning_rate": 1.982287250728689e-05,
"loss": 0.6107,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 155,
"tokens_per_second_per_gpu": 19691.98,
"total_tokens": 6987772
},
{
"epoch": 0.015582080607301603,
"grad_norm": 1.03125,
"learning_rate": 1.9816271834476642e-05,
"loss": 0.6226,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 156,
"tokens_per_second_per_gpu": 21383.14,
"total_tokens": 7039667
},
{
"epoch": 0.01568196573939969,
"grad_norm": 1.0703125,
"learning_rate": 1.9809551553491918e-05,
"loss": 0.5607,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 157,
"tokens_per_second_per_gpu": 17060.37,
"total_tokens": 7080343
},
{
"epoch": 0.015781850871497777,
"grad_norm": 1.0390625,
"learning_rate": 1.9802711746217222e-05,
"loss": 0.6185,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 158,
"tokens_per_second_per_gpu": 20126.33,
"total_tokens": 7130105
},
{
"epoch": 0.015881736003595864,
"grad_norm": 1.0859375,
"learning_rate": 1.979575249599344e-05,
"loss": 0.6005,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 159,
"tokens_per_second_per_gpu": 16630.21,
"total_tokens": 7171657
},
{
"epoch": 0.01598162113569395,
"grad_norm": 1.0390625,
"learning_rate": 1.9788673887616852e-05,
"loss": 0.5958,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 160,
"tokens_per_second_per_gpu": 21247.67,
"total_tokens": 7221634
},
{
"epoch": 0.016081506267792037,
"grad_norm": 0.98046875,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.5536,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 161,
"tokens_per_second_per_gpu": 18179.95,
"total_tokens": 7266949
},
{
"epoch": 0.016181391399890128,
"grad_norm": 1.0234375,
"learning_rate": 1.9774158942860962e-05,
"loss": 0.5735,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 162,
"tokens_per_second_per_gpu": 19294.01,
"total_tokens": 7314475
},
{
"epoch": 0.016281276531988215,
"grad_norm": 1.109375,
"learning_rate": 1.9766722783341682e-05,
"loss": 0.5661,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 163,
"tokens_per_second_per_gpu": 17160.88,
"total_tokens": 7356122
},
{
"epoch": 0.0163811616640863,
"grad_norm": 0.94921875,
"learning_rate": 1.9759167619387474e-05,
"loss": 0.5724,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 164,
"tokens_per_second_per_gpu": 20031.67,
"total_tokens": 7404915
},
{
"epoch": 0.01648104679618439,
"grad_norm": 0.9921875,
"learning_rate": 1.9751493543055634e-05,
"loss": 0.5188,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 165,
"tokens_per_second_per_gpu": 18645.96,
"total_tokens": 7450909
},
{
"epoch": 0.016580931928282475,
"grad_norm": 0.96875,
"learning_rate": 1.9743700647852356e-05,
"loss": 0.5581,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 166,
"tokens_per_second_per_gpu": 20450.52,
"total_tokens": 7500716
},
{
"epoch": 0.016680817060380562,
"grad_norm": 1.0078125,
"learning_rate": 1.9735789028731603e-05,
"loss": 0.6003,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 167,
"tokens_per_second_per_gpu": 18029.38,
"total_tokens": 7545625
},
{
"epoch": 0.01678070219247865,
"grad_norm": 0.96484375,
"learning_rate": 1.972775878209397e-05,
"loss": 0.5534,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 168,
"tokens_per_second_per_gpu": 18306.53,
"total_tokens": 7591257
},
{
"epoch": 0.016880587324576736,
"grad_norm": 1.0625,
"learning_rate": 1.9719610005785466e-05,
"loss": 0.5268,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 169,
"tokens_per_second_per_gpu": 16971.41,
"total_tokens": 7633328
},
{
"epoch": 0.016980472456674823,
"grad_norm": 1.15625,
"learning_rate": 1.971134279909636e-05,
"loss": 0.5476,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 170,
"tokens_per_second_per_gpu": 15585.99,
"total_tokens": 7671631
},
{
"epoch": 0.01708035758877291,
"grad_norm": 1.0234375,
"learning_rate": 1.9702957262759964e-05,
"loss": 0.6494,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 171,
"tokens_per_second_per_gpu": 20532.2,
"total_tokens": 7721385
},
{
"epoch": 0.017180242720870997,
"grad_norm": 1.0078125,
"learning_rate": 1.9694453498951392e-05,
"loss": 0.5652,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 172,
"tokens_per_second_per_gpu": 18492.11,
"total_tokens": 7766388
},
{
"epoch": 0.017280127852969087,
"grad_norm": 1.0234375,
"learning_rate": 1.9685831611286312e-05,
"loss": 0.5508,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 173,
"tokens_per_second_per_gpu": 19519.67,
"total_tokens": 7814109
},
{
"epoch": 0.017380012985067174,
"grad_norm": 1.0234375,
"learning_rate": 1.9677091704819714e-05,
"loss": 0.5424,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 174,
"tokens_per_second_per_gpu": 18019.17,
"total_tokens": 7858610
},
{
"epoch": 0.01747989811716526,
"grad_norm": 1.015625,
"learning_rate": 1.9668233886044597e-05,
"loss": 0.6046,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 175,
"tokens_per_second_per_gpu": 20685.55,
"total_tokens": 7907100
},
{
"epoch": 0.017579783249263348,
"grad_norm": 0.98046875,
"learning_rate": 1.9659258262890683e-05,
"loss": 0.5475,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 176,
"tokens_per_second_per_gpu": 19322.29,
"total_tokens": 7953990
},
{
"epoch": 0.017679668381361435,
"grad_norm": 1.046875,
"learning_rate": 1.9650164944723116e-05,
"loss": 0.5525,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 177,
"tokens_per_second_per_gpu": 17699.63,
"total_tokens": 7998810
},
{
"epoch": 0.01777955351345952,
"grad_norm": 1.0703125,
"learning_rate": 1.96409540423411e-05,
"loss": 0.5338,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 178,
"tokens_per_second_per_gpu": 17720.92,
"total_tokens": 8041953
},
{
"epoch": 0.01787943864555761,
"grad_norm": 1.046875,
"learning_rate": 1.9631625667976584e-05,
"loss": 0.6071,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 179,
"tokens_per_second_per_gpu": 19699.73,
"total_tokens": 8089553
},
{
"epoch": 0.017979323777655695,
"grad_norm": 1.125,
"learning_rate": 1.9622179935292855e-05,
"loss": 0.5819,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 180,
"tokens_per_second_per_gpu": 17729.02,
"total_tokens": 8133623
},
{
"epoch": 0.018079208909753782,
"grad_norm": 1.0234375,
"learning_rate": 1.961261695938319e-05,
"loss": 0.573,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 181,
"tokens_per_second_per_gpu": 20525.09,
"total_tokens": 8182577
},
{
"epoch": 0.01817909404185187,
"grad_norm": 1.0078125,
"learning_rate": 1.9602936856769432e-05,
"loss": 0.5255,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 182,
"tokens_per_second_per_gpu": 17597.52,
"total_tokens": 8225616
},
{
"epoch": 0.018278979173949956,
"grad_norm": 1.0390625,
"learning_rate": 1.9593139745400575e-05,
"loss": 0.587,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 183,
"tokens_per_second_per_gpu": 19864.17,
"total_tokens": 8272813
},
{
"epoch": 0.018378864306048046,
"grad_norm": 1.0,
"learning_rate": 1.9583225744651334e-05,
"loss": 0.5602,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 184,
"tokens_per_second_per_gpu": 18250.92,
"total_tokens": 8317907
},
{
"epoch": 0.018478749438146133,
"grad_norm": 1.0703125,
"learning_rate": 1.9573194975320672e-05,
"loss": 0.4711,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 185,
"tokens_per_second_per_gpu": 14209.64,
"total_tokens": 8353779
},
{
"epoch": 0.01857863457024422,
"grad_norm": 1.046875,
"learning_rate": 1.9563047559630356e-05,
"loss": 0.6015,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 186,
"tokens_per_second_per_gpu": 19199.47,
"total_tokens": 8400755
},
{
"epoch": 0.018678519702342307,
"grad_norm": 1.046875,
"learning_rate": 1.9552783621223437e-05,
"loss": 0.5105,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 187,
"tokens_per_second_per_gpu": 19506.67,
"total_tokens": 8447073
},
{
"epoch": 0.018778404834440394,
"grad_norm": 0.98046875,
"learning_rate": 1.954240328516277e-05,
"loss": 0.5857,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 188,
"tokens_per_second_per_gpu": 21690.49,
"total_tokens": 8498995
},
{
"epoch": 0.01887828996653848,
"grad_norm": 1.09375,
"learning_rate": 1.9531906677929472e-05,
"loss": 0.5433,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 189,
"tokens_per_second_per_gpu": 18889.76,
"total_tokens": 8544811
},
{
"epoch": 0.018978175098636568,
"grad_norm": 1.015625,
"learning_rate": 1.9521293927421388e-05,
"loss": 0.5275,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 190,
"tokens_per_second_per_gpu": 17928.4,
"total_tokens": 8588703
},
{
"epoch": 0.019078060230734654,
"grad_norm": 1.046875,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.5463,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 191,
"tokens_per_second_per_gpu": 17265.13,
"total_tokens": 8630893
},
{
"epoch": 0.01917794536283274,
"grad_norm": 1.0625,
"learning_rate": 1.9499720515246524e-05,
"loss": 0.5368,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 192,
"tokens_per_second_per_gpu": 16986.38,
"total_tokens": 8672973
},
{
"epoch": 0.019277830494930828,
"grad_norm": 1.078125,
"learning_rate": 1.9488760116444966e-05,
"loss": 0.5524,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 193,
"tokens_per_second_per_gpu": 17589.87,
"total_tokens": 8715648
},
{
"epoch": 0.019377715627028915,
"grad_norm": 1.046875,
"learning_rate": 1.947768410009586e-05,
"loss": 0.5248,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 194,
"tokens_per_second_per_gpu": 18909.14,
"total_tokens": 8760894
},
{
"epoch": 0.019477600759127005,
"grad_norm": 1.015625,
"learning_rate": 1.9466492601156964e-05,
"loss": 0.5234,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 195,
"tokens_per_second_per_gpu": 18256.57,
"total_tokens": 8806917
},
{
"epoch": 0.019577485891225092,
"grad_norm": 1.015625,
"learning_rate": 1.945518575599317e-05,
"loss": 0.4815,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 196,
"tokens_per_second_per_gpu": 16892.03,
"total_tokens": 8848209
},
{
"epoch": 0.01967737102332318,
"grad_norm": 1.046875,
"learning_rate": 1.944376370237481e-05,
"loss": 0.5487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 197,
"tokens_per_second_per_gpu": 18435.99,
"total_tokens": 8893596
},
{
"epoch": 0.019777256155421266,
"grad_norm": 1.1640625,
"learning_rate": 1.943222657947601e-05,
"loss": 0.5513,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 198,
"tokens_per_second_per_gpu": 17293.09,
"total_tokens": 8935716
},
{
"epoch": 0.019877141287519353,
"grad_norm": 1.03125,
"learning_rate": 1.942057452787297e-05,
"loss": 0.5301,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 199,
"tokens_per_second_per_gpu": 17071.93,
"total_tokens": 8978524
},
{
"epoch": 0.01997702641961744,
"grad_norm": 1.203125,
"learning_rate": 1.9408807689542257e-05,
"loss": 0.6015,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 200,
"tokens_per_second_per_gpu": 17983.04,
"total_tokens": 9022769
},
{
"epoch": 0.020076911551715527,
"grad_norm": 0.99609375,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.5086,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 201,
"tokens_per_second_per_gpu": 18544.64,
"total_tokens": 9068586
},
{
"epoch": 0.020176796683813614,
"grad_norm": 1.140625,
"learning_rate": 1.938493022759556e-05,
"loss": 0.5595,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 202,
"tokens_per_second_per_gpu": 19571.41,
"total_tokens": 9117162
},
{
"epoch": 0.0202766818159117,
"grad_norm": 1.0703125,
"learning_rate": 1.937281989491892e-05,
"loss": 0.5898,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 203,
"tokens_per_second_per_gpu": 19050.82,
"total_tokens": 9164422
},
{
"epoch": 0.020376566948009787,
"grad_norm": 1.0625,
"learning_rate": 1.9360595357389735e-05,
"loss": 0.5467,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 204,
"tokens_per_second_per_gpu": 18601.68,
"total_tokens": 9209783
},
{
"epoch": 0.020476452080107874,
"grad_norm": 1.0546875,
"learning_rate": 1.9348256763960146e-05,
"loss": 0.551,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 205,
"tokens_per_second_per_gpu": 18742.55,
"total_tokens": 9254835
},
{
"epoch": 0.020576337212205965,
"grad_norm": 1.0703125,
"learning_rate": 1.9335804264972018e-05,
"loss": 0.4828,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 206,
"tokens_per_second_per_gpu": 16024.4,
"total_tokens": 9294875
},
{
"epoch": 0.02067622234430405,
"grad_norm": 1.0234375,
"learning_rate": 1.9323238012155125e-05,
"loss": 0.5877,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 207,
"tokens_per_second_per_gpu": 19774.55,
"total_tokens": 9343127
},
{
"epoch": 0.02077610747640214,
"grad_norm": 1.0859375,
"learning_rate": 1.9310558158625286e-05,
"loss": 0.593,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 208,
"tokens_per_second_per_gpu": 18799.85,
"total_tokens": 9388693
},
{
"epoch": 0.020875992608500225,
"grad_norm": 1.0703125,
"learning_rate": 1.9297764858882516e-05,
"loss": 0.4974,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 209,
"tokens_per_second_per_gpu": 19057.68,
"total_tokens": 9435170
},
{
"epoch": 0.020975877740598312,
"grad_norm": 4.03125,
"learning_rate": 1.9284858268809135e-05,
"loss": 0.5196,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 210,
"tokens_per_second_per_gpu": 18224.29,
"total_tokens": 9479728
},
{
"epoch": 0.0210757628726964,
"grad_norm": 0.99609375,
"learning_rate": 1.9271838545667876e-05,
"loss": 0.5296,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 211,
"tokens_per_second_per_gpu": 21928.55,
"total_tokens": 9532152
},
{
"epoch": 0.021175648004794486,
"grad_norm": 1.03125,
"learning_rate": 1.925870584809995e-05,
"loss": 0.5262,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 212,
"tokens_per_second_per_gpu": 17696.69,
"total_tokens": 9575808
},
{
"epoch": 0.021275533136892573,
"grad_norm": 1.1484375,
"learning_rate": 1.9245460336123136e-05,
"loss": 0.5571,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 213,
"tokens_per_second_per_gpu": 16806.72,
"total_tokens": 9618295
},
{
"epoch": 0.02137541826899066,
"grad_norm": 1.046875,
"learning_rate": 1.923210217112981e-05,
"loss": 0.5041,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 214,
"tokens_per_second_per_gpu": 18831.48,
"total_tokens": 9662301
},
{
"epoch": 0.021475303401088747,
"grad_norm": 1.046875,
"learning_rate": 1.9218631515885007e-05,
"loss": 0.5231,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 215,
"tokens_per_second_per_gpu": 16461.58,
"total_tokens": 9703190
},
{
"epoch": 0.021575188533186834,
"grad_norm": 1.015625,
"learning_rate": 1.9205048534524405e-05,
"loss": 0.6079,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 216,
"tokens_per_second_per_gpu": 21287.11,
"total_tokens": 9754531
},
{
"epoch": 0.021675073665284924,
"grad_norm": 1.046875,
"learning_rate": 1.9191353392552346e-05,
"loss": 0.5376,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 217,
"tokens_per_second_per_gpu": 18028.11,
"total_tokens": 9797416
},
{
"epoch": 0.02177495879738301,
"grad_norm": 1.015625,
"learning_rate": 1.9177546256839814e-05,
"loss": 0.4622,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 218,
"tokens_per_second_per_gpu": 15611.94,
"total_tokens": 9836019
},
{
"epoch": 0.021874843929481098,
"grad_norm": 1.0390625,
"learning_rate": 1.9163627295622397e-05,
"loss": 0.5094,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 219,
"tokens_per_second_per_gpu": 17357.06,
"total_tokens": 9878932
},
{
"epoch": 0.021974729061579185,
"grad_norm": 1.078125,
"learning_rate": 1.914959667849825e-05,
"loss": 0.4949,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 220,
"tokens_per_second_per_gpu": 17750.15,
"total_tokens": 9922022
},
{
"epoch": 0.02207461419367727,
"grad_norm": 1.0703125,
"learning_rate": 1.913545457642601e-05,
"loss": 0.5137,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 221,
"tokens_per_second_per_gpu": 16924.18,
"total_tokens": 9964115
},
{
"epoch": 0.02217449932577536,
"grad_norm": 1.0625,
"learning_rate": 1.9121201161722732e-05,
"loss": 0.5185,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 222,
"tokens_per_second_per_gpu": 17421.71,
"total_tokens": 10006500
},
{
"epoch": 0.022274384457873445,
"grad_norm": 1.0546875,
"learning_rate": 1.910683660806177e-05,
"loss": 0.5358,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 223,
"tokens_per_second_per_gpu": 18662.16,
"total_tokens": 10051372
},
{
"epoch": 0.022374269589971532,
"grad_norm": 1.0,
"learning_rate": 1.9092361090470688e-05,
"loss": 0.5609,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 224,
"tokens_per_second_per_gpu": 20715.95,
"total_tokens": 10101597
},
{
"epoch": 0.02247415472206962,
"grad_norm": 1.0625,
"learning_rate": 1.907777478532909e-05,
"loss": 0.4757,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 225,
"tokens_per_second_per_gpu": 16981.74,
"total_tokens": 10142918
},
{
"epoch": 0.022574039854167706,
"grad_norm": 1.015625,
"learning_rate": 1.9063077870366504e-05,
"loss": 0.5302,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 226,
"tokens_per_second_per_gpu": 19896.85,
"total_tokens": 10191595
},
{
"epoch": 0.022673924986265793,
"grad_norm": 5.0625,
"learning_rate": 1.9048270524660197e-05,
"loss": 0.502,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 227,
"tokens_per_second_per_gpu": 19528.4,
"total_tokens": 10239443
},
{
"epoch": 0.022773810118363883,
"grad_norm": 1.0625,
"learning_rate": 1.903335292863301e-05,
"loss": 0.5454,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 228,
"tokens_per_second_per_gpu": 18761.49,
"total_tokens": 10286847
},
{
"epoch": 0.02287369525046197,
"grad_norm": 1.03125,
"learning_rate": 1.901832526405114e-05,
"loss": 0.5269,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 229,
"tokens_per_second_per_gpu": 22014.25,
"total_tokens": 10338804
},
{
"epoch": 0.022973580382560057,
"grad_norm": 1.0390625,
"learning_rate": 1.9003187714021936e-05,
"loss": 0.5163,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 230,
"tokens_per_second_per_gpu": 19617.38,
"total_tokens": 10385941
},
{
"epoch": 0.023073465514658144,
"grad_norm": 1.0,
"learning_rate": 1.8987940462991673e-05,
"loss": 0.4907,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 231,
"tokens_per_second_per_gpu": 16952.42,
"total_tokens": 10428295
},
{
"epoch": 0.02317335064675623,
"grad_norm": 0.9921875,
"learning_rate": 1.8972583696743284e-05,
"loss": 0.519,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 232,
"tokens_per_second_per_gpu": 18391.38,
"total_tokens": 10475782
},
{
"epoch": 0.023273235778854318,
"grad_norm": 1.0703125,
"learning_rate": 1.895711760239413e-05,
"loss": 0.5142,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 233,
"tokens_per_second_per_gpu": 18456.78,
"total_tokens": 10520937
},
{
"epoch": 0.023373120910952404,
"grad_norm": 1.1640625,
"learning_rate": 1.8941542368393683e-05,
"loss": 0.5679,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 234,
"tokens_per_second_per_gpu": 18947.6,
"total_tokens": 10567203
},
{
"epoch": 0.02347300604305049,
"grad_norm": 0.94140625,
"learning_rate": 1.892585818452126e-05,
"loss": 0.5676,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 235,
"tokens_per_second_per_gpu": 21453.85,
"total_tokens": 10620507
},
{
"epoch": 0.023572891175148578,
"grad_norm": 1.09375,
"learning_rate": 1.891006524188368e-05,
"loss": 0.5218,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 236,
"tokens_per_second_per_gpu": 17346.43,
"total_tokens": 10662969
},
{
"epoch": 0.023672776307246665,
"grad_norm": 1.0390625,
"learning_rate": 1.889416373291298e-05,
"loss": 0.5512,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 237,
"tokens_per_second_per_gpu": 17718.31,
"total_tokens": 10707126
},
{
"epoch": 0.023772661439344752,
"grad_norm": 1.0234375,
"learning_rate": 1.8878153851364013e-05,
"loss": 0.5454,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 238,
"tokens_per_second_per_gpu": 19729.28,
"total_tokens": 10756150
},
{
"epoch": 0.023872546571442842,
"grad_norm": 1.0,
"learning_rate": 1.8862035792312148e-05,
"loss": 0.5673,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 239,
"tokens_per_second_per_gpu": 19230.59,
"total_tokens": 10803451
},
{
"epoch": 0.02397243170354093,
"grad_norm": 1.0625,
"learning_rate": 1.884580975215084e-05,
"loss": 0.4678,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 240,
"tokens_per_second_per_gpu": 16075.38,
"total_tokens": 10844152
},
{
"epoch": 0.024072316835639016,
"grad_norm": 1.0703125,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.5216,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 241,
"tokens_per_second_per_gpu": 16947.62,
"total_tokens": 10886504
},
{
"epoch": 0.024172201967737103,
"grad_norm": 1.0390625,
"learning_rate": 1.8813034520649923e-05,
"loss": 0.495,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 242,
"tokens_per_second_per_gpu": 16490.15,
"total_tokens": 10927577
},
{
"epoch": 0.02427208709983519,
"grad_norm": 1.046875,
"learning_rate": 1.879648572866617e-05,
"loss": 0.521,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 243,
"tokens_per_second_per_gpu": 19398.3,
"total_tokens": 10975538
},
{
"epoch": 0.024371972231933277,
"grad_norm": 1.046875,
"learning_rate": 1.8779829754279806e-05,
"loss": 0.5261,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 244,
"tokens_per_second_per_gpu": 17232.62,
"total_tokens": 11017683
},
{
"epoch": 0.024471857364031364,
"grad_norm": 1.046875,
"learning_rate": 1.8763066800438638e-05,
"loss": 0.5228,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 245,
"tokens_per_second_per_gpu": 17577.85,
"total_tokens": 11061522
},
{
"epoch": 0.02457174249612945,
"grad_norm": 0.9765625,
"learning_rate": 1.874619707139396e-05,
"loss": 0.4951,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 246,
"tokens_per_second_per_gpu": 17959.41,
"total_tokens": 11106561
},
{
"epoch": 0.024671627628227537,
"grad_norm": 0.98828125,
"learning_rate": 1.8729220772698096e-05,
"loss": 0.5425,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 247,
"tokens_per_second_per_gpu": 18537.52,
"total_tokens": 11153308
},
{
"epoch": 0.024771512760325624,
"grad_norm": 1.03125,
"learning_rate": 1.8712138111201898e-05,
"loss": 0.5187,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 248,
"tokens_per_second_per_gpu": 16956.83,
"total_tokens": 11194195
},
{
"epoch": 0.02487139789242371,
"grad_norm": 1.03125,
"learning_rate": 1.869494929505219e-05,
"loss": 0.5101,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 249,
"tokens_per_second_per_gpu": 18105.97,
"total_tokens": 11238072
},
{
"epoch": 0.0249712830245218,
"grad_norm": 1.015625,
"learning_rate": 1.8677654533689287e-05,
"loss": 0.4723,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 250,
"tokens_per_second_per_gpu": 18915.71,
"total_tokens": 11283289
},
{
"epoch": 0.02507116815661989,
"grad_norm": 1.0234375,
"learning_rate": 1.866025403784439e-05,
"loss": 0.5126,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 251,
"tokens_per_second_per_gpu": 18381.16,
"total_tokens": 11327416
},
{
"epoch": 0.025171053288717975,
"grad_norm": 1.0859375,
"learning_rate": 1.864274801953705e-05,
"loss": 0.5325,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 252,
"tokens_per_second_per_gpu": 15937.65,
"total_tokens": 11367142
},
{
"epoch": 0.025270938420816062,
"grad_norm": 1.34375,
"learning_rate": 1.8625136692072577e-05,
"loss": 0.518,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 253,
"tokens_per_second_per_gpu": 18123.03,
"total_tokens": 11411442
},
{
"epoch": 0.02537082355291415,
"grad_norm": 1.0234375,
"learning_rate": 1.860742027003944e-05,
"loss": 0.5035,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 254,
"tokens_per_second_per_gpu": 17837.27,
"total_tokens": 11455512
},
{
"epoch": 0.025470708685012236,
"grad_norm": 1.0078125,
"learning_rate": 1.8589598969306646e-05,
"loss": 0.4812,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 255,
"tokens_per_second_per_gpu": 18677.68,
"total_tokens": 11501242
},
{
"epoch": 0.025570593817110323,
"grad_norm": 0.95703125,
"learning_rate": 1.8571673007021124e-05,
"loss": 0.4369,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 256,
"tokens_per_second_per_gpu": 16378.74,
"total_tokens": 11541449
},
{
"epoch": 0.02567047894920841,
"grad_norm": 1.0,
"learning_rate": 1.855364260160507e-05,
"loss": 0.4969,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 257,
"tokens_per_second_per_gpu": 18536.15,
"total_tokens": 11586387
},
{
"epoch": 0.025770364081306497,
"grad_norm": 1.0546875,
"learning_rate": 1.8535507972753275e-05,
"loss": 0.5515,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 258,
"tokens_per_second_per_gpu": 19604.96,
"total_tokens": 11635059
},
{
"epoch": 0.025870249213404584,
"grad_norm": 1.4609375,
"learning_rate": 1.851726934143048e-05,
"loss": 0.4852,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 259,
"tokens_per_second_per_gpu": 17284.37,
"total_tokens": 11677499
},
{
"epoch": 0.02597013434550267,
"grad_norm": 1.0859375,
"learning_rate": 1.849892692986864e-05,
"loss": 0.5206,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 260,
"tokens_per_second_per_gpu": 17155.87,
"total_tokens": 11719610
},
{
"epoch": 0.02607001947760076,
"grad_norm": 1.0078125,
"learning_rate": 1.848048096156426e-05,
"loss": 0.4847,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 261,
"tokens_per_second_per_gpu": 19597.03,
"total_tokens": 11766478
},
{
"epoch": 0.026169904609698848,
"grad_norm": 1.03125,
"learning_rate": 1.8461931661275642e-05,
"loss": 0.5008,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 262,
"tokens_per_second_per_gpu": 19182.2,
"total_tokens": 11814476
},
{
"epoch": 0.026269789741796935,
"grad_norm": 0.9375,
"learning_rate": 1.8443279255020153e-05,
"loss": 0.5125,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 263,
"tokens_per_second_per_gpu": 21126.2,
"total_tokens": 11865959
},
{
"epoch": 0.02636967487389502,
"grad_norm": 0.98828125,
"learning_rate": 1.842452397007148e-05,
"loss": 0.5473,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 264,
"tokens_per_second_per_gpu": 20107.78,
"total_tokens": 11915303
},
{
"epoch": 0.02646956000599311,
"grad_norm": 1.0625,
"learning_rate": 1.8405666034956842e-05,
"loss": 0.5118,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 265,
"tokens_per_second_per_gpu": 16581.5,
"total_tokens": 11956888
},
{
"epoch": 0.026569445138091195,
"grad_norm": 0.97265625,
"learning_rate": 1.8386705679454243e-05,
"loss": 0.4893,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 266,
"tokens_per_second_per_gpu": 18772.48,
"total_tokens": 12001866
},
{
"epoch": 0.026669330270189282,
"grad_norm": 1.0625,
"learning_rate": 1.836764313458962e-05,
"loss": 0.5045,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 267,
"tokens_per_second_per_gpu": 16038.98,
"total_tokens": 12041689
},
{
"epoch": 0.02676921540228737,
"grad_norm": 1.046875,
"learning_rate": 1.8348478632634067e-05,
"loss": 0.4969,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 268,
"tokens_per_second_per_gpu": 17039.71,
"total_tokens": 12084140
},
{
"epoch": 0.026869100534385456,
"grad_norm": 1.015625,
"learning_rate": 1.8329212407100996e-05,
"loss": 0.5459,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 269,
"tokens_per_second_per_gpu": 20357.58,
"total_tokens": 12133252
},
{
"epoch": 0.026968985666483543,
"grad_norm": 0.9765625,
"learning_rate": 1.8309844692743283e-05,
"loss": 0.5052,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 270,
"tokens_per_second_per_gpu": 19910.54,
"total_tokens": 12181918
},
{
"epoch": 0.02706887079858163,
"grad_norm": 1.0390625,
"learning_rate": 1.8290375725550417e-05,
"loss": 0.5141,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 271,
"tokens_per_second_per_gpu": 18294.76,
"total_tokens": 12226107
},
{
"epoch": 0.027168755930679717,
"grad_norm": 0.984375,
"learning_rate": 1.827080574274562e-05,
"loss": 0.5605,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 272,
"tokens_per_second_per_gpu": 19639.78,
"total_tokens": 12274774
},
{
"epoch": 0.027268641062777807,
"grad_norm": 0.97265625,
"learning_rate": 1.8251134982782952e-05,
"loss": 0.5569,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 273,
"tokens_per_second_per_gpu": 20715.49,
"total_tokens": 12325027
},
{
"epoch": 0.027368526194875894,
"grad_norm": 1.0390625,
"learning_rate": 1.8231363685344422e-05,
"loss": 0.5389,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 274,
"tokens_per_second_per_gpu": 18047.21,
"total_tokens": 12369086
},
{
"epoch": 0.02746841132697398,
"grad_norm": 1.015625,
"learning_rate": 1.821149209133704e-05,
"loss": 0.4896,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 275,
"tokens_per_second_per_gpu": 18340.18,
"total_tokens": 12414271
},
{
"epoch": 0.027568296459072068,
"grad_norm": 1.046875,
"learning_rate": 1.819152044288992e-05,
"loss": 0.4762,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 276,
"tokens_per_second_per_gpu": 19089.79,
"total_tokens": 12459681
},
{
"epoch": 0.027668181591170155,
"grad_norm": 0.984375,
"learning_rate": 1.8171448983351284e-05,
"loss": 0.5327,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 277,
"tokens_per_second_per_gpu": 18793.23,
"total_tokens": 12506046
},
{
"epoch": 0.02776806672326824,
"grad_norm": 1.03125,
"learning_rate": 1.815127795728554e-05,
"loss": 0.4785,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 278,
"tokens_per_second_per_gpu": 18361.93,
"total_tokens": 12550215
},
{
"epoch": 0.02786795185536633,
"grad_norm": 0.9609375,
"learning_rate": 1.8131007610470278e-05,
"loss": 0.4658,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 279,
"tokens_per_second_per_gpu": 20140.12,
"total_tokens": 12599109
},
{
"epoch": 0.027967836987464415,
"grad_norm": 0.984375,
"learning_rate": 1.8110638189893267e-05,
"loss": 0.5672,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 280,
"tokens_per_second_per_gpu": 21231.66,
"total_tokens": 12650418
},
{
"epoch": 0.028067722119562502,
"grad_norm": 1.0078125,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.5241,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 281,
"tokens_per_second_per_gpu": 18823.79,
"total_tokens": 12696996
},
{
"epoch": 0.02816760725166059,
"grad_norm": 0.9765625,
"learning_rate": 1.806960312143802e-05,
"loss": 0.5084,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 282,
"tokens_per_second_per_gpu": 19507.52,
"total_tokens": 12744486
},
{
"epoch": 0.028267492383758676,
"grad_norm": 1.015625,
"learning_rate": 1.804893797355914e-05,
"loss": 0.537,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 283,
"tokens_per_second_per_gpu": 20278.56,
"total_tokens": 12794381
},
{
"epoch": 0.028367377515856766,
"grad_norm": 1.0390625,
"learning_rate": 1.8028174751911147e-05,
"loss": 0.5501,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 284,
"tokens_per_second_per_gpu": 19920.09,
"total_tokens": 12842353
},
{
"epoch": 0.028467262647954853,
"grad_norm": 1.03125,
"learning_rate": 1.8007313709487334e-05,
"loss": 0.487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 285,
"tokens_per_second_per_gpu": 18472.14,
"total_tokens": 12887368
},
{
"epoch": 0.02856714778005294,
"grad_norm": 1.03125,
"learning_rate": 1.798635510047293e-05,
"loss": 0.4733,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 286,
"tokens_per_second_per_gpu": 17903.3,
"total_tokens": 12930372
},
{
"epoch": 0.028667032912151027,
"grad_norm": 1.015625,
"learning_rate": 1.7965299180241963e-05,
"loss": 0.478,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 287,
"tokens_per_second_per_gpu": 17593.31,
"total_tokens": 12974566
},
{
"epoch": 0.028766918044249114,
"grad_norm": 1.0546875,
"learning_rate": 1.7944146205354182e-05,
"loss": 0.5189,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 288,
"tokens_per_second_per_gpu": 18942.14,
"total_tokens": 13021241
},
{
"epoch": 0.0288668031763472,
"grad_norm": 1.0390625,
"learning_rate": 1.792289643355191e-05,
"loss": 0.5124,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 289,
"tokens_per_second_per_gpu": 18756.68,
"total_tokens": 13067207
},
{
"epoch": 0.028966688308445288,
"grad_norm": 1.09375,
"learning_rate": 1.7901550123756906e-05,
"loss": 0.4909,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 290,
"tokens_per_second_per_gpu": 17000.27,
"total_tokens": 13109482
},
{
"epoch": 0.029066573440543374,
"grad_norm": 1.0234375,
"learning_rate": 1.788010753606722e-05,
"loss": 0.5352,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 291,
"tokens_per_second_per_gpu": 19516.61,
"total_tokens": 13157090
},
{
"epoch": 0.02916645857264146,
"grad_norm": 1.0390625,
"learning_rate": 1.785856893175402e-05,
"loss": 0.5216,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 292,
"tokens_per_second_per_gpu": 17696.17,
"total_tokens": 13201391
},
{
"epoch": 0.029266343704739548,
"grad_norm": 1.1953125,
"learning_rate": 1.78369345732584e-05,
"loss": 0.5069,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 293,
"tokens_per_second_per_gpu": 17687.66,
"total_tokens": 13244732
},
{
"epoch": 0.029366228836837635,
"grad_norm": 1.015625,
"learning_rate": 1.781520472418819e-05,
"loss": 0.5491,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 294,
"tokens_per_second_per_gpu": 19981.45,
"total_tokens": 13293892
},
{
"epoch": 0.029466113968935725,
"grad_norm": 0.984375,
"learning_rate": 1.7793379649314743e-05,
"loss": 0.5243,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 295,
"tokens_per_second_per_gpu": 21192.86,
"total_tokens": 13345141
},
{
"epoch": 0.029565999101033812,
"grad_norm": 0.95703125,
"learning_rate": 1.777145961456971e-05,
"loss": 0.5119,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 296,
"tokens_per_second_per_gpu": 19742.73,
"total_tokens": 13393443
},
{
"epoch": 0.0296658842331319,
"grad_norm": 0.9921875,
"learning_rate": 1.7749444887041797e-05,
"loss": 0.521,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 297,
"tokens_per_second_per_gpu": 18918.37,
"total_tokens": 13440225
},
{
"epoch": 0.029765769365229986,
"grad_norm": 1.0234375,
"learning_rate": 1.7727335734973512e-05,
"loss": 0.5153,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 298,
"tokens_per_second_per_gpu": 19484.4,
"total_tokens": 13486462
},
{
"epoch": 0.029865654497328073,
"grad_norm": 1.0859375,
"learning_rate": 1.7705132427757895e-05,
"loss": 0.4908,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 299,
"tokens_per_second_per_gpu": 18590.01,
"total_tokens": 13532632
},
{
"epoch": 0.02996553962942616,
"grad_norm": 1.09375,
"learning_rate": 1.7682835235935236e-05,
"loss": 0.5278,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 300,
"tokens_per_second_per_gpu": 19135.09,
"total_tokens": 13579070
},
{
"epoch": 0.030065424761524247,
"grad_norm": 1.0703125,
"learning_rate": 1.766044443118978e-05,
"loss": 0.4759,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 301,
"tokens_per_second_per_gpu": 18283.95,
"total_tokens": 13623918
},
{
"epoch": 0.030165309893622334,
"grad_norm": 1.0234375,
"learning_rate": 1.7637960286346423e-05,
"loss": 0.5244,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 302,
"tokens_per_second_per_gpu": 19133.09,
"total_tokens": 13670237
},
{
"epoch": 0.03026519502572042,
"grad_norm": 1.0078125,
"learning_rate": 1.761538307536737e-05,
"loss": 0.5141,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 303,
"tokens_per_second_per_gpu": 19248.59,
"total_tokens": 13717299
},
{
"epoch": 0.030365080157818507,
"grad_norm": 1.03125,
"learning_rate": 1.759271307334881e-05,
"loss": 0.4981,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 304,
"tokens_per_second_per_gpu": 18204.05,
"total_tokens": 13762208
},
{
"epoch": 0.030464965289916594,
"grad_norm": 1.5390625,
"learning_rate": 1.7569950556517566e-05,
"loss": 0.487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 305,
"tokens_per_second_per_gpu": 18663.58,
"total_tokens": 13807816
},
{
"epoch": 0.030564850422014685,
"grad_norm": 1.0390625,
"learning_rate": 1.7547095802227723e-05,
"loss": 0.5029,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 306,
"tokens_per_second_per_gpu": 18010.27,
"total_tokens": 13852296
},
{
"epoch": 0.03066473555411277,
"grad_norm": 0.953125,
"learning_rate": 1.7524149088957244e-05,
"loss": 0.4872,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 307,
"tokens_per_second_per_gpu": 19421.97,
"total_tokens": 13900538
},
{
"epoch": 0.03076462068621086,
"grad_norm": 1.03125,
"learning_rate": 1.7501110696304598e-05,
"loss": 0.4879,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 308,
"tokens_per_second_per_gpu": 17969.56,
"total_tokens": 13944542
},
{
"epoch": 0.030864505818308945,
"grad_norm": 0.98828125,
"learning_rate": 1.747798090498532e-05,
"loss": 0.4495,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 309,
"tokens_per_second_per_gpu": 18007.97,
"total_tokens": 13988178
},
{
"epoch": 0.030964390950407032,
"grad_norm": 0.94140625,
"learning_rate": 1.7454759996828622e-05,
"loss": 0.4651,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 310,
"tokens_per_second_per_gpu": 19010.71,
"total_tokens": 14036025
},
{
"epoch": 0.03106427608250512,
"grad_norm": 1.0390625,
"learning_rate": 1.7431448254773943e-05,
"loss": 0.532,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 311,
"tokens_per_second_per_gpu": 18966.75,
"total_tokens": 14081781
},
{
"epoch": 0.031164161214603206,
"grad_norm": 0.99609375,
"learning_rate": 1.74080459628675e-05,
"loss": 0.5316,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 312,
"tokens_per_second_per_gpu": 19665.56,
"total_tokens": 14129766
},
{
"epoch": 0.031264046346701296,
"grad_norm": 0.9609375,
"learning_rate": 1.7384553406258842e-05,
"loss": 0.4403,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 313,
"tokens_per_second_per_gpu": 17855.82,
"total_tokens": 14174008
},
{
"epoch": 0.03136393147879938,
"grad_norm": 0.95703125,
"learning_rate": 1.7360970871197347e-05,
"loss": 0.4727,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 314,
"tokens_per_second_per_gpu": 17841.13,
"total_tokens": 14218809
},
{
"epoch": 0.03146381661089747,
"grad_norm": 1.046875,
"learning_rate": 1.7337298645028764e-05,
"loss": 0.4913,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 315,
"tokens_per_second_per_gpu": 18165.77,
"total_tokens": 14263022
},
{
"epoch": 0.031563701742995554,
"grad_norm": 1.015625,
"learning_rate": 1.7313537016191706e-05,
"loss": 0.5182,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 316,
"tokens_per_second_per_gpu": 18308.57,
"total_tokens": 14308512
},
{
"epoch": 0.031663586875093644,
"grad_norm": 2.15625,
"learning_rate": 1.7289686274214116e-05,
"loss": 0.4994,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 317,
"tokens_per_second_per_gpu": 18705.09,
"total_tokens": 14354147
},
{
"epoch": 0.03176347200719173,
"grad_norm": 0.9921875,
"learning_rate": 1.7265746709709762e-05,
"loss": 0.5177,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 318,
"tokens_per_second_per_gpu": 19544.81,
"total_tokens": 14402261
},
{
"epoch": 0.03186335713928982,
"grad_norm": 0.99609375,
"learning_rate": 1.7241718614374678e-05,
"loss": 0.5126,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 319,
"tokens_per_second_per_gpu": 19536.76,
"total_tokens": 14449838
},
{
"epoch": 0.0319632422713879,
"grad_norm": 1.015625,
"learning_rate": 1.7217602280983622e-05,
"loss": 0.4438,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 320,
"tokens_per_second_per_gpu": 15149.65,
"total_tokens": 14488127
},
{
"epoch": 0.03206312740348599,
"grad_norm": 1.0625,
"learning_rate": 1.7193398003386514e-05,
"loss": 0.538,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 321,
"tokens_per_second_per_gpu": 19976.01,
"total_tokens": 14536446
},
{
"epoch": 0.032163012535584075,
"grad_norm": 1.0234375,
"learning_rate": 1.716910607650483e-05,
"loss": 0.4728,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 322,
"tokens_per_second_per_gpu": 17459.16,
"total_tokens": 14579841
},
{
"epoch": 0.032262897667682165,
"grad_norm": 1.0546875,
"learning_rate": 1.7144726796328034e-05,
"loss": 0.5182,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 323,
"tokens_per_second_per_gpu": 17770.82,
"total_tokens": 14624238
},
{
"epoch": 0.032362782799780256,
"grad_norm": 1.0,
"learning_rate": 1.712026045990997e-05,
"loss": 0.4697,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 324,
"tokens_per_second_per_gpu": 17157.62,
"total_tokens": 14665490
},
{
"epoch": 0.03246266793187834,
"grad_norm": 1.03125,
"learning_rate": 1.709570736536521e-05,
"loss": 0.5361,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 325,
"tokens_per_second_per_gpu": 16988.15,
"total_tokens": 14708874
},
{
"epoch": 0.03256255306397643,
"grad_norm": 1.0703125,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.4871,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 326,
"tokens_per_second_per_gpu": 16881.81,
"total_tokens": 14750657
},
{
"epoch": 0.03266243819607451,
"grad_norm": 0.98828125,
"learning_rate": 1.7046342099635948e-05,
"loss": 0.5409,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 327,
"tokens_per_second_per_gpu": 21657.15,
"total_tokens": 14800847
},
{
"epoch": 0.0327623233281726,
"grad_norm": 1.0078125,
"learning_rate": 1.7021530529951627e-05,
"loss": 0.4758,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 328,
"tokens_per_second_per_gpu": 17620.93,
"total_tokens": 14843795
},
{
"epoch": 0.03286220846027069,
"grad_norm": 0.97265625,
"learning_rate": 1.6996633405133656e-05,
"loss": 0.4971,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 329,
"tokens_per_second_per_gpu": 19737.26,
"total_tokens": 14891678
},
{
"epoch": 0.03296209359236878,
"grad_norm": 1.0625,
"learning_rate": 1.697165102854565e-05,
"loss": 0.474,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 330,
"tokens_per_second_per_gpu": 17700.75,
"total_tokens": 14935021
},
{
"epoch": 0.03306197872446686,
"grad_norm": 1.078125,
"learning_rate": 1.6946583704589973e-05,
"loss": 0.5282,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 331,
"tokens_per_second_per_gpu": 19790.9,
"total_tokens": 14983388
},
{
"epoch": 0.03316186385656495,
"grad_norm": 1.015625,
"learning_rate": 1.692143173870407e-05,
"loss": 0.5258,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 332,
"tokens_per_second_per_gpu": 19972.01,
"total_tokens": 15031594
},
{
"epoch": 0.033261748988663034,
"grad_norm": 1.0078125,
"learning_rate": 1.68961954373567e-05,
"loss": 0.4575,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 333,
"tokens_per_second_per_gpu": 18681.11,
"total_tokens": 15076740
},
{
"epoch": 0.033361634120761124,
"grad_norm": 1.015625,
"learning_rate": 1.6870875108044233e-05,
"loss": 0.4903,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 334,
"tokens_per_second_per_gpu": 17157.67,
"total_tokens": 15120321
},
{
"epoch": 0.033461519252859215,
"grad_norm": 1.03125,
"learning_rate": 1.684547105928689e-05,
"loss": 0.4515,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 335,
"tokens_per_second_per_gpu": 15719.21,
"total_tokens": 15158224
},
{
"epoch": 0.0335614043849573,
"grad_norm": 1.015625,
"learning_rate": 1.6819983600624986e-05,
"loss": 0.4877,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 336,
"tokens_per_second_per_gpu": 18868.7,
"total_tokens": 15204132
},
{
"epoch": 0.03366128951705539,
"grad_norm": 1.234375,
"learning_rate": 1.6794413042615168e-05,
"loss": 0.4875,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 337,
"tokens_per_second_per_gpu": 16502.82,
"total_tokens": 15246281
},
{
"epoch": 0.03376117464915347,
"grad_norm": 2.015625,
"learning_rate": 1.6768759696826608e-05,
"loss": 0.5036,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 338,
"tokens_per_second_per_gpu": 18350.8,
"total_tokens": 15290200
},
{
"epoch": 0.03386105978125156,
"grad_norm": 0.96484375,
"learning_rate": 1.6743023875837233e-05,
"loss": 0.4749,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 339,
"tokens_per_second_per_gpu": 18822.04,
"total_tokens": 15337650
},
{
"epoch": 0.033960944913349646,
"grad_norm": 0.99609375,
"learning_rate": 1.6717205893229904e-05,
"loss": 0.4843,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 340,
"tokens_per_second_per_gpu": 18742.82,
"total_tokens": 15383316
},
{
"epoch": 0.034060830045447736,
"grad_norm": 1.0390625,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.4261,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 341,
"tokens_per_second_per_gpu": 16470.5,
"total_tokens": 15424698
},
{
"epoch": 0.03416071517754582,
"grad_norm": 0.9921875,
"learning_rate": 1.6665324702494524e-05,
"loss": 0.4358,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 342,
"tokens_per_second_per_gpu": 17679.61,
"total_tokens": 15468208
},
{
"epoch": 0.03426060030964391,
"grad_norm": 0.98828125,
"learning_rate": 1.6639262126522417e-05,
"loss": 0.4723,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 343,
"tokens_per_second_per_gpu": 17795.33,
"total_tokens": 15512033
},
{
"epoch": 0.03436048544174199,
"grad_norm": 0.96875,
"learning_rate": 1.661311865323652e-05,
"loss": 0.5252,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 344,
"tokens_per_second_per_gpu": 21287.65,
"total_tokens": 15564114
},
{
"epoch": 0.034460370573840084,
"grad_norm": 1.0390625,
"learning_rate": 1.6586894601186804e-05,
"loss": 0.4677,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 345,
"tokens_per_second_per_gpu": 18457.17,
"total_tokens": 15609874
},
{
"epoch": 0.034560255705938174,
"grad_norm": 1.0,
"learning_rate": 1.6560590289905074e-05,
"loss": 0.5226,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 346,
"tokens_per_second_per_gpu": 19767.3,
"total_tokens": 15657417
},
{
"epoch": 0.03466014083803626,
"grad_norm": 0.98828125,
"learning_rate": 1.6534206039901057e-05,
"loss": 0.4562,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 347,
"tokens_per_second_per_gpu": 19409.52,
"total_tokens": 15704110
},
{
"epoch": 0.03476002597013435,
"grad_norm": 1.0703125,
"learning_rate": 1.650774217265851e-05,
"loss": 0.4736,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 348,
"tokens_per_second_per_gpu": 16562.36,
"total_tokens": 15745533
},
{
"epoch": 0.03485991110223243,
"grad_norm": 0.96484375,
"learning_rate": 1.6481199010631312e-05,
"loss": 0.4421,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 349,
"tokens_per_second_per_gpu": 18111.37,
"total_tokens": 15790497
},
{
"epoch": 0.03495979623433052,
"grad_norm": 1.0546875,
"learning_rate": 1.645457687723951e-05,
"loss": 0.5272,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 350,
"tokens_per_second_per_gpu": 18123.01,
"total_tokens": 15835918
},
{
"epoch": 0.035059681366428605,
"grad_norm": 0.98046875,
"learning_rate": 1.6427876096865394e-05,
"loss": 0.4796,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 351,
"tokens_per_second_per_gpu": 19074.82,
"total_tokens": 15882568
},
{
"epoch": 0.035159566498526695,
"grad_norm": 1.0390625,
"learning_rate": 1.6401096994849558e-05,
"loss": 0.4702,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 352,
"tokens_per_second_per_gpu": 18347.85,
"total_tokens": 15927874
},
{
"epoch": 0.03525945163062478,
"grad_norm": 0.9765625,
"learning_rate": 1.63742398974869e-05,
"loss": 0.4702,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 353,
"tokens_per_second_per_gpu": 18320.93,
"total_tokens": 15974025
},
{
"epoch": 0.03535933676272287,
"grad_norm": 0.98828125,
"learning_rate": 1.6347305132022677e-05,
"loss": 0.5038,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 354,
"tokens_per_second_per_gpu": 20266.97,
"total_tokens": 16023301
},
{
"epoch": 0.03545922189482095,
"grad_norm": 1.09375,
"learning_rate": 1.632029302664851e-05,
"loss": 0.4681,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 355,
"tokens_per_second_per_gpu": 14593.18,
"total_tokens": 16059415
},
{
"epoch": 0.03555910702691904,
"grad_norm": 1.046875,
"learning_rate": 1.6293203910498375e-05,
"loss": 0.5048,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 356,
"tokens_per_second_per_gpu": 17647.42,
"total_tokens": 16102167
},
{
"epoch": 0.03565899215901713,
"grad_norm": 1.015625,
"learning_rate": 1.6266038113644605e-05,
"loss": 0.4722,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 357,
"tokens_per_second_per_gpu": 19000.51,
"total_tokens": 16148416
},
{
"epoch": 0.03575887729111522,
"grad_norm": 1.0078125,
"learning_rate": 1.6238795967093865e-05,
"loss": 0.4886,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 358,
"tokens_per_second_per_gpu": 19107.42,
"total_tokens": 16194773
},
{
"epoch": 0.03585876242321331,
"grad_norm": 0.96875,
"learning_rate": 1.6211477802783105e-05,
"loss": 0.4529,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 359,
"tokens_per_second_per_gpu": 18559.33,
"total_tokens": 16240465
},
{
"epoch": 0.03595864755531139,
"grad_norm": 1.0,
"learning_rate": 1.6184083953575543e-05,
"loss": 0.4638,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 360,
"tokens_per_second_per_gpu": 18688.95,
"total_tokens": 16286759
},
{
"epoch": 0.03605853268740948,
"grad_norm": 1.0078125,
"learning_rate": 1.6156614753256583e-05,
"loss": 0.4533,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 361,
"tokens_per_second_per_gpu": 18258.2,
"total_tokens": 16331195
},
{
"epoch": 0.036158417819507564,
"grad_norm": 1.0078125,
"learning_rate": 1.6129070536529767e-05,
"loss": 0.5191,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 362,
"tokens_per_second_per_gpu": 18698.11,
"total_tokens": 16378176
},
{
"epoch": 0.036258302951605655,
"grad_norm": 1.0078125,
"learning_rate": 1.610145163901268e-05,
"loss": 0.4656,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 363,
"tokens_per_second_per_gpu": 18091.65,
"total_tokens": 16423031
},
{
"epoch": 0.03635818808370374,
"grad_norm": 1.0390625,
"learning_rate": 1.607375839723287e-05,
"loss": 0.5201,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 364,
"tokens_per_second_per_gpu": 20080.33,
"total_tokens": 16472899
},
{
"epoch": 0.03645807321580183,
"grad_norm": 1.0390625,
"learning_rate": 1.6045991148623752e-05,
"loss": 0.4326,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 365,
"tokens_per_second_per_gpu": 15284.04,
"total_tokens": 16510300
},
{
"epoch": 0.03655795834789991,
"grad_norm": 0.98046875,
"learning_rate": 1.6018150231520486e-05,
"loss": 0.5057,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 366,
"tokens_per_second_per_gpu": 18603.95,
"total_tokens": 16556259
},
{
"epoch": 0.036657843479998,
"grad_norm": 0.98046875,
"learning_rate": 1.599023598515586e-05,
"loss": 0.4852,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 367,
"tokens_per_second_per_gpu": 18972.85,
"total_tokens": 16602771
},
{
"epoch": 0.03675772861209609,
"grad_norm": 0.96484375,
"learning_rate": 1.5962248749656158e-05,
"loss": 0.4484,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 368,
"tokens_per_second_per_gpu": 18752.14,
"total_tokens": 16649863
},
{
"epoch": 0.036857613744194176,
"grad_norm": 1.0,
"learning_rate": 1.5934188866037017e-05,
"loss": 0.4871,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 369,
"tokens_per_second_per_gpu": 18747.7,
"total_tokens": 16695468
},
{
"epoch": 0.036957498876292266,
"grad_norm": 1.0078125,
"learning_rate": 1.5906056676199256e-05,
"loss": 0.4475,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 370,
"tokens_per_second_per_gpu": 16922.36,
"total_tokens": 16738176
},
{
"epoch": 0.03705738400839035,
"grad_norm": 0.9921875,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.4742,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 371,
"tokens_per_second_per_gpu": 18042.51,
"total_tokens": 16782261
},
{
"epoch": 0.03715726914048844,
"grad_norm": 0.96484375,
"learning_rate": 1.584957674987216e-05,
"loss": 0.4745,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 372,
"tokens_per_second_per_gpu": 19842.54,
"total_tokens": 16830975
},
{
"epoch": 0.03725715427258652,
"grad_norm": 1.0390625,
"learning_rate": 1.5821229701572897e-05,
"loss": 0.4499,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 373,
"tokens_per_second_per_gpu": 16786.67,
"total_tokens": 16872527
},
{
"epoch": 0.037357039404684614,
"grad_norm": 0.984375,
"learning_rate": 1.5792811723426787e-05,
"loss": 0.4368,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 374,
"tokens_per_second_per_gpu": 16910.29,
"total_tokens": 16915897
},
{
"epoch": 0.0374569245367827,
"grad_norm": 1.0078125,
"learning_rate": 1.5764323161697933e-05,
"loss": 0.4274,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 375,
"tokens_per_second_per_gpu": 16770.9,
"total_tokens": 16956784
},
{
"epoch": 0.03755680966888079,
"grad_norm": 1.09375,
"learning_rate": 1.573576436351046e-05,
"loss": 0.4516,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 376,
"tokens_per_second_per_gpu": 15393.62,
"total_tokens": 16995210
},
{
"epoch": 0.03765669480097887,
"grad_norm": 1.0625,
"learning_rate": 1.570713567684432e-05,
"loss": 0.4663,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 377,
"tokens_per_second_per_gpu": 16725.72,
"total_tokens": 17036548
},
{
"epoch": 0.03775657993307696,
"grad_norm": 0.98046875,
"learning_rate": 1.5678437450531014e-05,
"loss": 0.529,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 378,
"tokens_per_second_per_gpu": 22398.24,
"total_tokens": 17089299
},
{
"epoch": 0.03785646506517505,
"grad_norm": 0.93359375,
"learning_rate": 1.564967003424938e-05,
"loss": 0.4789,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 379,
"tokens_per_second_per_gpu": 19240.37,
"total_tokens": 17135363
},
{
"epoch": 0.037956350197273135,
"grad_norm": 0.98828125,
"learning_rate": 1.5620833778521306e-05,
"loss": 0.4886,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 380,
"tokens_per_second_per_gpu": 19903.65,
"total_tokens": 17183281
},
{
"epoch": 0.038056235329371225,
"grad_norm": 1.0,
"learning_rate": 1.5591929034707468e-05,
"loss": 0.4449,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 381,
"tokens_per_second_per_gpu": 17219.67,
"total_tokens": 17225460
},
{
"epoch": 0.03815612046146931,
"grad_norm": 0.921875,
"learning_rate": 1.556295615500305e-05,
"loss": 0.488,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 382,
"tokens_per_second_per_gpu": 19687.26,
"total_tokens": 17273403
},
{
"epoch": 0.0382560055935674,
"grad_norm": 1.03125,
"learning_rate": 1.553391549243344e-05,
"loss": 0.4886,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 383,
"tokens_per_second_per_gpu": 16172.03,
"total_tokens": 17313336
},
{
"epoch": 0.03835589072566548,
"grad_norm": 1.046875,
"learning_rate": 1.5504807400849957e-05,
"loss": 0.4709,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 384,
"tokens_per_second_per_gpu": 16817.61,
"total_tokens": 17353898
},
{
"epoch": 0.03845577585776357,
"grad_norm": 0.96484375,
"learning_rate": 1.5475632234925505e-05,
"loss": 0.4926,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 385,
"tokens_per_second_per_gpu": 21137.73,
"total_tokens": 17404787
},
{
"epoch": 0.038555660989861656,
"grad_norm": 0.98046875,
"learning_rate": 1.5446390350150272e-05,
"loss": 0.4896,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 386,
"tokens_per_second_per_gpu": 20261.92,
"total_tokens": 17454159
},
{
"epoch": 0.03865554612195975,
"grad_norm": 1.0625,
"learning_rate": 1.54170821028274e-05,
"loss": 0.4817,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 387,
"tokens_per_second_per_gpu": 17798.83,
"total_tokens": 17498095
},
{
"epoch": 0.03875543125405783,
"grad_norm": 1.0,
"learning_rate": 1.5387707850068633e-05,
"loss": 0.4538,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 388,
"tokens_per_second_per_gpu": 18309.45,
"total_tokens": 17542740
},
{
"epoch": 0.03885531638615592,
"grad_norm": 1.03125,
"learning_rate": 1.5358267949789968e-05,
"loss": 0.4877,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 389,
"tokens_per_second_per_gpu": 18106.24,
"total_tokens": 17586608
},
{
"epoch": 0.03895520151825401,
"grad_norm": 1.109375,
"learning_rate": 1.53287627607073e-05,
"loss": 0.4918,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 390,
"tokens_per_second_per_gpu": 16000.74,
"total_tokens": 17626349
},
{
"epoch": 0.039055086650352094,
"grad_norm": 0.98828125,
"learning_rate": 1.529919264233205e-05,
"loss": 0.4838,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 391,
"tokens_per_second_per_gpu": 17778.35,
"total_tokens": 17670133
},
{
"epoch": 0.039154971782450185,
"grad_norm": 1.0078125,
"learning_rate": 1.5269557954966777e-05,
"loss": 0.4923,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 392,
"tokens_per_second_per_gpu": 18786.88,
"total_tokens": 17715930
},
{
"epoch": 0.03925485691454827,
"grad_norm": 1.046875,
"learning_rate": 1.5239859059700794e-05,
"loss": 0.5369,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 393,
"tokens_per_second_per_gpu": 19758.95,
"total_tokens": 17763863
},
{
"epoch": 0.03935474204664636,
"grad_norm": 1.0234375,
"learning_rate": 1.5210096318405768e-05,
"loss": 0.4702,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 394,
"tokens_per_second_per_gpu": 17953.26,
"total_tokens": 17807788
},
{
"epoch": 0.03945462717874444,
"grad_norm": 0.96875,
"learning_rate": 1.5180270093731305e-05,
"loss": 0.4827,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 395,
"tokens_per_second_per_gpu": 18853.02,
"total_tokens": 17852672
},
{
"epoch": 0.03955451231084253,
"grad_norm": 1.0390625,
"learning_rate": 1.5150380749100545e-05,
"loss": 0.4722,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 396,
"tokens_per_second_per_gpu": 15988.81,
"total_tokens": 17892790
},
{
"epoch": 0.039654397442940616,
"grad_norm": 1.0390625,
"learning_rate": 1.5120428648705716e-05,
"loss": 0.471,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 397,
"tokens_per_second_per_gpu": 15682.78,
"total_tokens": 17931167
},
{
"epoch": 0.039754282575038706,
"grad_norm": 0.984375,
"learning_rate": 1.5090414157503715e-05,
"loss": 0.4666,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 398,
"tokens_per_second_per_gpu": 19233.06,
"total_tokens": 17977266
},
{
"epoch": 0.03985416770713679,
"grad_norm": 1.03125,
"learning_rate": 1.5060337641211637e-05,
"loss": 0.4853,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 399,
"tokens_per_second_per_gpu": 17453.17,
"total_tokens": 18019796
},
{
"epoch": 0.03995405283923488,
"grad_norm": 1.015625,
"learning_rate": 1.5030199466302354e-05,
"loss": 0.4996,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 400,
"tokens_per_second_per_gpu": 20378.63,
"total_tokens": 18068913
},
{
"epoch": 0.04005393797133297,
"grad_norm": 0.99609375,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.4709,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 401,
"tokens_per_second_per_gpu": 16785.21,
"total_tokens": 18110434
},
{
"epoch": 0.040153823103431054,
"grad_norm": 1.015625,
"learning_rate": 1.4969739610275556e-05,
"loss": 0.4835,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 402,
"tokens_per_second_per_gpu": 18895.39,
"total_tokens": 18155666
},
{
"epoch": 0.040253708235529144,
"grad_norm": 0.93359375,
"learning_rate": 1.493941866584231e-05,
"loss": 0.4325,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 403,
"tokens_per_second_per_gpu": 17748.23,
"total_tokens": 18199979
},
{
"epoch": 0.04035359336762723,
"grad_norm": 0.984375,
"learning_rate": 1.490903753615141e-05,
"loss": 0.457,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 404,
"tokens_per_second_per_gpu": 18064.08,
"total_tokens": 18244991
},
{
"epoch": 0.04045347849972532,
"grad_norm": 0.96875,
"learning_rate": 1.4878596591387329e-05,
"loss": 0.4955,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 405,
"tokens_per_second_per_gpu": 20242.52,
"total_tokens": 18294163
},
{
"epoch": 0.0405533636318234,
"grad_norm": 1.3359375,
"learning_rate": 1.4848096202463373e-05,
"loss": 0.4543,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 406,
"tokens_per_second_per_gpu": 16321.01,
"total_tokens": 18334899
},
{
"epoch": 0.04065324876392149,
"grad_norm": 1.03125,
"learning_rate": 1.4817536741017153e-05,
"loss": 0.5023,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 407,
"tokens_per_second_per_gpu": 20600.84,
"total_tokens": 18384811
},
{
"epoch": 0.040753133896019575,
"grad_norm": 0.96875,
"learning_rate": 1.478691857940607e-05,
"loss": 0.4377,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 408,
"tokens_per_second_per_gpu": 16604.43,
"total_tokens": 18426521
},
{
"epoch": 0.040853019028117665,
"grad_norm": 0.9453125,
"learning_rate": 1.4756242090702756e-05,
"loss": 0.5098,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 409,
"tokens_per_second_per_gpu": 19356.84,
"total_tokens": 18475111
},
{
"epoch": 0.04095290416021575,
"grad_norm": 0.9609375,
"learning_rate": 1.4725507648690542e-05,
"loss": 0.4828,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 410,
"tokens_per_second_per_gpu": 20213.24,
"total_tokens": 18523782
},
{
"epoch": 0.04105278929231384,
"grad_norm": 0.99609375,
"learning_rate": 1.469471562785891e-05,
"loss": 0.499,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 411,
"tokens_per_second_per_gpu": 19154.22,
"total_tokens": 18569782
},
{
"epoch": 0.04115267442441193,
"grad_norm": 0.97265625,
"learning_rate": 1.4663866403398915e-05,
"loss": 0.4652,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 412,
"tokens_per_second_per_gpu": 17546.97,
"total_tokens": 18613362
},
{
"epoch": 0.04125255955651001,
"grad_norm": 0.984375,
"learning_rate": 1.463296035119862e-05,
"loss": 0.4443,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 413,
"tokens_per_second_per_gpu": 17136.09,
"total_tokens": 18656132
},
{
"epoch": 0.0413524446886081,
"grad_norm": 0.96484375,
"learning_rate": 1.4601997847838518e-05,
"loss": 0.495,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 414,
"tokens_per_second_per_gpu": 19173.21,
"total_tokens": 18703756
},
{
"epoch": 0.04145232982070619,
"grad_norm": 1.0,
"learning_rate": 1.4570979270586944e-05,
"loss": 0.481,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 415,
"tokens_per_second_per_gpu": 16812.7,
"total_tokens": 18745722
},
{
"epoch": 0.04155221495280428,
"grad_norm": 1.015625,
"learning_rate": 1.4539904997395468e-05,
"loss": 0.4299,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 416,
"tokens_per_second_per_gpu": 16677.75,
"total_tokens": 18786807
},
{
"epoch": 0.04165210008490236,
"grad_norm": 1.0234375,
"learning_rate": 1.4508775406894308e-05,
"loss": 0.4857,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 417,
"tokens_per_second_per_gpu": 17607.47,
"total_tokens": 18830485
},
{
"epoch": 0.04175198521700045,
"grad_norm": 0.94140625,
"learning_rate": 1.4477590878387697e-05,
"loss": 0.4711,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 418,
"tokens_per_second_per_gpu": 20045.44,
"total_tokens": 18878778
},
{
"epoch": 0.041851870349098534,
"grad_norm": 1.0,
"learning_rate": 1.4446351791849276e-05,
"loss": 0.4933,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 419,
"tokens_per_second_per_gpu": 18986.08,
"total_tokens": 18924651
},
{
"epoch": 0.041951755481196625,
"grad_norm": 0.96484375,
"learning_rate": 1.4415058527917454e-05,
"loss": 0.505,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 420,
"tokens_per_second_per_gpu": 19508.66,
"total_tokens": 18972299
},
{
"epoch": 0.04205164061329471,
"grad_norm": 0.99609375,
"learning_rate": 1.4383711467890776e-05,
"loss": 0.4733,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 421,
"tokens_per_second_per_gpu": 18690.55,
"total_tokens": 19018039
},
{
"epoch": 0.0421515257453928,
"grad_norm": 0.94921875,
"learning_rate": 1.4352310993723277e-05,
"loss": 0.4814,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 422,
"tokens_per_second_per_gpu": 19967.53,
"total_tokens": 19066992
},
{
"epoch": 0.04225141087749089,
"grad_norm": 0.953125,
"learning_rate": 1.4320857488019826e-05,
"loss": 0.5123,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 423,
"tokens_per_second_per_gpu": 20665.01,
"total_tokens": 19116238
},
{
"epoch": 0.04235129600958897,
"grad_norm": 1.015625,
"learning_rate": 1.4289351334031461e-05,
"loss": 0.4845,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 424,
"tokens_per_second_per_gpu": 18633.95,
"total_tokens": 19161635
},
{
"epoch": 0.04245118114168706,
"grad_norm": 1.0546875,
"learning_rate": 1.4257792915650728e-05,
"loss": 0.4953,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 425,
"tokens_per_second_per_gpu": 19149.34,
"total_tokens": 19207721
},
{
"epoch": 0.042551066273785146,
"grad_norm": 0.96875,
"learning_rate": 1.4226182617406996e-05,
"loss": 0.5075,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 426,
"tokens_per_second_per_gpu": 20615.55,
"total_tokens": 19257403
},
{
"epoch": 0.042650951405883236,
"grad_norm": 1.03125,
"learning_rate": 1.4194520824461773e-05,
"loss": 0.4779,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 427,
"tokens_per_second_per_gpu": 18010.14,
"total_tokens": 19300710
},
{
"epoch": 0.04275083653798132,
"grad_norm": 1.015625,
"learning_rate": 1.4162807922604014e-05,
"loss": 0.437,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 428,
"tokens_per_second_per_gpu": 16253.78,
"total_tokens": 19340967
},
{
"epoch": 0.04285072167007941,
"grad_norm": 0.91015625,
"learning_rate": 1.413104429824542e-05,
"loss": 0.477,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 429,
"tokens_per_second_per_gpu": 20601.91,
"total_tokens": 19390987
},
{
"epoch": 0.04295060680217749,
"grad_norm": 0.98828125,
"learning_rate": 1.4099230338415728e-05,
"loss": 0.4954,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 430,
"tokens_per_second_per_gpu": 19146.66,
"total_tokens": 19437852
},
{
"epoch": 0.043050491934275584,
"grad_norm": 1.0234375,
"learning_rate": 1.4067366430758004e-05,
"loss": 0.4818,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 431,
"tokens_per_second_per_gpu": 17260.63,
"total_tokens": 19480645
},
{
"epoch": 0.04315037706637367,
"grad_norm": 0.921875,
"learning_rate": 1.4035452963523903e-05,
"loss": 0.4795,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 432,
"tokens_per_second_per_gpu": 19292.06,
"total_tokens": 19529326
},
{
"epoch": 0.04325026219847176,
"grad_norm": 0.98828125,
"learning_rate": 1.4003490325568953e-05,
"loss": 0.527,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 433,
"tokens_per_second_per_gpu": 20906.46,
"total_tokens": 19580256
},
{
"epoch": 0.04335014733056985,
"grad_norm": 1.0390625,
"learning_rate": 1.3971478906347806e-05,
"loss": 0.4926,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 434,
"tokens_per_second_per_gpu": 16518.25,
"total_tokens": 19621920
},
{
"epoch": 0.04345003246266793,
"grad_norm": 0.93359375,
"learning_rate": 1.3939419095909513e-05,
"loss": 0.4155,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 435,
"tokens_per_second_per_gpu": 17755.02,
"total_tokens": 19666530
},
{
"epoch": 0.04354991759476602,
"grad_norm": 1.0234375,
"learning_rate": 1.3907311284892737e-05,
"loss": 0.4568,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 436,
"tokens_per_second_per_gpu": 17222.48,
"total_tokens": 19709675
},
{
"epoch": 0.043649802726864105,
"grad_norm": 1.0078125,
"learning_rate": 1.3875155864521031e-05,
"loss": 0.4477,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 437,
"tokens_per_second_per_gpu": 16412.82,
"total_tokens": 19750839
},
{
"epoch": 0.043749687858962195,
"grad_norm": 0.984375,
"learning_rate": 1.3842953226598036e-05,
"loss": 0.4441,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 438,
"tokens_per_second_per_gpu": 17774.0,
"total_tokens": 19794275
},
{
"epoch": 0.04384957299106028,
"grad_norm": 0.984375,
"learning_rate": 1.3810703763502744e-05,
"loss": 0.4864,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 439,
"tokens_per_second_per_gpu": 19258.91,
"total_tokens": 19841649
},
{
"epoch": 0.04394945812315837,
"grad_norm": 1.0,
"learning_rate": 1.3778407868184674e-05,
"loss": 0.4498,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 440,
"tokens_per_second_per_gpu": 18522.25,
"total_tokens": 19887636
},
{
"epoch": 0.04404934325525645,
"grad_norm": 1.078125,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.4497,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 441,
"tokens_per_second_per_gpu": 15927.53,
"total_tokens": 19927026
},
{
"epoch": 0.04414922838735454,
"grad_norm": 1.0078125,
"learning_rate": 1.371367835550235e-05,
"loss": 0.4638,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 442,
"tokens_per_second_per_gpu": 18635.1,
"total_tokens": 19972837
},
{
"epoch": 0.044249113519452626,
"grad_norm": 0.96875,
"learning_rate": 1.3681245526846782e-05,
"loss": 0.4623,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 443,
"tokens_per_second_per_gpu": 18977.89,
"total_tokens": 20019104
},
{
"epoch": 0.04434899865155072,
"grad_norm": 1.015625,
"learning_rate": 1.3648767843376196e-05,
"loss": 0.4301,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 444,
"tokens_per_second_per_gpu": 15652.52,
"total_tokens": 20059232
},
{
"epoch": 0.04444888378364881,
"grad_norm": 1.0234375,
"learning_rate": 1.3616245700820922e-05,
"loss": 0.4843,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 445,
"tokens_per_second_per_gpu": 16862.53,
"total_tokens": 20100364
},
{
"epoch": 0.04454876891574689,
"grad_norm": 0.94921875,
"learning_rate": 1.3583679495453e-05,
"loss": 0.5102,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 446,
"tokens_per_second_per_gpu": 20530.87,
"total_tokens": 20150727
},
{
"epoch": 0.04464865404784498,
"grad_norm": 1.015625,
"learning_rate": 1.3551069624081372e-05,
"loss": 0.4832,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 447,
"tokens_per_second_per_gpu": 19254.04,
"total_tokens": 20196714
},
{
"epoch": 0.044748539179943064,
"grad_norm": 1.015625,
"learning_rate": 1.3518416484047018e-05,
"loss": 0.4832,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 448,
"tokens_per_second_per_gpu": 18247.57,
"total_tokens": 20241670
},
{
"epoch": 0.044848424312041155,
"grad_norm": 0.98046875,
"learning_rate": 1.3485720473218153e-05,
"loss": 0.4366,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 449,
"tokens_per_second_per_gpu": 16882.38,
"total_tokens": 20283970
},
{
"epoch": 0.04494830944413924,
"grad_norm": 0.9375,
"learning_rate": 1.3452981989985347e-05,
"loss": 0.4708,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 450,
"tokens_per_second_per_gpu": 21541.68,
"total_tokens": 20335291
},
{
"epoch": 0.04504819457623733,
"grad_norm": 0.98828125,
"learning_rate": 1.342020143325669e-05,
"loss": 0.5028,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 451,
"tokens_per_second_per_gpu": 19648.41,
"total_tokens": 20383351
},
{
"epoch": 0.04514807970833541,
"grad_norm": 0.984375,
"learning_rate": 1.3387379202452917e-05,
"loss": 0.4838,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 452,
"tokens_per_second_per_gpu": 19749.18,
"total_tokens": 20430693
},
{
"epoch": 0.0452479648404335,
"grad_norm": 0.98046875,
"learning_rate": 1.3354515697502552e-05,
"loss": 0.4566,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 453,
"tokens_per_second_per_gpu": 19588.54,
"total_tokens": 20478883
},
{
"epoch": 0.045347849972531586,
"grad_norm": 1.0234375,
"learning_rate": 1.3321611318837033e-05,
"loss": 0.4549,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 454,
"tokens_per_second_per_gpu": 18174.67,
"total_tokens": 20523444
},
{
"epoch": 0.045447735104629676,
"grad_norm": 1.0078125,
"learning_rate": 1.3288666467385834e-05,
"loss": 0.442,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 455,
"tokens_per_second_per_gpu": 16575.21,
"total_tokens": 20564379
},
{
"epoch": 0.045547620236727766,
"grad_norm": 0.93359375,
"learning_rate": 1.3255681544571568e-05,
"loss": 0.4501,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 456,
"tokens_per_second_per_gpu": 19453.63,
"total_tokens": 20611507
},
{
"epoch": 0.04564750536882585,
"grad_norm": 0.9765625,
"learning_rate": 1.3222656952305113e-05,
"loss": 0.4896,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 457,
"tokens_per_second_per_gpu": 18982.61,
"total_tokens": 20657825
},
{
"epoch": 0.04574739050092394,
"grad_norm": 0.984375,
"learning_rate": 1.3189593092980701e-05,
"loss": 0.4279,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 458,
"tokens_per_second_per_gpu": 18418.04,
"total_tokens": 20702133
},
{
"epoch": 0.045847275633022024,
"grad_norm": 1.0390625,
"learning_rate": 1.3156490369471026e-05,
"loss": 0.4607,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 459,
"tokens_per_second_per_gpu": 18412.72,
"total_tokens": 20747114
},
{
"epoch": 0.045947160765120114,
"grad_norm": 0.984375,
"learning_rate": 1.3123349185122328e-05,
"loss": 0.4671,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 460,
"tokens_per_second_per_gpu": 17640.8,
"total_tokens": 20790822
},
{
"epoch": 0.0460470458972182,
"grad_norm": 1.0234375,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.5004,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 461,
"tokens_per_second_per_gpu": 18665.44,
"total_tokens": 20836379
},
{
"epoch": 0.04614693102931629,
"grad_norm": 0.953125,
"learning_rate": 1.3056953049631059e-05,
"loss": 0.4387,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 462,
"tokens_per_second_per_gpu": 19129.47,
"total_tokens": 20883986
},
{
"epoch": 0.04624681616141437,
"grad_norm": 1.1484375,
"learning_rate": 1.3023698907504447e-05,
"loss": 0.4174,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 463,
"tokens_per_second_per_gpu": 16146.06,
"total_tokens": 20923457
},
{
"epoch": 0.04634670129351246,
"grad_norm": 0.96875,
"learning_rate": 1.2990407922560869e-05,
"loss": 0.5199,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 464,
"tokens_per_second_per_gpu": 21859.89,
"total_tokens": 20975561
},
{
"epoch": 0.046446586425610545,
"grad_norm": 0.98828125,
"learning_rate": 1.2957080500440469e-05,
"loss": 0.4812,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 465,
"tokens_per_second_per_gpu": 19141.51,
"total_tokens": 21022847
},
{
"epoch": 0.046546471557708635,
"grad_norm": 1.046875,
"learning_rate": 1.2923717047227368e-05,
"loss": 0.4277,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 466,
"tokens_per_second_per_gpu": 16949.78,
"total_tokens": 21064452
},
{
"epoch": 0.046646356689806726,
"grad_norm": 1.0625,
"learning_rate": 1.2890317969444716e-05,
"loss": 0.4635,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 467,
"tokens_per_second_per_gpu": 17184.9,
"total_tokens": 21106486
},
{
"epoch": 0.04674624182190481,
"grad_norm": 0.98046875,
"learning_rate": 1.2856883674049736e-05,
"loss": 0.4734,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 468,
"tokens_per_second_per_gpu": 20115.21,
"total_tokens": 21154512
},
{
"epoch": 0.0468461269540029,
"grad_norm": 1.2734375,
"learning_rate": 1.2823414568428767e-05,
"loss": 0.412,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 469,
"tokens_per_second_per_gpu": 15311.39,
"total_tokens": 21193038
},
{
"epoch": 0.04694601208610098,
"grad_norm": 0.9765625,
"learning_rate": 1.2789911060392295e-05,
"loss": 0.4683,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 470,
"tokens_per_second_per_gpu": 19043.56,
"total_tokens": 21240442
},
{
"epoch": 0.04704589721819907,
"grad_norm": 1.0078125,
"learning_rate": 1.2756373558169992e-05,
"loss": 0.5045,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 471,
"tokens_per_second_per_gpu": 18973.02,
"total_tokens": 21286255
},
{
"epoch": 0.047145782350297157,
"grad_norm": 0.98046875,
"learning_rate": 1.2722802470405744e-05,
"loss": 0.4847,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 472,
"tokens_per_second_per_gpu": 19835.41,
"total_tokens": 21334376
},
{
"epoch": 0.04724566748239525,
"grad_norm": 0.94921875,
"learning_rate": 1.2689198206152657e-05,
"loss": 0.4578,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 473,
"tokens_per_second_per_gpu": 19532.41,
"total_tokens": 21382963
},
{
"epoch": 0.04734555261449333,
"grad_norm": 0.9921875,
"learning_rate": 1.265556117486809e-05,
"loss": 0.4629,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 474,
"tokens_per_second_per_gpu": 18454.21,
"total_tokens": 21428429
},
{
"epoch": 0.04744543774659142,
"grad_norm": 0.9609375,
"learning_rate": 1.2621891786408648e-05,
"loss": 0.4086,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 475,
"tokens_per_second_per_gpu": 17434.94,
"total_tokens": 21472056
},
{
"epoch": 0.047545322878689504,
"grad_norm": 1.0390625,
"learning_rate": 1.2588190451025209e-05,
"loss": 0.4546,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 476,
"tokens_per_second_per_gpu": 18143.91,
"total_tokens": 21516053
},
{
"epoch": 0.047645208010787594,
"grad_norm": 1.5234375,
"learning_rate": 1.2554457579357906e-05,
"loss": 0.4865,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 477,
"tokens_per_second_per_gpu": 17698.1,
"total_tokens": 21559736
},
{
"epoch": 0.047745093142885685,
"grad_norm": 1.0234375,
"learning_rate": 1.252069358243114e-05,
"loss": 0.4796,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 478,
"tokens_per_second_per_gpu": 18941.25,
"total_tokens": 21606119
},
{
"epoch": 0.04784497827498377,
"grad_norm": 1.0,
"learning_rate": 1.2486898871648552e-05,
"loss": 0.4479,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 479,
"tokens_per_second_per_gpu": 17602.65,
"total_tokens": 21649683
},
{
"epoch": 0.04794486340708186,
"grad_norm": 1.0234375,
"learning_rate": 1.2453073858788027e-05,
"loss": 0.4305,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 480,
"tokens_per_second_per_gpu": 16582.51,
"total_tokens": 21690429
},
{
"epoch": 0.04804474853917994,
"grad_norm": 1.03125,
"learning_rate": 1.2419218955996677e-05,
"loss": 0.5023,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 481,
"tokens_per_second_per_gpu": 19550.42,
"total_tokens": 21738074
},
{
"epoch": 0.04814463367127803,
"grad_norm": 0.9921875,
"learning_rate": 1.238533457578581e-05,
"loss": 0.4985,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 482,
"tokens_per_second_per_gpu": 19261.78,
"total_tokens": 21784804
},
{
"epoch": 0.048244518803376116,
"grad_norm": 0.984375,
"learning_rate": 1.23514211310259e-05,
"loss": 0.4767,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 483,
"tokens_per_second_per_gpu": 19732.31,
"total_tokens": 21832012
},
{
"epoch": 0.048344403935474206,
"grad_norm": 0.9921875,
"learning_rate": 1.2317479034941572e-05,
"loss": 0.4619,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 484,
"tokens_per_second_per_gpu": 20400.51,
"total_tokens": 21880113
},
{
"epoch": 0.04844428906757229,
"grad_norm": 0.9609375,
"learning_rate": 1.2283508701106559e-05,
"loss": 0.4387,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 485,
"tokens_per_second_per_gpu": 18048.0,
"total_tokens": 21924247
},
{
"epoch": 0.04854417419967038,
"grad_norm": 1.0078125,
"learning_rate": 1.2249510543438652e-05,
"loss": 0.4826,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 486,
"tokens_per_second_per_gpu": 17412.13,
"total_tokens": 21967561
},
{
"epoch": 0.04864405933176846,
"grad_norm": 1.0234375,
"learning_rate": 1.2215484976194675e-05,
"loss": 0.5,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 487,
"tokens_per_second_per_gpu": 19866.72,
"total_tokens": 22014791
},
{
"epoch": 0.048743944463866554,
"grad_norm": 0.98046875,
"learning_rate": 1.2181432413965428e-05,
"loss": 0.4481,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 488,
"tokens_per_second_per_gpu": 17658.1,
"total_tokens": 22058505
},
{
"epoch": 0.048843829595964644,
"grad_norm": 0.95703125,
"learning_rate": 1.2147353271670634e-05,
"loss": 0.4992,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 489,
"tokens_per_second_per_gpu": 21458.73,
"total_tokens": 22109615
},
{
"epoch": 0.04894371472806273,
"grad_norm": 1.0234375,
"learning_rate": 1.211324796455389e-05,
"loss": 0.451,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 490,
"tokens_per_second_per_gpu": 18087.11,
"total_tokens": 22153686
},
{
"epoch": 0.04904359986016082,
"grad_norm": 1.2265625,
"learning_rate": 1.2079116908177592e-05,
"loss": 0.483,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 491,
"tokens_per_second_per_gpu": 17660.42,
"total_tokens": 22197018
},
{
"epoch": 0.0491434849922589,
"grad_norm": 1.8359375,
"learning_rate": 1.2044960518417902e-05,
"loss": 0.4741,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 492,
"tokens_per_second_per_gpu": 17381.69,
"total_tokens": 22240331
},
{
"epoch": 0.04924337012435699,
"grad_norm": 0.953125,
"learning_rate": 1.2010779211459649e-05,
"loss": 0.4283,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 493,
"tokens_per_second_per_gpu": 17621.55,
"total_tokens": 22283909
},
{
"epoch": 0.049343255256455075,
"grad_norm": 0.9609375,
"learning_rate": 1.1976573403791263e-05,
"loss": 0.4636,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 494,
"tokens_per_second_per_gpu": 19438.53,
"total_tokens": 22332531
},
{
"epoch": 0.049443140388553165,
"grad_norm": 1.03125,
"learning_rate": 1.194234351219972e-05,
"loss": 0.464,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 495,
"tokens_per_second_per_gpu": 18317.53,
"total_tokens": 22376999
},
{
"epoch": 0.04954302552065125,
"grad_norm": 1.015625,
"learning_rate": 1.190808995376545e-05,
"loss": 0.509,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 496,
"tokens_per_second_per_gpu": 19293.59,
"total_tokens": 22424618
},
{
"epoch": 0.04964291065274934,
"grad_norm": 0.99609375,
"learning_rate": 1.187381314585725e-05,
"loss": 0.4516,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 497,
"tokens_per_second_per_gpu": 18270.3,
"total_tokens": 22469423
},
{
"epoch": 0.04974279578484742,
"grad_norm": 0.98046875,
"learning_rate": 1.1839513506127202e-05,
"loss": 0.4849,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 498,
"tokens_per_second_per_gpu": 19430.08,
"total_tokens": 22516368
},
{
"epoch": 0.04984268091694551,
"grad_norm": 1.09375,
"learning_rate": 1.1805191452505602e-05,
"loss": 0.4667,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 499,
"tokens_per_second_per_gpu": 16036.97,
"total_tokens": 22556525
},
{
"epoch": 0.0499425660490436,
"grad_norm": 1.015625,
"learning_rate": 1.1770847403195836e-05,
"loss": 0.4213,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 500,
"tokens_per_second_per_gpu": 17013.59,
"total_tokens": 22598669
},
{
"epoch": 0.05004245118114169,
"grad_norm": 1.015625,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.4727,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 501,
"tokens_per_second_per_gpu": 19157.82,
"total_tokens": 22644756
},
{
"epoch": 0.05014233631323978,
"grad_norm": 0.8984375,
"learning_rate": 1.1702094991660326e-05,
"loss": 0.4344,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 502,
"tokens_per_second_per_gpu": 19863.48,
"total_tokens": 22694880
},
{
"epoch": 0.05024222144533786,
"grad_norm": 1.125,
"learning_rate": 1.1667687467161025e-05,
"loss": 0.4611,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 503,
"tokens_per_second_per_gpu": 18555.46,
"total_tokens": 22740472
},
{
"epoch": 0.05034210657743595,
"grad_norm": 1.03125,
"learning_rate": 1.1633259622416224e-05,
"loss": 0.4185,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 504,
"tokens_per_second_per_gpu": 16048.54,
"total_tokens": 22780599
},
{
"epoch": 0.050441991709534034,
"grad_norm": 0.96875,
"learning_rate": 1.159881187691835e-05,
"loss": 0.4438,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 505,
"tokens_per_second_per_gpu": 18750.75,
"total_tokens": 22826251
},
{
"epoch": 0.050541876841632125,
"grad_norm": 0.96484375,
"learning_rate": 1.156434465040231e-05,
"loss": 0.4806,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 506,
"tokens_per_second_per_gpu": 19927.32,
"total_tokens": 22875342
},
{
"epoch": 0.05064176197373021,
"grad_norm": 1.0078125,
"learning_rate": 1.1529858362840383e-05,
"loss": 0.4731,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 507,
"tokens_per_second_per_gpu": 17839.7,
"total_tokens": 22918851
},
{
"epoch": 0.0507416471058283,
"grad_norm": 0.9921875,
"learning_rate": 1.1495353434437098e-05,
"loss": 0.4712,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 508,
"tokens_per_second_per_gpu": 17955.09,
"total_tokens": 22963103
},
{
"epoch": 0.05084153223792638,
"grad_norm": 1.0625,
"learning_rate": 1.1460830285624119e-05,
"loss": 0.4683,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 509,
"tokens_per_second_per_gpu": 17845.55,
"total_tokens": 23007258
},
{
"epoch": 0.05094141737002447,
"grad_norm": 0.9765625,
"learning_rate": 1.1426289337055119e-05,
"loss": 0.4843,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 510,
"tokens_per_second_per_gpu": 19624.71,
"total_tokens": 23055166
},
{
"epoch": 0.05104130250212256,
"grad_norm": 1.0390625,
"learning_rate": 1.1391731009600655e-05,
"loss": 0.4588,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 511,
"tokens_per_second_per_gpu": 15869.37,
"total_tokens": 23095363
},
{
"epoch": 0.051141187634220646,
"grad_norm": 1.125,
"learning_rate": 1.1357155724343046e-05,
"loss": 0.4484,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 512,
"tokens_per_second_per_gpu": 17790.97,
"total_tokens": 23138552
},
{
"epoch": 0.051241072766318736,
"grad_norm": 0.94921875,
"learning_rate": 1.1322563902571227e-05,
"loss": 0.4843,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 513,
"tokens_per_second_per_gpu": 18872.7,
"total_tokens": 23186121
},
{
"epoch": 0.05134095789841682,
"grad_norm": 1.0078125,
"learning_rate": 1.128795596577563e-05,
"loss": 0.4933,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 514,
"tokens_per_second_per_gpu": 18641.21,
"total_tokens": 23231758
},
{
"epoch": 0.05144084303051491,
"grad_norm": 0.97265625,
"learning_rate": 1.1253332335643043e-05,
"loss": 0.4605,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 515,
"tokens_per_second_per_gpu": 19688.1,
"total_tokens": 23278872
},
{
"epoch": 0.05154072816261299,
"grad_norm": 0.9921875,
"learning_rate": 1.1218693434051475e-05,
"loss": 0.5027,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 516,
"tokens_per_second_per_gpu": 20087.61,
"total_tokens": 23327833
},
{
"epoch": 0.051640613294711084,
"grad_norm": 1.1328125,
"learning_rate": 1.1184039683065014e-05,
"loss": 0.4815,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 517,
"tokens_per_second_per_gpu": 18899.57,
"total_tokens": 23373189
},
{
"epoch": 0.05174049842680917,
"grad_norm": 0.98046875,
"learning_rate": 1.1149371504928667e-05,
"loss": 0.4454,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 518,
"tokens_per_second_per_gpu": 19441.21,
"total_tokens": 23420039
},
{
"epoch": 0.05184038355890726,
"grad_norm": 0.97265625,
"learning_rate": 1.1114689322063255e-05,
"loss": 0.4678,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 519,
"tokens_per_second_per_gpu": 18903.71,
"total_tokens": 23465844
},
{
"epoch": 0.05194026869100534,
"grad_norm": 1.078125,
"learning_rate": 1.1079993557060228e-05,
"loss": 0.4813,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 520,
"tokens_per_second_per_gpu": 17792.12,
"total_tokens": 23509432
},
{
"epoch": 0.05204015382310343,
"grad_norm": 0.98046875,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.5014,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 521,
"tokens_per_second_per_gpu": 19237.45,
"total_tokens": 23556465
},
{
"epoch": 0.05214003895520152,
"grad_norm": 1.0703125,
"learning_rate": 1.1010562971829464e-05,
"loss": 0.4765,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 522,
"tokens_per_second_per_gpu": 17247.23,
"total_tokens": 23598540
},
{
"epoch": 0.052239924087299605,
"grad_norm": 1.0625,
"learning_rate": 1.0975828997591496e-05,
"loss": 0.4712,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 523,
"tokens_per_second_per_gpu": 18469.09,
"total_tokens": 23643815
},
{
"epoch": 0.052339809219397695,
"grad_norm": 1.0,
"learning_rate": 1.0941083133185146e-05,
"loss": 0.4282,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 524,
"tokens_per_second_per_gpu": 17815.45,
"total_tokens": 23686989
},
{
"epoch": 0.05243969435149578,
"grad_norm": 0.98828125,
"learning_rate": 1.0906325801977804e-05,
"loss": 0.4625,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 525,
"tokens_per_second_per_gpu": 17453.89,
"total_tokens": 23730806
},
{
"epoch": 0.05253957948359387,
"grad_norm": 0.9765625,
"learning_rate": 1.0871557427476585e-05,
"loss": 0.4538,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 526,
"tokens_per_second_per_gpu": 18837.53,
"total_tokens": 23776963
},
{
"epoch": 0.05263946461569195,
"grad_norm": 1.0078125,
"learning_rate": 1.083677843332316e-05,
"loss": 0.4792,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 527,
"tokens_per_second_per_gpu": 19107.38,
"total_tokens": 23824252
},
{
"epoch": 0.05273934974779004,
"grad_norm": 1.015625,
"learning_rate": 1.0801989243288588e-05,
"loss": 0.4795,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 528,
"tokens_per_second_per_gpu": 18085.61,
"total_tokens": 23869316
},
{
"epoch": 0.052839234879888126,
"grad_norm": 1.0234375,
"learning_rate": 1.0767190281268187e-05,
"loss": 0.4145,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 529,
"tokens_per_second_per_gpu": 16406.74,
"total_tokens": 23910350
},
{
"epoch": 0.05293912001198622,
"grad_norm": 1.0,
"learning_rate": 1.0732381971276318e-05,
"loss": 0.4572,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 530,
"tokens_per_second_per_gpu": 17710.36,
"total_tokens": 23954773
},
{
"epoch": 0.0530390051440843,
"grad_norm": 0.984375,
"learning_rate": 1.0697564737441254e-05,
"loss": 0.4703,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 531,
"tokens_per_second_per_gpu": 18903.27,
"total_tokens": 24000771
},
{
"epoch": 0.05313889027618239,
"grad_norm": 0.9921875,
"learning_rate": 1.0662739004000005e-05,
"loss": 0.4402,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 532,
"tokens_per_second_per_gpu": 18233.74,
"total_tokens": 24045859
},
{
"epoch": 0.053238775408280474,
"grad_norm": 0.98828125,
"learning_rate": 1.0627905195293135e-05,
"loss": 0.4951,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 533,
"tokens_per_second_per_gpu": 19692.99,
"total_tokens": 24093700
},
{
"epoch": 0.053338660540378564,
"grad_norm": 0.9453125,
"learning_rate": 1.0593063735759619e-05,
"loss": 0.4596,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 534,
"tokens_per_second_per_gpu": 18803.61,
"total_tokens": 24139870
},
{
"epoch": 0.053438545672476655,
"grad_norm": 0.96875,
"learning_rate": 1.055821504993164e-05,
"loss": 0.4681,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 535,
"tokens_per_second_per_gpu": 19578.83,
"total_tokens": 24186517
},
{
"epoch": 0.05353843080457474,
"grad_norm": 0.96875,
"learning_rate": 1.0523359562429441e-05,
"loss": 0.5205,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 536,
"tokens_per_second_per_gpu": 20269.17,
"total_tokens": 24236114
},
{
"epoch": 0.05363831593667283,
"grad_norm": 0.95703125,
"learning_rate": 1.0488497697956134e-05,
"loss": 0.4709,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 537,
"tokens_per_second_per_gpu": 18848.83,
"total_tokens": 24282417
},
{
"epoch": 0.05373820106877091,
"grad_norm": 0.953125,
"learning_rate": 1.0453629881292537e-05,
"loss": 0.4237,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 538,
"tokens_per_second_per_gpu": 18133.2,
"total_tokens": 24327150
},
{
"epoch": 0.053838086200869,
"grad_norm": 0.9921875,
"learning_rate": 1.0418756537291996e-05,
"loss": 0.3724,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 539,
"tokens_per_second_per_gpu": 15680.58,
"total_tokens": 24365728
},
{
"epoch": 0.053937971332967086,
"grad_norm": 0.96484375,
"learning_rate": 1.03838780908752e-05,
"loss": 0.428,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 540,
"tokens_per_second_per_gpu": 18189.04,
"total_tokens": 24410190
},
{
"epoch": 0.054037856465065176,
"grad_norm": 1.0703125,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.4759,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 541,
"tokens_per_second_per_gpu": 17332.7,
"total_tokens": 24451894
},
{
"epoch": 0.05413774159716326,
"grad_norm": 0.9921875,
"learning_rate": 1.0314107590781284e-05,
"loss": 0.4739,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 542,
"tokens_per_second_per_gpu": 20634.37,
"total_tokens": 24502320
},
{
"epoch": 0.05423762672926135,
"grad_norm": 1.046875,
"learning_rate": 1.0279216387235691e-05,
"loss": 0.4531,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 543,
"tokens_per_second_per_gpu": 17057.94,
"total_tokens": 24545125
},
{
"epoch": 0.05433751186135943,
"grad_norm": 1.03125,
"learning_rate": 1.0244321781526533e-05,
"loss": 0.4646,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 544,
"tokens_per_second_per_gpu": 17576.91,
"total_tokens": 24587955
},
{
"epoch": 0.054437396993457524,
"grad_norm": 1.015625,
"learning_rate": 1.0209424198833571e-05,
"loss": 0.4518,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 545,
"tokens_per_second_per_gpu": 17794.97,
"total_tokens": 24632457
},
{
"epoch": 0.054537282125555614,
"grad_norm": 0.94921875,
"learning_rate": 1.0174524064372837e-05,
"loss": 0.4674,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 546,
"tokens_per_second_per_gpu": 21668.28,
"total_tokens": 24684174
},
{
"epoch": 0.0546371672576537,
"grad_norm": 1.0,
"learning_rate": 1.0139621803391454e-05,
"loss": 0.4574,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 547,
"tokens_per_second_per_gpu": 18638.05,
"total_tokens": 24731619
},
{
"epoch": 0.05473705238975179,
"grad_norm": 0.94921875,
"learning_rate": 1.010471784116246e-05,
"loss": 0.4437,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 548,
"tokens_per_second_per_gpu": 18738.17,
"total_tokens": 24777474
},
{
"epoch": 0.05483693752184987,
"grad_norm": 0.953125,
"learning_rate": 1.0069812602979617e-05,
"loss": 0.4953,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 549,
"tokens_per_second_per_gpu": 20376.59,
"total_tokens": 24827482
},
{
"epoch": 0.05493682265394796,
"grad_norm": 0.9296875,
"learning_rate": 1.0034906514152239e-05,
"loss": 0.4572,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 550,
"tokens_per_second_per_gpu": 19513.4,
"total_tokens": 24876158
},
{
"epoch": 0.055036707786046045,
"grad_norm": 0.9765625,
"learning_rate": 1e-05,
"loss": 0.4804,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 551,
"tokens_per_second_per_gpu": 19312.89,
"total_tokens": 24924673
},
{
"epoch": 0.055136592918144135,
"grad_norm": 0.953125,
"learning_rate": 9.965093485847766e-06,
"loss": 0.4502,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 552,
"tokens_per_second_per_gpu": 19002.55,
"total_tokens": 24970279
},
{
"epoch": 0.05523647805024222,
"grad_norm": 0.95703125,
"learning_rate": 9.930187397020385e-06,
"loss": 0.451,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 553,
"tokens_per_second_per_gpu": 19493.12,
"total_tokens": 25017731
},
{
"epoch": 0.05533636318234031,
"grad_norm": 0.94921875,
"learning_rate": 9.895282158837545e-06,
"loss": 0.4229,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 554,
"tokens_per_second_per_gpu": 17975.71,
"total_tokens": 25062533
},
{
"epoch": 0.05543624831443839,
"grad_norm": 1.0,
"learning_rate": 9.860378196608549e-06,
"loss": 0.4934,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 555,
"tokens_per_second_per_gpu": 18552.65,
"total_tokens": 25108295
},
{
"epoch": 0.05553613344653648,
"grad_norm": 1.0,
"learning_rate": 9.825475935627165e-06,
"loss": 0.3945,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 556,
"tokens_per_second_per_gpu": 16296.72,
"total_tokens": 25147946
},
{
"epoch": 0.05563601857863457,
"grad_norm": 0.96875,
"learning_rate": 9.790575801166432e-06,
"loss": 0.4595,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 557,
"tokens_per_second_per_gpu": 18774.27,
"total_tokens": 25194998
},
{
"epoch": 0.05573590371073266,
"grad_norm": 0.9453125,
"learning_rate": 9.75567821847347e-06,
"loss": 0.4485,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 558,
"tokens_per_second_per_gpu": 19033.71,
"total_tokens": 25242866
},
{
"epoch": 0.05583578884283075,
"grad_norm": 0.9921875,
"learning_rate": 9.720783612764314e-06,
"loss": 0.4339,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 559,
"tokens_per_second_per_gpu": 17784.22,
"total_tokens": 25286127
},
{
"epoch": 0.05593567397492883,
"grad_norm": 1.0,
"learning_rate": 9.685892409218718e-06,
"loss": 0.4305,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 560,
"tokens_per_second_per_gpu": 16979.23,
"total_tokens": 25327149
},
{
"epoch": 0.05603555910702692,
"grad_norm": 0.94921875,
"learning_rate": 9.651005032974994e-06,
"loss": 0.5061,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 561,
"tokens_per_second_per_gpu": 19737.19,
"total_tokens": 25375816
},
{
"epoch": 0.056135444239125004,
"grad_norm": 0.9921875,
"learning_rate": 9.616121909124801e-06,
"loss": 0.42,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 562,
"tokens_per_second_per_gpu": 17275.51,
"total_tokens": 25417750
},
{
"epoch": 0.056235329371223094,
"grad_norm": 0.98828125,
"learning_rate": 9.581243462708007e-06,
"loss": 0.475,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 563,
"tokens_per_second_per_gpu": 18029.63,
"total_tokens": 25463172
},
{
"epoch": 0.05633521450332118,
"grad_norm": 1.0,
"learning_rate": 9.546370118707463e-06,
"loss": 0.3855,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 564,
"tokens_per_second_per_gpu": 15926.85,
"total_tokens": 25501577
},
{
"epoch": 0.05643509963541927,
"grad_norm": 1.25,
"learning_rate": 9.511502302043867e-06,
"loss": 0.4714,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 565,
"tokens_per_second_per_gpu": 20540.94,
"total_tokens": 25550938
},
{
"epoch": 0.05653498476751735,
"grad_norm": 0.953125,
"learning_rate": 9.476640437570562e-06,
"loss": 0.4812,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 566,
"tokens_per_second_per_gpu": 18593.41,
"total_tokens": 25596861
},
{
"epoch": 0.05663486989961544,
"grad_norm": 1.0390625,
"learning_rate": 9.441784950068362e-06,
"loss": 0.4998,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 567,
"tokens_per_second_per_gpu": 19689.28,
"total_tokens": 25645516
},
{
"epoch": 0.05673475503171353,
"grad_norm": 1.0078125,
"learning_rate": 9.406936264240386e-06,
"loss": 0.4486,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 568,
"tokens_per_second_per_gpu": 17352.5,
"total_tokens": 25687800
},
{
"epoch": 0.056834640163811616,
"grad_norm": 1.0390625,
"learning_rate": 9.372094804706867e-06,
"loss": 0.4391,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 569,
"tokens_per_second_per_gpu": 17140.78,
"total_tokens": 25730650
},
{
"epoch": 0.056934525295909706,
"grad_norm": 1.0234375,
"learning_rate": 9.337260996000002e-06,
"loss": 0.4701,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 570,
"tokens_per_second_per_gpu": 16625.41,
"total_tokens": 25772206
},
{
"epoch": 0.05703441042800779,
"grad_norm": 0.95703125,
"learning_rate": 9.302435262558748e-06,
"loss": 0.467,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 571,
"tokens_per_second_per_gpu": 19677.5,
"total_tokens": 25821005
},
{
"epoch": 0.05713429556010588,
"grad_norm": 1.015625,
"learning_rate": 9.267618028723687e-06,
"loss": 0.4443,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 572,
"tokens_per_second_per_gpu": 17615.31,
"total_tokens": 25863849
},
{
"epoch": 0.05723418069220396,
"grad_norm": 0.96875,
"learning_rate": 9.232809718731815e-06,
"loss": 0.4633,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 573,
"tokens_per_second_per_gpu": 18647.94,
"total_tokens": 25909625
},
{
"epoch": 0.057334065824302054,
"grad_norm": 0.97265625,
"learning_rate": 9.198010756711413e-06,
"loss": 0.4333,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 574,
"tokens_per_second_per_gpu": 18315.95,
"total_tokens": 25954200
},
{
"epoch": 0.05743395095640014,
"grad_norm": 0.984375,
"learning_rate": 9.163221566676847e-06,
"loss": 0.5012,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 575,
"tokens_per_second_per_gpu": 20415.32,
"total_tokens": 26003168
},
{
"epoch": 0.05753383608849823,
"grad_norm": 0.99609375,
"learning_rate": 9.128442572523418e-06,
"loss": 0.4842,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 576,
"tokens_per_second_per_gpu": 18732.91,
"total_tokens": 26048948
},
{
"epoch": 0.05763372122059631,
"grad_norm": 0.93359375,
"learning_rate": 9.093674198022201e-06,
"loss": 0.4333,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 577,
"tokens_per_second_per_gpu": 19008.14,
"total_tokens": 26096480
},
{
"epoch": 0.0577336063526944,
"grad_norm": 0.97265625,
"learning_rate": 9.058916866814857e-06,
"loss": 0.4946,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 578,
"tokens_per_second_per_gpu": 20926.04,
"total_tokens": 26147269
},
{
"epoch": 0.05783349148479249,
"grad_norm": 1.015625,
"learning_rate": 9.024171002408507e-06,
"loss": 0.4802,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 579,
"tokens_per_second_per_gpu": 19004.89,
"total_tokens": 26194908
},
{
"epoch": 0.057933376616890575,
"grad_norm": 0.9765625,
"learning_rate": 8.989437028170537e-06,
"loss": 0.4595,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 580,
"tokens_per_second_per_gpu": 18815.85,
"total_tokens": 26241318
},
{
"epoch": 0.058033261748988665,
"grad_norm": 0.984375,
"learning_rate": 8.954715367323468e-06,
"loss": 0.4884,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 581,
"tokens_per_second_per_gpu": 20409.23,
"total_tokens": 26289950
},
{
"epoch": 0.05813314688108675,
"grad_norm": 1.0234375,
"learning_rate": 8.920006442939772e-06,
"loss": 0.4935,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 582,
"tokens_per_second_per_gpu": 19855.08,
"total_tokens": 26339613
},
{
"epoch": 0.05823303201318484,
"grad_norm": 0.984375,
"learning_rate": 8.885310677936746e-06,
"loss": 0.4837,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 583,
"tokens_per_second_per_gpu": 18973.04,
"total_tokens": 26385041
},
{
"epoch": 0.05833291714528292,
"grad_norm": 0.99609375,
"learning_rate": 8.850628495071336e-06,
"loss": 0.4396,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 584,
"tokens_per_second_per_gpu": 17273.93,
"total_tokens": 26427549
},
{
"epoch": 0.05843280227738101,
"grad_norm": 0.94140625,
"learning_rate": 8.815960316934991e-06,
"loss": 0.4523,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 585,
"tokens_per_second_per_gpu": 19089.53,
"total_tokens": 26473743
},
{
"epoch": 0.058532687409479096,
"grad_norm": 0.9921875,
"learning_rate": 8.781306565948528e-06,
"loss": 0.3876,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 586,
"tokens_per_second_per_gpu": 16731.9,
"total_tokens": 26514808
},
{
"epoch": 0.05863257254157719,
"grad_norm": 1.0390625,
"learning_rate": 8.746667664356957e-06,
"loss": 0.4639,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 587,
"tokens_per_second_per_gpu": 18593.43,
"total_tokens": 26560648
},
{
"epoch": 0.05873245767367527,
"grad_norm": 0.98046875,
"learning_rate": 8.712044034224374e-06,
"loss": 0.4661,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 588,
"tokens_per_second_per_gpu": 19767.89,
"total_tokens": 26607406
},
{
"epoch": 0.05883234280577336,
"grad_norm": 0.97265625,
"learning_rate": 8.677436097428775e-06,
"loss": 0.4576,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 589,
"tokens_per_second_per_gpu": 18286.53,
"total_tokens": 26652688
},
{
"epoch": 0.05893222793787145,
"grad_norm": 0.9609375,
"learning_rate": 8.642844275656957e-06,
"loss": 0.4869,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 590,
"tokens_per_second_per_gpu": 20452.72,
"total_tokens": 26701540
},
{
"epoch": 0.059032113069969534,
"grad_norm": 0.9765625,
"learning_rate": 8.60826899039935e-06,
"loss": 0.4703,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 591,
"tokens_per_second_per_gpu": 19659.34,
"total_tokens": 26748739
},
{
"epoch": 0.059131998202067625,
"grad_norm": 1.078125,
"learning_rate": 8.573710662944884e-06,
"loss": 0.4692,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 592,
"tokens_per_second_per_gpu": 15527.22,
"total_tokens": 26787153
},
{
"epoch": 0.05923188333416571,
"grad_norm": 1.0078125,
"learning_rate": 8.539169714375885e-06,
"loss": 0.4582,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 593,
"tokens_per_second_per_gpu": 18806.46,
"total_tokens": 26832537
},
{
"epoch": 0.0593317684662638,
"grad_norm": 0.93359375,
"learning_rate": 8.504646565562907e-06,
"loss": 0.4949,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 594,
"tokens_per_second_per_gpu": 20047.32,
"total_tokens": 26881583
},
{
"epoch": 0.05943165359836188,
"grad_norm": 0.96484375,
"learning_rate": 8.47014163715962e-06,
"loss": 0.4277,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 595,
"tokens_per_second_per_gpu": 17635.6,
"total_tokens": 26924740
},
{
"epoch": 0.05953153873045997,
"grad_norm": 1.0703125,
"learning_rate": 8.43565534959769e-06,
"loss": 0.4941,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 596,
"tokens_per_second_per_gpu": 16559.95,
"total_tokens": 26964699
},
{
"epoch": 0.059631423862558056,
"grad_norm": 1.0546875,
"learning_rate": 8.401188123081653e-06,
"loss": 0.503,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 597,
"tokens_per_second_per_gpu": 19523.28,
"total_tokens": 27013061
},
{
"epoch": 0.059731308994656146,
"grad_norm": 1.046875,
"learning_rate": 8.366740377583781e-06,
"loss": 0.4505,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 598,
"tokens_per_second_per_gpu": 16031.96,
"total_tokens": 27053346
},
{
"epoch": 0.05983119412675423,
"grad_norm": 1.828125,
"learning_rate": 8.332312532838978e-06,
"loss": 0.4278,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 599,
"tokens_per_second_per_gpu": 18183.46,
"total_tokens": 27097289
},
{
"epoch": 0.05993107925885232,
"grad_norm": 0.96484375,
"learning_rate": 8.297905008339677e-06,
"loss": 0.4328,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 600,
"tokens_per_second_per_gpu": 18745.84,
"total_tokens": 27142719
},
{
"epoch": 0.06003096439095041,
"grad_norm": 0.96875,
"learning_rate": 8.263518223330698e-06,
"loss": 0.4855,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 601,
"tokens_per_second_per_gpu": 19117.69,
"total_tokens": 27189704
},
{
"epoch": 0.060130849523048494,
"grad_norm": 0.9765625,
"learning_rate": 8.22915259680417e-06,
"loss": 0.4138,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 602,
"tokens_per_second_per_gpu": 17316.95,
"total_tokens": 27233327
},
{
"epoch": 0.060230734655146584,
"grad_norm": 0.9921875,
"learning_rate": 8.194808547494401e-06,
"loss": 0.4416,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 603,
"tokens_per_second_per_gpu": 18794.71,
"total_tokens": 27278463
},
{
"epoch": 0.06033061978724467,
"grad_norm": 0.9765625,
"learning_rate": 8.1604864938728e-06,
"loss": 0.4301,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 604,
"tokens_per_second_per_gpu": 17557.98,
"total_tokens": 27321255
},
{
"epoch": 0.06043050491934276,
"grad_norm": 0.984375,
"learning_rate": 8.126186854142752e-06,
"loss": 0.4919,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 605,
"tokens_per_second_per_gpu": 19766.28,
"total_tokens": 27368920
},
{
"epoch": 0.06053039005144084,
"grad_norm": 0.9765625,
"learning_rate": 8.091910046234552e-06,
"loss": 0.4667,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 606,
"tokens_per_second_per_gpu": 18862.83,
"total_tokens": 27415583
},
{
"epoch": 0.06063027518353893,
"grad_norm": 0.96484375,
"learning_rate": 8.057656487800283e-06,
"loss": 0.4607,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 607,
"tokens_per_second_per_gpu": 18179.15,
"total_tokens": 27459995
},
{
"epoch": 0.060730160315637015,
"grad_norm": 0.94140625,
"learning_rate": 8.023426596208739e-06,
"loss": 0.444,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 608,
"tokens_per_second_per_gpu": 19382.04,
"total_tokens": 27506679
},
{
"epoch": 0.060830045447735105,
"grad_norm": 1.015625,
"learning_rate": 7.989220788540356e-06,
"loss": 0.4386,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 609,
"tokens_per_second_per_gpu": 17295.19,
"total_tokens": 27549153
},
{
"epoch": 0.06092993057983319,
"grad_norm": 0.9453125,
"learning_rate": 7.955039481582098e-06,
"loss": 0.4129,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 610,
"tokens_per_second_per_gpu": 16833.12,
"total_tokens": 27591645
},
{
"epoch": 0.06102981571193128,
"grad_norm": 0.96484375,
"learning_rate": 7.92088309182241e-06,
"loss": 0.45,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 611,
"tokens_per_second_per_gpu": 19610.45,
"total_tokens": 27638598
},
{
"epoch": 0.06112970084402937,
"grad_norm": 0.9921875,
"learning_rate": 7.886752035446116e-06,
"loss": 0.4748,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 612,
"tokens_per_second_per_gpu": 19465.45,
"total_tokens": 27685299
},
{
"epoch": 0.06122958597612745,
"grad_norm": 0.9609375,
"learning_rate": 7.852646728329368e-06,
"loss": 0.4291,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 613,
"tokens_per_second_per_gpu": 17706.05,
"total_tokens": 27729722
},
{
"epoch": 0.06132947110822554,
"grad_norm": 0.9609375,
"learning_rate": 7.818567586034578e-06,
"loss": 0.4414,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 614,
"tokens_per_second_per_gpu": 19520.29,
"total_tokens": 27776771
},
{
"epoch": 0.061429356240323627,
"grad_norm": 0.96484375,
"learning_rate": 7.784515023805328e-06,
"loss": 0.4513,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 615,
"tokens_per_second_per_gpu": 19684.59,
"total_tokens": 27824101
},
{
"epoch": 0.06152924137242172,
"grad_norm": 1.0390625,
"learning_rate": 7.750489456561351e-06,
"loss": 0.4747,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 616,
"tokens_per_second_per_gpu": 17967.76,
"total_tokens": 27868858
},
{
"epoch": 0.0616291265045198,
"grad_norm": 0.96875,
"learning_rate": 7.716491298893443e-06,
"loss": 0.434,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 617,
"tokens_per_second_per_gpu": 16676.99,
"total_tokens": 27910582
},
{
"epoch": 0.06172901163661789,
"grad_norm": 1.015625,
"learning_rate": 7.68252096505843e-06,
"loss": 0.4833,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 618,
"tokens_per_second_per_gpu": 17930.32,
"total_tokens": 27955552
},
{
"epoch": 0.061828896768715974,
"grad_norm": 0.9921875,
"learning_rate": 7.6485788689741e-06,
"loss": 0.4148,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 619,
"tokens_per_second_per_gpu": 17313.44,
"total_tokens": 27997912
},
{
"epoch": 0.061928781900814064,
"grad_norm": 1.0078125,
"learning_rate": 7.6146654242141935e-06,
"loss": 0.3938,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 620,
"tokens_per_second_per_gpu": 16905.12,
"total_tokens": 28039699
},
{
"epoch": 0.06202866703291215,
"grad_norm": 0.9921875,
"learning_rate": 7.580781044003324e-06,
"loss": 0.466,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 621,
"tokens_per_second_per_gpu": 19457.54,
"total_tokens": 28085977
},
{
"epoch": 0.06212855216501024,
"grad_norm": 1.0234375,
"learning_rate": 7.546926141211975e-06,
"loss": 0.466,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 622,
"tokens_per_second_per_gpu": 17933.92,
"total_tokens": 28130081
},
{
"epoch": 0.06222843729710833,
"grad_norm": 1.0234375,
"learning_rate": 7.513101128351454e-06,
"loss": 0.4754,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 623,
"tokens_per_second_per_gpu": 18196.03,
"total_tokens": 28175362
},
{
"epoch": 0.06232832242920641,
"grad_norm": 0.93359375,
"learning_rate": 7.4793064175688635e-06,
"loss": 0.5095,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 624,
"tokens_per_second_per_gpu": 20367.97,
"total_tokens": 28225060
},
{
"epoch": 0.0624282075613045,
"grad_norm": 0.99609375,
"learning_rate": 7.445542420642097e-06,
"loss": 0.4894,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 625,
"tokens_per_second_per_gpu": 19088.22,
"total_tokens": 28272613
},
{
"epoch": 0.06252809269340259,
"grad_norm": 0.9609375,
"learning_rate": 7.411809548974792e-06,
"loss": 0.4331,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 626,
"tokens_per_second_per_gpu": 17644.45,
"total_tokens": 28318026
},
{
"epoch": 0.06262797782550067,
"grad_norm": 0.9609375,
"learning_rate": 7.378108213591355e-06,
"loss": 0.441,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 627,
"tokens_per_second_per_gpu": 18800.17,
"total_tokens": 28363693
},
{
"epoch": 0.06272786295759876,
"grad_norm": 0.984375,
"learning_rate": 7.344438825131912e-06,
"loss": 0.4471,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 628,
"tokens_per_second_per_gpu": 18055.68,
"total_tokens": 28407964
},
{
"epoch": 0.06282774808969685,
"grad_norm": 1.015625,
"learning_rate": 7.310801793847344e-06,
"loss": 0.4504,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 629,
"tokens_per_second_per_gpu": 17227.74,
"total_tokens": 28451011
},
{
"epoch": 0.06292763322179494,
"grad_norm": 1.0390625,
"learning_rate": 7.277197529594257e-06,
"loss": 0.407,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 630,
"tokens_per_second_per_gpu": 16007.4,
"total_tokens": 28490050
},
{
"epoch": 0.06302751835389302,
"grad_norm": 1.0,
"learning_rate": 7.243626441830009e-06,
"loss": 0.4627,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 631,
"tokens_per_second_per_gpu": 18154.11,
"total_tokens": 28534404
},
{
"epoch": 0.06312740348599111,
"grad_norm": 0.953125,
"learning_rate": 7.210088939607709e-06,
"loss": 0.4687,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 632,
"tokens_per_second_per_gpu": 19973.58,
"total_tokens": 28583112
},
{
"epoch": 0.0632272886180892,
"grad_norm": 0.98828125,
"learning_rate": 7.176585431571235e-06,
"loss": 0.4917,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 633,
"tokens_per_second_per_gpu": 19270.85,
"total_tokens": 28630682
},
{
"epoch": 0.06332717375018729,
"grad_norm": 1.015625,
"learning_rate": 7.143116325950266e-06,
"loss": 0.4501,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 634,
"tokens_per_second_per_gpu": 17027.02,
"total_tokens": 28672763
},
{
"epoch": 0.06342705888228538,
"grad_norm": 0.9375,
"learning_rate": 7.109682030555283e-06,
"loss": 0.4657,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 635,
"tokens_per_second_per_gpu": 18996.37,
"total_tokens": 28718962
},
{
"epoch": 0.06352694401438345,
"grad_norm": 0.99609375,
"learning_rate": 7.076282952772634e-06,
"loss": 0.462,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 636,
"tokens_per_second_per_gpu": 19421.75,
"total_tokens": 28766632
},
{
"epoch": 0.06362682914648154,
"grad_norm": 0.98828125,
"learning_rate": 7.042919499559538e-06,
"loss": 0.468,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 637,
"tokens_per_second_per_gpu": 18377.34,
"total_tokens": 28811631
},
{
"epoch": 0.06372671427857964,
"grad_norm": 0.984375,
"learning_rate": 7.009592077439135e-06,
"loss": 0.4868,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 638,
"tokens_per_second_per_gpu": 19772.77,
"total_tokens": 28859019
},
{
"epoch": 0.06382659941067773,
"grad_norm": 1.0078125,
"learning_rate": 6.976301092495556e-06,
"loss": 0.4605,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 639,
"tokens_per_second_per_gpu": 19948.77,
"total_tokens": 28907903
},
{
"epoch": 0.0639264845427758,
"grad_norm": 0.96484375,
"learning_rate": 6.943046950368944e-06,
"loss": 0.4336,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 640,
"tokens_per_second_per_gpu": 18449.02,
"total_tokens": 28952689
},
{
"epoch": 0.06402636967487389,
"grad_norm": 1.0546875,
"learning_rate": 6.909830056250527e-06,
"loss": 0.3581,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 641,
"tokens_per_second_per_gpu": 13364.83,
"total_tokens": 28986620
},
{
"epoch": 0.06412625480697198,
"grad_norm": 1.0390625,
"learning_rate": 6.876650814877675e-06,
"loss": 0.4532,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 642,
"tokens_per_second_per_gpu": 18207.0,
"total_tokens": 29030761
},
{
"epoch": 0.06422613993907007,
"grad_norm": 0.99609375,
"learning_rate": 6.843509630528977e-06,
"loss": 0.4339,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 643,
"tokens_per_second_per_gpu": 19715.45,
"total_tokens": 29077383
},
{
"epoch": 0.06432602507116815,
"grad_norm": 0.9921875,
"learning_rate": 6.8104069070193e-06,
"loss": 0.4657,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 644,
"tokens_per_second_per_gpu": 17513.0,
"total_tokens": 29120675
},
{
"epoch": 0.06442591020326624,
"grad_norm": 1.015625,
"learning_rate": 6.777343047694891e-06,
"loss": 0.4571,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 645,
"tokens_per_second_per_gpu": 17560.1,
"total_tokens": 29164085
},
{
"epoch": 0.06452579533536433,
"grad_norm": 1.03125,
"learning_rate": 6.744318455428436e-06,
"loss": 0.4452,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 646,
"tokens_per_second_per_gpu": 16974.5,
"total_tokens": 29205333
},
{
"epoch": 0.06462568046746242,
"grad_norm": 0.953125,
"learning_rate": 6.711333532614168e-06,
"loss": 0.4409,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 647,
"tokens_per_second_per_gpu": 18172.28,
"total_tokens": 29250861
},
{
"epoch": 0.06472556559956051,
"grad_norm": 1.1484375,
"learning_rate": 6.67838868116297e-06,
"loss": 0.4604,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 648,
"tokens_per_second_per_gpu": 17922.54,
"total_tokens": 29294375
},
{
"epoch": 0.06482545073165859,
"grad_norm": 0.9453125,
"learning_rate": 6.645484302497452e-06,
"loss": 0.4123,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 649,
"tokens_per_second_per_gpu": 19412.3,
"total_tokens": 29342090
},
{
"epoch": 0.06492533586375668,
"grad_norm": 0.98828125,
"learning_rate": 6.612620797547087e-06,
"loss": 0.4664,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 650,
"tokens_per_second_per_gpu": 20085.46,
"total_tokens": 29390943
},
{
"epoch": 0.06502522099585477,
"grad_norm": 0.953125,
"learning_rate": 6.579798566743314e-06,
"loss": 0.4643,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 651,
"tokens_per_second_per_gpu": 19639.71,
"total_tokens": 29438908
},
{
"epoch": 0.06512510612795286,
"grad_norm": 0.96875,
"learning_rate": 6.547018010014654e-06,
"loss": 0.5143,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 652,
"tokens_per_second_per_gpu": 20348.98,
"total_tokens": 29487963
},
{
"epoch": 0.06522499126005094,
"grad_norm": 1.140625,
"learning_rate": 6.5142795267818505e-06,
"loss": 0.4578,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 653,
"tokens_per_second_per_gpu": 21329.4,
"total_tokens": 29539358
},
{
"epoch": 0.06532487639214903,
"grad_norm": 0.9765625,
"learning_rate": 6.481583515952983e-06,
"loss": 0.4758,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 654,
"tokens_per_second_per_gpu": 20649.82,
"total_tokens": 29589170
},
{
"epoch": 0.06542476152424712,
"grad_norm": 1.0234375,
"learning_rate": 6.448930375918632e-06,
"loss": 0.4418,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 655,
"tokens_per_second_per_gpu": 15975.75,
"total_tokens": 29628635
},
{
"epoch": 0.0655246466563452,
"grad_norm": 0.94140625,
"learning_rate": 6.4163205045469975e-06,
"loss": 0.4991,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 656,
"tokens_per_second_per_gpu": 20504.51,
"total_tokens": 29680577
},
{
"epoch": 0.0656245317884433,
"grad_norm": 1.03125,
"learning_rate": 6.383754299179079e-06,
"loss": 0.4592,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 657,
"tokens_per_second_per_gpu": 17970.07,
"total_tokens": 29725176
},
{
"epoch": 0.06572441692054137,
"grad_norm": 0.99609375,
"learning_rate": 6.351232156623803e-06,
"loss": 0.4616,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 658,
"tokens_per_second_per_gpu": 18193.26,
"total_tokens": 29769698
},
{
"epoch": 0.06582430205263946,
"grad_norm": 1.0078125,
"learning_rate": 6.318754473153221e-06,
"loss": 0.4671,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 659,
"tokens_per_second_per_gpu": 18415.02,
"total_tokens": 29814583
},
{
"epoch": 0.06592418718473755,
"grad_norm": 0.9765625,
"learning_rate": 6.286321644497655e-06,
"loss": 0.4223,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 660,
"tokens_per_second_per_gpu": 17227.34,
"total_tokens": 29857378
},
{
"epoch": 0.06602407231683564,
"grad_norm": 0.92578125,
"learning_rate": 6.25393406584088e-06,
"loss": 0.392,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 661,
"tokens_per_second_per_gpu": 18097.88,
"total_tokens": 29901353
},
{
"epoch": 0.06612395744893372,
"grad_norm": 1.0703125,
"learning_rate": 6.22159213181533e-06,
"loss": 0.4632,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 662,
"tokens_per_second_per_gpu": 18450.85,
"total_tokens": 29946306
},
{
"epoch": 0.06622384258103181,
"grad_norm": 0.96875,
"learning_rate": 6.18929623649726e-06,
"loss": 0.4411,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 663,
"tokens_per_second_per_gpu": 17681.94,
"total_tokens": 29989951
},
{
"epoch": 0.0663237277131299,
"grad_norm": 0.9375,
"learning_rate": 6.157046773401964e-06,
"loss": 0.4539,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 664,
"tokens_per_second_per_gpu": 19703.0,
"total_tokens": 30038267
},
{
"epoch": 0.06642361284522799,
"grad_norm": 0.98046875,
"learning_rate": 6.124844135478971e-06,
"loss": 0.4563,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 665,
"tokens_per_second_per_gpu": 17008.48,
"total_tokens": 30081026
},
{
"epoch": 0.06652349797732607,
"grad_norm": 0.96484375,
"learning_rate": 6.092688715107265e-06,
"loss": 0.4441,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 666,
"tokens_per_second_per_gpu": 19151.18,
"total_tokens": 30127625
},
{
"epoch": 0.06662338310942416,
"grad_norm": 0.96484375,
"learning_rate": 6.06058090409049e-06,
"loss": 0.412,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 667,
"tokens_per_second_per_gpu": 16757.9,
"total_tokens": 30168887
},
{
"epoch": 0.06672326824152225,
"grad_norm": 0.9375,
"learning_rate": 6.028521093652195e-06,
"loss": 0.4656,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 668,
"tokens_per_second_per_gpu": 20171.11,
"total_tokens": 30218601
},
{
"epoch": 0.06682315337362034,
"grad_norm": 0.984375,
"learning_rate": 5.996509674431053e-06,
"loss": 0.4596,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 669,
"tokens_per_second_per_gpu": 18319.13,
"total_tokens": 30264663
},
{
"epoch": 0.06692303850571843,
"grad_norm": 0.98046875,
"learning_rate": 5.9645470364761e-06,
"loss": 0.4339,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 670,
"tokens_per_second_per_gpu": 17872.78,
"total_tokens": 30308631
},
{
"epoch": 0.0670229236378165,
"grad_norm": 0.95703125,
"learning_rate": 5.932633569242e-06,
"loss": 0.4562,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 671,
"tokens_per_second_per_gpu": 19672.95,
"total_tokens": 30355498
},
{
"epoch": 0.0671228087699146,
"grad_norm": 0.95703125,
"learning_rate": 5.900769661584273e-06,
"loss": 0.4612,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 672,
"tokens_per_second_per_gpu": 19593.86,
"total_tokens": 30403625
},
{
"epoch": 0.06722269390201269,
"grad_norm": 0.96484375,
"learning_rate": 5.868955701754584e-06,
"loss": 0.485,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 673,
"tokens_per_second_per_gpu": 21161.74,
"total_tokens": 30455157
},
{
"epoch": 0.06732257903411078,
"grad_norm": 0.984375,
"learning_rate": 5.83719207739599e-06,
"loss": 0.4095,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 674,
"tokens_per_second_per_gpu": 17658.46,
"total_tokens": 30498641
},
{
"epoch": 0.06742246416620885,
"grad_norm": 0.984375,
"learning_rate": 5.8054791755382286e-06,
"loss": 0.4319,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 675,
"tokens_per_second_per_gpu": 17117.92,
"total_tokens": 30540987
},
{
"epoch": 0.06752234929830694,
"grad_norm": 1.046875,
"learning_rate": 5.773817382593008e-06,
"loss": 0.4071,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 676,
"tokens_per_second_per_gpu": 14920.45,
"total_tokens": 30578470
},
{
"epoch": 0.06762223443040503,
"grad_norm": 1.265625,
"learning_rate": 5.742207084349274e-06,
"loss": 0.4675,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 677,
"tokens_per_second_per_gpu": 18108.81,
"total_tokens": 30623728
},
{
"epoch": 0.06772211956250312,
"grad_norm": 1.0078125,
"learning_rate": 5.710648665968543e-06,
"loss": 0.4302,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 678,
"tokens_per_second_per_gpu": 16796.52,
"total_tokens": 30665187
},
{
"epoch": 0.06782200469460122,
"grad_norm": 0.921875,
"learning_rate": 5.679142511980176e-06,
"loss": 0.4784,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 679,
"tokens_per_second_per_gpu": 20095.37,
"total_tokens": 30715056
},
{
"epoch": 0.06792188982669929,
"grad_norm": 1.0078125,
"learning_rate": 5.647689006276727e-06,
"loss": 0.4255,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 680,
"tokens_per_second_per_gpu": 16664.02,
"total_tokens": 30755522
},
{
"epoch": 0.06802177495879738,
"grad_norm": 1.0625,
"learning_rate": 5.616288532109225e-06,
"loss": 0.4976,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 681,
"tokens_per_second_per_gpu": 18576.48,
"total_tokens": 30801413
},
{
"epoch": 0.06812166009089547,
"grad_norm": 0.9765625,
"learning_rate": 5.584941472082549e-06,
"loss": 0.4152,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 682,
"tokens_per_second_per_gpu": 17569.33,
"total_tokens": 30844380
},
{
"epoch": 0.06822154522299356,
"grad_norm": 0.9765625,
"learning_rate": 5.553648208150728e-06,
"loss": 0.4134,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 683,
"tokens_per_second_per_gpu": 17369.95,
"total_tokens": 30887278
},
{
"epoch": 0.06832143035509164,
"grad_norm": 1.0078125,
"learning_rate": 5.522409121612304e-06,
"loss": 0.4959,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 684,
"tokens_per_second_per_gpu": 20798.58,
"total_tokens": 30937699
},
{
"epoch": 0.06842131548718973,
"grad_norm": 0.98046875,
"learning_rate": 5.491224593105695e-06,
"loss": 0.5075,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 685,
"tokens_per_second_per_gpu": 19341.32,
"total_tokens": 30985491
},
{
"epoch": 0.06852120061928782,
"grad_norm": 0.9375,
"learning_rate": 5.460095002604533e-06,
"loss": 0.4622,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 686,
"tokens_per_second_per_gpu": 20051.25,
"total_tokens": 31034575
},
{
"epoch": 0.06862108575138591,
"grad_norm": 0.9375,
"learning_rate": 5.429020729413062e-06,
"loss": 0.4696,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 687,
"tokens_per_second_per_gpu": 19922.24,
"total_tokens": 31084550
},
{
"epoch": 0.06872097088348399,
"grad_norm": 0.90625,
"learning_rate": 5.398002152161484e-06,
"loss": 0.4259,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 688,
"tokens_per_second_per_gpu": 19415.35,
"total_tokens": 31132179
},
{
"epoch": 0.06882085601558208,
"grad_norm": 1.015625,
"learning_rate": 5.367039648801386e-06,
"loss": 0.4633,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 689,
"tokens_per_second_per_gpu": 16543.47,
"total_tokens": 31173186
},
{
"epoch": 0.06892074114768017,
"grad_norm": 0.98046875,
"learning_rate": 5.336133596601089e-06,
"loss": 0.4457,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 690,
"tokens_per_second_per_gpu": 17956.14,
"total_tokens": 31217511
},
{
"epoch": 0.06902062627977826,
"grad_norm": 0.9765625,
"learning_rate": 5.305284372141095e-06,
"loss": 0.4806,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 691,
"tokens_per_second_per_gpu": 20201.6,
"total_tokens": 31266754
},
{
"epoch": 0.06912051141187635,
"grad_norm": 1.1796875,
"learning_rate": 5.274492351309462e-06,
"loss": 0.4487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 692,
"tokens_per_second_per_gpu": 17138.32,
"total_tokens": 31310363
},
{
"epoch": 0.06922039654397442,
"grad_norm": 0.9375,
"learning_rate": 5.243757909297247e-06,
"loss": 0.409,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 693,
"tokens_per_second_per_gpu": 19496.06,
"total_tokens": 31357301
},
{
"epoch": 0.06932028167607251,
"grad_norm": 0.9140625,
"learning_rate": 5.213081420593933e-06,
"loss": 0.4808,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 694,
"tokens_per_second_per_gpu": 21254.7,
"total_tokens": 31408105
},
{
"epoch": 0.0694201668081706,
"grad_norm": 1.015625,
"learning_rate": 5.1824632589828465e-06,
"loss": 0.4155,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 695,
"tokens_per_second_per_gpu": 15527.07,
"total_tokens": 31447034
},
{
"epoch": 0.0695200519402687,
"grad_norm": 1.0,
"learning_rate": 5.151903797536631e-06,
"loss": 0.4171,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 696,
"tokens_per_second_per_gpu": 16471.4,
"total_tokens": 31486678
},
{
"epoch": 0.06961993707236677,
"grad_norm": 0.9921875,
"learning_rate": 5.121403408612672e-06,
"loss": 0.4642,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 697,
"tokens_per_second_per_gpu": 19309.6,
"total_tokens": 31533685
},
{
"epoch": 0.06971982220446486,
"grad_norm": 1.0234375,
"learning_rate": 5.090962463848592e-06,
"loss": 0.4628,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 698,
"tokens_per_second_per_gpu": 18263.08,
"total_tokens": 31578085
},
{
"epoch": 0.06981970733656295,
"grad_norm": 0.96484375,
"learning_rate": 5.060581334157693e-06,
"loss": 0.4488,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 699,
"tokens_per_second_per_gpu": 19174.1,
"total_tokens": 31624026
},
{
"epoch": 0.06991959246866104,
"grad_norm": 1.015625,
"learning_rate": 5.030260389724447e-06,
"loss": 0.4883,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 700,
"tokens_per_second_per_gpu": 20237.01,
"total_tokens": 31672429
},
{
"epoch": 0.07001947760075913,
"grad_norm": 0.96484375,
"learning_rate": 5.000000000000003e-06,
"loss": 0.4821,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 701,
"tokens_per_second_per_gpu": 19608.0,
"total_tokens": 31720678
},
{
"epoch": 0.07011936273285721,
"grad_norm": 0.99609375,
"learning_rate": 4.96980053369765e-06,
"loss": 0.4297,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 702,
"tokens_per_second_per_gpu": 16657.01,
"total_tokens": 31762205
},
{
"epoch": 0.0702192478649553,
"grad_norm": 0.9375,
"learning_rate": 4.939662358788364e-06,
"loss": 0.3997,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 703,
"tokens_per_second_per_gpu": 17215.9,
"total_tokens": 31805553
},
{
"epoch": 0.07031913299705339,
"grad_norm": 0.9375,
"learning_rate": 4.909585842496287e-06,
"loss": 0.417,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 704,
"tokens_per_second_per_gpu": 18258.52,
"total_tokens": 31849565
},
{
"epoch": 0.07041901812915148,
"grad_norm": 1.0546875,
"learning_rate": 4.879571351294287e-06,
"loss": 0.4359,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 705,
"tokens_per_second_per_gpu": 16860.81,
"total_tokens": 31890442
},
{
"epoch": 0.07051890326124956,
"grad_norm": 1.0,
"learning_rate": 4.849619250899458e-06,
"loss": 0.4084,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 706,
"tokens_per_second_per_gpu": 16599.91,
"total_tokens": 31931257
},
{
"epoch": 0.07061878839334765,
"grad_norm": 0.921875,
"learning_rate": 4.8197299062687e-06,
"loss": 0.4197,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 707,
"tokens_per_second_per_gpu": 18363.15,
"total_tokens": 31976058
},
{
"epoch": 0.07071867352544574,
"grad_norm": 1.2421875,
"learning_rate": 4.78990368159424e-06,
"loss": 0.432,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 708,
"tokens_per_second_per_gpu": 18281.44,
"total_tokens": 32020676
},
{
"epoch": 0.07081855865754383,
"grad_norm": 1.15625,
"learning_rate": 4.76014094029921e-06,
"loss": 0.4821,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 709,
"tokens_per_second_per_gpu": 17574.88,
"total_tokens": 32064101
},
{
"epoch": 0.0709184437896419,
"grad_norm": 1.0390625,
"learning_rate": 4.7304420450332244e-06,
"loss": 0.4496,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 710,
"tokens_per_second_per_gpu": 17647.12,
"total_tokens": 32107600
},
{
"epoch": 0.07101832892174,
"grad_norm": 0.953125,
"learning_rate": 4.700807357667953e-06,
"loss": 0.507,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 711,
"tokens_per_second_per_gpu": 21211.22,
"total_tokens": 32158910
},
{
"epoch": 0.07111821405383809,
"grad_norm": 0.96484375,
"learning_rate": 4.671237239292699e-06,
"loss": 0.3837,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 712,
"tokens_per_second_per_gpu": 17423.28,
"total_tokens": 32202315
},
{
"epoch": 0.07121809918593618,
"grad_norm": 0.98046875,
"learning_rate": 4.641732050210032e-06,
"loss": 0.4286,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 713,
"tokens_per_second_per_gpu": 17394.81,
"total_tokens": 32245312
},
{
"epoch": 0.07131798431803427,
"grad_norm": 0.9453125,
"learning_rate": 4.612292149931369e-06,
"loss": 0.3988,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 714,
"tokens_per_second_per_gpu": 16734.28,
"total_tokens": 32286805
},
{
"epoch": 0.07141786945013234,
"grad_norm": 0.93359375,
"learning_rate": 4.582917897172603e-06,
"loss": 0.4663,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 715,
"tokens_per_second_per_gpu": 19969.21,
"total_tokens": 32336588
},
{
"epoch": 0.07151775458223043,
"grad_norm": 1.0078125,
"learning_rate": 4.5536096498497295e-06,
"loss": 0.462,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 716,
"tokens_per_second_per_gpu": 17908.97,
"total_tokens": 32379697
},
{
"epoch": 0.07161763971432852,
"grad_norm": 0.9140625,
"learning_rate": 4.524367765074499e-06,
"loss": 0.4429,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 717,
"tokens_per_second_per_gpu": 20401.38,
"total_tokens": 32430056
},
{
"epoch": 0.07171752484642661,
"grad_norm": 1.0625,
"learning_rate": 4.495192599150045e-06,
"loss": 0.4642,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 718,
"tokens_per_second_per_gpu": 16419.87,
"total_tokens": 32470376
},
{
"epoch": 0.07181740997852469,
"grad_norm": 0.94921875,
"learning_rate": 4.46608450756656e-06,
"loss": 0.4466,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 719,
"tokens_per_second_per_gpu": 19404.36,
"total_tokens": 32518031
},
{
"epoch": 0.07191729511062278,
"grad_norm": 0.99609375,
"learning_rate": 4.437043844996952e-06,
"loss": 0.4127,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 720,
"tokens_per_second_per_gpu": 15533.69,
"total_tokens": 32557117
},
{
"epoch": 0.07201718024272087,
"grad_norm": 1.0234375,
"learning_rate": 4.408070965292534e-06,
"loss": 0.4562,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 721,
"tokens_per_second_per_gpu": 17043.35,
"total_tokens": 32598621
},
{
"epoch": 0.07211706537481896,
"grad_norm": 0.984375,
"learning_rate": 4.379166221478697e-06,
"loss": 0.4657,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 722,
"tokens_per_second_per_gpu": 18217.55,
"total_tokens": 32642986
},
{
"epoch": 0.07221695050691705,
"grad_norm": 0.98046875,
"learning_rate": 4.350329965750622e-06,
"loss": 0.4085,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 723,
"tokens_per_second_per_gpu": 17567.62,
"total_tokens": 32685773
},
{
"epoch": 0.07231683563901513,
"grad_norm": 0.98046875,
"learning_rate": 4.321562549468991e-06,
"loss": 0.4135,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 724,
"tokens_per_second_per_gpu": 16458.0,
"total_tokens": 32726348
},
{
"epoch": 0.07241672077111322,
"grad_norm": 1.0234375,
"learning_rate": 4.292864323155684e-06,
"loss": 0.4623,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 725,
"tokens_per_second_per_gpu": 18086.79,
"total_tokens": 32771186
},
{
"epoch": 0.07251660590321131,
"grad_norm": 0.96484375,
"learning_rate": 4.264235636489542e-06,
"loss": 0.5004,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 726,
"tokens_per_second_per_gpu": 20477.83,
"total_tokens": 32821377
},
{
"epoch": 0.0726164910353094,
"grad_norm": 0.96484375,
"learning_rate": 4.235676838302069e-06,
"loss": 0.4673,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 727,
"tokens_per_second_per_gpu": 19149.83,
"total_tokens": 32868608
},
{
"epoch": 0.07271637616740748,
"grad_norm": 1.015625,
"learning_rate": 4.207188276573214e-06,
"loss": 0.441,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 728,
"tokens_per_second_per_gpu": 17952.4,
"total_tokens": 32912104
},
{
"epoch": 0.07281626129950557,
"grad_norm": 1.0078125,
"learning_rate": 4.178770298427107e-06,
"loss": 0.4795,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 729,
"tokens_per_second_per_gpu": 18644.2,
"total_tokens": 32958514
},
{
"epoch": 0.07291614643160366,
"grad_norm": 1.0234375,
"learning_rate": 4.150423250127846e-06,
"loss": 0.4647,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 730,
"tokens_per_second_per_gpu": 17645.44,
"total_tokens": 33002672
},
{
"epoch": 0.07301603156370175,
"grad_norm": 1.0078125,
"learning_rate": 4.12214747707527e-06,
"loss": 0.4294,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 731,
"tokens_per_second_per_gpu": 17261.14,
"total_tokens": 33045478
},
{
"epoch": 0.07311591669579982,
"grad_norm": 1.0703125,
"learning_rate": 4.093943323800746e-06,
"loss": 0.4318,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 732,
"tokens_per_second_per_gpu": 18671.66,
"total_tokens": 33092420
},
{
"epoch": 0.07321580182789791,
"grad_norm": 0.96484375,
"learning_rate": 4.065811133962987e-06,
"loss": 0.4281,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 733,
"tokens_per_second_per_gpu": 18318.77,
"total_tokens": 33137357
},
{
"epoch": 0.073315686959996,
"grad_norm": 0.98046875,
"learning_rate": 4.037751250343841e-06,
"loss": 0.3983,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 734,
"tokens_per_second_per_gpu": 16528.32,
"total_tokens": 33178233
},
{
"epoch": 0.0734155720920941,
"grad_norm": 1.6015625,
"learning_rate": 4.009764014844143e-06,
"loss": 0.4226,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 735,
"tokens_per_second_per_gpu": 18004.58,
"total_tokens": 33222630
},
{
"epoch": 0.07351545722419218,
"grad_norm": 0.96484375,
"learning_rate": 3.981849768479516e-06,
"loss": 0.4552,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 736,
"tokens_per_second_per_gpu": 18603.91,
"total_tokens": 33268228
},
{
"epoch": 0.07361534235629026,
"grad_norm": 0.98046875,
"learning_rate": 3.954008851376252e-06,
"loss": 0.4387,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 737,
"tokens_per_second_per_gpu": 19583.86,
"total_tokens": 33315124
},
{
"epoch": 0.07371522748838835,
"grad_norm": 0.99609375,
"learning_rate": 3.9262416027671354e-06,
"loss": 0.4538,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 738,
"tokens_per_second_per_gpu": 16932.53,
"total_tokens": 33357686
},
{
"epoch": 0.07381511262048644,
"grad_norm": 0.98046875,
"learning_rate": 3.898548360987325e-06,
"loss": 0.4558,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 739,
"tokens_per_second_per_gpu": 19851.47,
"total_tokens": 33405913
},
{
"epoch": 0.07391499775258453,
"grad_norm": 0.9453125,
"learning_rate": 3.8709294634702374e-06,
"loss": 0.4522,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 740,
"tokens_per_second_per_gpu": 19943.32,
"total_tokens": 33452722
},
{
"epoch": 0.07401488288468261,
"grad_norm": 1.0078125,
"learning_rate": 3.8433852467434175e-06,
"loss": 0.4498,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 741,
"tokens_per_second_per_gpu": 17826.87,
"total_tokens": 33496275
},
{
"epoch": 0.0741147680167807,
"grad_norm": 1.0,
"learning_rate": 3.81591604642446e-06,
"loss": 0.4784,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 742,
"tokens_per_second_per_gpu": 19558.59,
"total_tokens": 33543831
},
{
"epoch": 0.07421465314887879,
"grad_norm": 1.0078125,
"learning_rate": 3.7885221972168974e-06,
"loss": 0.4276,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 743,
"tokens_per_second_per_gpu": 16763.51,
"total_tokens": 33584229
},
{
"epoch": 0.07431453828097688,
"grad_norm": 0.95703125,
"learning_rate": 3.7612040329061405e-06,
"loss": 0.4796,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 744,
"tokens_per_second_per_gpu": 19724.55,
"total_tokens": 33633472
},
{
"epoch": 0.07441442341307497,
"grad_norm": 0.953125,
"learning_rate": 3.7339618863553983e-06,
"loss": 0.4666,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 745,
"tokens_per_second_per_gpu": 19000.4,
"total_tokens": 33680629
},
{
"epoch": 0.07451430854517305,
"grad_norm": 0.9453125,
"learning_rate": 3.7067960895016277e-06,
"loss": 0.4235,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 746,
"tokens_per_second_per_gpu": 19174.22,
"total_tokens": 33726158
},
{
"epoch": 0.07461419367727114,
"grad_norm": 0.90625,
"learning_rate": 3.679706973351491e-06,
"loss": 0.4475,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 747,
"tokens_per_second_per_gpu": 19844.29,
"total_tokens": 33775074
},
{
"epoch": 0.07471407880936923,
"grad_norm": 0.96484375,
"learning_rate": 3.6526948679773256e-06,
"loss": 0.4764,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 748,
"tokens_per_second_per_gpu": 18010.15,
"total_tokens": 33819103
},
{
"epoch": 0.07481396394146732,
"grad_norm": 0.9453125,
"learning_rate": 3.625760102513103e-06,
"loss": 0.4354,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 749,
"tokens_per_second_per_gpu": 18519.92,
"total_tokens": 33864876
},
{
"epoch": 0.0749138490735654,
"grad_norm": 0.984375,
"learning_rate": 3.598903005150444e-06,
"loss": 0.4649,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 750,
"tokens_per_second_per_gpu": 20241.23,
"total_tokens": 33913338
},
{
"epoch": 0.07501373420566348,
"grad_norm": 0.921875,
"learning_rate": 3.5721239031346067e-06,
"loss": 0.4566,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 751,
"tokens_per_second_per_gpu": 19538.75,
"total_tokens": 33960877
},
{
"epoch": 0.07511361933776158,
"grad_norm": 0.96875,
"learning_rate": 3.545423122760493e-06,
"loss": 0.4134,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 752,
"tokens_per_second_per_gpu": 18120.67,
"total_tokens": 34005425
},
{
"epoch": 0.07521350446985967,
"grad_norm": 0.96484375,
"learning_rate": 3.5188009893686916e-06,
"loss": 0.4733,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 753,
"tokens_per_second_per_gpu": 20333.77,
"total_tokens": 34055315
},
{
"epoch": 0.07531338960195774,
"grad_norm": 0.96875,
"learning_rate": 3.492257827341492e-06,
"loss": 0.4752,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 754,
"tokens_per_second_per_gpu": 18882.58,
"total_tokens": 34101132
},
{
"epoch": 0.07541327473405583,
"grad_norm": 1.0859375,
"learning_rate": 3.4657939600989453e-06,
"loss": 0.4387,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 755,
"tokens_per_second_per_gpu": 16251.93,
"total_tokens": 34141329
},
{
"epoch": 0.07551315986615392,
"grad_norm": 0.984375,
"learning_rate": 3.4394097100949286e-06,
"loss": 0.4814,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 756,
"tokens_per_second_per_gpu": 19121.99,
"total_tokens": 34187810
},
{
"epoch": 0.07561304499825201,
"grad_norm": 1.0546875,
"learning_rate": 3.4131053988131947e-06,
"loss": 0.4285,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 757,
"tokens_per_second_per_gpu": 15534.14,
"total_tokens": 34226967
},
{
"epoch": 0.0757129301303501,
"grad_norm": 0.9296875,
"learning_rate": 3.3868813467634833e-06,
"loss": 0.5004,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 758,
"tokens_per_second_per_gpu": 20540.89,
"total_tokens": 34278318
},
{
"epoch": 0.07581281526244818,
"grad_norm": 0.92578125,
"learning_rate": 3.360737873477584e-06,
"loss": 0.478,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 759,
"tokens_per_second_per_gpu": 18996.35,
"total_tokens": 34326382
},
{
"epoch": 0.07591270039454627,
"grad_norm": 0.92578125,
"learning_rate": 3.3346752975054763e-06,
"loss": 0.4768,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 760,
"tokens_per_second_per_gpu": 20186.09,
"total_tokens": 34376644
},
{
"epoch": 0.07601258552664436,
"grad_norm": 1.0234375,
"learning_rate": 3.308693936411421e-06,
"loss": 0.4029,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 761,
"tokens_per_second_per_gpu": 16115.93,
"total_tokens": 34416708
},
{
"epoch": 0.07611247065874245,
"grad_norm": 1.0,
"learning_rate": 3.2827941067700996e-06,
"loss": 0.4444,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 762,
"tokens_per_second_per_gpu": 16823.28,
"total_tokens": 34458692
},
{
"epoch": 0.07621235579084053,
"grad_norm": 0.9921875,
"learning_rate": 3.2569761241627694e-06,
"loss": 0.468,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 763,
"tokens_per_second_per_gpu": 19727.4,
"total_tokens": 34506469
},
{
"epoch": 0.07631224092293862,
"grad_norm": 1.015625,
"learning_rate": 3.2312403031733943e-06,
"loss": 0.4906,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 764,
"tokens_per_second_per_gpu": 19245.83,
"total_tokens": 34553520
},
{
"epoch": 0.07641212605503671,
"grad_norm": 1.03125,
"learning_rate": 3.2055869573848374e-06,
"loss": 0.4225,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 765,
"tokens_per_second_per_gpu": 17663.42,
"total_tokens": 34595781
},
{
"epoch": 0.0765120111871348,
"grad_norm": 0.97265625,
"learning_rate": 3.1800163993750166e-06,
"loss": 0.469,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 766,
"tokens_per_second_per_gpu": 18474.22,
"total_tokens": 34640570
},
{
"epoch": 0.07661189631923289,
"grad_norm": 0.92578125,
"learning_rate": 3.1545289407131128e-06,
"loss": 0.4653,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 767,
"tokens_per_second_per_gpu": 19435.84,
"total_tokens": 34687529
},
{
"epoch": 0.07671178145133097,
"grad_norm": 1.2421875,
"learning_rate": 3.1291248919557717e-06,
"loss": 0.4393,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 768,
"tokens_per_second_per_gpu": 18703.97,
"total_tokens": 34734118
},
{
"epoch": 0.07681166658342906,
"grad_norm": 0.95703125,
"learning_rate": 3.103804562643302e-06,
"loss": 0.4458,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 769,
"tokens_per_second_per_gpu": 19402.21,
"total_tokens": 34781930
},
{
"epoch": 0.07691155171552715,
"grad_norm": 0.98828125,
"learning_rate": 3.0785682612959334e-06,
"loss": 0.4941,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 770,
"tokens_per_second_per_gpu": 19548.47,
"total_tokens": 34830220
},
{
"epoch": 0.07701143684762524,
"grad_norm": 1.015625,
"learning_rate": 3.0534162954100264e-06,
"loss": 0.4643,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 771,
"tokens_per_second_per_gpu": 17562.18,
"total_tokens": 34874221
},
{
"epoch": 0.07711132197972331,
"grad_norm": 0.93359375,
"learning_rate": 3.028348971454356e-06,
"loss": 0.4263,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 772,
"tokens_per_second_per_gpu": 18395.88,
"total_tokens": 34919409
},
{
"epoch": 0.0772112071118214,
"grad_norm": 0.93359375,
"learning_rate": 3.003366594866345e-06,
"loss": 0.4913,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 773,
"tokens_per_second_per_gpu": 20334.84,
"total_tokens": 34968740
},
{
"epoch": 0.0773110922439195,
"grad_norm": 0.92578125,
"learning_rate": 2.978469470048376e-06,
"loss": 0.4351,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 774,
"tokens_per_second_per_gpu": 19515.04,
"total_tokens": 35015775
},
{
"epoch": 0.07741097737601758,
"grad_norm": 0.953125,
"learning_rate": 2.953657900364053e-06,
"loss": 0.4445,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 775,
"tokens_per_second_per_gpu": 18606.56,
"total_tokens": 35061754
},
{
"epoch": 0.07751086250811566,
"grad_norm": 0.9453125,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.4226,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 776,
"tokens_per_second_per_gpu": 17852.59,
"total_tokens": 35106349
},
{
"epoch": 0.07761074764021375,
"grad_norm": 1.0078125,
"learning_rate": 2.9042926346347932e-06,
"loss": 0.4751,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 777,
"tokens_per_second_per_gpu": 20021.82,
"total_tokens": 35155865
},
{
"epoch": 0.07771063277231184,
"grad_norm": 0.95703125,
"learning_rate": 2.8797395400900362e-06,
"loss": 0.4197,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 778,
"tokens_per_second_per_gpu": 18516.27,
"total_tokens": 35200487
},
{
"epoch": 0.07781051790440993,
"grad_norm": 0.97265625,
"learning_rate": 2.855273203671969e-06,
"loss": 0.472,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 779,
"tokens_per_second_per_gpu": 20415.64,
"total_tokens": 35249366
},
{
"epoch": 0.07791040303650802,
"grad_norm": 0.96484375,
"learning_rate": 2.830893923495173e-06,
"loss": 0.444,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 780,
"tokens_per_second_per_gpu": 18200.26,
"total_tokens": 35294028
},
{
"epoch": 0.0780102881686061,
"grad_norm": 0.9765625,
"learning_rate": 2.8066019966134907e-06,
"loss": 0.4302,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 781,
"tokens_per_second_per_gpu": 17184.21,
"total_tokens": 35336675
},
{
"epoch": 0.07811017330070419,
"grad_norm": 0.9375,
"learning_rate": 2.7823977190163788e-06,
"loss": 0.4132,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 782,
"tokens_per_second_per_gpu": 18695.96,
"total_tokens": 35381758
},
{
"epoch": 0.07821005843280228,
"grad_norm": 0.94921875,
"learning_rate": 2.7582813856253276e-06,
"loss": 0.4294,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 783,
"tokens_per_second_per_gpu": 18864.72,
"total_tokens": 35427540
},
{
"epoch": 0.07830994356490037,
"grad_norm": 0.94140625,
"learning_rate": 2.7342532902902418e-06,
"loss": 0.443,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 784,
"tokens_per_second_per_gpu": 18334.86,
"total_tokens": 35473265
},
{
"epoch": 0.07840982869699845,
"grad_norm": 0.94140625,
"learning_rate": 2.7103137257858867e-06,
"loss": 0.4663,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 785,
"tokens_per_second_per_gpu": 20488.92,
"total_tokens": 35522599
},
{
"epoch": 0.07850971382909654,
"grad_norm": 0.95703125,
"learning_rate": 2.6864629838082957e-06,
"loss": 0.486,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 786,
"tokens_per_second_per_gpu": 19238.21,
"total_tokens": 35569885
},
{
"epoch": 0.07860959896119463,
"grad_norm": 0.94140625,
"learning_rate": 2.6627013549712355e-06,
"loss": 0.4398,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 787,
"tokens_per_second_per_gpu": 20147.3,
"total_tokens": 35618490
},
{
"epoch": 0.07870948409329272,
"grad_norm": 0.9609375,
"learning_rate": 2.639029128802657e-06,
"loss": 0.4521,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 788,
"tokens_per_second_per_gpu": 19362.36,
"total_tokens": 35664805
},
{
"epoch": 0.07880936922539081,
"grad_norm": 0.98828125,
"learning_rate": 2.615446593741161e-06,
"loss": 0.4338,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 789,
"tokens_per_second_per_gpu": 17103.85,
"total_tokens": 35707639
},
{
"epoch": 0.07890925435748888,
"grad_norm": 1.0234375,
"learning_rate": 2.5919540371325005e-06,
"loss": 0.4768,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 790,
"tokens_per_second_per_gpu": 16934.67,
"total_tokens": 35749944
},
{
"epoch": 0.07900913948958697,
"grad_norm": 0.9609375,
"learning_rate": 2.5685517452260566e-06,
"loss": 0.486,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 791,
"tokens_per_second_per_gpu": 19332.58,
"total_tokens": 35797939
},
{
"epoch": 0.07910902462168506,
"grad_norm": 1.0234375,
"learning_rate": 2.5452400031713786e-06,
"loss": 0.4516,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 792,
"tokens_per_second_per_gpu": 16832.4,
"total_tokens": 35839936
},
{
"epoch": 0.07920890975378315,
"grad_norm": 1.015625,
"learning_rate": 2.522019095014683e-06,
"loss": 0.481,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 793,
"tokens_per_second_per_gpu": 20197.47,
"total_tokens": 35888927
},
{
"epoch": 0.07930879488588123,
"grad_norm": 0.96484375,
"learning_rate": 2.4988893036954045e-06,
"loss": 0.4085,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 794,
"tokens_per_second_per_gpu": 16791.69,
"total_tokens": 35930456
},
{
"epoch": 0.07940868001797932,
"grad_norm": 0.94921875,
"learning_rate": 2.4758509110427576e-06,
"loss": 0.4307,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 795,
"tokens_per_second_per_gpu": 18229.64,
"total_tokens": 35975399
},
{
"epoch": 0.07950856515007741,
"grad_norm": 0.9296875,
"learning_rate": 2.45290419777228e-06,
"loss": 0.4409,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 796,
"tokens_per_second_per_gpu": 19402.09,
"total_tokens": 36022487
},
{
"epoch": 0.0796084502821755,
"grad_norm": 0.9921875,
"learning_rate": 2.4300494434824373e-06,
"loss": 0.4787,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 797,
"tokens_per_second_per_gpu": 18897.54,
"total_tokens": 36067845
},
{
"epoch": 0.07970833541427358,
"grad_norm": 0.9609375,
"learning_rate": 2.407286926651192e-06,
"loss": 0.4174,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 798,
"tokens_per_second_per_gpu": 17302.79,
"total_tokens": 36110406
},
{
"epoch": 0.07980822054637167,
"grad_norm": 0.98046875,
"learning_rate": 2.3846169246326345e-06,
"loss": 0.4533,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 799,
"tokens_per_second_per_gpu": 18365.7,
"total_tokens": 36154887
},
{
"epoch": 0.07990810567846976,
"grad_norm": 0.9765625,
"learning_rate": 2.362039713653581e-06,
"loss": 0.4402,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 800,
"tokens_per_second_per_gpu": 17734.24,
"total_tokens": 36199208
},
{
"epoch": 0.08000799081056785,
"grad_norm": 1.0234375,
"learning_rate": 2.339555568810221e-06,
"loss": 0.448,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 801,
"tokens_per_second_per_gpu": 16035.94,
"total_tokens": 36238446
},
{
"epoch": 0.08010787594266594,
"grad_norm": 0.97265625,
"learning_rate": 2.317164764064769e-06,
"loss": 0.4579,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 802,
"tokens_per_second_per_gpu": 18881.91,
"total_tokens": 36285159
},
{
"epoch": 0.08020776107476402,
"grad_norm": 1.0,
"learning_rate": 2.2948675722421086e-06,
"loss": 0.4404,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 803,
"tokens_per_second_per_gpu": 16918.36,
"total_tokens": 36327565
},
{
"epoch": 0.08030764620686211,
"grad_norm": 0.9765625,
"learning_rate": 2.27266426502649e-06,
"loss": 0.4996,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 804,
"tokens_per_second_per_gpu": 19541.82,
"total_tokens": 36375461
},
{
"epoch": 0.0804075313389602,
"grad_norm": 1.0,
"learning_rate": 2.2505551129582047e-06,
"loss": 0.5001,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 805,
"tokens_per_second_per_gpu": 19301.34,
"total_tokens": 36422902
},
{
"epoch": 0.08050741647105829,
"grad_norm": 0.921875,
"learning_rate": 2.2285403854302912e-06,
"loss": 0.4725,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 806,
"tokens_per_second_per_gpu": 20864.66,
"total_tokens": 36473161
},
{
"epoch": 0.08060730160315636,
"grad_norm": 0.93359375,
"learning_rate": 2.206620350685257e-06,
"loss": 0.4983,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 807,
"tokens_per_second_per_gpu": 20616.17,
"total_tokens": 36523271
},
{
"epoch": 0.08070718673525445,
"grad_norm": 0.9375,
"learning_rate": 2.1847952758118118e-06,
"loss": 0.4219,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 808,
"tokens_per_second_per_gpu": 18241.16,
"total_tokens": 36568183
},
{
"epoch": 0.08080707186735255,
"grad_norm": 1.0078125,
"learning_rate": 2.163065426741603e-06,
"loss": 0.4266,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 809,
"tokens_per_second_per_gpu": 16330.06,
"total_tokens": 36608756
},
{
"epoch": 0.08090695699945064,
"grad_norm": 0.9296875,
"learning_rate": 2.1414310682459805e-06,
"loss": 0.4323,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 810,
"tokens_per_second_per_gpu": 20630.5,
"total_tokens": 36657416
},
{
"epoch": 0.08100684213154873,
"grad_norm": 1.0390625,
"learning_rate": 2.119892463932781e-06,
"loss": 0.4131,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 811,
"tokens_per_second_per_gpu": 14407.44,
"total_tokens": 36693932
},
{
"epoch": 0.0811067272636468,
"grad_norm": 0.93359375,
"learning_rate": 2.098449876243096e-06,
"loss": 0.4454,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 812,
"tokens_per_second_per_gpu": 19878.7,
"total_tokens": 36742759
},
{
"epoch": 0.08120661239574489,
"grad_norm": 0.96875,
"learning_rate": 2.0771035664480944e-06,
"loss": 0.416,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 813,
"tokens_per_second_per_gpu": 17948.12,
"total_tokens": 36786121
},
{
"epoch": 0.08130649752784298,
"grad_norm": 0.984375,
"learning_rate": 2.0558537946458177e-06,
"loss": 0.4597,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 814,
"tokens_per_second_per_gpu": 19428.04,
"total_tokens": 36833543
},
{
"epoch": 0.08140638265994107,
"grad_norm": 1.0625,
"learning_rate": 2.0347008197580376e-06,
"loss": 0.4773,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 815,
"tokens_per_second_per_gpu": 16797.44,
"total_tokens": 36875562
},
{
"epoch": 0.08150626779203915,
"grad_norm": 0.91015625,
"learning_rate": 2.013644899527074e-06,
"loss": 0.3689,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 816,
"tokens_per_second_per_gpu": 17852.4,
"total_tokens": 36920264
},
{
"epoch": 0.08160615292413724,
"grad_norm": 0.984375,
"learning_rate": 1.9926862905126663e-06,
"loss": 0.4804,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 817,
"tokens_per_second_per_gpu": 19627.76,
"total_tokens": 36968604
},
{
"epoch": 0.08170603805623533,
"grad_norm": 0.953125,
"learning_rate": 1.9718252480888567e-06,
"loss": 0.4784,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 818,
"tokens_per_second_per_gpu": 19471.58,
"total_tokens": 37017512
},
{
"epoch": 0.08180592318833342,
"grad_norm": 0.98046875,
"learning_rate": 1.95106202644086e-06,
"loss": 0.4654,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 819,
"tokens_per_second_per_gpu": 18562.62,
"total_tokens": 37063802
},
{
"epoch": 0.0819058083204315,
"grad_norm": 0.97265625,
"learning_rate": 1.930396878561983e-06,
"loss": 0.4418,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 820,
"tokens_per_second_per_gpu": 19439.85,
"total_tokens": 37111193
},
{
"epoch": 0.08200569345252959,
"grad_norm": 0.9296875,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.4499,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 821,
"tokens_per_second_per_gpu": 19463.13,
"total_tokens": 37157706
},
{
"epoch": 0.08210557858462768,
"grad_norm": 0.95703125,
"learning_rate": 1.8893618101067357e-06,
"loss": 0.4992,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 822,
"tokens_per_second_per_gpu": 19661.97,
"total_tokens": 37206493
},
{
"epoch": 0.08220546371672577,
"grad_norm": 1.03125,
"learning_rate": 1.8689923895297247e-06,
"loss": 0.4788,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 823,
"tokens_per_second_per_gpu": 17584.68,
"total_tokens": 37250267
},
{
"epoch": 0.08230534884882386,
"grad_norm": 0.97265625,
"learning_rate": 1.848722042714457e-06,
"loss": 0.4918,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 824,
"tokens_per_second_per_gpu": 19572.39,
"total_tokens": 37297746
},
{
"epoch": 0.08240523398092194,
"grad_norm": 0.95703125,
"learning_rate": 1.8285510166487154e-06,
"loss": 0.4801,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 825,
"tokens_per_second_per_gpu": 18921.89,
"total_tokens": 37345014
},
{
"epoch": 0.08250511911302003,
"grad_norm": 0.9453125,
"learning_rate": 1.808479557110081e-06,
"loss": 0.4642,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 826,
"tokens_per_second_per_gpu": 19139.52,
"total_tokens": 37391906
},
{
"epoch": 0.08260500424511812,
"grad_norm": 0.9765625,
"learning_rate": 1.7885079086629598e-06,
"loss": 0.5035,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 827,
"tokens_per_second_per_gpu": 19019.06,
"total_tokens": 37438925
},
{
"epoch": 0.0827048893772162,
"grad_norm": 1.0390625,
"learning_rate": 1.7686363146555807e-06,
"loss": 0.4424,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 828,
"tokens_per_second_per_gpu": 16617.06,
"total_tokens": 37479544
},
{
"epoch": 0.08280477450931428,
"grad_norm": 0.984375,
"learning_rate": 1.7488650172170496e-06,
"loss": 0.4608,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 829,
"tokens_per_second_per_gpu": 17764.13,
"total_tokens": 37523449
},
{
"epoch": 0.08290465964141237,
"grad_norm": 1.0,
"learning_rate": 1.7291942572543806e-06,
"loss": 0.3988,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 830,
"tokens_per_second_per_gpu": 16056.31,
"total_tokens": 37562434
},
{
"epoch": 0.08300454477351046,
"grad_norm": 0.9921875,
"learning_rate": 1.709624274449584e-06,
"loss": 0.4472,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 831,
"tokens_per_second_per_gpu": 17246.78,
"total_tokens": 37603863
},
{
"epoch": 0.08310442990560855,
"grad_norm": 1.0078125,
"learning_rate": 1.6901553072567189e-06,
"loss": 0.4265,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 832,
"tokens_per_second_per_gpu": 16067.93,
"total_tokens": 37643783
},
{
"epoch": 0.08320431503770664,
"grad_norm": 1.0546875,
"learning_rate": 1.6707875928990059e-06,
"loss": 0.4405,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 833,
"tokens_per_second_per_gpu": 18527.37,
"total_tokens": 37688766
},
{
"epoch": 0.08330420016980472,
"grad_norm": 0.9453125,
"learning_rate": 1.651521367365936e-06,
"loss": 0.4489,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 834,
"tokens_per_second_per_gpu": 19398.06,
"total_tokens": 37735797
},
{
"epoch": 0.08340408530190281,
"grad_norm": 0.9609375,
"learning_rate": 1.6323568654103838e-06,
"loss": 0.4791,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 835,
"tokens_per_second_per_gpu": 19898.65,
"total_tokens": 37784821
},
{
"epoch": 0.0835039704340009,
"grad_norm": 1.0625,
"learning_rate": 1.6132943205457607e-06,
"loss": 0.5056,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 836,
"tokens_per_second_per_gpu": 18456.36,
"total_tokens": 37830582
},
{
"epoch": 0.08360385556609899,
"grad_norm": 0.99609375,
"learning_rate": 1.5943339650431578e-06,
"loss": 0.415,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 837,
"tokens_per_second_per_gpu": 17436.82,
"total_tokens": 37873914
},
{
"epoch": 0.08370374069819707,
"grad_norm": 0.99609375,
"learning_rate": 1.5754760299285255e-06,
"loss": 0.5159,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 838,
"tokens_per_second_per_gpu": 20230.5,
"total_tokens": 37922986
},
{
"epoch": 0.08380362583029516,
"grad_norm": 1.0546875,
"learning_rate": 1.5567207449798517e-06,
"loss": 0.4642,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 839,
"tokens_per_second_per_gpu": 16996.21,
"total_tokens": 37964539
},
{
"epoch": 0.08390351096239325,
"grad_norm": 0.9609375,
"learning_rate": 1.538068338724361e-06,
"loss": 0.4508,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 840,
"tokens_per_second_per_gpu": 18701.87,
"total_tokens": 38009445
},
{
"epoch": 0.08400339609449134,
"grad_norm": 1.0078125,
"learning_rate": 1.5195190384357405e-06,
"loss": 0.4494,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 841,
"tokens_per_second_per_gpu": 19390.21,
"total_tokens": 38056971
},
{
"epoch": 0.08410328122658942,
"grad_norm": 1.0078125,
"learning_rate": 1.5010730701313626e-06,
"loss": 0.456,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 842,
"tokens_per_second_per_gpu": 17713.22,
"total_tokens": 38099722
},
{
"epoch": 0.0842031663586875,
"grad_norm": 0.94921875,
"learning_rate": 1.4827306585695234e-06,
"loss": 0.4127,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 843,
"tokens_per_second_per_gpu": 18723.15,
"total_tokens": 38145587
},
{
"epoch": 0.0843030514907856,
"grad_norm": 0.97265625,
"learning_rate": 1.4644920272467245e-06,
"loss": 0.4542,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 844,
"tokens_per_second_per_gpu": 18546.4,
"total_tokens": 38191632
},
{
"epoch": 0.08440293662288369,
"grad_norm": 1.0546875,
"learning_rate": 1.446357398394934e-06,
"loss": 0.4214,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 845,
"tokens_per_second_per_gpu": 16288.16,
"total_tokens": 38232383
},
{
"epoch": 0.08450282175498178,
"grad_norm": 1.0078125,
"learning_rate": 1.4283269929788779e-06,
"loss": 0.4211,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 846,
"tokens_per_second_per_gpu": 15736.11,
"total_tokens": 38270618
},
{
"epoch": 0.08460270688707985,
"grad_norm": 0.953125,
"learning_rate": 1.4104010306933558e-06,
"loss": 0.5127,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 847,
"tokens_per_second_per_gpu": 20861.17,
"total_tokens": 38322839
},
{
"epoch": 0.08470259201917794,
"grad_norm": 0.93359375,
"learning_rate": 1.3925797299605649e-06,
"loss": 0.453,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 848,
"tokens_per_second_per_gpu": 19225.22,
"total_tokens": 38369114
},
{
"epoch": 0.08480247715127603,
"grad_norm": 0.9609375,
"learning_rate": 1.3748633079274254e-06,
"loss": 0.4638,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 849,
"tokens_per_second_per_gpu": 19435.5,
"total_tokens": 38415895
},
{
"epoch": 0.08490236228337412,
"grad_norm": 0.9609375,
"learning_rate": 1.3572519804629537e-06,
"loss": 0.4142,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 850,
"tokens_per_second_per_gpu": 17504.54,
"total_tokens": 38458330
},
{
"epoch": 0.0850022474154722,
"grad_norm": 0.9609375,
"learning_rate": 1.339745962155613e-06,
"loss": 0.4124,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 851,
"tokens_per_second_per_gpu": 18266.31,
"total_tokens": 38502745
},
{
"epoch": 0.08510213254757029,
"grad_norm": 0.921875,
"learning_rate": 1.322345466310717e-06,
"loss": 0.4658,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 852,
"tokens_per_second_per_gpu": 20271.47,
"total_tokens": 38552471
},
{
"epoch": 0.08520201767966838,
"grad_norm": 0.94140625,
"learning_rate": 1.30505070494781e-06,
"loss": 0.4,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 853,
"tokens_per_second_per_gpu": 17890.06,
"total_tokens": 38597351
},
{
"epoch": 0.08530190281176647,
"grad_norm": 0.9296875,
"learning_rate": 1.2878618887981064e-06,
"loss": 0.4398,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 854,
"tokens_per_second_per_gpu": 18875.61,
"total_tokens": 38643445
},
{
"epoch": 0.08540178794386455,
"grad_norm": 1.5859375,
"learning_rate": 1.2707792273019049e-06,
"loss": 0.4428,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 855,
"tokens_per_second_per_gpu": 16558.46,
"total_tokens": 38684289
},
{
"epoch": 0.08550167307596264,
"grad_norm": 0.93359375,
"learning_rate": 1.2538029286060428e-06,
"loss": 0.4257,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 856,
"tokens_per_second_per_gpu": 17797.76,
"total_tokens": 38728057
},
{
"epoch": 0.08560155820806073,
"grad_norm": 1.0,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.4436,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 857,
"tokens_per_second_per_gpu": 16844.23,
"total_tokens": 38769231
},
{
"epoch": 0.08570144334015882,
"grad_norm": 0.95703125,
"learning_rate": 1.2201702457201948e-06,
"loss": 0.4174,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 858,
"tokens_per_second_per_gpu": 18295.02,
"total_tokens": 38813479
},
{
"epoch": 0.08580132847225691,
"grad_norm": 0.96875,
"learning_rate": 1.2035142713338366e-06,
"loss": 0.4569,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 859,
"tokens_per_second_per_gpu": 19348.96,
"total_tokens": 38861170
},
{
"epoch": 0.08590121360435499,
"grad_norm": 1.03125,
"learning_rate": 1.1869654793500784e-06,
"loss": 0.4735,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 860,
"tokens_per_second_per_gpu": 19254.08,
"total_tokens": 38907760
},
{
"epoch": 0.08600109873645308,
"grad_norm": 1.0546875,
"learning_rate": 1.1705240714107301e-06,
"loss": 0.4271,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 861,
"tokens_per_second_per_gpu": 15999.45,
"total_tokens": 38946661
},
{
"epoch": 0.08610098386855117,
"grad_norm": 0.97265625,
"learning_rate": 1.1541902478491607e-06,
"loss": 0.4408,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 862,
"tokens_per_second_per_gpu": 18010.12,
"total_tokens": 38991339
},
{
"epoch": 0.08620086900064926,
"grad_norm": 0.9296875,
"learning_rate": 1.1379642076878528e-06,
"loss": 0.4487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 863,
"tokens_per_second_per_gpu": 20454.89,
"total_tokens": 39040982
},
{
"epoch": 0.08630075413274733,
"grad_norm": 0.9453125,
"learning_rate": 1.1218461486359878e-06,
"loss": 0.4596,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 864,
"tokens_per_second_per_gpu": 18759.82,
"total_tokens": 39087467
},
{
"epoch": 0.08640063926484542,
"grad_norm": 0.9453125,
"learning_rate": 1.1058362670870248e-06,
"loss": 0.4461,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 865,
"tokens_per_second_per_gpu": 19638.49,
"total_tokens": 39134944
},
{
"epoch": 0.08650052439694352,
"grad_norm": 0.984375,
"learning_rate": 1.0899347581163222e-06,
"loss": 0.3956,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 866,
"tokens_per_second_per_gpu": 15727.83,
"total_tokens": 39173576
},
{
"epoch": 0.0866004095290416,
"grad_norm": 0.9453125,
"learning_rate": 1.0741418154787443e-06,
"loss": 0.5174,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 867,
"tokens_per_second_per_gpu": 21853.95,
"total_tokens": 39225174
},
{
"epoch": 0.0867002946611397,
"grad_norm": 0.96875,
"learning_rate": 1.058457631606319e-06,
"loss": 0.42,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 868,
"tokens_per_second_per_gpu": 17794.46,
"total_tokens": 39269152
},
{
"epoch": 0.08680017979323777,
"grad_norm": 1.03125,
"learning_rate": 1.042882397605871e-06,
"loss": 0.4497,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 869,
"tokens_per_second_per_gpu": 17295.14,
"total_tokens": 39311626
},
{
"epoch": 0.08690006492533586,
"grad_norm": 1.0,
"learning_rate": 1.0274163032567165e-06,
"loss": 0.4458,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 870,
"tokens_per_second_per_gpu": 19380.27,
"total_tokens": 39358741
},
{
"epoch": 0.08699995005743395,
"grad_norm": 0.92578125,
"learning_rate": 1.012059537008332e-06,
"loss": 0.4043,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 871,
"tokens_per_second_per_gpu": 17965.13,
"total_tokens": 39403437
},
{
"epoch": 0.08709983518953204,
"grad_norm": 0.96484375,
"learning_rate": 9.968122859780648e-07,
"loss": 0.4402,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 872,
"tokens_per_second_per_gpu": 18130.63,
"total_tokens": 39446971
},
{
"epoch": 0.08719972032163012,
"grad_norm": 1.2265625,
"learning_rate": 9.816747359488632e-07,
"loss": 0.4494,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 873,
"tokens_per_second_per_gpu": 17179.87,
"total_tokens": 39489129
},
{
"epoch": 0.08729960545372821,
"grad_norm": 0.95703125,
"learning_rate": 9.666470713669918e-07,
"loss": 0.4177,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 874,
"tokens_per_second_per_gpu": 17429.74,
"total_tokens": 39531879
},
{
"epoch": 0.0873994905858263,
"grad_norm": 1.0390625,
"learning_rate": 9.517294753398066e-07,
"loss": 0.4372,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 875,
"tokens_per_second_per_gpu": 16985.12,
"total_tokens": 39573717
},
{
"epoch": 0.08749937571792439,
"grad_norm": 0.96875,
"learning_rate": 9.369221296335007e-07,
"loss": 0.5002,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 876,
"tokens_per_second_per_gpu": 19949.48,
"total_tokens": 39622256
},
{
"epoch": 0.08759926085002247,
"grad_norm": 0.9609375,
"learning_rate": 9.222252146709143e-07,
"loss": 0.4295,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 877,
"tokens_per_second_per_gpu": 18473.63,
"total_tokens": 39667518
},
{
"epoch": 0.08769914598212056,
"grad_norm": 0.9375,
"learning_rate": 9.076389095293148e-07,
"loss": 0.4607,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 878,
"tokens_per_second_per_gpu": 20108.44,
"total_tokens": 39715924
},
{
"epoch": 0.08779903111421865,
"grad_norm": 1.0546875,
"learning_rate": 8.931633919382299e-07,
"loss": 0.5037,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 879,
"tokens_per_second_per_gpu": 18996.83,
"total_tokens": 39762377
},
{
"epoch": 0.08789891624631674,
"grad_norm": 0.99609375,
"learning_rate": 8.787988382772705e-07,
"loss": 0.4251,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 880,
"tokens_per_second_per_gpu": 18702.25,
"total_tokens": 39808110
},
{
"epoch": 0.08799880137841483,
"grad_norm": 1.0390625,
"learning_rate": 8.645454235739903e-07,
"loss": 0.4113,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 881,
"tokens_per_second_per_gpu": 15478.48,
"total_tokens": 39846683
},
{
"epoch": 0.0880986865105129,
"grad_norm": 0.9375,
"learning_rate": 8.504033215017527e-07,
"loss": 0.4558,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 882,
"tokens_per_second_per_gpu": 19692.77,
"total_tokens": 39894219
},
{
"epoch": 0.088198571642611,
"grad_norm": 0.9296875,
"learning_rate": 8.363727043776037e-07,
"loss": 0.4684,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 883,
"tokens_per_second_per_gpu": 20459.8,
"total_tokens": 39943380
},
{
"epoch": 0.08829845677470909,
"grad_norm": 0.9765625,
"learning_rate": 8.224537431601886e-07,
"loss": 0.4421,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 884,
"tokens_per_second_per_gpu": 18059.44,
"total_tokens": 39987487
},
{
"epoch": 0.08839834190680718,
"grad_norm": 0.98828125,
"learning_rate": 8.086466074476562e-07,
"loss": 0.4255,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 885,
"tokens_per_second_per_gpu": 17447.58,
"total_tokens": 40031018
},
{
"epoch": 0.08849822703890525,
"grad_norm": 0.93359375,
"learning_rate": 7.949514654755963e-07,
"loss": 0.4196,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 886,
"tokens_per_second_per_gpu": 18557.05,
"total_tokens": 40076790
},
{
"epoch": 0.08859811217100334,
"grad_norm": 0.96484375,
"learning_rate": 7.81368484114996e-07,
"loss": 0.4456,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 887,
"tokens_per_second_per_gpu": 18626.23,
"total_tokens": 40123066
},
{
"epoch": 0.08869799730310143,
"grad_norm": 1.125,
"learning_rate": 7.678978288701911e-07,
"loss": 0.4572,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 888,
"tokens_per_second_per_gpu": 17197.3,
"total_tokens": 40165096
},
{
"epoch": 0.08879788243519952,
"grad_norm": 1.015625,
"learning_rate": 7.545396638768698e-07,
"loss": 0.4906,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 889,
"tokens_per_second_per_gpu": 18299.48,
"total_tokens": 40209850
},
{
"epoch": 0.08889776756729761,
"grad_norm": 0.9609375,
"learning_rate": 7.412941519000527e-07,
"loss": 0.407,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 890,
"tokens_per_second_per_gpu": 17714.45,
"total_tokens": 40253574
},
{
"epoch": 0.08899765269939569,
"grad_norm": 0.96875,
"learning_rate": 7.281614543321269e-07,
"loss": 0.4417,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 891,
"tokens_per_second_per_gpu": 18392.38,
"total_tokens": 40299387
},
{
"epoch": 0.08909753783149378,
"grad_norm": 0.93359375,
"learning_rate": 7.151417311908648e-07,
"loss": 0.4439,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 892,
"tokens_per_second_per_gpu": 18968.63,
"total_tokens": 40345374
},
{
"epoch": 0.08919742296359187,
"grad_norm": 0.97265625,
"learning_rate": 7.022351411174866e-07,
"loss": 0.4761,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 893,
"tokens_per_second_per_gpu": 18196.28,
"total_tokens": 40390659
},
{
"epoch": 0.08929730809568996,
"grad_norm": 1.8984375,
"learning_rate": 6.894418413747183e-07,
"loss": 0.457,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 894,
"tokens_per_second_per_gpu": 20565.68,
"total_tokens": 40440395
},
{
"epoch": 0.08939719322778804,
"grad_norm": 0.91796875,
"learning_rate": 6.767619878448783e-07,
"loss": 0.4651,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 895,
"tokens_per_second_per_gpu": 20380.49,
"total_tokens": 40489844
},
{
"epoch": 0.08949707835988613,
"grad_norm": 1.0,
"learning_rate": 6.641957350279838e-07,
"loss": 0.4452,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 896,
"tokens_per_second_per_gpu": 16799.21,
"total_tokens": 40531712
},
{
"epoch": 0.08959696349198422,
"grad_norm": 1.0390625,
"learning_rate": 6.517432360398556e-07,
"loss": 0.486,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 897,
"tokens_per_second_per_gpu": 16152.11,
"total_tokens": 40572211
},
{
"epoch": 0.08969684862408231,
"grad_norm": 0.9921875,
"learning_rate": 6.394046426102673e-07,
"loss": 0.4418,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 898,
"tokens_per_second_per_gpu": 16569.92,
"total_tokens": 40613429
},
{
"epoch": 0.08979673375618039,
"grad_norm": 0.96484375,
"learning_rate": 6.271801050810856e-07,
"loss": 0.4513,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 899,
"tokens_per_second_per_gpu": 20112.44,
"total_tokens": 40662338
},
{
"epoch": 0.08989661888827848,
"grad_norm": 0.9140625,
"learning_rate": 6.150697724044407e-07,
"loss": 0.4388,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 900,
"tokens_per_second_per_gpu": 19881.68,
"total_tokens": 40709871
},
{
"epoch": 0.08999650402037657,
"grad_norm": 1.1171875,
"learning_rate": 6.030737921409169e-07,
"loss": 0.4688,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 901,
"tokens_per_second_per_gpu": 19500.12,
"total_tokens": 40756767
},
{
"epoch": 0.09009638915247466,
"grad_norm": 0.98828125,
"learning_rate": 5.911923104577455e-07,
"loss": 0.4264,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 902,
"tokens_per_second_per_gpu": 17913.22,
"total_tokens": 40801044
},
{
"epoch": 0.09019627428457275,
"grad_norm": 1.0234375,
"learning_rate": 5.794254721270331e-07,
"loss": 0.4401,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 903,
"tokens_per_second_per_gpu": 17718.96,
"total_tokens": 40844019
},
{
"epoch": 0.09029615941667082,
"grad_norm": 0.984375,
"learning_rate": 5.677734205239904e-07,
"loss": 0.4382,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 904,
"tokens_per_second_per_gpu": 16599.4,
"total_tokens": 40885158
},
{
"epoch": 0.09039604454876891,
"grad_norm": 0.9453125,
"learning_rate": 5.562362976251901e-07,
"loss": 0.4395,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 905,
"tokens_per_second_per_gpu": 19135.68,
"total_tokens": 40931536
},
{
"epoch": 0.090495929680867,
"grad_norm": 4.21875,
"learning_rate": 5.448142440068316e-07,
"loss": 0.4645,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 906,
"tokens_per_second_per_gpu": 21138.2,
"total_tokens": 40982884
},
{
"epoch": 0.0905958148129651,
"grad_norm": 0.98828125,
"learning_rate": 5.335073988430373e-07,
"loss": 0.4415,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 907,
"tokens_per_second_per_gpu": 17495.75,
"total_tokens": 41026347
},
{
"epoch": 0.09069569994506317,
"grad_norm": 1.0234375,
"learning_rate": 5.223158999041444e-07,
"loss": 0.4052,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 908,
"tokens_per_second_per_gpu": 15924.71,
"total_tokens": 41066086
},
{
"epoch": 0.09079558507716126,
"grad_norm": 0.9140625,
"learning_rate": 5.112398835550348e-07,
"loss": 0.4484,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 909,
"tokens_per_second_per_gpu": 20056.15,
"total_tokens": 41114943
},
{
"epoch": 0.09089547020925935,
"grad_norm": 0.93359375,
"learning_rate": 5.002794847534765e-07,
"loss": 0.448,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 910,
"tokens_per_second_per_gpu": 20648.43,
"total_tokens": 41163509
},
{
"epoch": 0.09099535534135744,
"grad_norm": 1.0078125,
"learning_rate": 4.894348370484648e-07,
"loss": 0.4369,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 911,
"tokens_per_second_per_gpu": 18606.75,
"total_tokens": 41208775
},
{
"epoch": 0.09109524047345553,
"grad_norm": 0.94921875,
"learning_rate": 4.787060725786141e-07,
"loss": 0.466,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 912,
"tokens_per_second_per_gpu": 20156.85,
"total_tokens": 41256862
},
{
"epoch": 0.09119512560555361,
"grad_norm": 0.98046875,
"learning_rate": 4.6809332207053083e-07,
"loss": 0.4929,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 913,
"tokens_per_second_per_gpu": 21484.71,
"total_tokens": 41309518
},
{
"epoch": 0.0912950107376517,
"grad_norm": 0.94921875,
"learning_rate": 4.575967148372318e-07,
"loss": 0.4791,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 914,
"tokens_per_second_per_gpu": 20365.56,
"total_tokens": 41358382
},
{
"epoch": 0.09139489586974979,
"grad_norm": 1.0078125,
"learning_rate": 4.4721637877656377e-07,
"loss": 0.4525,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 915,
"tokens_per_second_per_gpu": 16982.25,
"total_tokens": 41399708
},
{
"epoch": 0.09149478100184788,
"grad_norm": 1.015625,
"learning_rate": 4.3695244036964567e-07,
"loss": 0.438,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 916,
"tokens_per_second_per_gpu": 18562.94,
"total_tokens": 41443914
},
{
"epoch": 0.09159466613394596,
"grad_norm": 0.99609375,
"learning_rate": 4.268050246793276e-07,
"loss": 0.4601,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 917,
"tokens_per_second_per_gpu": 17697.46,
"total_tokens": 41486925
},
{
"epoch": 0.09169455126604405,
"grad_norm": 0.94921875,
"learning_rate": 4.167742553486676e-07,
"loss": 0.3925,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 918,
"tokens_per_second_per_gpu": 16335.77,
"total_tokens": 41527017
},
{
"epoch": 0.09179443639814214,
"grad_norm": 0.98046875,
"learning_rate": 4.068602545994249e-07,
"loss": 0.4612,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 919,
"tokens_per_second_per_gpu": 17681.21,
"total_tokens": 41569225
},
{
"epoch": 0.09189432153024023,
"grad_norm": 1.0,
"learning_rate": 3.9706314323056936e-07,
"loss": 0.4546,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 920,
"tokens_per_second_per_gpu": 17947.92,
"total_tokens": 41612776
},
{
"epoch": 0.0919942066623383,
"grad_norm": 0.9609375,
"learning_rate": 3.8738304061681107e-07,
"loss": 0.4736,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 921,
"tokens_per_second_per_gpu": 19814.19,
"total_tokens": 41661306
},
{
"epoch": 0.0920940917944364,
"grad_norm": 1.0078125,
"learning_rate": 3.7782006470714614e-07,
"loss": 0.422,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 922,
"tokens_per_second_per_gpu": 17699.83,
"total_tokens": 41703850
},
{
"epoch": 0.09219397692653448,
"grad_norm": 0.93359375,
"learning_rate": 3.68374332023419e-07,
"loss": 0.4534,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 923,
"tokens_per_second_per_gpu": 19623.71,
"total_tokens": 41751685
},
{
"epoch": 0.09229386205863258,
"grad_norm": 0.9609375,
"learning_rate": 3.590459576589e-07,
"loss": 0.4743,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 924,
"tokens_per_second_per_gpu": 18825.06,
"total_tokens": 41798183
},
{
"epoch": 0.09239374719073067,
"grad_norm": 1.0078125,
"learning_rate": 3.498350552768859e-07,
"loss": 0.4961,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 925,
"tokens_per_second_per_gpu": 19234.73,
"total_tokens": 41845106
},
{
"epoch": 0.09249363232282874,
"grad_norm": 0.9375,
"learning_rate": 3.4074173710931804e-07,
"loss": 0.458,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 926,
"tokens_per_second_per_gpu": 20158.76,
"total_tokens": 41894753
},
{
"epoch": 0.09259351745492683,
"grad_norm": 0.9296875,
"learning_rate": 3.3176611395540625e-07,
"loss": 0.4375,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 927,
"tokens_per_second_per_gpu": 19702.47,
"total_tokens": 41941859
},
{
"epoch": 0.09269340258702492,
"grad_norm": 0.953125,
"learning_rate": 3.2290829518028867e-07,
"loss": 0.4647,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 928,
"tokens_per_second_per_gpu": 18735.78,
"total_tokens": 41987824
},
{
"epoch": 0.09279328771912301,
"grad_norm": 1.0234375,
"learning_rate": 3.1416838871368925e-07,
"loss": 0.5122,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 929,
"tokens_per_second_per_gpu": 19608.28,
"total_tokens": 42034861
},
{
"epoch": 0.09289317285122109,
"grad_norm": 1.15625,
"learning_rate": 3.0554650104861137e-07,
"loss": 0.5212,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 930,
"tokens_per_second_per_gpu": 18235.48,
"total_tokens": 42080690
},
{
"epoch": 0.09299305798331918,
"grad_norm": 1.1171875,
"learning_rate": 2.970427372400353e-07,
"loss": 0.4358,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 931,
"tokens_per_second_per_gpu": 17479.6,
"total_tokens": 42123904
},
{
"epoch": 0.09309294311541727,
"grad_norm": 0.953125,
"learning_rate": 2.8865720090364037e-07,
"loss": 0.4614,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 932,
"tokens_per_second_per_gpu": 19294.48,
"total_tokens": 42171026
},
{
"epoch": 0.09319282824751536,
"grad_norm": 0.96484375,
"learning_rate": 2.8038999421453827e-07,
"loss": 0.4582,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 933,
"tokens_per_second_per_gpu": 18575.01,
"total_tokens": 42217647
},
{
"epoch": 0.09329271337961345,
"grad_norm": 0.921875,
"learning_rate": 2.7224121790603517e-07,
"loss": 0.4831,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 934,
"tokens_per_second_per_gpu": 20795.01,
"total_tokens": 42267921
},
{
"epoch": 0.09339259851171153,
"grad_norm": 0.96875,
"learning_rate": 2.6421097126839714e-07,
"loss": 0.4428,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 935,
"tokens_per_second_per_gpu": 18775.03,
"total_tokens": 42314148
},
{
"epoch": 0.09349248364380962,
"grad_norm": 0.984375,
"learning_rate": 2.5629935214764866e-07,
"loss": 0.4708,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 936,
"tokens_per_second_per_gpu": 19374.28,
"total_tokens": 42361852
},
{
"epoch": 0.09359236877590771,
"grad_norm": 0.9453125,
"learning_rate": 2.4850645694436736e-07,
"loss": 0.4287,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 937,
"tokens_per_second_per_gpu": 18463.91,
"total_tokens": 42406661
},
{
"epoch": 0.0936922539080058,
"grad_norm": 1.1484375,
"learning_rate": 2.4083238061252565e-07,
"loss": 0.494,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 938,
"tokens_per_second_per_gpu": 18323.56,
"total_tokens": 42451040
},
{
"epoch": 0.09379213904010388,
"grad_norm": 0.9453125,
"learning_rate": 2.332772166583208e-07,
"loss": 0.4761,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 939,
"tokens_per_second_per_gpu": 19643.91,
"total_tokens": 42498816
},
{
"epoch": 0.09389202417220197,
"grad_norm": 0.953125,
"learning_rate": 2.2584105713904126e-07,
"loss": 0.4444,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 940,
"tokens_per_second_per_gpu": 18834.15,
"total_tokens": 42544756
},
{
"epoch": 0.09399190930430006,
"grad_norm": 1.0078125,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.3685,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 941,
"tokens_per_second_per_gpu": 14666.73,
"total_tokens": 42581678
},
{
"epoch": 0.09409179443639815,
"grad_norm": 0.97265625,
"learning_rate": 2.1132611238315004e-07,
"loss": 0.5325,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 942,
"tokens_per_second_per_gpu": 19959.54,
"total_tokens": 42631711
},
{
"epoch": 0.09419167956849622,
"grad_norm": 1.1640625,
"learning_rate": 2.0424750400655947e-07,
"loss": 0.4282,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 943,
"tokens_per_second_per_gpu": 17580.27,
"total_tokens": 42674486
},
{
"epoch": 0.09429156470059431,
"grad_norm": 0.98828125,
"learning_rate": 1.9728825378278248e-07,
"loss": 0.4102,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 944,
"tokens_per_second_per_gpu": 17182.11,
"total_tokens": 42716862
},
{
"epoch": 0.0943914498326924,
"grad_norm": 0.96875,
"learning_rate": 1.9044844650808468e-07,
"loss": 0.4571,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 945,
"tokens_per_second_per_gpu": 18865.72,
"total_tokens": 42762569
},
{
"epoch": 0.0944913349647905,
"grad_norm": 0.9921875,
"learning_rate": 1.8372816552336025e-07,
"loss": 0.4254,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 946,
"tokens_per_second_per_gpu": 18081.22,
"total_tokens": 42806582
},
{
"epoch": 0.09459122009688858,
"grad_norm": 0.98828125,
"learning_rate": 1.7712749271311392e-07,
"loss": 0.4432,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 947,
"tokens_per_second_per_gpu": 18913.25,
"total_tokens": 42853391
},
{
"epoch": 0.09469110522898666,
"grad_norm": 0.99609375,
"learning_rate": 1.706465085044584e-07,
"loss": 0.4443,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 948,
"tokens_per_second_per_gpu": 16852.31,
"total_tokens": 42895962
},
{
"epoch": 0.09479099036108475,
"grad_norm": 0.94140625,
"learning_rate": 1.6428529186614195e-07,
"loss": 0.4545,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 949,
"tokens_per_second_per_gpu": 17871.59,
"total_tokens": 42940970
},
{
"epoch": 0.09489087549318284,
"grad_norm": 0.98046875,
"learning_rate": 1.580439203075812e-07,
"loss": 0.4492,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 950,
"tokens_per_second_per_gpu": 18162.74,
"total_tokens": 42985294
},
{
"epoch": 0.09499076062528093,
"grad_norm": 0.98046875,
"learning_rate": 1.519224698779198e-07,
"loss": 0.4666,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 951,
"tokens_per_second_per_gpu": 18369.89,
"total_tokens": 43031965
},
{
"epoch": 0.09509064575737901,
"grad_norm": 0.9453125,
"learning_rate": 1.4592101516509916e-07,
"loss": 0.4825,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 952,
"tokens_per_second_per_gpu": 19654.91,
"total_tokens": 43080342
},
{
"epoch": 0.0951905308894771,
"grad_norm": 0.9921875,
"learning_rate": 1.400396292949513e-07,
"loss": 0.4514,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 953,
"tokens_per_second_per_gpu": 19029.22,
"total_tokens": 43126884
},
{
"epoch": 0.09529041602157519,
"grad_norm": 0.96875,
"learning_rate": 1.3427838393030634e-07,
"loss": 0.4483,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 954,
"tokens_per_second_per_gpu": 18592.71,
"total_tokens": 43172457
},
{
"epoch": 0.09539030115367328,
"grad_norm": 0.984375,
"learning_rate": 1.2863734927012094e-07,
"loss": 0.4691,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 955,
"tokens_per_second_per_gpu": 17402.82,
"total_tokens": 43219624
},
{
"epoch": 0.09549018628577137,
"grad_norm": 0.953125,
"learning_rate": 1.231165940486234e-07,
"loss": 0.4377,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 956,
"tokens_per_second_per_gpu": 17255.75,
"total_tokens": 43262874
},
{
"epoch": 0.09559007141786945,
"grad_norm": 0.9765625,
"learning_rate": 1.1771618553447217e-07,
"loss": 0.4813,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 957,
"tokens_per_second_per_gpu": 19253.02,
"total_tokens": 43309217
},
{
"epoch": 0.09568995654996754,
"grad_norm": 0.98046875,
"learning_rate": 1.1243618952994195e-07,
"loss": 0.484,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 958,
"tokens_per_second_per_gpu": 20471.01,
"total_tokens": 43358319
},
{
"epoch": 0.09578984168206563,
"grad_norm": 1.0,
"learning_rate": 1.0727667037011668e-07,
"loss": 0.4488,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 959,
"tokens_per_second_per_gpu": 19270.76,
"total_tokens": 43404675
},
{
"epoch": 0.09588972681416372,
"grad_norm": 0.98828125,
"learning_rate": 1.0223769092211012e-07,
"loss": 0.4487,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 960,
"tokens_per_second_per_gpu": 17859.57,
"total_tokens": 43448494
},
{
"epoch": 0.0959896119462618,
"grad_norm": 0.94921875,
"learning_rate": 9.731931258429638e-08,
"loss": 0.5092,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 961,
"tokens_per_second_per_gpu": 22155.32,
"total_tokens": 43501117
},
{
"epoch": 0.09608949707835988,
"grad_norm": 0.95703125,
"learning_rate": 9.252159528556404e-08,
"loss": 0.4117,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 962,
"tokens_per_second_per_gpu": 17370.88,
"total_tokens": 43543709
},
{
"epoch": 0.09618938221045797,
"grad_norm": 0.99609375,
"learning_rate": 8.784459748458318e-08,
"loss": 0.4376,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 963,
"tokens_per_second_per_gpu": 16865.09,
"total_tokens": 43585580
},
{
"epoch": 0.09628926734255606,
"grad_norm": 1.4296875,
"learning_rate": 8.328837616909612e-08,
"loss": 0.4473,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 964,
"tokens_per_second_per_gpu": 16405.92,
"total_tokens": 43626251
},
{
"epoch": 0.09638915247465414,
"grad_norm": 1.046875,
"learning_rate": 7.885298685522235e-08,
"loss": 0.4357,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 965,
"tokens_per_second_per_gpu": 16632.25,
"total_tokens": 43666658
},
{
"epoch": 0.09648903760675223,
"grad_norm": 1.0703125,
"learning_rate": 7.453848358678018e-08,
"loss": 0.4606,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 966,
"tokens_per_second_per_gpu": 15540.97,
"total_tokens": 43705183
},
{
"epoch": 0.09658892273885032,
"grad_norm": 0.8984375,
"learning_rate": 7.034491893463059e-08,
"loss": 0.4517,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 967,
"tokens_per_second_per_gpu": 21108.92,
"total_tokens": 43756112
},
{
"epoch": 0.09668880787094841,
"grad_norm": 0.9765625,
"learning_rate": 6.627234399603554e-08,
"loss": 0.4383,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 968,
"tokens_per_second_per_gpu": 18476.27,
"total_tokens": 43801144
},
{
"epoch": 0.0967886930030465,
"grad_norm": 1.21875,
"learning_rate": 6.232080839403631e-08,
"loss": 0.3901,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 969,
"tokens_per_second_per_gpu": 16945.39,
"total_tokens": 43843371
},
{
"epoch": 0.09688857813514458,
"grad_norm": 0.98046875,
"learning_rate": 5.849036027684607e-08,
"loss": 0.4401,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 970,
"tokens_per_second_per_gpu": 16730.65,
"total_tokens": 43883917
},
{
"epoch": 0.09698846326724267,
"grad_norm": 1.015625,
"learning_rate": 5.4781046317267103e-08,
"loss": 0.4496,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 971,
"tokens_per_second_per_gpu": 18073.27,
"total_tokens": 43927712
},
{
"epoch": 0.09708834839934076,
"grad_norm": 0.984375,
"learning_rate": 5.119291171211793e-08,
"loss": 0.4021,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 972,
"tokens_per_second_per_gpu": 15942.93,
"total_tokens": 43967262
},
{
"epoch": 0.09718823353143885,
"grad_norm": 1.078125,
"learning_rate": 4.772600018168816e-08,
"loss": 0.4618,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 973,
"tokens_per_second_per_gpu": 17579.52,
"total_tokens": 44009445
},
{
"epoch": 0.09728811866353693,
"grad_norm": 0.9296875,
"learning_rate": 4.438035396920004e-08,
"loss": 0.4217,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 974,
"tokens_per_second_per_gpu": 18951.32,
"total_tokens": 44056001
},
{
"epoch": 0.09738800379563502,
"grad_norm": 0.91015625,
"learning_rate": 4.115601384029666e-08,
"loss": 0.4659,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 975,
"tokens_per_second_per_gpu": 20514.73,
"total_tokens": 44104705
},
{
"epoch": 0.09748788892773311,
"grad_norm": 0.9921875,
"learning_rate": 3.805301908254455e-08,
"loss": 0.4325,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 976,
"tokens_per_second_per_gpu": 16668.21,
"total_tokens": 44145662
},
{
"epoch": 0.0975877740598312,
"grad_norm": 0.97265625,
"learning_rate": 3.50714075049563e-08,
"loss": 0.4782,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 977,
"tokens_per_second_per_gpu": 18495.56,
"total_tokens": 44191132
},
{
"epoch": 0.09768765919192929,
"grad_norm": 0.95703125,
"learning_rate": 3.22112154375287e-08,
"loss": 0.4653,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 978,
"tokens_per_second_per_gpu": 19483.13,
"total_tokens": 44239031
},
{
"epoch": 0.09778754432402736,
"grad_norm": 0.9609375,
"learning_rate": 2.947247773079753e-08,
"loss": 0.4519,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 979,
"tokens_per_second_per_gpu": 19120.96,
"total_tokens": 44285415
},
{
"epoch": 0.09788742945612545,
"grad_norm": 0.94921875,
"learning_rate": 2.6855227755419046e-08,
"loss": 0.4424,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 980,
"tokens_per_second_per_gpu": 17814.06,
"total_tokens": 44330574
},
{
"epoch": 0.09798731458822355,
"grad_norm": 0.9453125,
"learning_rate": 2.4359497401758026e-08,
"loss": 0.4192,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 981,
"tokens_per_second_per_gpu": 19316.62,
"total_tokens": 44376868
},
{
"epoch": 0.09808719972032164,
"grad_norm": 0.9453125,
"learning_rate": 2.1985317079500358e-08,
"loss": 0.4946,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 982,
"tokens_per_second_per_gpu": 20238.67,
"total_tokens": 44425601
},
{
"epoch": 0.09818708485241971,
"grad_norm": 0.94921875,
"learning_rate": 1.973271571728441e-08,
"loss": 0.4305,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 983,
"tokens_per_second_per_gpu": 18249.19,
"total_tokens": 44469533
},
{
"epoch": 0.0982869699845178,
"grad_norm": 1.4296875,
"learning_rate": 1.7601720762346895e-08,
"loss": 0.4433,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 984,
"tokens_per_second_per_gpu": 17378.35,
"total_tokens": 44512930
},
{
"epoch": 0.09838685511661589,
"grad_norm": 0.99609375,
"learning_rate": 1.5592358180189782e-08,
"loss": 0.3867,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 985,
"tokens_per_second_per_gpu": 16166.91,
"total_tokens": 44552078
},
{
"epoch": 0.09848674024871398,
"grad_norm": 1.015625,
"learning_rate": 1.370465245426167e-08,
"loss": 0.4447,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 986,
"tokens_per_second_per_gpu": 19431.46,
"total_tokens": 44599621
},
{
"epoch": 0.09858662538081206,
"grad_norm": 0.9453125,
"learning_rate": 1.1938626585660252e-08,
"loss": 0.4789,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 987,
"tokens_per_second_per_gpu": 21446.01,
"total_tokens": 44651008
},
{
"epoch": 0.09868651051291015,
"grad_norm": 0.984375,
"learning_rate": 1.0294302092853647e-08,
"loss": 0.4412,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 988,
"tokens_per_second_per_gpu": 16777.56,
"total_tokens": 44692261
},
{
"epoch": 0.09878639564500824,
"grad_norm": 0.98828125,
"learning_rate": 8.771699011416169e-09,
"loss": 0.4817,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 989,
"tokens_per_second_per_gpu": 19002.23,
"total_tokens": 44738601
},
{
"epoch": 0.09888628077710633,
"grad_norm": 0.984375,
"learning_rate": 7.370835893788508e-09,
"loss": 0.4094,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 990,
"tokens_per_second_per_gpu": 16810.32,
"total_tokens": 44779354
},
{
"epoch": 0.09898616590920442,
"grad_norm": 0.94921875,
"learning_rate": 6.091729809042379e-09,
"loss": 0.4621,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 991,
"tokens_per_second_per_gpu": 19308.29,
"total_tokens": 44826450
},
{
"epoch": 0.0990860510413025,
"grad_norm": 0.9609375,
"learning_rate": 4.9343963426840006e-09,
"loss": 0.4263,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 992,
"tokens_per_second_per_gpu": 17706.22,
"total_tokens": 44869736
},
{
"epoch": 0.09918593617340059,
"grad_norm": 0.92578125,
"learning_rate": 3.898849596456477e-09,
"loss": 0.4745,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 993,
"tokens_per_second_per_gpu": 20667.54,
"total_tokens": 44920499
},
{
"epoch": 0.09928582130549868,
"grad_norm": 1.0234375,
"learning_rate": 2.9851021881688314e-09,
"loss": 0.4705,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 994,
"tokens_per_second_per_gpu": 17161.8,
"total_tokens": 44963646
},
{
"epoch": 0.09938570643759677,
"grad_norm": 0.94921875,
"learning_rate": 2.193165251545004e-09,
"loss": 0.4023,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 995,
"tokens_per_second_per_gpu": 16653.71,
"total_tokens": 45005897
},
{
"epoch": 0.09948559156969485,
"grad_norm": 0.984375,
"learning_rate": 1.5230484360873043e-09,
"loss": 0.4059,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 996,
"tokens_per_second_per_gpu": 16316.94,
"total_tokens": 45046570
},
{
"epoch": 0.09958547670179294,
"grad_norm": 1.0234375,
"learning_rate": 9.74759906957612e-10,
"loss": 0.4281,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 997,
"tokens_per_second_per_gpu": 16667.99,
"total_tokens": 45087565
},
{
"epoch": 0.09968536183389103,
"grad_norm": 0.9765625,
"learning_rate": 5.483063448785686e-10,
"loss": 0.516,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 998,
"tokens_per_second_per_gpu": 18744.23,
"total_tokens": 45134425
},
{
"epoch": 0.09978524696598912,
"grad_norm": 0.94921875,
"learning_rate": 2.436929460525317e-10,
"loss": 0.4697,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 999,
"tokens_per_second_per_gpu": 21031.15,
"total_tokens": 45185167
},
{
"epoch": 0.0998851320980872,
"grad_norm": 0.92578125,
"learning_rate": 6.092342209607083e-11,
"loss": 0.4754,
"memory/device_reserved (GiB)": 101.86,
"memory/max_active (GiB)": 91.58,
"memory/max_allocated (GiB)": 91.58,
"step": 1000,
"tokens_per_second_per_gpu": 20383.74,
"total_tokens": 45235592
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.428755029426176e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}