Image-to-Text
Transformers
Safetensors
qwen3_vl
Unicorn-VL-R3 / trainer_state.json
rin2401's picture
Upload folder using huggingface_hub
0953e12 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.956175298804781,
"eval_steps": 500,
"global_step": 186,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01593625498007968,
"grad_norm": 3.265625,
"learning_rate": 0.0,
"loss": 1.1802,
"memory/device_reserved (GiB)": 62.1,
"memory/max_active (GiB)": 50.46,
"memory/max_allocated (GiB)": 50.46,
"step": 1,
"tokens_per_second_per_gpu": 4078.9,
"total_tokens": 187960
},
{
"epoch": 0.03187250996015936,
"grad_norm": 3.53125,
"learning_rate": 1.111111111111111e-07,
"loss": 1.2461,
"memory/device_reserved (GiB)": 77.62,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 2,
"tokens_per_second_per_gpu": 5600.28,
"total_tokens": 380007
},
{
"epoch": 0.04780876494023904,
"grad_norm": 3.5625,
"learning_rate": 2.222222222222222e-07,
"loss": 1.3145,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 3,
"tokens_per_second_per_gpu": 5504.25,
"total_tokens": 566579
},
{
"epoch": 0.06374501992031872,
"grad_norm": 3.625,
"learning_rate": 3.333333333333333e-07,
"loss": 1.2505,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 4,
"tokens_per_second_per_gpu": 5678.77,
"total_tokens": 754678
},
{
"epoch": 0.0796812749003984,
"grad_norm": 3.46875,
"learning_rate": 4.444444444444444e-07,
"loss": 1.2344,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 5,
"tokens_per_second_per_gpu": 5644.44,
"total_tokens": 935084
},
{
"epoch": 0.09561752988047809,
"grad_norm": 3.71875,
"learning_rate": 5.555555555555555e-07,
"loss": 1.334,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 6,
"tokens_per_second_per_gpu": 5488.37,
"total_tokens": 1114037
},
{
"epoch": 0.11155378486055777,
"grad_norm": 3.4375,
"learning_rate": 6.666666666666666e-07,
"loss": 1.1704,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 7,
"tokens_per_second_per_gpu": 5468.62,
"total_tokens": 1302175
},
{
"epoch": 0.12749003984063745,
"grad_norm": 3.484375,
"learning_rate": 7.777777777777778e-07,
"loss": 1.2471,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 8,
"tokens_per_second_per_gpu": 5412.64,
"total_tokens": 1483342
},
{
"epoch": 0.14342629482071714,
"grad_norm": 3.375,
"learning_rate": 8.888888888888888e-07,
"loss": 1.2354,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 9,
"tokens_per_second_per_gpu": 5455.54,
"total_tokens": 1667332
},
{
"epoch": 0.1593625498007968,
"grad_norm": 3.546875,
"learning_rate": 1e-06,
"loss": 1.3232,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 10,
"tokens_per_second_per_gpu": 5574.32,
"total_tokens": 1849357
},
{
"epoch": 0.1752988047808765,
"grad_norm": 3.671875,
"learning_rate": 1.111111111111111e-06,
"loss": 1.3232,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 11,
"tokens_per_second_per_gpu": 5321.58,
"total_tokens": 2026853
},
{
"epoch": 0.19123505976095617,
"grad_norm": 3.359375,
"learning_rate": 1.2222222222222223e-06,
"loss": 1.2529,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 12,
"tokens_per_second_per_gpu": 5847.85,
"total_tokens": 2213285
},
{
"epoch": 0.20717131474103587,
"grad_norm": 3.328125,
"learning_rate": 1.3333333333333332e-06,
"loss": 1.2559,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 13,
"tokens_per_second_per_gpu": 5613.77,
"total_tokens": 2400076
},
{
"epoch": 0.22310756972111553,
"grad_norm": 3.15625,
"learning_rate": 1.4444444444444443e-06,
"loss": 1.2129,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 14,
"tokens_per_second_per_gpu": 5571.51,
"total_tokens": 2593112
},
{
"epoch": 0.23904382470119523,
"grad_norm": 3.125,
"learning_rate": 1.5555555555555556e-06,
"loss": 1.2153,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 15,
"tokens_per_second_per_gpu": 5626.56,
"total_tokens": 2782327
},
{
"epoch": 0.2549800796812749,
"grad_norm": 3.3125,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.2598,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 16,
"tokens_per_second_per_gpu": 5731.24,
"total_tokens": 2966947
},
{
"epoch": 0.27091633466135456,
"grad_norm": 3.09375,
"learning_rate": 1.7777777777777775e-06,
"loss": 1.1714,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 17,
"tokens_per_second_per_gpu": 6088.78,
"total_tokens": 3161163
},
{
"epoch": 0.2868525896414343,
"grad_norm": 3.234375,
"learning_rate": 1.8888888888888888e-06,
"loss": 1.2402,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 18,
"tokens_per_second_per_gpu": 5499.3,
"total_tokens": 3343301
},
{
"epoch": 0.30278884462151395,
"grad_norm": 3.171875,
"learning_rate": 2e-06,
"loss": 1.2158,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 19,
"tokens_per_second_per_gpu": 5505.1,
"total_tokens": 3527203
},
{
"epoch": 0.3187250996015936,
"grad_norm": 3.21875,
"learning_rate": 1.9998251609127463e-06,
"loss": 1.2446,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 20,
"tokens_per_second_per_gpu": 5441.06,
"total_tokens": 3710228
},
{
"epoch": 0.3346613545816733,
"grad_norm": 2.875,
"learning_rate": 1.9993007047883984e-06,
"loss": 1.1895,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 21,
"tokens_per_second_per_gpu": 5817.48,
"total_tokens": 3903090
},
{
"epoch": 0.350597609561753,
"grad_norm": 3.078125,
"learning_rate": 1.9984268150178167e-06,
"loss": 1.209,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 22,
"tokens_per_second_per_gpu": 5823.26,
"total_tokens": 4085213
},
{
"epoch": 0.3665338645418327,
"grad_norm": 2.71875,
"learning_rate": 1.9972037971811797e-06,
"loss": 1.1602,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 23,
"tokens_per_second_per_gpu": 5595.44,
"total_tokens": 4271930
},
{
"epoch": 0.38247011952191234,
"grad_norm": 2.625,
"learning_rate": 1.9956320789411338e-06,
"loss": 1.1587,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 24,
"tokens_per_second_per_gpu": 5470.25,
"total_tokens": 4458282
},
{
"epoch": 0.398406374501992,
"grad_norm": 2.96875,
"learning_rate": 1.9937122098932426e-06,
"loss": 1.2295,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 25,
"tokens_per_second_per_gpu": 5381.09,
"total_tokens": 4638920
},
{
"epoch": 0.41434262948207173,
"grad_norm": 2.765625,
"learning_rate": 1.9914448613738106e-06,
"loss": 1.2695,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 26,
"tokens_per_second_per_gpu": 5413.01,
"total_tokens": 4823256
},
{
"epoch": 0.4302788844621514,
"grad_norm": 2.703125,
"learning_rate": 1.9888308262251284e-06,
"loss": 1.2412,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 27,
"tokens_per_second_per_gpu": 5555.69,
"total_tokens": 5004546
},
{
"epoch": 0.44621513944223107,
"grad_norm": 2.703125,
"learning_rate": 1.9858710185182355e-06,
"loss": 1.2666,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 28,
"tokens_per_second_per_gpu": 5321.57,
"total_tokens": 5180916
},
{
"epoch": 0.46215139442231074,
"grad_norm": 2.5625,
"learning_rate": 1.9825664732332882e-06,
"loss": 1.207,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 29,
"tokens_per_second_per_gpu": 5507.08,
"total_tokens": 5362870
},
{
"epoch": 0.47808764940239046,
"grad_norm": 2.671875,
"learning_rate": 1.9789183458976484e-06,
"loss": 1.1904,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 30,
"tokens_per_second_per_gpu": 5788.71,
"total_tokens": 5548692
},
{
"epoch": 0.4940239043824701,
"grad_norm": 2.4375,
"learning_rate": 1.9749279121818236e-06,
"loss": 1.1865,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 31,
"tokens_per_second_per_gpu": 5411.35,
"total_tokens": 5734611
},
{
"epoch": 0.5099601593625498,
"grad_norm": 2.5,
"learning_rate": 1.970596567453391e-06,
"loss": 1.1953,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 32,
"tokens_per_second_per_gpu": 5568.61,
"total_tokens": 5915483
},
{
"epoch": 0.5258964143426295,
"grad_norm": 2.328125,
"learning_rate": 1.965925826289068e-06,
"loss": 1.1885,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 33,
"tokens_per_second_per_gpu": 5527.28,
"total_tokens": 6103887
},
{
"epoch": 0.5418326693227091,
"grad_norm": 2.390625,
"learning_rate": 1.9609173219450997e-06,
"loss": 1.2578,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 34,
"tokens_per_second_per_gpu": 5786.79,
"total_tokens": 6291053
},
{
"epoch": 0.5577689243027888,
"grad_norm": 2.390625,
"learning_rate": 1.955572805786141e-06,
"loss": 1.2656,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 35,
"tokens_per_second_per_gpu": 5616.39,
"total_tokens": 6476080
},
{
"epoch": 0.5737051792828686,
"grad_norm": 2.40625,
"learning_rate": 1.9498941466728456e-06,
"loss": 1.2285,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 36,
"tokens_per_second_per_gpu": 5446.88,
"total_tokens": 6654173
},
{
"epoch": 0.5896414342629482,
"grad_norm": 2.1875,
"learning_rate": 1.9438833303083674e-06,
"loss": 1.2314,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 37,
"tokens_per_second_per_gpu": 5411.99,
"total_tokens": 6838496
},
{
"epoch": 0.6055776892430279,
"grad_norm": 2.140625,
"learning_rate": 1.937542458543999e-06,
"loss": 1.1685,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 38,
"tokens_per_second_per_gpu": 5285.57,
"total_tokens": 7020101
},
{
"epoch": 0.6215139442231076,
"grad_norm": 2.09375,
"learning_rate": 1.930873748644204e-06,
"loss": 1.1489,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 39,
"tokens_per_second_per_gpu": 5909.97,
"total_tokens": 7209728
},
{
"epoch": 0.6374501992031872,
"grad_norm": 2.21875,
"learning_rate": 1.9238795325112867e-06,
"loss": 1.1924,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 40,
"tokens_per_second_per_gpu": 5668.7,
"total_tokens": 7392542
},
{
"epoch": 0.6533864541832669,
"grad_norm": 2.1875,
"learning_rate": 1.916562255869976e-06,
"loss": 1.21,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 41,
"tokens_per_second_per_gpu": 5187.81,
"total_tokens": 7570380
},
{
"epoch": 0.6693227091633466,
"grad_norm": 2.03125,
"learning_rate": 1.908924477412211e-06,
"loss": 1.1787,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 42,
"tokens_per_second_per_gpu": 5502.78,
"total_tokens": 7753287
},
{
"epoch": 0.6852589641434262,
"grad_norm": 1.90625,
"learning_rate": 1.9009688679024189e-06,
"loss": 1.1504,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 43,
"tokens_per_second_per_gpu": 5664.86,
"total_tokens": 7942863
},
{
"epoch": 0.701195219123506,
"grad_norm": 1.9921875,
"learning_rate": 1.8926982092436114e-06,
"loss": 1.2075,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 44,
"tokens_per_second_per_gpu": 5493.62,
"total_tokens": 8127398
},
{
"epoch": 0.7171314741035857,
"grad_norm": 1.9140625,
"learning_rate": 1.8841153935046096e-06,
"loss": 1.1484,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 45,
"tokens_per_second_per_gpu": 5847.74,
"total_tokens": 8312603
},
{
"epoch": 0.7330677290836654,
"grad_norm": 1.90625,
"learning_rate": 1.8752234219087537e-06,
"loss": 1.167,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 46,
"tokens_per_second_per_gpu": 5502.76,
"total_tokens": 8497383
},
{
"epoch": 0.749003984063745,
"grad_norm": 1.8828125,
"learning_rate": 1.8660254037844386e-06,
"loss": 1.147,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 47,
"tokens_per_second_per_gpu": 5615.32,
"total_tokens": 8688166
},
{
"epoch": 0.7649402390438247,
"grad_norm": 1.9453125,
"learning_rate": 1.8565245554778515e-06,
"loss": 1.1992,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 48,
"tokens_per_second_per_gpu": 5767.9,
"total_tokens": 8874386
},
{
"epoch": 0.7808764940239044,
"grad_norm": 1.90625,
"learning_rate": 1.8467241992282841e-06,
"loss": 1.1475,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 49,
"tokens_per_second_per_gpu": 5789.02,
"total_tokens": 9059229
},
{
"epoch": 0.796812749003984,
"grad_norm": 1.75,
"learning_rate": 1.8366277620064197e-06,
"loss": 1.0986,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 50,
"tokens_per_second_per_gpu": 5192.74,
"total_tokens": 9255532
},
{
"epoch": 0.8127490039840638,
"grad_norm": 1.7578125,
"learning_rate": 1.8262387743159948e-06,
"loss": 1.0908,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 51,
"tokens_per_second_per_gpu": 5522.43,
"total_tokens": 9446438
},
{
"epoch": 0.8286852589641435,
"grad_norm": 1.7109375,
"learning_rate": 1.8155608689592601e-06,
"loss": 1.1084,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 52,
"tokens_per_second_per_gpu": 5467.0,
"total_tokens": 9632510
},
{
"epoch": 0.8446215139442231,
"grad_norm": 1.6171875,
"learning_rate": 1.8045977797666683e-06,
"loss": 1.0781,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 53,
"tokens_per_second_per_gpu": 5831.19,
"total_tokens": 9824144
},
{
"epoch": 0.8605577689243028,
"grad_norm": 1.78125,
"learning_rate": 1.7933533402912351e-06,
"loss": 1.1533,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 54,
"tokens_per_second_per_gpu": 5705.42,
"total_tokens": 10010140
},
{
"epoch": 0.8764940239043825,
"grad_norm": 1.7734375,
"learning_rate": 1.7818314824680298e-06,
"loss": 1.1689,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 55,
"tokens_per_second_per_gpu": 5696.79,
"total_tokens": 10195855
},
{
"epoch": 0.8924302788844621,
"grad_norm": 1.6796875,
"learning_rate": 1.770036235239263e-06,
"loss": 1.1182,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 56,
"tokens_per_second_per_gpu": 6045.85,
"total_tokens": 10381589
},
{
"epoch": 0.9083665338645418,
"grad_norm": 1.6875,
"learning_rate": 1.7579717231454529e-06,
"loss": 1.1738,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 57,
"tokens_per_second_per_gpu": 5658.53,
"total_tokens": 10573779
},
{
"epoch": 0.9243027888446215,
"grad_norm": 1.65625,
"learning_rate": 1.7456421648831654e-06,
"loss": 1.1553,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 58,
"tokens_per_second_per_gpu": 5601.69,
"total_tokens": 10762764
},
{
"epoch": 0.9402390438247012,
"grad_norm": 1.7109375,
"learning_rate": 1.733051871829826e-06,
"loss": 1.1416,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 59,
"tokens_per_second_per_gpu": 5873.82,
"total_tokens": 10948310
},
{
"epoch": 0.9561752988047809,
"grad_norm": 1.5859375,
"learning_rate": 1.7202052465361266e-06,
"loss": 1.1514,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 60,
"tokens_per_second_per_gpu": 5552.06,
"total_tokens": 11134379
},
{
"epoch": 0.9721115537848606,
"grad_norm": 1.5859375,
"learning_rate": 1.7071067811865474e-06,
"loss": 1.1243,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 61,
"tokens_per_second_per_gpu": 5016.43,
"total_tokens": 11320025
},
{
"epoch": 0.9880478087649402,
"grad_norm": 1.4921875,
"learning_rate": 1.6937610560285416e-06,
"loss": 1.0957,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 62,
"tokens_per_second_per_gpu": 5350.32,
"total_tokens": 11504140
},
{
"epoch": 1.0,
"grad_norm": 1.609375,
"learning_rate": 1.6801727377709191e-06,
"loss": 1.1582,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 63,
"tokens_per_second_per_gpu": 4173.22,
"total_tokens": 11627371
},
{
"epoch": 1.0159362549800797,
"grad_norm": 1.4609375,
"learning_rate": 1.6663465779520037e-06,
"loss": 1.0864,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 64,
"tokens_per_second_per_gpu": 5530.73,
"total_tokens": 11815331
},
{
"epoch": 1.0318725099601593,
"grad_norm": 1.6015625,
"learning_rate": 1.6522874112781212e-06,
"loss": 1.1445,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 65,
"tokens_per_second_per_gpu": 5941.29,
"total_tokens": 12007378
},
{
"epoch": 1.047808764940239,
"grad_norm": 1.546875,
"learning_rate": 1.6380001539330085e-06,
"loss": 1.209,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 66,
"tokens_per_second_per_gpu": 5598.11,
"total_tokens": 12193950
},
{
"epoch": 1.0637450199203187,
"grad_norm": 1.578125,
"learning_rate": 1.6234898018587336e-06,
"loss": 1.1421,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 67,
"tokens_per_second_per_gpu": 5671.24,
"total_tokens": 12382049
},
{
"epoch": 1.0796812749003983,
"grad_norm": 1.5,
"learning_rate": 1.6087614290087205e-06,
"loss": 1.1323,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 68,
"tokens_per_second_per_gpu": 5627.89,
"total_tokens": 12562455
},
{
"epoch": 1.095617529880478,
"grad_norm": 1.5859375,
"learning_rate": 1.5938201855735014e-06,
"loss": 1.2207,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 69,
"tokens_per_second_per_gpu": 5368.43,
"total_tokens": 12741408
},
{
"epoch": 1.1115537848605577,
"grad_norm": 1.46875,
"learning_rate": 1.578671296179806e-06,
"loss": 1.0659,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 70,
"tokens_per_second_per_gpu": 5466.98,
"total_tokens": 12929546
},
{
"epoch": 1.1274900398406373,
"grad_norm": 1.421875,
"learning_rate": 1.563320058063622e-06,
"loss": 1.1416,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 71,
"tokens_per_second_per_gpu": 5509.23,
"total_tokens": 13110713
},
{
"epoch": 1.1434262948207172,
"grad_norm": 1.34375,
"learning_rate": 1.5477718392178713e-06,
"loss": 1.1318,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 72,
"tokens_per_second_per_gpu": 5553.33,
"total_tokens": 13294703
},
{
"epoch": 1.159362549800797,
"grad_norm": 1.484375,
"learning_rate": 1.5320320765153365e-06,
"loss": 1.2129,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 73,
"tokens_per_second_per_gpu": 5457.5,
"total_tokens": 13476728
},
{
"epoch": 1.1752988047808766,
"grad_norm": 1.5,
"learning_rate": 1.5161062738075065e-06,
"loss": 1.2109,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 74,
"tokens_per_second_per_gpu": 5229.94,
"total_tokens": 13654224
},
{
"epoch": 1.1912350597609562,
"grad_norm": 1.3671875,
"learning_rate": 1.5e-06,
"loss": 1.1475,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 75,
"tokens_per_second_per_gpu": 5856.14,
"total_tokens": 13840656
},
{
"epoch": 1.207171314741036,
"grad_norm": 1.3984375,
"learning_rate": 1.4837188871052397e-06,
"loss": 1.1494,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 76,
"tokens_per_second_per_gpu": 5621.93,
"total_tokens": 14027447
},
{
"epoch": 1.2231075697211156,
"grad_norm": 1.3125,
"learning_rate": 1.467268628273062e-06,
"loss": 1.1133,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 77,
"tokens_per_second_per_gpu": 5678.9,
"total_tokens": 14220483
},
{
"epoch": 1.2390438247011952,
"grad_norm": 1.2734375,
"learning_rate": 1.4506549757999453e-06,
"loss": 1.1182,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 78,
"tokens_per_second_per_gpu": 5610.92,
"total_tokens": 14409698
},
{
"epoch": 1.254980079681275,
"grad_norm": 1.4296875,
"learning_rate": 1.433883739117558e-06,
"loss": 1.1582,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 79,
"tokens_per_second_per_gpu": 5624.06,
"total_tokens": 14594318
},
{
"epoch": 1.2709163346613546,
"grad_norm": 1.328125,
"learning_rate": 1.4169607827613282e-06,
"loss": 1.0762,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 80,
"tokens_per_second_per_gpu": 6081.02,
"total_tokens": 14788534
},
{
"epoch": 1.2868525896414342,
"grad_norm": 1.3125,
"learning_rate": 1.3998920243197408e-06,
"loss": 1.1372,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 81,
"tokens_per_second_per_gpu": 5511.47,
"total_tokens": 14970672
},
{
"epoch": 1.302788844621514,
"grad_norm": 1.3671875,
"learning_rate": 1.3826834323650898e-06,
"loss": 1.1167,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 82,
"tokens_per_second_per_gpu": 5614.12,
"total_tokens": 15154574
},
{
"epoch": 1.3187250996015936,
"grad_norm": 1.390625,
"learning_rate": 1.3653410243663951e-06,
"loss": 1.1455,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 83,
"tokens_per_second_per_gpu": 5438.76,
"total_tokens": 15337599
},
{
"epoch": 1.3346613545816732,
"grad_norm": 1.234375,
"learning_rate": 1.347870864585227e-06,
"loss": 1.103,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 84,
"tokens_per_second_per_gpu": 5795.88,
"total_tokens": 15530461
},
{
"epoch": 1.3505976095617531,
"grad_norm": 1.3515625,
"learning_rate": 1.3302790619551672e-06,
"loss": 1.1162,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 85,
"tokens_per_second_per_gpu": 5723.72,
"total_tokens": 15712584
},
{
"epoch": 1.3665338645418328,
"grad_norm": 1.2265625,
"learning_rate": 1.3125717679456444e-06,
"loss": 1.0786,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 86,
"tokens_per_second_per_gpu": 5598.4,
"total_tokens": 15899301
},
{
"epoch": 1.3824701195219125,
"grad_norm": 1.1875,
"learning_rate": 1.2947551744109043e-06,
"loss": 1.082,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 87,
"tokens_per_second_per_gpu": 5458.74,
"total_tokens": 16085653
},
{
"epoch": 1.3984063745019921,
"grad_norm": 1.3515625,
"learning_rate": 1.2768355114248492e-06,
"loss": 1.1436,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 88,
"tokens_per_second_per_gpu": 5579.84,
"total_tokens": 16266291
},
{
"epoch": 1.4143426294820718,
"grad_norm": 1.2734375,
"learning_rate": 1.2588190451025207e-06,
"loss": 1.1914,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 89,
"tokens_per_second_per_gpu": 5336.85,
"total_tokens": 16450627
},
{
"epoch": 1.4302788844621515,
"grad_norm": 1.25,
"learning_rate": 1.240712075408973e-06,
"loss": 1.167,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 90,
"tokens_per_second_per_gpu": 5445.98,
"total_tokens": 16631917
},
{
"epoch": 1.4462151394422311,
"grad_norm": 1.265625,
"learning_rate": 1.2225209339563143e-06,
"loss": 1.1895,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 91,
"tokens_per_second_per_gpu": 5330.92,
"total_tokens": 16808287
},
{
"epoch": 1.4621513944223108,
"grad_norm": 1.25,
"learning_rate": 1.2042519817896804e-06,
"loss": 1.1357,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 92,
"tokens_per_second_per_gpu": 5515.76,
"total_tokens": 16990241
},
{
"epoch": 1.4780876494023905,
"grad_norm": 1.328125,
"learning_rate": 1.1859116071629147e-06,
"loss": 1.1187,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 93,
"tokens_per_second_per_gpu": 5890.54,
"total_tokens": 17176063
},
{
"epoch": 1.4940239043824701,
"grad_norm": 1.2265625,
"learning_rate": 1.1675062233047363e-06,
"loss": 1.1211,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 94,
"tokens_per_second_per_gpu": 5413.95,
"total_tokens": 17361982
},
{
"epoch": 1.5099601593625498,
"grad_norm": 1.2734375,
"learning_rate": 1.1490422661761743e-06,
"loss": 1.1294,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 95,
"tokens_per_second_per_gpu": 5447.34,
"total_tokens": 17542854
},
{
"epoch": 1.5258964143426295,
"grad_norm": 1.1875,
"learning_rate": 1.1305261922200517e-06,
"loss": 1.125,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 96,
"tokens_per_second_per_gpu": 5531.41,
"total_tokens": 17731258
},
{
"epoch": 1.5418326693227091,
"grad_norm": 1.2109375,
"learning_rate": 1.1119644761033077e-06,
"loss": 1.1963,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 97,
"tokens_per_second_per_gpu": 5812.79,
"total_tokens": 17918424
},
{
"epoch": 1.5577689243027888,
"grad_norm": 1.203125,
"learning_rate": 1.0933636084529506e-06,
"loss": 1.2041,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 98,
"tokens_per_second_per_gpu": 5613.31,
"total_tokens": 18103451
},
{
"epoch": 1.5737051792828685,
"grad_norm": 1.2578125,
"learning_rate": 1.0747300935864243e-06,
"loss": 1.168,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 99,
"tokens_per_second_per_gpu": 5555.4,
"total_tokens": 18281544
},
{
"epoch": 1.5896414342629481,
"grad_norm": 1.171875,
"learning_rate": 1.0560704472371917e-06,
"loss": 1.1768,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 100,
"tokens_per_second_per_gpu": 5413.52,
"total_tokens": 18465867
},
{
"epoch": 1.6055776892430278,
"grad_norm": 1.15625,
"learning_rate": 1.037391194276326e-06,
"loss": 1.1162,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 101,
"tokens_per_second_per_gpu": 5209.56,
"total_tokens": 18647472
},
{
"epoch": 1.6215139442231075,
"grad_norm": 1.1328125,
"learning_rate": 1.0186988664309022e-06,
"loss": 1.0986,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 102,
"tokens_per_second_per_gpu": 5913.8,
"total_tokens": 18837099
},
{
"epoch": 1.6374501992031871,
"grad_norm": 1.21875,
"learning_rate": 1e-06,
"loss": 1.1392,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 103,
"tokens_per_second_per_gpu": 5669.98,
"total_tokens": 19019913
},
{
"epoch": 1.6533864541832668,
"grad_norm": 1.2109375,
"learning_rate": 9.81301133569098e-07,
"loss": 1.1582,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 104,
"tokens_per_second_per_gpu": 5370.41,
"total_tokens": 19197751
},
{
"epoch": 1.6693227091633465,
"grad_norm": 1.1796875,
"learning_rate": 9.626088057236744e-07,
"loss": 1.1318,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 105,
"tokens_per_second_per_gpu": 5391.77,
"total_tokens": 19380658
},
{
"epoch": 1.6852589641434261,
"grad_norm": 1.09375,
"learning_rate": 9.43929552762808e-07,
"loss": 1.1084,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 106,
"tokens_per_second_per_gpu": 5564.12,
"total_tokens": 19570234
},
{
"epoch": 1.701195219123506,
"grad_norm": 1.1640625,
"learning_rate": 9.252699064135758e-07,
"loss": 1.1616,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 107,
"tokens_per_second_per_gpu": 5485.18,
"total_tokens": 19754769
},
{
"epoch": 1.7171314741035857,
"grad_norm": 1.1328125,
"learning_rate": 9.066363915470494e-07,
"loss": 1.106,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 108,
"tokens_per_second_per_gpu": 5839.07,
"total_tokens": 19939974
},
{
"epoch": 1.7330677290836654,
"grad_norm": 1.1328125,
"learning_rate": 8.880355238966921e-07,
"loss": 1.125,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 109,
"tokens_per_second_per_gpu": 5601.59,
"total_tokens": 20124754
},
{
"epoch": 1.749003984063745,
"grad_norm": 1.15625,
"learning_rate": 8.694738077799486e-07,
"loss": 1.1084,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 110,
"tokens_per_second_per_gpu": 5601.31,
"total_tokens": 20315537
},
{
"epoch": 1.7649402390438247,
"grad_norm": 1.1640625,
"learning_rate": 8.509577338238254e-07,
"loss": 1.1602,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 111,
"tokens_per_second_per_gpu": 5767.0,
"total_tokens": 20501757
},
{
"epoch": 1.7808764940239044,
"grad_norm": 1.1796875,
"learning_rate": 8.324937766952636e-07,
"loss": 1.1094,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 112,
"tokens_per_second_per_gpu": 5679.05,
"total_tokens": 20686600
},
{
"epoch": 1.796812749003984,
"grad_norm": 1.1015625,
"learning_rate": 8.140883928370854e-07,
"loss": 1.0659,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 113,
"tokens_per_second_per_gpu": 5176.17,
"total_tokens": 20882903
},
{
"epoch": 1.812749003984064,
"grad_norm": 1.109375,
"learning_rate": 7.957480182103197e-07,
"loss": 1.0562,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 114,
"tokens_per_second_per_gpu": 5609.67,
"total_tokens": 21073809
},
{
"epoch": 1.8286852589641436,
"grad_norm": 1.09375,
"learning_rate": 7.774790660436857e-07,
"loss": 1.0747,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 115,
"tokens_per_second_per_gpu": 5480.8,
"total_tokens": 21259881
},
{
"epoch": 1.8446215139442232,
"grad_norm": 1.0390625,
"learning_rate": 7.592879245910272e-07,
"loss": 1.0459,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 116,
"tokens_per_second_per_gpu": 5854.74,
"total_tokens": 21451515
},
{
"epoch": 1.860557768924303,
"grad_norm": 1.1484375,
"learning_rate": 7.411809548974791e-07,
"loss": 1.1191,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 117,
"tokens_per_second_per_gpu": 5607.88,
"total_tokens": 21637511
},
{
"epoch": 1.8764940239043826,
"grad_norm": 1.1328125,
"learning_rate": 7.231644885751507e-07,
"loss": 1.1377,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 118,
"tokens_per_second_per_gpu": 5697.72,
"total_tokens": 21823226
},
{
"epoch": 1.8924302788844622,
"grad_norm": 1.1171875,
"learning_rate": 7.052448255890957e-07,
"loss": 1.0869,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 119,
"tokens_per_second_per_gpu": 6034.6,
"total_tokens": 22008960
},
{
"epoch": 1.908366533864542,
"grad_norm": 1.1171875,
"learning_rate": 6.874282320543556e-07,
"loss": 1.144,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 120,
"tokens_per_second_per_gpu": 5668.36,
"total_tokens": 22201150
},
{
"epoch": 1.9243027888446216,
"grad_norm": 1.0859375,
"learning_rate": 6.697209380448332e-07,
"loss": 1.1255,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 121,
"tokens_per_second_per_gpu": 5720.74,
"total_tokens": 22390135
},
{
"epoch": 1.9402390438247012,
"grad_norm": 1.15625,
"learning_rate": 6.521291354147726e-07,
"loss": 1.1104,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 122,
"tokens_per_second_per_gpu": 5870.28,
"total_tokens": 22575681
},
{
"epoch": 1.956175298804781,
"grad_norm": 1.0625,
"learning_rate": 6.34658975633605e-07,
"loss": 1.1221,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 123,
"tokens_per_second_per_gpu": 5453.27,
"total_tokens": 22761750
},
{
"epoch": 1.9721115537848606,
"grad_norm": 1.0859375,
"learning_rate": 6.173165676349102e-07,
"loss": 1.0967,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 124,
"tokens_per_second_per_gpu": 4989.67,
"total_tokens": 22947396
},
{
"epoch": 1.9880478087649402,
"grad_norm": 1.0390625,
"learning_rate": 6.001079756802592e-07,
"loss": 1.0703,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 125,
"tokens_per_second_per_gpu": 5385.24,
"total_tokens": 23131511
},
{
"epoch": 2.0,
"grad_norm": 1.140625,
"learning_rate": 5.830392172386722e-07,
"loss": 1.1328,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 126,
"tokens_per_second_per_gpu": 4540.27,
"total_tokens": 23254742
},
{
"epoch": 2.0159362549800797,
"grad_norm": 1.03125,
"learning_rate": 5.661162608824419e-07,
"loss": 1.061,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 127,
"tokens_per_second_per_gpu": 5523.13,
"total_tokens": 23442702
},
{
"epoch": 2.0318725099601593,
"grad_norm": 1.140625,
"learning_rate": 5.493450242000546e-07,
"loss": 1.1201,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 128,
"tokens_per_second_per_gpu": 5699.3,
"total_tokens": 23634749
},
{
"epoch": 2.047808764940239,
"grad_norm": 1.109375,
"learning_rate": 5.327313717269379e-07,
"loss": 1.1875,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 129,
"tokens_per_second_per_gpu": 5607.25,
"total_tokens": 23821321
},
{
"epoch": 2.0637450199203187,
"grad_norm": 1.15625,
"learning_rate": 5.162811128947602e-07,
"loss": 1.1191,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 130,
"tokens_per_second_per_gpu": 5781.65,
"total_tokens": 24009420
},
{
"epoch": 2.0796812749003983,
"grad_norm": 1.1015625,
"learning_rate": 5.000000000000002e-07,
"loss": 1.1104,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 131,
"tokens_per_second_per_gpu": 5505.55,
"total_tokens": 24189826
},
{
"epoch": 2.095617529880478,
"grad_norm": 1.1796875,
"learning_rate": 4.838937261924933e-07,
"loss": 1.1973,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 132,
"tokens_per_second_per_gpu": 5512.31,
"total_tokens": 24368779
},
{
"epoch": 2.1115537848605577,
"grad_norm": 1.1015625,
"learning_rate": 4.6796792348466353e-07,
"loss": 1.0444,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 133,
"tokens_per_second_per_gpu": 5465.08,
"total_tokens": 24556917
},
{
"epoch": 2.1274900398406373,
"grad_norm": 1.1015625,
"learning_rate": 4.522281607821288e-07,
"loss": 1.1206,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 134,
"tokens_per_second_per_gpu": 5419.8,
"total_tokens": 24738084
},
{
"epoch": 2.143426294820717,
"grad_norm": 1.03125,
"learning_rate": 4.366799419363779e-07,
"loss": 1.1143,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 135,
"tokens_per_second_per_gpu": 5539.41,
"total_tokens": 24922074
},
{
"epoch": 2.1593625498007967,
"grad_norm": 1.1328125,
"learning_rate": 4.2132870382019427e-07,
"loss": 1.1924,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 136,
"tokens_per_second_per_gpu": 5444.42,
"total_tokens": 25104099
},
{
"epoch": 2.1752988047808763,
"grad_norm": 1.1328125,
"learning_rate": 4.061798144264985e-07,
"loss": 1.1885,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 137,
"tokens_per_second_per_gpu": 5332.72,
"total_tokens": 25281595
},
{
"epoch": 2.191235059760956,
"grad_norm": 1.0703125,
"learning_rate": 3.912385709912793e-07,
"loss": 1.1318,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 138,
"tokens_per_second_per_gpu": 5857.46,
"total_tokens": 25468027
},
{
"epoch": 2.2071713147410357,
"grad_norm": 1.1015625,
"learning_rate": 3.765101981412665e-07,
"loss": 1.1328,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 139,
"tokens_per_second_per_gpu": 5613.35,
"total_tokens": 25654818
},
{
"epoch": 2.2231075697211153,
"grad_norm": 1.0390625,
"learning_rate": 3.6199984606699153e-07,
"loss": 1.0981,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 140,
"tokens_per_second_per_gpu": 5586.6,
"total_tokens": 25847854
},
{
"epoch": 2.239043824701195,
"grad_norm": 1.015625,
"learning_rate": 3.477125887218791e-07,
"loss": 1.105,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 141,
"tokens_per_second_per_gpu": 5723.95,
"total_tokens": 26037069
},
{
"epoch": 2.2549800796812747,
"grad_norm": 1.15625,
"learning_rate": 3.3365342204799606e-07,
"loss": 1.1416,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 142,
"tokens_per_second_per_gpu": 5609.66,
"total_tokens": 26221689
},
{
"epoch": 2.2709163346613543,
"grad_norm": 1.0703125,
"learning_rate": 3.198272622290804e-07,
"loss": 1.0625,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 143,
"tokens_per_second_per_gpu": 6065.87,
"total_tokens": 26415905
},
{
"epoch": 2.2868525896414345,
"grad_norm": 1.0703125,
"learning_rate": 3.0623894397145833e-07,
"loss": 1.123,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 144,
"tokens_per_second_per_gpu": 5513.26,
"total_tokens": 26598043
},
{
"epoch": 2.302788844621514,
"grad_norm": 1.078125,
"learning_rate": 2.9289321881345254e-07,
"loss": 1.103,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 145,
"tokens_per_second_per_gpu": 5505.93,
"total_tokens": 26781945
},
{
"epoch": 2.318725099601594,
"grad_norm": 1.140625,
"learning_rate": 2.797947534638736e-07,
"loss": 1.1348,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 146,
"tokens_per_second_per_gpu": 5529.92,
"total_tokens": 26964970
},
{
"epoch": 2.3346613545816735,
"grad_norm": 1.0234375,
"learning_rate": 2.6694812817017387e-07,
"loss": 1.0938,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 147,
"tokens_per_second_per_gpu": 5703.33,
"total_tokens": 27157832
},
{
"epoch": 2.350597609561753,
"grad_norm": 1.1640625,
"learning_rate": 2.543578351168344e-07,
"loss": 1.1045,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 148,
"tokens_per_second_per_gpu": 5833.69,
"total_tokens": 27339955
},
{
"epoch": 2.366533864541833,
"grad_norm": 1.03125,
"learning_rate": 2.4202827685454687e-07,
"loss": 1.0674,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 149,
"tokens_per_second_per_gpu": 5586.43,
"total_tokens": 27526672
},
{
"epoch": 2.3824701195219125,
"grad_norm": 1.03125,
"learning_rate": 2.299637647607372e-07,
"loss": 1.0728,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 150,
"tokens_per_second_per_gpu": 5369.71,
"total_tokens": 27713024
},
{
"epoch": 2.398406374501992,
"grad_norm": 1.1640625,
"learning_rate": 2.181685175319702e-07,
"loss": 1.1318,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 151,
"tokens_per_second_per_gpu": 5560.0,
"total_tokens": 27893662
},
{
"epoch": 2.414342629482072,
"grad_norm": 1.1015625,
"learning_rate": 2.0664665970876495e-07,
"loss": 1.1807,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 152,
"tokens_per_second_per_gpu": 5346.34,
"total_tokens": 28077998
},
{
"epoch": 2.4302788844621515,
"grad_norm": 1.078125,
"learning_rate": 1.9540222023333163e-07,
"loss": 1.1572,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 153,
"tokens_per_second_per_gpu": 5554.36,
"total_tokens": 28259288
},
{
"epoch": 2.446215139442231,
"grad_norm": 1.09375,
"learning_rate": 1.8443913104073982e-07,
"loss": 1.1807,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 154,
"tokens_per_second_per_gpu": 5321.03,
"total_tokens": 28435658
},
{
"epoch": 2.462151394422311,
"grad_norm": 1.1015625,
"learning_rate": 1.737612256840053e-07,
"loss": 1.127,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 155,
"tokens_per_second_per_gpu": 5503.74,
"total_tokens": 28617612
},
{
"epoch": 2.4780876494023905,
"grad_norm": 1.1640625,
"learning_rate": 1.6337223799358024e-07,
"loss": 1.1099,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 156,
"tokens_per_second_per_gpu": 5737.94,
"total_tokens": 28803434
},
{
"epoch": 2.49402390438247,
"grad_norm": 1.0859375,
"learning_rate": 1.5327580077171588e-07,
"loss": 1.1143,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 157,
"tokens_per_second_per_gpu": 5419.26,
"total_tokens": 28989353
},
{
"epoch": 2.50996015936255,
"grad_norm": 1.140625,
"learning_rate": 1.4347544452214867e-07,
"loss": 1.1216,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 158,
"tokens_per_second_per_gpu": 5564.03,
"total_tokens": 29170225
},
{
"epoch": 2.5258964143426295,
"grad_norm": 1.0703125,
"learning_rate": 1.3397459621556128e-07,
"loss": 1.1182,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 159,
"tokens_per_second_per_gpu": 5533.6,
"total_tokens": 29358629
},
{
"epoch": 2.541832669322709,
"grad_norm": 1.09375,
"learning_rate": 1.2477657809124632e-07,
"loss": 1.1895,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 160,
"tokens_per_second_per_gpu": 5801.01,
"total_tokens": 29545795
},
{
"epoch": 2.557768924302789,
"grad_norm": 1.09375,
"learning_rate": 1.1588460649539034e-07,
"loss": 1.1973,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 161,
"tokens_per_second_per_gpu": 5496.74,
"total_tokens": 29730822
},
{
"epoch": 2.5737051792828685,
"grad_norm": 1.1484375,
"learning_rate": 1.0730179075638868e-07,
"loss": 1.1611,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 162,
"tokens_per_second_per_gpu": 5661.86,
"total_tokens": 29908915
},
{
"epoch": 2.589641434262948,
"grad_norm": 1.078125,
"learning_rate": 9.903113209758096e-08,
"loss": 1.1709,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 163,
"tokens_per_second_per_gpu": 5334.67,
"total_tokens": 30093238
},
{
"epoch": 2.605577689243028,
"grad_norm": 1.0703125,
"learning_rate": 9.107552258778905e-08,
"loss": 1.1099,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 164,
"tokens_per_second_per_gpu": 5276.84,
"total_tokens": 30274843
},
{
"epoch": 2.6215139442231075,
"grad_norm": 1.046875,
"learning_rate": 8.34377441300238e-08,
"loss": 1.0933,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 165,
"tokens_per_second_per_gpu": 5920.44,
"total_tokens": 30464470
},
{
"epoch": 2.637450199203187,
"grad_norm": 1.1328125,
"learning_rate": 7.612046748871326e-08,
"loss": 1.1348,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 166,
"tokens_per_second_per_gpu": 5562.69,
"total_tokens": 30647284
},
{
"epoch": 2.653386454183267,
"grad_norm": 1.1328125,
"learning_rate": 6.912625135579586e-08,
"loss": 1.1553,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 167,
"tokens_per_second_per_gpu": 5381.44,
"total_tokens": 30825122
},
{
"epoch": 2.6693227091633465,
"grad_norm": 1.078125,
"learning_rate": 6.245754145600091e-08,
"loss": 1.126,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 168,
"tokens_per_second_per_gpu": 5392.77,
"total_tokens": 31008029
},
{
"epoch": 2.685258964143426,
"grad_norm": 1.03125,
"learning_rate": 5.611666969163242e-08,
"loss": 1.1025,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 169,
"tokens_per_second_per_gpu": 5636.12,
"total_tokens": 31197605
},
{
"epoch": 2.7011952191235062,
"grad_norm": 1.09375,
"learning_rate": 5.0105853327154004e-08,
"loss": 1.1572,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 170,
"tokens_per_second_per_gpu": 5442.72,
"total_tokens": 31382140
},
{
"epoch": 2.717131474103586,
"grad_norm": 1.0703125,
"learning_rate": 4.442719421385921e-08,
"loss": 1.1011,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 171,
"tokens_per_second_per_gpu": 5829.11,
"total_tokens": 31567345
},
{
"epoch": 2.7330677290836656,
"grad_norm": 1.078125,
"learning_rate": 3.908267805490051e-08,
"loss": 1.123,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 172,
"tokens_per_second_per_gpu": 5509.97,
"total_tokens": 31752125
},
{
"epoch": 2.7490039840637452,
"grad_norm": 1.1015625,
"learning_rate": 3.4074173710931796e-08,
"loss": 1.105,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 173,
"tokens_per_second_per_gpu": 5609.04,
"total_tokens": 31942908
},
{
"epoch": 2.764940239043825,
"grad_norm": 1.109375,
"learning_rate": 2.9403432546609043e-08,
"loss": 1.1533,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 174,
"tokens_per_second_per_gpu": 5753.07,
"total_tokens": 32129128
},
{
"epoch": 2.7808764940239046,
"grad_norm": 1.1328125,
"learning_rate": 2.507208781817638e-08,
"loss": 1.1074,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 175,
"tokens_per_second_per_gpu": 5763.07,
"total_tokens": 32313971
},
{
"epoch": 2.7968127490039842,
"grad_norm": 1.0625,
"learning_rate": 2.1081654102351632e-08,
"loss": 1.0635,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 176,
"tokens_per_second_per_gpu": 5200.76,
"total_tokens": 32510274
},
{
"epoch": 2.812749003984064,
"grad_norm": 1.078125,
"learning_rate": 1.7433526766711725e-08,
"loss": 1.0547,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 177,
"tokens_per_second_per_gpu": 5511.76,
"total_tokens": 32701180
},
{
"epoch": 2.8286852589641436,
"grad_norm": 1.0546875,
"learning_rate": 1.4128981481764113e-08,
"loss": 1.0728,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 178,
"tokens_per_second_per_gpu": 5453.23,
"total_tokens": 32887252
},
{
"epoch": 2.8446215139442232,
"grad_norm": 1.0,
"learning_rate": 1.1169173774871477e-08,
"loss": 1.0454,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 179,
"tokens_per_second_per_gpu": 5831.37,
"total_tokens": 33078886
},
{
"epoch": 2.860557768924303,
"grad_norm": 1.109375,
"learning_rate": 8.555138626189618e-09,
"loss": 1.1182,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 180,
"tokens_per_second_per_gpu": 5712.45,
"total_tokens": 33264882
},
{
"epoch": 2.8764940239043826,
"grad_norm": 1.09375,
"learning_rate": 6.2877901067573955e-09,
"loss": 1.1357,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 181,
"tokens_per_second_per_gpu": 5709.15,
"total_tokens": 33450597
},
{
"epoch": 2.8924302788844622,
"grad_norm": 1.1015625,
"learning_rate": 4.367921058866186e-09,
"loss": 1.085,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 182,
"tokens_per_second_per_gpu": 5923.67,
"total_tokens": 33636331
},
{
"epoch": 2.908366533864542,
"grad_norm": 1.0859375,
"learning_rate": 2.7962028188198706e-09,
"loss": 1.1416,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 183,
"tokens_per_second_per_gpu": 5643.13,
"total_tokens": 33828521
},
{
"epoch": 2.9243027888446216,
"grad_norm": 1.0625,
"learning_rate": 1.5731849821833953e-09,
"loss": 1.1255,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 184,
"tokens_per_second_per_gpu": 5703.08,
"total_tokens": 34017506
},
{
"epoch": 2.9402390438247012,
"grad_norm": 1.1328125,
"learning_rate": 6.992952116013917e-10,
"loss": 1.1094,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 185,
"tokens_per_second_per_gpu": 5872.3,
"total_tokens": 34203052
},
{
"epoch": 2.956175298804781,
"grad_norm": 1.046875,
"learning_rate": 1.7483908725357543e-10,
"loss": 1.1221,
"memory/device_reserved (GiB)": 77.63,
"memory/max_active (GiB)": 65.77,
"memory/max_allocated (GiB)": 65.77,
"step": 186,
"tokens_per_second_per_gpu": 5556.53,
"total_tokens": 34389121
}
],
"logging_steps": 1,
"max_steps": 186,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 62,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3756016575819284e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}