Text Generation
Transformers
Safetensors
qwen3
conversational
text-generation-inference
Unicorn-R3 / trainer_state.json
rin2401's picture
Upload folder using huggingface_hub
e99638b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.956175298804781,
"eval_steps": 500,
"global_step": 186,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01593625498007968,
"grad_norm": 4.5625,
"learning_rate": 0.0,
"loss": 1.4053,
"memory/device_reserved (GiB)": 61.34,
"memory/max_active (GiB)": 49.6,
"memory/max_allocated (GiB)": 49.6,
"step": 1,
"tokens_per_second_per_gpu": 4706.79,
"total_tokens": 180518
},
{
"epoch": 0.03187250996015936,
"grad_norm": 4.34375,
"learning_rate": 1.111111111111111e-06,
"loss": 1.3369,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 2,
"tokens_per_second_per_gpu": 5826.72,
"total_tokens": 363757
},
{
"epoch": 0.04780876494023904,
"grad_norm": 4.15625,
"learning_rate": 2.222222222222222e-06,
"loss": 1.3623,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 3,
"tokens_per_second_per_gpu": 5939.96,
"total_tokens": 558043
},
{
"epoch": 0.06374501992031872,
"grad_norm": 4.34375,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.3643,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 4,
"tokens_per_second_per_gpu": 5941.47,
"total_tokens": 743276
},
{
"epoch": 0.0796812749003984,
"grad_norm": 3.90625,
"learning_rate": 4.444444444444444e-06,
"loss": 1.2998,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 5,
"tokens_per_second_per_gpu": 5380.38,
"total_tokens": 929761
},
{
"epoch": 0.09561752988047809,
"grad_norm": 3.546875,
"learning_rate": 5.555555555555557e-06,
"loss": 1.3018,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 6,
"tokens_per_second_per_gpu": 5949.69,
"total_tokens": 1118316
},
{
"epoch": 0.11155378486055777,
"grad_norm": 3.171875,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2793,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 7,
"tokens_per_second_per_gpu": 5785.23,
"total_tokens": 1301615
},
{
"epoch": 0.12749003984063745,
"grad_norm": 2.96875,
"learning_rate": 7.77777777777778e-06,
"loss": 1.3115,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 8,
"tokens_per_second_per_gpu": 5941.21,
"total_tokens": 1490474
},
{
"epoch": 0.14342629482071714,
"grad_norm": 2.296875,
"learning_rate": 8.888888888888888e-06,
"loss": 1.2588,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 9,
"tokens_per_second_per_gpu": 5534.49,
"total_tokens": 1667576
},
{
"epoch": 0.1593625498007968,
"grad_norm": 1.5625,
"learning_rate": 1e-05,
"loss": 1.1992,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 10,
"tokens_per_second_per_gpu": 6154.87,
"total_tokens": 1857807
},
{
"epoch": 0.1752988047808765,
"grad_norm": 1.0703125,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.1436,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 11,
"tokens_per_second_per_gpu": 5715.6,
"total_tokens": 2041489
},
{
"epoch": 0.19123505976095617,
"grad_norm": 0.9765625,
"learning_rate": 1.2222222222222224e-05,
"loss": 1.2402,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 12,
"tokens_per_second_per_gpu": 5749.37,
"total_tokens": 2216014
},
{
"epoch": 0.20717131474103587,
"grad_norm": 0.9609375,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.2051,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 13,
"tokens_per_second_per_gpu": 5748.94,
"total_tokens": 2397131
},
{
"epoch": 0.22310756972111553,
"grad_norm": 0.921875,
"learning_rate": 1.4444444444444446e-05,
"loss": 1.1211,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 14,
"tokens_per_second_per_gpu": 6171.1,
"total_tokens": 2590472
},
{
"epoch": 0.23904382470119523,
"grad_norm": 0.8984375,
"learning_rate": 1.555555555555556e-05,
"loss": 1.1777,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 15,
"tokens_per_second_per_gpu": 6160.5,
"total_tokens": 2780711
},
{
"epoch": 0.2549800796812749,
"grad_norm": 0.80078125,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.1025,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 16,
"tokens_per_second_per_gpu": 5706.58,
"total_tokens": 2968588
},
{
"epoch": 0.27091633466135456,
"grad_norm": 0.65234375,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.2041,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 17,
"tokens_per_second_per_gpu": 5569.19,
"total_tokens": 3148691
},
{
"epoch": 0.2868525896414343,
"grad_norm": 0.59765625,
"learning_rate": 1.888888888888889e-05,
"loss": 1.168,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 18,
"tokens_per_second_per_gpu": 5894.91,
"total_tokens": 3332398
},
{
"epoch": 0.30278884462151395,
"grad_norm": 0.5625,
"learning_rate": 2e-05,
"loss": 1.0977,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 19,
"tokens_per_second_per_gpu": 6092.09,
"total_tokens": 3526610
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.54296875,
"learning_rate": 1.9998251609127465e-05,
"loss": 1.1372,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 20,
"tokens_per_second_per_gpu": 5971.19,
"total_tokens": 3711042
},
{
"epoch": 0.3346613545816733,
"grad_norm": 0.5078125,
"learning_rate": 1.9993007047883988e-05,
"loss": 1.0659,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 21,
"tokens_per_second_per_gpu": 5750.71,
"total_tokens": 3890841
},
{
"epoch": 0.350597609561753,
"grad_norm": 0.50390625,
"learning_rate": 1.998426815017817e-05,
"loss": 1.124,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 22,
"tokens_per_second_per_gpu": 5968.21,
"total_tokens": 4074024
},
{
"epoch": 0.3665338645418327,
"grad_norm": 0.4609375,
"learning_rate": 1.9972037971811802e-05,
"loss": 1.064,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 23,
"tokens_per_second_per_gpu": 5672.93,
"total_tokens": 4261426
},
{
"epoch": 0.38247011952191234,
"grad_norm": 0.458984375,
"learning_rate": 1.9956320789411338e-05,
"loss": 1.0977,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 24,
"tokens_per_second_per_gpu": 5947.63,
"total_tokens": 4448221
},
{
"epoch": 0.398406374501992,
"grad_norm": 0.400390625,
"learning_rate": 1.9937122098932428e-05,
"loss": 0.9438,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 25,
"tokens_per_second_per_gpu": 5830.3,
"total_tokens": 4643418
},
{
"epoch": 0.41434262948207173,
"grad_norm": 0.451171875,
"learning_rate": 1.9914448613738107e-05,
"loss": 1.0786,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 26,
"tokens_per_second_per_gpu": 5753.23,
"total_tokens": 4826564
},
{
"epoch": 0.4302788844621514,
"grad_norm": 0.41796875,
"learning_rate": 1.9888308262251286e-05,
"loss": 1.1084,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 27,
"tokens_per_second_per_gpu": 5786.21,
"total_tokens": 5008617
},
{
"epoch": 0.44621513944223107,
"grad_norm": 0.392578125,
"learning_rate": 1.985871018518236e-05,
"loss": 1.0488,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 28,
"tokens_per_second_per_gpu": 5935.98,
"total_tokens": 5194550
},
{
"epoch": 0.46215139442231074,
"grad_norm": 0.37109375,
"learning_rate": 1.9825664732332886e-05,
"loss": 1.0894,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 29,
"tokens_per_second_per_gpu": 5927.93,
"total_tokens": 5380376
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.35546875,
"learning_rate": 1.9789183458976485e-05,
"loss": 1.0869,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 30,
"tokens_per_second_per_gpu": 6097.05,
"total_tokens": 5567310
},
{
"epoch": 0.4940239043824701,
"grad_norm": 0.37109375,
"learning_rate": 1.9749279121818235e-05,
"loss": 1.0181,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 31,
"tokens_per_second_per_gpu": 6055.18,
"total_tokens": 5750982
},
{
"epoch": 0.5099601593625498,
"grad_norm": 0.380859375,
"learning_rate": 1.970596567453391e-05,
"loss": 1.0552,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 32,
"tokens_per_second_per_gpu": 6008.39,
"total_tokens": 5937332
},
{
"epoch": 0.5258964143426295,
"grad_norm": 0.376953125,
"learning_rate": 1.9659258262890683e-05,
"loss": 1.0439,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 33,
"tokens_per_second_per_gpu": 6030.07,
"total_tokens": 6120851
},
{
"epoch": 0.5418326693227091,
"grad_norm": 0.369140625,
"learning_rate": 1.9609173219450998e-05,
"loss": 1.0835,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 34,
"tokens_per_second_per_gpu": 5726.88,
"total_tokens": 6297402
},
{
"epoch": 0.5577689243027888,
"grad_norm": 0.396484375,
"learning_rate": 1.955572805786141e-05,
"loss": 1.1074,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 35,
"tokens_per_second_per_gpu": 5816.12,
"total_tokens": 6480316
},
{
"epoch": 0.5737051792828686,
"grad_norm": 0.357421875,
"learning_rate": 1.9498941466728462e-05,
"loss": 1.0391,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 36,
"tokens_per_second_per_gpu": 5765.66,
"total_tokens": 6665052
},
{
"epoch": 0.5896414342629482,
"grad_norm": 0.345703125,
"learning_rate": 1.9438833303083677e-05,
"loss": 1.0371,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 37,
"tokens_per_second_per_gpu": 5749.09,
"total_tokens": 6849283
},
{
"epoch": 0.6055776892430279,
"grad_norm": 0.34375,
"learning_rate": 1.9375424585439994e-05,
"loss": 1.0503,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 38,
"tokens_per_second_per_gpu": 5927.3,
"total_tokens": 7032513
},
{
"epoch": 0.6215139442231076,
"grad_norm": 0.330078125,
"learning_rate": 1.9308737486442045e-05,
"loss": 1.0479,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 39,
"tokens_per_second_per_gpu": 5905.92,
"total_tokens": 7214561
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.3359375,
"learning_rate": 1.9238795325112867e-05,
"loss": 1.0098,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 40,
"tokens_per_second_per_gpu": 5853.3,
"total_tokens": 7400854
},
{
"epoch": 0.6533864541832669,
"grad_norm": 0.36328125,
"learning_rate": 1.9165622558699763e-05,
"loss": 1.106,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 41,
"tokens_per_second_per_gpu": 5556.45,
"total_tokens": 7577263
},
{
"epoch": 0.6693227091633466,
"grad_norm": 0.396484375,
"learning_rate": 1.908924477412211e-05,
"loss": 1.0498,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 42,
"tokens_per_second_per_gpu": 5928.26,
"total_tokens": 7763586
},
{
"epoch": 0.6852589641434262,
"grad_norm": 0.32421875,
"learning_rate": 1.900968867902419e-05,
"loss": 1.0171,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 43,
"tokens_per_second_per_gpu": 6102.36,
"total_tokens": 7953595
},
{
"epoch": 0.701195219123506,
"grad_norm": 0.36328125,
"learning_rate": 1.8926982092436117e-05,
"loss": 1.0688,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 44,
"tokens_per_second_per_gpu": 6058.46,
"total_tokens": 8135608
},
{
"epoch": 0.7171314741035857,
"grad_norm": 0.359375,
"learning_rate": 1.8841153935046098e-05,
"loss": 0.978,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 45,
"tokens_per_second_per_gpu": 5806.45,
"total_tokens": 8328038
},
{
"epoch": 0.7330677290836654,
"grad_norm": 0.333984375,
"learning_rate": 1.8752234219087538e-05,
"loss": 1.0435,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 46,
"tokens_per_second_per_gpu": 5940.85,
"total_tokens": 8517629
},
{
"epoch": 0.749003984063745,
"grad_norm": 0.400390625,
"learning_rate": 1.866025403784439e-05,
"loss": 1.0317,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 47,
"tokens_per_second_per_gpu": 5929.8,
"total_tokens": 8700619
},
{
"epoch": 0.7649402390438247,
"grad_norm": 0.328125,
"learning_rate": 1.8565245554778516e-05,
"loss": 0.9819,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 48,
"tokens_per_second_per_gpu": 5991.89,
"total_tokens": 8886726
},
{
"epoch": 0.7808764940239044,
"grad_norm": 0.34765625,
"learning_rate": 1.8467241992282842e-05,
"loss": 1.0396,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 49,
"tokens_per_second_per_gpu": 5941.59,
"total_tokens": 9074210
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.345703125,
"learning_rate": 1.83662776200642e-05,
"loss": 1.0703,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 50,
"tokens_per_second_per_gpu": 5856.71,
"total_tokens": 9253264
},
{
"epoch": 0.8127490039840638,
"grad_norm": 0.33203125,
"learning_rate": 1.826238774315995e-05,
"loss": 1.0078,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 51,
"tokens_per_second_per_gpu": 5883.97,
"total_tokens": 9437019
},
{
"epoch": 0.8286852589641435,
"grad_norm": 0.326171875,
"learning_rate": 1.8155608689592604e-05,
"loss": 1.0352,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 52,
"tokens_per_second_per_gpu": 6284.45,
"total_tokens": 9624777
},
{
"epoch": 0.8446215139442231,
"grad_norm": 0.34375,
"learning_rate": 1.8045977797666685e-05,
"loss": 1.0015,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 53,
"tokens_per_second_per_gpu": 6227.78,
"total_tokens": 9816093
},
{
"epoch": 0.8605577689243028,
"grad_norm": 0.32421875,
"learning_rate": 1.7933533402912354e-05,
"loss": 1.0205,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 54,
"tokens_per_second_per_gpu": 5562.75,
"total_tokens": 10003875
},
{
"epoch": 0.8764940239043825,
"grad_norm": 0.3125,
"learning_rate": 1.78183148246803e-05,
"loss": 0.9985,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 55,
"tokens_per_second_per_gpu": 6029.45,
"total_tokens": 10195261
},
{
"epoch": 0.8924302788844621,
"grad_norm": 0.328125,
"learning_rate": 1.7700362352392632e-05,
"loss": 1.0151,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 56,
"tokens_per_second_per_gpu": 5824.93,
"total_tokens": 10378607
},
{
"epoch": 0.9083665338645418,
"grad_norm": 0.345703125,
"learning_rate": 1.757971723145453e-05,
"loss": 1.0737,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 57,
"tokens_per_second_per_gpu": 5758.69,
"total_tokens": 10565102
},
{
"epoch": 0.9243027888446215,
"grad_norm": 0.330078125,
"learning_rate": 1.7456421648831658e-05,
"loss": 1.0444,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 58,
"tokens_per_second_per_gpu": 5699.09,
"total_tokens": 10743645
},
{
"epoch": 0.9402390438247012,
"grad_norm": 0.337890625,
"learning_rate": 1.7330518718298263e-05,
"loss": 0.998,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 59,
"tokens_per_second_per_gpu": 5772.72,
"total_tokens": 10926325
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.361328125,
"learning_rate": 1.7202052465361268e-05,
"loss": 1.0659,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 60,
"tokens_per_second_per_gpu": 5781.99,
"total_tokens": 11105741
},
{
"epoch": 0.9721115537848606,
"grad_norm": 0.326171875,
"learning_rate": 1.7071067811865477e-05,
"loss": 1.0024,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 61,
"tokens_per_second_per_gpu": 5416.25,
"total_tokens": 11283752
},
{
"epoch": 0.9880478087649402,
"grad_norm": 0.314453125,
"learning_rate": 1.693761056028542e-05,
"loss": 0.9429,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 62,
"tokens_per_second_per_gpu": 6080.81,
"total_tokens": 11476891
},
{
"epoch": 1.0,
"grad_norm": 1.03125,
"learning_rate": 1.6801727377709195e-05,
"loss": 0.8979,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 63,
"tokens_per_second_per_gpu": 4586.33,
"total_tokens": 11600559
},
{
"epoch": 1.0159362549800797,
"grad_norm": 0.33203125,
"learning_rate": 1.6663465779520042e-05,
"loss": 1.0391,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 64,
"tokens_per_second_per_gpu": 5765.65,
"total_tokens": 11781077
},
{
"epoch": 1.0318725099601593,
"grad_norm": 0.328125,
"learning_rate": 1.6522874112781213e-05,
"loss": 0.9893,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 65,
"tokens_per_second_per_gpu": 5812.65,
"total_tokens": 11964316
},
{
"epoch": 1.047808764940239,
"grad_norm": 0.33203125,
"learning_rate": 1.6380001539330088e-05,
"loss": 1.019,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 66,
"tokens_per_second_per_gpu": 5958.35,
"total_tokens": 12158602
},
{
"epoch": 1.0637450199203187,
"grad_norm": 0.318359375,
"learning_rate": 1.6234898018587336e-05,
"loss": 1.0098,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 67,
"tokens_per_second_per_gpu": 5947.9,
"total_tokens": 12343835
},
{
"epoch": 1.0796812749003983,
"grad_norm": 0.31640625,
"learning_rate": 1.608761429008721e-05,
"loss": 0.959,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 68,
"tokens_per_second_per_gpu": 5410.16,
"total_tokens": 12530320
},
{
"epoch": 1.095617529880478,
"grad_norm": 0.337890625,
"learning_rate": 1.5938201855735017e-05,
"loss": 0.998,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 69,
"tokens_per_second_per_gpu": 5950.0,
"total_tokens": 12718875
},
{
"epoch": 1.1115537848605577,
"grad_norm": 0.31640625,
"learning_rate": 1.578671296179806e-05,
"loss": 0.9834,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 70,
"tokens_per_second_per_gpu": 5806.97,
"total_tokens": 12902174
},
{
"epoch": 1.1274900398406373,
"grad_norm": 0.322265625,
"learning_rate": 1.563320058063622e-05,
"loss": 1.02,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 71,
"tokens_per_second_per_gpu": 5964.93,
"total_tokens": 13091033
},
{
"epoch": 1.1434262948207172,
"grad_norm": 0.328125,
"learning_rate": 1.5477718392178716e-05,
"loss": 1.001,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 72,
"tokens_per_second_per_gpu": 5543.95,
"total_tokens": 13268135
},
{
"epoch": 1.159362549800797,
"grad_norm": 0.322265625,
"learning_rate": 1.5320320765153367e-05,
"loss": 0.9868,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 73,
"tokens_per_second_per_gpu": 6161.46,
"total_tokens": 13458366
},
{
"epoch": 1.1752988047808766,
"grad_norm": 0.33203125,
"learning_rate": 1.5161062738075068e-05,
"loss": 0.9404,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 74,
"tokens_per_second_per_gpu": 5781.1,
"total_tokens": 13642048
},
{
"epoch": 1.1912350597609562,
"grad_norm": 0.423828125,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.0273,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 75,
"tokens_per_second_per_gpu": 5754.51,
"total_tokens": 13816573
},
{
"epoch": 1.207171314741036,
"grad_norm": 0.376953125,
"learning_rate": 1.4837188871052399e-05,
"loss": 0.999,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 76,
"tokens_per_second_per_gpu": 5745.16,
"total_tokens": 13997690
},
{
"epoch": 1.2231075697211156,
"grad_norm": 0.30859375,
"learning_rate": 1.4672686282730622e-05,
"loss": 0.9365,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 77,
"tokens_per_second_per_gpu": 6187.18,
"total_tokens": 14191031
},
{
"epoch": 1.2390438247011952,
"grad_norm": 0.310546875,
"learning_rate": 1.4506549757999456e-05,
"loss": 0.9932,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 78,
"tokens_per_second_per_gpu": 6189.26,
"total_tokens": 14381270
},
{
"epoch": 1.254980079681275,
"grad_norm": 0.361328125,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.9253,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 79,
"tokens_per_second_per_gpu": 5694.92,
"total_tokens": 14569147
},
{
"epoch": 1.2709163346613546,
"grad_norm": 0.349609375,
"learning_rate": 1.4169607827613284e-05,
"loss": 1.0249,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 80,
"tokens_per_second_per_gpu": 5574.13,
"total_tokens": 14749250
},
{
"epoch": 1.2868525896414342,
"grad_norm": 0.33984375,
"learning_rate": 1.3998920243197408e-05,
"loss": 1.0044,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 81,
"tokens_per_second_per_gpu": 5892.86,
"total_tokens": 14932957
},
{
"epoch": 1.302788844621514,
"grad_norm": 0.31640625,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.9443,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 82,
"tokens_per_second_per_gpu": 6084.92,
"total_tokens": 15127169
},
{
"epoch": 1.3187250996015936,
"grad_norm": 0.328125,
"learning_rate": 1.3653410243663953e-05,
"loss": 0.9878,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 83,
"tokens_per_second_per_gpu": 5984.25,
"total_tokens": 15311601
},
{
"epoch": 1.3346613545816732,
"grad_norm": 0.32421875,
"learning_rate": 1.3478708645852272e-05,
"loss": 0.9248,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 84,
"tokens_per_second_per_gpu": 5744.73,
"total_tokens": 15491400
},
{
"epoch": 1.3505976095617531,
"grad_norm": 0.33203125,
"learning_rate": 1.3302790619551673e-05,
"loss": 0.9824,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 85,
"tokens_per_second_per_gpu": 6009.5,
"total_tokens": 15674583
},
{
"epoch": 1.3665338645418328,
"grad_norm": 0.314453125,
"learning_rate": 1.3125717679456447e-05,
"loss": 0.9404,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 86,
"tokens_per_second_per_gpu": 5690.82,
"total_tokens": 15861985
},
{
"epoch": 1.3824701195219125,
"grad_norm": 0.34765625,
"learning_rate": 1.2947551744109044e-05,
"loss": 0.9731,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 87,
"tokens_per_second_per_gpu": 5962.66,
"total_tokens": 16048780
},
{
"epoch": 1.3984063745019921,
"grad_norm": 0.318359375,
"learning_rate": 1.2768355114248493e-05,
"loss": 0.8406,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 88,
"tokens_per_second_per_gpu": 5796.13,
"total_tokens": 16243977
},
{
"epoch": 1.4143426294820718,
"grad_norm": 0.337890625,
"learning_rate": 1.2588190451025209e-05,
"loss": 0.9692,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 89,
"tokens_per_second_per_gpu": 5748.01,
"total_tokens": 16427123
},
{
"epoch": 1.4302788844621515,
"grad_norm": 0.345703125,
"learning_rate": 1.2407120754089733e-05,
"loss": 0.998,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 90,
"tokens_per_second_per_gpu": 5897.24,
"total_tokens": 16609176
},
{
"epoch": 1.4462151394422311,
"grad_norm": 0.33203125,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.9507,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 91,
"tokens_per_second_per_gpu": 5936.81,
"total_tokens": 16795109
},
{
"epoch": 1.4621513944223108,
"grad_norm": 0.328125,
"learning_rate": 1.2042519817896805e-05,
"loss": 0.9912,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 92,
"tokens_per_second_per_gpu": 5949.43,
"total_tokens": 16980935
},
{
"epoch": 1.4780876494023905,
"grad_norm": 0.333984375,
"learning_rate": 1.1859116071629148e-05,
"loss": 0.9888,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 93,
"tokens_per_second_per_gpu": 6095.07,
"total_tokens": 17167869
},
{
"epoch": 1.4940239043824701,
"grad_norm": 0.322265625,
"learning_rate": 1.1675062233047365e-05,
"loss": 0.9219,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 94,
"tokens_per_second_per_gpu": 6067.77,
"total_tokens": 17351541
},
{
"epoch": 1.5099601593625498,
"grad_norm": 0.3828125,
"learning_rate": 1.1490422661761744e-05,
"loss": 0.9648,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 95,
"tokens_per_second_per_gpu": 6008.67,
"total_tokens": 17537891
},
{
"epoch": 1.5258964143426295,
"grad_norm": 0.328125,
"learning_rate": 1.130526192220052e-05,
"loss": 0.9556,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 96,
"tokens_per_second_per_gpu": 5955.34,
"total_tokens": 17721410
},
{
"epoch": 1.5418326693227091,
"grad_norm": 0.3359375,
"learning_rate": 1.1119644761033079e-05,
"loss": 0.9951,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 97,
"tokens_per_second_per_gpu": 5732.24,
"total_tokens": 17897961
},
{
"epoch": 1.5577689243027888,
"grad_norm": 0.330078125,
"learning_rate": 1.0933636084529507e-05,
"loss": 1.02,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 98,
"tokens_per_second_per_gpu": 5813.99,
"total_tokens": 18080875
},
{
"epoch": 1.5737051792828685,
"grad_norm": 0.330078125,
"learning_rate": 1.0747300935864245e-05,
"loss": 0.958,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 99,
"tokens_per_second_per_gpu": 5769.52,
"total_tokens": 18265611
},
{
"epoch": 1.5896414342629481,
"grad_norm": 0.326171875,
"learning_rate": 1.0560704472371919e-05,
"loss": 0.9561,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 100,
"tokens_per_second_per_gpu": 5726.38,
"total_tokens": 18449842
},
{
"epoch": 1.6055776892430278,
"grad_norm": 0.326171875,
"learning_rate": 1.037391194276326e-05,
"loss": 0.9707,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 101,
"tokens_per_second_per_gpu": 5944.93,
"total_tokens": 18633072
},
{
"epoch": 1.6215139442231075,
"grad_norm": 0.3203125,
"learning_rate": 1.0186988664309023e-05,
"loss": 0.9707,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 102,
"tokens_per_second_per_gpu": 5897.0,
"total_tokens": 18815120
},
{
"epoch": 1.6374501992031871,
"grad_norm": 0.328125,
"learning_rate": 1e-05,
"loss": 0.9385,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 103,
"tokens_per_second_per_gpu": 5841.82,
"total_tokens": 19001413
},
{
"epoch": 1.6533864541832668,
"grad_norm": 0.341796875,
"learning_rate": 9.81301133569098e-06,
"loss": 1.0303,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 104,
"tokens_per_second_per_gpu": 5567.16,
"total_tokens": 19177822
},
{
"epoch": 1.6693227091633465,
"grad_norm": 0.330078125,
"learning_rate": 9.626088057236745e-06,
"loss": 0.9814,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 105,
"tokens_per_second_per_gpu": 5850.82,
"total_tokens": 19364145
},
{
"epoch": 1.6852589641434261,
"grad_norm": 0.31640625,
"learning_rate": 9.439295527628083e-06,
"loss": 0.9531,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 106,
"tokens_per_second_per_gpu": 6142.09,
"total_tokens": 19554154
},
{
"epoch": 1.701195219123506,
"grad_norm": 0.333984375,
"learning_rate": 9.252699064135759e-06,
"loss": 0.998,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 107,
"tokens_per_second_per_gpu": 6059.46,
"total_tokens": 19736167
},
{
"epoch": 1.7171314741035857,
"grad_norm": 0.33203125,
"learning_rate": 9.066363915470494e-06,
"loss": 0.9204,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 108,
"tokens_per_second_per_gpu": 5807.52,
"total_tokens": 19928597
},
{
"epoch": 1.7330677290836654,
"grad_norm": 0.3515625,
"learning_rate": 8.880355238966923e-06,
"loss": 0.978,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 109,
"tokens_per_second_per_gpu": 5987.33,
"total_tokens": 20118188
},
{
"epoch": 1.749003984063745,
"grad_norm": 0.3359375,
"learning_rate": 8.694738077799487e-06,
"loss": 0.9702,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 110,
"tokens_per_second_per_gpu": 5888.18,
"total_tokens": 20301178
},
{
"epoch": 1.7649402390438247,
"grad_norm": 0.357421875,
"learning_rate": 8.509577338238255e-06,
"loss": 0.9253,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 111,
"tokens_per_second_per_gpu": 5972.68,
"total_tokens": 20487285
},
{
"epoch": 1.7808764940239044,
"grad_norm": 0.337890625,
"learning_rate": 8.324937766952638e-06,
"loss": 0.9814,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 112,
"tokens_per_second_per_gpu": 5932.16,
"total_tokens": 20674769
},
{
"epoch": 1.796812749003984,
"grad_norm": 0.341796875,
"learning_rate": 8.140883928370855e-06,
"loss": 1.0088,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 113,
"tokens_per_second_per_gpu": 5830.81,
"total_tokens": 20853823
},
{
"epoch": 1.812749003984064,
"grad_norm": 0.322265625,
"learning_rate": 7.957480182103198e-06,
"loss": 0.9487,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 114,
"tokens_per_second_per_gpu": 5865.22,
"total_tokens": 21037578
},
{
"epoch": 1.8286852589641436,
"grad_norm": 0.328125,
"learning_rate": 7.774790660436857e-06,
"loss": 0.9819,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 115,
"tokens_per_second_per_gpu": 6252.84,
"total_tokens": 21225336
},
{
"epoch": 1.8446215139442232,
"grad_norm": 0.33203125,
"learning_rate": 7.592879245910273e-06,
"loss": 0.9482,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 116,
"tokens_per_second_per_gpu": 6223.23,
"total_tokens": 21416652
},
{
"epoch": 1.860557768924303,
"grad_norm": 0.322265625,
"learning_rate": 7.411809548974792e-06,
"loss": 0.9697,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 117,
"tokens_per_second_per_gpu": 5561.76,
"total_tokens": 21604434
},
{
"epoch": 1.8764940239043826,
"grad_norm": 0.30859375,
"learning_rate": 7.2316448857515076e-06,
"loss": 0.9468,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 118,
"tokens_per_second_per_gpu": 6026.88,
"total_tokens": 21795820
},
{
"epoch": 1.8924302788844622,
"grad_norm": 0.32421875,
"learning_rate": 7.052448255890958e-06,
"loss": 0.9624,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 119,
"tokens_per_second_per_gpu": 5817.1,
"total_tokens": 21979166
},
{
"epoch": 1.908366533864542,
"grad_norm": 0.33984375,
"learning_rate": 6.874282320543557e-06,
"loss": 1.022,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 120,
"tokens_per_second_per_gpu": 5653.71,
"total_tokens": 22165661
},
{
"epoch": 1.9243027888446216,
"grad_norm": 0.32421875,
"learning_rate": 6.697209380448333e-06,
"loss": 0.9961,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 121,
"tokens_per_second_per_gpu": 5699.1,
"total_tokens": 22344204
},
{
"epoch": 1.9402390438247012,
"grad_norm": 0.33203125,
"learning_rate": 6.521291354147727e-06,
"loss": 0.9521,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 122,
"tokens_per_second_per_gpu": 5765.79,
"total_tokens": 22526884
},
{
"epoch": 1.956175298804781,
"grad_norm": 0.349609375,
"learning_rate": 6.34658975633605e-06,
"loss": 1.0171,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 123,
"tokens_per_second_per_gpu": 5780.38,
"total_tokens": 22706300
},
{
"epoch": 1.9721115537848606,
"grad_norm": 0.318359375,
"learning_rate": 6.173165676349103e-06,
"loss": 0.957,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 124,
"tokens_per_second_per_gpu": 5399.07,
"total_tokens": 22884311
},
{
"epoch": 1.9880478087649402,
"grad_norm": 0.357421875,
"learning_rate": 6.001079756802592e-06,
"loss": 0.9028,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 125,
"tokens_per_second_per_gpu": 5850.46,
"total_tokens": 23077450
},
{
"epoch": 2.0,
"grad_norm": 0.39453125,
"learning_rate": 5.830392172386723e-06,
"loss": 0.8589,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 126,
"tokens_per_second_per_gpu": 4564.2,
"total_tokens": 23201118
},
{
"epoch": 2.0159362549800797,
"grad_norm": 0.32421875,
"learning_rate": 5.66116260882442e-06,
"loss": 0.9985,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 127,
"tokens_per_second_per_gpu": 5832.26,
"total_tokens": 23381636
},
{
"epoch": 2.0318725099601593,
"grad_norm": 0.328125,
"learning_rate": 5.493450242000546e-06,
"loss": 0.9521,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 128,
"tokens_per_second_per_gpu": 5774.6,
"total_tokens": 23564875
},
{
"epoch": 2.047808764940239,
"grad_norm": 0.328125,
"learning_rate": 5.32731371726938e-06,
"loss": 0.98,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 129,
"tokens_per_second_per_gpu": 5972.38,
"total_tokens": 23759161
},
{
"epoch": 2.0637450199203187,
"grad_norm": 0.328125,
"learning_rate": 5.1628111289476025e-06,
"loss": 0.9746,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 130,
"tokens_per_second_per_gpu": 5919.54,
"total_tokens": 23944394
},
{
"epoch": 2.0796812749003983,
"grad_norm": 0.31640625,
"learning_rate": 5.000000000000003e-06,
"loss": 0.9229,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 131,
"tokens_per_second_per_gpu": 5414.05,
"total_tokens": 24130879
},
{
"epoch": 2.095617529880478,
"grad_norm": 0.33203125,
"learning_rate": 4.838937261924933e-06,
"loss": 0.9639,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 132,
"tokens_per_second_per_gpu": 5968.88,
"total_tokens": 24319434
},
{
"epoch": 2.1115537848605577,
"grad_norm": 0.31640625,
"learning_rate": 4.679679234846636e-06,
"loss": 0.9502,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 133,
"tokens_per_second_per_gpu": 5802.86,
"total_tokens": 24502733
},
{
"epoch": 2.1274900398406373,
"grad_norm": 0.318359375,
"learning_rate": 4.522281607821288e-06,
"loss": 0.9854,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 134,
"tokens_per_second_per_gpu": 5970.96,
"total_tokens": 24691592
},
{
"epoch": 2.143426294820717,
"grad_norm": 0.373046875,
"learning_rate": 4.3667994193637794e-06,
"loss": 0.9683,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 135,
"tokens_per_second_per_gpu": 5528.1,
"total_tokens": 24868694
},
{
"epoch": 2.1593625498007967,
"grad_norm": 0.318359375,
"learning_rate": 4.213287038201943e-06,
"loss": 0.9561,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 136,
"tokens_per_second_per_gpu": 6105.7,
"total_tokens": 25058925
},
{
"epoch": 2.1752988047808763,
"grad_norm": 0.322265625,
"learning_rate": 4.061798144264986e-06,
"loss": 0.9116,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 137,
"tokens_per_second_per_gpu": 5771.48,
"total_tokens": 25242607
},
{
"epoch": 2.191235059760956,
"grad_norm": 0.3359375,
"learning_rate": 3.912385709912794e-06,
"loss": 0.9966,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 138,
"tokens_per_second_per_gpu": 5723.02,
"total_tokens": 25417132
},
{
"epoch": 2.2071713147410357,
"grad_norm": 0.318359375,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.9712,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 139,
"tokens_per_second_per_gpu": 5739.14,
"total_tokens": 25598249
},
{
"epoch": 2.2231075697211153,
"grad_norm": 0.306640625,
"learning_rate": 3.619998460669916e-06,
"loss": 0.9106,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 140,
"tokens_per_second_per_gpu": 6168.5,
"total_tokens": 25791590
},
{
"epoch": 2.239043824701195,
"grad_norm": 0.31640625,
"learning_rate": 3.4771258872187917e-06,
"loss": 0.9673,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 141,
"tokens_per_second_per_gpu": 6156.05,
"total_tokens": 25981829
},
{
"epoch": 2.2549800796812747,
"grad_norm": 0.33203125,
"learning_rate": 3.3365342204799613e-06,
"loss": 0.9019,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 142,
"tokens_per_second_per_gpu": 5766.33,
"total_tokens": 26169706
},
{
"epoch": 2.2709163346613543,
"grad_norm": 0.50390625,
"learning_rate": 3.1982726222908046e-06,
"loss": 0.9995,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 143,
"tokens_per_second_per_gpu": 5566.19,
"total_tokens": 26349809
},
{
"epoch": 2.2868525896414345,
"grad_norm": 0.359375,
"learning_rate": 3.0623894397145837e-06,
"loss": 0.9805,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 144,
"tokens_per_second_per_gpu": 5897.52,
"total_tokens": 26533516
},
{
"epoch": 2.302788844621514,
"grad_norm": 0.375,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.9219,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 145,
"tokens_per_second_per_gpu": 6065.68,
"total_tokens": 26727728
},
{
"epoch": 2.318725099601594,
"grad_norm": 0.3359375,
"learning_rate": 2.7979475346387363e-06,
"loss": 0.9639,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 146,
"tokens_per_second_per_gpu": 5976.74,
"total_tokens": 26912160
},
{
"epoch": 2.3346613545816735,
"grad_norm": 0.34765625,
"learning_rate": 2.669481281701739e-06,
"loss": 0.9038,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 147,
"tokens_per_second_per_gpu": 5726.82,
"total_tokens": 27091959
},
{
"epoch": 2.350597609561753,
"grad_norm": 0.341796875,
"learning_rate": 2.5435783511683444e-06,
"loss": 0.9614,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 148,
"tokens_per_second_per_gpu": 5973.36,
"total_tokens": 27275142
},
{
"epoch": 2.366533864541833,
"grad_norm": 0.33203125,
"learning_rate": 2.420282768545469e-06,
"loss": 0.9219,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 149,
"tokens_per_second_per_gpu": 5654.39,
"total_tokens": 27462544
},
{
"epoch": 2.3824701195219125,
"grad_norm": 0.322265625,
"learning_rate": 2.2996376476073724e-06,
"loss": 0.9526,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 150,
"tokens_per_second_per_gpu": 5952.85,
"total_tokens": 27649339
},
{
"epoch": 2.398406374501992,
"grad_norm": 0.3203125,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.8235,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 151,
"tokens_per_second_per_gpu": 5837.21,
"total_tokens": 27844536
},
{
"epoch": 2.414342629482072,
"grad_norm": 0.333984375,
"learning_rate": 2.0664665970876496e-06,
"loss": 0.9521,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 152,
"tokens_per_second_per_gpu": 5754.33,
"total_tokens": 28027682
},
{
"epoch": 2.4302788844621515,
"grad_norm": 0.326171875,
"learning_rate": 1.9540222023333165e-06,
"loss": 0.9805,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 153,
"tokens_per_second_per_gpu": 5902.39,
"total_tokens": 28209735
},
{
"epoch": 2.446215139442231,
"grad_norm": 0.32421875,
"learning_rate": 1.8443913104073984e-06,
"loss": 0.9321,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 154,
"tokens_per_second_per_gpu": 5930.45,
"total_tokens": 28395668
},
{
"epoch": 2.462151394422311,
"grad_norm": 0.3203125,
"learning_rate": 1.7376122568400533e-06,
"loss": 0.9756,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 155,
"tokens_per_second_per_gpu": 5945.66,
"total_tokens": 28581494
},
{
"epoch": 2.4780876494023905,
"grad_norm": 0.322265625,
"learning_rate": 1.6337223799358025e-06,
"loss": 0.9736,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 156,
"tokens_per_second_per_gpu": 6107.11,
"total_tokens": 28768428
},
{
"epoch": 2.49402390438247,
"grad_norm": 0.31640625,
"learning_rate": 1.5327580077171589e-06,
"loss": 0.9067,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 157,
"tokens_per_second_per_gpu": 6059.61,
"total_tokens": 28952100
},
{
"epoch": 2.50996015936255,
"grad_norm": 0.326171875,
"learning_rate": 1.4347544452214869e-06,
"loss": 0.9512,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 158,
"tokens_per_second_per_gpu": 6010.47,
"total_tokens": 29138450
},
{
"epoch": 2.5258964143426295,
"grad_norm": 0.376953125,
"learning_rate": 1.339745962155613e-06,
"loss": 0.9409,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 159,
"tokens_per_second_per_gpu": 6038.89,
"total_tokens": 29321969
},
{
"epoch": 2.541832669322709,
"grad_norm": 0.330078125,
"learning_rate": 1.2477657809124632e-06,
"loss": 0.9824,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 160,
"tokens_per_second_per_gpu": 5740.71,
"total_tokens": 29498520
},
{
"epoch": 2.557768924302789,
"grad_norm": 0.328125,
"learning_rate": 1.1588460649539036e-06,
"loss": 1.0068,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 161,
"tokens_per_second_per_gpu": 5804.04,
"total_tokens": 29681434
},
{
"epoch": 2.5737051792828685,
"grad_norm": 0.337890625,
"learning_rate": 1.073017907563887e-06,
"loss": 0.9453,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 162,
"tokens_per_second_per_gpu": 5741.44,
"total_tokens": 29866170
},
{
"epoch": 2.589641434262948,
"grad_norm": 0.3203125,
"learning_rate": 9.903113209758098e-07,
"loss": 0.9443,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 163,
"tokens_per_second_per_gpu": 5756.27,
"total_tokens": 30050401
},
{
"epoch": 2.605577689243028,
"grad_norm": 0.32421875,
"learning_rate": 9.107552258778907e-07,
"loss": 0.9585,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 164,
"tokens_per_second_per_gpu": 5959.88,
"total_tokens": 30233631
},
{
"epoch": 2.6215139442231075,
"grad_norm": 0.3203125,
"learning_rate": 8.343774413002382e-07,
"loss": 0.9604,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 165,
"tokens_per_second_per_gpu": 5899.57,
"total_tokens": 30415679
},
{
"epoch": 2.637450199203187,
"grad_norm": 0.326171875,
"learning_rate": 7.612046748871327e-07,
"loss": 0.9277,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 166,
"tokens_per_second_per_gpu": 5845.3,
"total_tokens": 30601972
},
{
"epoch": 2.653386454183267,
"grad_norm": 0.33984375,
"learning_rate": 6.912625135579587e-07,
"loss": 1.022,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 167,
"tokens_per_second_per_gpu": 5574.11,
"total_tokens": 30778381
},
{
"epoch": 2.6693227091633465,
"grad_norm": 0.31640625,
"learning_rate": 6.245754145600091e-07,
"loss": 0.9707,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 168,
"tokens_per_second_per_gpu": 5930.11,
"total_tokens": 30964704
},
{
"epoch": 2.685258964143426,
"grad_norm": 0.318359375,
"learning_rate": 5.611666969163243e-07,
"loss": 0.9448,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 169,
"tokens_per_second_per_gpu": 6128.63,
"total_tokens": 31154713
},
{
"epoch": 2.7011952191235062,
"grad_norm": 0.333984375,
"learning_rate": 5.010585332715401e-07,
"loss": 0.9883,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 170,
"tokens_per_second_per_gpu": 6064.77,
"total_tokens": 31336726
},
{
"epoch": 2.717131474103586,
"grad_norm": 0.328125,
"learning_rate": 4.4427194213859216e-07,
"loss": 0.9131,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 171,
"tokens_per_second_per_gpu": 5798.36,
"total_tokens": 31529156
},
{
"epoch": 2.7330677290836656,
"grad_norm": 0.318359375,
"learning_rate": 3.908267805490051e-07,
"loss": 0.9697,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 172,
"tokens_per_second_per_gpu": 5977.29,
"total_tokens": 31718747
},
{
"epoch": 2.7490039840637452,
"grad_norm": 0.328125,
"learning_rate": 3.4074173710931804e-07,
"loss": 0.9619,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 173,
"tokens_per_second_per_gpu": 5934.4,
"total_tokens": 31901737
},
{
"epoch": 2.764940239043825,
"grad_norm": 0.322265625,
"learning_rate": 2.940343254660905e-07,
"loss": 0.9185,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 174,
"tokens_per_second_per_gpu": 5978.89,
"total_tokens": 32087844
},
{
"epoch": 2.7808764940239046,
"grad_norm": 0.328125,
"learning_rate": 2.507208781817638e-07,
"loss": 0.9751,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 175,
"tokens_per_second_per_gpu": 5946.29,
"total_tokens": 32275328
},
{
"epoch": 2.7968127490039842,
"grad_norm": 0.337890625,
"learning_rate": 2.1081654102351634e-07,
"loss": 1.0015,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 176,
"tokens_per_second_per_gpu": 5871.96,
"total_tokens": 32454382
},
{
"epoch": 2.812749003984064,
"grad_norm": 0.318359375,
"learning_rate": 1.7433526766711727e-07,
"loss": 0.9429,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 177,
"tokens_per_second_per_gpu": 5872.24,
"total_tokens": 32638137
},
{
"epoch": 2.8286852589641436,
"grad_norm": 0.3203125,
"learning_rate": 1.4128981481764115e-07,
"loss": 0.9746,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 178,
"tokens_per_second_per_gpu": 6173.8,
"total_tokens": 32825895
},
{
"epoch": 2.8446215139442232,
"grad_norm": 0.322265625,
"learning_rate": 1.1169173774871478e-07,
"loss": 0.9434,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 179,
"tokens_per_second_per_gpu": 6232.85,
"total_tokens": 33017211
},
{
"epoch": 2.860557768924303,
"grad_norm": 0.3203125,
"learning_rate": 8.555138626189619e-08,
"loss": 0.9644,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 180,
"tokens_per_second_per_gpu": 5578.64,
"total_tokens": 33204993
},
{
"epoch": 2.8764940239043826,
"grad_norm": 0.310546875,
"learning_rate": 6.287790106757396e-08,
"loss": 0.9429,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 181,
"tokens_per_second_per_gpu": 6034.66,
"total_tokens": 33396379
},
{
"epoch": 2.8924302788844622,
"grad_norm": 0.318359375,
"learning_rate": 4.367921058866187e-08,
"loss": 0.959,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 182,
"tokens_per_second_per_gpu": 5841.56,
"total_tokens": 33579725
},
{
"epoch": 2.908366533864542,
"grad_norm": 0.33984375,
"learning_rate": 2.796202818819871e-08,
"loss": 1.0166,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 183,
"tokens_per_second_per_gpu": 5736.29,
"total_tokens": 33766220
},
{
"epoch": 2.9243027888446216,
"grad_norm": 0.322265625,
"learning_rate": 1.5731849821833955e-08,
"loss": 0.9907,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 184,
"tokens_per_second_per_gpu": 5704.78,
"total_tokens": 33944763
},
{
"epoch": 2.9402390438247012,
"grad_norm": 0.326171875,
"learning_rate": 6.992952116013918e-09,
"loss": 0.9478,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 185,
"tokens_per_second_per_gpu": 5773.31,
"total_tokens": 34127443
},
{
"epoch": 2.956175298804781,
"grad_norm": 0.349609375,
"learning_rate": 1.7483908725357546e-09,
"loss": 1.0122,
"memory/device_reserved (GiB)": 76.38,
"memory/max_active (GiB)": 64.91,
"memory/max_allocated (GiB)": 64.91,
"step": 186,
"tokens_per_second_per_gpu": 5785.44,
"total_tokens": 34306859
}
],
"logging_steps": 1,
"max_steps": 186,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 62,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2082055574021734e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}