{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 8790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11376564277588168, "grad_norm": 6.126019477844238, "learning_rate": 4.9731121281464535e-05, "loss": 2.4955, "step": 100 }, { "epoch": 0.22753128555176336, "grad_norm": 3.398350954055786, "learning_rate": 4.916475972540046e-05, "loss": 2.1647, "step": 200 }, { "epoch": 0.3412969283276451, "grad_norm": 4.663970947265625, "learning_rate": 4.859267734553776e-05, "loss": 2.115, "step": 300 }, { "epoch": 0.4550625711035267, "grad_norm": 7.414068222045898, "learning_rate": 4.802059496567506e-05, "loss": 2.0582, "step": 400 }, { "epoch": 0.5688282138794084, "grad_norm": 5.257181644439697, "learning_rate": 4.744851258581236e-05, "loss": 2.0307, "step": 500 }, { "epoch": 0.6825938566552902, "grad_norm": 2.71687388420105, "learning_rate": 4.687643020594966e-05, "loss": 1.9912, "step": 600 }, { "epoch": 0.7963594994311718, "grad_norm": 6.777807712554932, "learning_rate": 4.630434782608696e-05, "loss": 1.9838, "step": 700 }, { "epoch": 0.9101251422070534, "grad_norm": 7.886149883270264, "learning_rate": 4.573226544622426e-05, "loss": 1.9829, "step": 800 }, { "epoch": 1.0, "eval_gen_len": 210.01, "eval_loss": 2.0051724910736084, "eval_rouge1": 44.2025, "eval_rouge2": 21.4118, "eval_rougeL": 29.8753, "eval_rougeLsum": 39.0475, "eval_runtime": 123.0269, "eval_samples_per_second": 0.813, "eval_steps_per_second": 0.106, "step": 879 }, { "epoch": 1.023890784982935, "grad_norm": 8.102137565612793, "learning_rate": 4.516018306636156e-05, "loss": 1.9554, "step": 900 }, { "epoch": 1.1376564277588168, "grad_norm": 11.607733726501465, "learning_rate": 4.458810068649886e-05, "loss": 1.955, "step": 1000 }, { "epoch": 1.2514220705346986, "grad_norm": 6.75238037109375, "learning_rate": 4.401601830663616e-05, "loss": 1.8595, "step": 1100 }, { "epoch": 1.36518771331058, "grad_norm": 3.8955740928649902, "learning_rate": 4.344393592677346e-05, "loss": 1.8723, "step": 1200 }, { "epoch": 1.4789533560864618, "grad_norm": 3.7848293781280518, "learning_rate": 4.287757437070938e-05, "loss": 1.8681, "step": 1300 }, { "epoch": 1.5927189988623436, "grad_norm": 5.047945976257324, "learning_rate": 4.2305491990846686e-05, "loss": 1.8742, "step": 1400 }, { "epoch": 1.7064846416382253, "grad_norm": 4.739316463470459, "learning_rate": 4.173340961098398e-05, "loss": 1.8819, "step": 1500 }, { "epoch": 1.820250284414107, "grad_norm": 5.646695613861084, "learning_rate": 4.1161327231121284e-05, "loss": 1.8737, "step": 1600 }, { "epoch": 1.9340159271899886, "grad_norm": 3.755201578140259, "learning_rate": 4.0589244851258586e-05, "loss": 1.8785, "step": 1700 }, { "epoch": 2.0, "eval_gen_len": 146.16, "eval_loss": 1.9213957786560059, "eval_rouge1": 46.698, "eval_rouge2": 22.1329, "eval_rougeL": 31.017, "eval_rougeLsum": 41.3027, "eval_runtime": 88.692, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.147, "step": 1758 }, { "epoch": 2.04778156996587, "grad_norm": 2.962921619415283, "learning_rate": 4.001716247139588e-05, "loss": 1.8208, "step": 1800 }, { "epoch": 2.161547212741752, "grad_norm": 8.434884071350098, "learning_rate": 3.9445080091533184e-05, "loss": 1.803, "step": 1900 }, { "epoch": 2.2753128555176336, "grad_norm": 9.340300559997559, "learning_rate": 3.8872997711670486e-05, "loss": 1.8278, "step": 2000 }, { "epoch": 2.3890784982935154, "grad_norm": 3.50976300239563, "learning_rate": 3.830091533180778e-05, "loss": 1.8069, "step": 2100 }, { "epoch": 2.502844141069397, "grad_norm": 4.721962928771973, "learning_rate": 3.7728832951945084e-05, "loss": 1.8098, "step": 2200 }, { "epoch": 2.616609783845279, "grad_norm": 3.864901542663574, "learning_rate": 3.715675057208238e-05, "loss": 1.7589, "step": 2300 }, { "epoch": 2.73037542662116, "grad_norm": 24.329442977905273, "learning_rate": 3.658466819221968e-05, "loss": 1.7621, "step": 2400 }, { "epoch": 2.8441410693970424, "grad_norm": 8.203035354614258, "learning_rate": 3.601258581235698e-05, "loss": 1.7758, "step": 2500 }, { "epoch": 2.9579067121729237, "grad_norm": 5.4326066970825195, "learning_rate": 3.544050343249428e-05, "loss": 1.7493, "step": 2600 }, { "epoch": 3.0, "eval_gen_len": 135.9, "eval_loss": 1.86639404296875, "eval_rouge1": 47.237, "eval_rouge2": 23.0343, "eval_rougeL": 31.7155, "eval_rougeLsum": 42.1807, "eval_runtime": 76.7658, "eval_samples_per_second": 1.303, "eval_steps_per_second": 0.169, "step": 2637 }, { "epoch": 3.0716723549488054, "grad_norm": 3.5585150718688965, "learning_rate": 3.4868421052631575e-05, "loss": 1.7432, "step": 2700 }, { "epoch": 3.185437997724687, "grad_norm": 3.9954323768615723, "learning_rate": 3.429633867276888e-05, "loss": 1.7035, "step": 2800 }, { "epoch": 3.299203640500569, "grad_norm": 4.328066825866699, "learning_rate": 3.372425629290618e-05, "loss": 1.7343, "step": 2900 }, { "epoch": 3.4129692832764507, "grad_norm": 4.497200965881348, "learning_rate": 3.3152173913043475e-05, "loss": 1.7319, "step": 3000 }, { "epoch": 3.526734926052332, "grad_norm": 4.723243236541748, "learning_rate": 3.258009153318078e-05, "loss": 1.7294, "step": 3100 }, { "epoch": 3.640500568828214, "grad_norm": 6.760339260101318, "learning_rate": 3.200800915331808e-05, "loss": 1.7024, "step": 3200 }, { "epoch": 3.7542662116040955, "grad_norm": 3.54321026802063, "learning_rate": 3.1435926773455376e-05, "loss": 1.7029, "step": 3300 }, { "epoch": 3.868031854379977, "grad_norm": 5.660515785217285, "learning_rate": 3.086384439359268e-05, "loss": 1.7313, "step": 3400 }, { "epoch": 3.981797497155859, "grad_norm": 6.904107570648193, "learning_rate": 3.029176201372998e-05, "loss": 1.6599, "step": 3500 }, { "epoch": 4.0, "eval_gen_len": 133.89, "eval_loss": 1.8406709432601929, "eval_rouge1": 46.8883, "eval_rouge2": 22.317, "eval_rougeL": 30.9894, "eval_rougeLsum": 41.5511, "eval_runtime": 75.0208, "eval_samples_per_second": 1.333, "eval_steps_per_second": 0.173, "step": 3516 }, { "epoch": 4.09556313993174, "grad_norm": 3.598436117172241, "learning_rate": 2.9719679633867276e-05, "loss": 1.6754, "step": 3600 }, { "epoch": 4.2093287827076225, "grad_norm": 3.4669225215911865, "learning_rate": 2.9147597254004578e-05, "loss": 1.685, "step": 3700 }, { "epoch": 4.323094425483504, "grad_norm": 4.923774242401123, "learning_rate": 2.857551487414188e-05, "loss": 1.6468, "step": 3800 }, { "epoch": 4.436860068259386, "grad_norm": 5.548232078552246, "learning_rate": 2.8009153318077803e-05, "loss": 1.6389, "step": 3900 }, { "epoch": 4.550625711035267, "grad_norm": 6.222611904144287, "learning_rate": 2.7437070938215102e-05, "loss": 1.6251, "step": 4000 }, { "epoch": 4.664391353811149, "grad_norm": 4.012085437774658, "learning_rate": 2.6864988558352404e-05, "loss": 1.6708, "step": 4100 }, { "epoch": 4.778156996587031, "grad_norm": 4.607513904571533, "learning_rate": 2.62929061784897e-05, "loss": 1.6539, "step": 4200 }, { "epoch": 4.891922639362912, "grad_norm": 7.459988594055176, "learning_rate": 2.5720823798627002e-05, "loss": 1.6442, "step": 4300 }, { "epoch": 5.0, "eval_gen_len": 130.6, "eval_loss": 1.8186066150665283, "eval_rouge1": 46.7324, "eval_rouge2": 22.5522, "eval_rougeL": 30.8932, "eval_rougeLsum": 41.6596, "eval_runtime": 71.9117, "eval_samples_per_second": 1.391, "eval_steps_per_second": 0.181, "step": 4395 }, { "epoch": 5.005688282138794, "grad_norm": 3.9666900634765625, "learning_rate": 2.5148741418764304e-05, "loss": 1.6693, "step": 4400 }, { "epoch": 5.1194539249146755, "grad_norm": 3.8383851051330566, "learning_rate": 2.4576659038901603e-05, "loss": 1.5846, "step": 4500 }, { "epoch": 5.233219567690558, "grad_norm": 3.5106499195098877, "learning_rate": 2.4004576659038902e-05, "loss": 1.6043, "step": 4600 }, { "epoch": 5.346985210466439, "grad_norm": 4.501423358917236, "learning_rate": 2.3432494279176205e-05, "loss": 1.6203, "step": 4700 }, { "epoch": 5.460750853242321, "grad_norm": 3.3335440158843994, "learning_rate": 2.2860411899313504e-05, "loss": 1.6109, "step": 4800 }, { "epoch": 5.5745164960182025, "grad_norm": 10.799994468688965, "learning_rate": 2.2288329519450803e-05, "loss": 1.6036, "step": 4900 }, { "epoch": 5.688282138794084, "grad_norm": 3.463279962539673, "learning_rate": 2.17162471395881e-05, "loss": 1.6034, "step": 5000 }, { "epoch": 5.802047781569966, "grad_norm": 5.357439994812012, "learning_rate": 2.1149885583524028e-05, "loss": 1.6073, "step": 5100 }, { "epoch": 5.915813424345847, "grad_norm": 6.183532238006592, "learning_rate": 2.0577803203661326e-05, "loss": 1.65, "step": 5200 }, { "epoch": 6.0, "eval_gen_len": 129.34, "eval_loss": 1.804569959640503, "eval_rouge1": 46.7244, "eval_rouge2": 22.3848, "eval_rougeL": 31.2658, "eval_rougeLsum": 41.6427, "eval_runtime": 65.0318, "eval_samples_per_second": 1.538, "eval_steps_per_second": 0.2, "step": 5274 }, { "epoch": 6.0295790671217295, "grad_norm": 4.417309761047363, "learning_rate": 2.000572082379863e-05, "loss": 1.6041, "step": 5300 }, { "epoch": 6.143344709897611, "grad_norm": 5.918379306793213, "learning_rate": 1.9433638443935928e-05, "loss": 1.5789, "step": 5400 }, { "epoch": 6.257110352673493, "grad_norm": 3.504812240600586, "learning_rate": 1.8861556064073227e-05, "loss": 1.6221, "step": 5500 }, { "epoch": 6.370875995449374, "grad_norm": 4.689468860626221, "learning_rate": 1.828947368421053e-05, "loss": 1.5967, "step": 5600 }, { "epoch": 6.484641638225256, "grad_norm": 3.1115574836730957, "learning_rate": 1.7717391304347828e-05, "loss": 1.5745, "step": 5700 }, { "epoch": 6.598407281001138, "grad_norm": 5.300652503967285, "learning_rate": 1.7145308924485127e-05, "loss": 1.5666, "step": 5800 }, { "epoch": 6.712172923777019, "grad_norm": 3.5895206928253174, "learning_rate": 1.657322654462243e-05, "loss": 1.5531, "step": 5900 }, { "epoch": 6.825938566552901, "grad_norm": 3.9184463024139404, "learning_rate": 1.6001144164759728e-05, "loss": 1.583, "step": 6000 }, { "epoch": 6.939704209328783, "grad_norm": 4.801300048828125, "learning_rate": 1.5429061784897027e-05, "loss": 1.5859, "step": 6100 }, { "epoch": 7.0, "eval_gen_len": 128.86, "eval_loss": 1.7970900535583496, "eval_rouge1": 47.0912, "eval_rouge2": 22.2605, "eval_rougeL": 31.1363, "eval_rougeLsum": 41.6028, "eval_runtime": 65.7365, "eval_samples_per_second": 1.521, "eval_steps_per_second": 0.198, "step": 6153 }, { "epoch": 7.053469852104665, "grad_norm": 36.4532470703125, "learning_rate": 1.4856979405034328e-05, "loss": 1.539, "step": 6200 }, { "epoch": 7.167235494880546, "grad_norm": 5.765852451324463, "learning_rate": 1.4284897025171627e-05, "loss": 1.5805, "step": 6300 }, { "epoch": 7.281001137656427, "grad_norm": 5.349630832672119, "learning_rate": 1.3712814645308924e-05, "loss": 1.5332, "step": 6400 }, { "epoch": 7.39476678043231, "grad_norm": 100.44608306884766, "learning_rate": 1.3140732265446226e-05, "loss": 1.5277, "step": 6500 }, { "epoch": 7.508532423208191, "grad_norm": 3.3179357051849365, "learning_rate": 1.2568649885583525e-05, "loss": 1.5304, "step": 6600 }, { "epoch": 7.622298065984073, "grad_norm": 5.867196083068848, "learning_rate": 1.1996567505720824e-05, "loss": 1.5314, "step": 6700 }, { "epoch": 7.736063708759954, "grad_norm": 3.5859735012054443, "learning_rate": 1.1424485125858125e-05, "loss": 1.5665, "step": 6800 }, { "epoch": 7.849829351535837, "grad_norm": 3.854527711868286, "learning_rate": 1.0852402745995424e-05, "loss": 1.5633, "step": 6900 }, { "epoch": 7.963594994311718, "grad_norm": 3.261179208755493, "learning_rate": 1.0280320366132722e-05, "loss": 1.5773, "step": 7000 }, { "epoch": 8.0, "eval_gen_len": 128.57, "eval_loss": 1.7826117277145386, "eval_rouge1": 47.1155, "eval_rouge2": 22.756, "eval_rougeL": 31.6846, "eval_rougeLsum": 41.8634, "eval_runtime": 68.6593, "eval_samples_per_second": 1.456, "eval_steps_per_second": 0.189, "step": 7032 }, { "epoch": 8.0773606370876, "grad_norm": 3.9632034301757812, "learning_rate": 9.708237986270023e-06, "loss": 1.5591, "step": 7100 }, { "epoch": 8.19112627986348, "grad_norm": 6.473509311676025, "learning_rate": 9.136155606407324e-06, "loss": 1.5121, "step": 7200 }, { "epoch": 8.304891922639364, "grad_norm": 4.013639450073242, "learning_rate": 8.564073226544623e-06, "loss": 1.5342, "step": 7300 }, { "epoch": 8.418657565415245, "grad_norm": 6.20673942565918, "learning_rate": 7.991990846681922e-06, "loss": 1.536, "step": 7400 }, { "epoch": 8.532423208191126, "grad_norm": 3.9642581939697266, "learning_rate": 7.419908466819222e-06, "loss": 1.5527, "step": 7500 }, { "epoch": 8.646188850967008, "grad_norm": 4.914712429046631, "learning_rate": 6.847826086956521e-06, "loss": 1.5377, "step": 7600 }, { "epoch": 8.759954493742889, "grad_norm": 3.1526217460632324, "learning_rate": 6.275743707093822e-06, "loss": 1.5192, "step": 7700 }, { "epoch": 8.873720136518772, "grad_norm": 4.830049991607666, "learning_rate": 5.7036613272311215e-06, "loss": 1.5166, "step": 7800 }, { "epoch": 8.987485779294653, "grad_norm": 5.3716020584106445, "learning_rate": 5.131578947368421e-06, "loss": 1.5268, "step": 7900 }, { "epoch": 9.0, "eval_gen_len": 128.39, "eval_loss": 1.7820578813552856, "eval_rouge1": 47.0113, "eval_rouge2": 23.0256, "eval_rougeL": 31.9372, "eval_rougeLsum": 42.0294, "eval_runtime": 68.0652, "eval_samples_per_second": 1.469, "eval_steps_per_second": 0.191, "step": 7911 }, { "epoch": 9.101251422070535, "grad_norm": 5.304577350616455, "learning_rate": 4.559496567505721e-06, "loss": 1.5266, "step": 8000 }, { "epoch": 9.215017064846416, "grad_norm": 41.307308197021484, "learning_rate": 3.987414187643021e-06, "loss": 1.5269, "step": 8100 }, { "epoch": 9.328782707622299, "grad_norm": 3.451307773590088, "learning_rate": 3.4153318077803206e-06, "loss": 1.5135, "step": 8200 }, { "epoch": 9.44254835039818, "grad_norm": 3.4578561782836914, "learning_rate": 2.8432494279176204e-06, "loss": 1.5204, "step": 8300 }, { "epoch": 9.556313993174061, "grad_norm": 4.3368096351623535, "learning_rate": 2.2711670480549198e-06, "loss": 1.5315, "step": 8400 }, { "epoch": 9.670079635949943, "grad_norm": 4.57019567489624, "learning_rate": 1.6990846681922198e-06, "loss": 1.5206, "step": 8500 }, { "epoch": 9.783845278725824, "grad_norm": 4.861795425415039, "learning_rate": 1.1270022883295195e-06, "loss": 1.5193, "step": 8600 }, { "epoch": 9.897610921501707, "grad_norm": 3.933220863342285, "learning_rate": 5.549199084668192e-07, "loss": 1.5362, "step": 8700 }, { "epoch": 10.0, "eval_gen_len": 128.35, "eval_loss": 1.7812010049819946, "eval_rouge1": 46.8688, "eval_rouge2": 23.0889, "eval_rougeL": 31.9785, "eval_rougeLsum": 41.911, "eval_runtime": 68.0802, "eval_samples_per_second": 1.469, "eval_steps_per_second": 0.191, "step": 8790 } ], "logging_steps": 100, "max_steps": 8790, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.031293483188224e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }