{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6413997492305938, "eval_steps": 720, "global_step": 28800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 0.535019040107727, "learning_rate": 9.864353971540931e-05, "loss": 0.3127, "step": 720 }, { "epoch": 0.04, "eval_bertscore": 0.7034942507743835, "eval_loss": 0.12081495672464371, "eval_rouge1": 0.5240594026277678, "eval_rouge2": 0.3067757525245376, "eval_rougeL": 0.39633186458379827, "eval_rougeLsum": 0.39613387865495875, "eval_runtime": 80.0774, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.337, "step": 720 }, { "epoch": 0.08, "grad_norm": 0.42372021079063416, "learning_rate": 9.72756806048977e-05, "loss": 0.1212, "step": 1440 }, { "epoch": 0.08, "eval_bertscore": 0.7000990509986877, "eval_loss": 0.11524277925491333, "eval_rouge1": 0.5216793756151761, "eval_rouge2": 0.2934274790368596, "eval_rougeL": 0.3886043968581182, "eval_rougeLsum": 0.38807827948983176, "eval_runtime": 76.9989, "eval_samples_per_second": 0.701, "eval_steps_per_second": 0.351, "step": 1440 }, { "epoch": 0.12, "grad_norm": 0.42828959226608276, "learning_rate": 9.590782149438608e-05, "loss": 0.1222, "step": 2160 }, { "epoch": 0.12, "eval_bertscore": 0.6457424163818359, "eval_loss": 0.1191863939166069, "eval_rouge1": 0.4637245182501467, "eval_rouge2": 0.25286895940302717, "eval_rougeL": 0.34111914002345234, "eval_rougeLsum": 0.3407154814401842, "eval_runtime": 103.8897, "eval_samples_per_second": 0.52, "eval_steps_per_second": 0.26, "step": 2160 }, { "epoch": 0.16, "grad_norm": 0.44599342346191406, "learning_rate": 9.453996238387447e-05, "loss": 0.1172, "step": 2880 }, { "epoch": 0.16, "eval_bertscore": 0.6841260194778442, "eval_loss": 0.11394956707954407, "eval_rouge1": 0.48661400042430136, "eval_rouge2": 0.27298564925044316, "eval_rougeL": 0.371859286615548, "eval_rougeLsum": 0.37045985133751724, "eval_runtime": 67.7701, "eval_samples_per_second": 0.797, "eval_steps_per_second": 0.398, "step": 2880 }, { "epoch": 0.21, "grad_norm": 0.4306412935256958, "learning_rate": 9.317210327336284e-05, "loss": 0.1265, "step": 3600 }, { "epoch": 0.21, "eval_bertscore": 0.5902894139289856, "eval_loss": 0.1217198297381401, "eval_rouge1": 0.3684193429088964, "eval_rouge2": 0.19503224358227592, "eval_rougeL": 0.29399973276202795, "eval_rougeLsum": 0.29348115386759427, "eval_runtime": 58.6683, "eval_samples_per_second": 0.92, "eval_steps_per_second": 0.46, "step": 3600 }, { "epoch": 0.25, "grad_norm": 0.4569800794124603, "learning_rate": 9.180424416285122e-05, "loss": 0.1201, "step": 4320 }, { "epoch": 0.25, "eval_bertscore": 0.6664375066757202, "eval_loss": 0.11453992873430252, "eval_rouge1": 0.4849966834372761, "eval_rouge2": 0.27797369581531306, "eval_rougeL": 0.3629296708263681, "eval_rougeLsum": 0.3623800251579623, "eval_runtime": 81.2927, "eval_samples_per_second": 0.664, "eval_steps_per_second": 0.332, "step": 4320 }, { "epoch": 0.29, "grad_norm": 0.4236726462841034, "learning_rate": 9.043638505233961e-05, "loss": 0.1171, "step": 5040 }, { "epoch": 0.29, "eval_bertscore": 0.6511832475662231, "eval_loss": 0.1140449047088623, "eval_rouge1": 0.4622762558490229, "eval_rouge2": 0.2720739144786822, "eval_rougeL": 0.35563312683133363, "eval_rougeLsum": 0.3553109181928958, "eval_runtime": 96.9669, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.278, "step": 5040 }, { "epoch": 0.33, "grad_norm": 0.4153118431568146, "learning_rate": 8.9068525941828e-05, "loss": 0.1182, "step": 5760 }, { "epoch": 0.33, "eval_bertscore": 0.7056278586387634, "eval_loss": 0.11447593569755554, "eval_rouge1": 0.535980541670685, "eval_rouge2": 0.3085487182685619, "eval_rougeL": 0.3952747177668595, "eval_rougeLsum": 0.39487374036594847, "eval_runtime": 78.3273, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.345, "step": 5760 }, { "epoch": 0.37, "grad_norm": 0.30859124660491943, "learning_rate": 8.770066683131638e-05, "loss": 0.1161, "step": 6480 }, { "epoch": 0.37, "eval_bertscore": 0.717333197593689, "eval_loss": 0.11316747963428497, "eval_rouge1": 0.5395779598124536, "eval_rouge2": 0.3235379995103774, "eval_rougeL": 0.40115447322823283, "eval_rougeLsum": 0.40261399344054405, "eval_runtime": 77.4572, "eval_samples_per_second": 0.697, "eval_steps_per_second": 0.349, "step": 6480 }, { "epoch": 0.41, "grad_norm": 0.3857922852039337, "learning_rate": 8.633280772080476e-05, "loss": 0.1151, "step": 7200 }, { "epoch": 0.41, "eval_bertscore": 0.7019616365432739, "eval_loss": 0.11000501364469528, "eval_rouge1": 0.5236493302484355, "eval_rouge2": 0.3068191529551719, "eval_rougeL": 0.4036664284755191, "eval_rougeLsum": 0.4040689187486951, "eval_runtime": 76.1117, "eval_samples_per_second": 0.709, "eval_steps_per_second": 0.355, "step": 7200 }, { "epoch": 0.45, "grad_norm": 0.3432803452014923, "learning_rate": 8.496494861029315e-05, "loss": 0.1144, "step": 7920 }, { "epoch": 0.45, "eval_bertscore": 0.7042035460472107, "eval_loss": 0.11131834983825684, "eval_rouge1": 0.5294383728708514, "eval_rouge2": 0.3003098501119716, "eval_rougeL": 0.3967273111533241, "eval_rougeLsum": 0.39619485281011757, "eval_runtime": 77.3811, "eval_samples_per_second": 0.698, "eval_steps_per_second": 0.349, "step": 7920 }, { "epoch": 0.49, "grad_norm": 0.3402859568595886, "learning_rate": 8.359708949978152e-05, "loss": 0.1126, "step": 8640 }, { "epoch": 0.49, "eval_bertscore": 0.71971595287323, "eval_loss": 0.11067274957895279, "eval_rouge1": 0.551885774991056, "eval_rouge2": 0.33499475588298316, "eval_rougeL": 0.4160407628361842, "eval_rougeLsum": 0.4164543392695917, "eval_runtime": 77.4902, "eval_samples_per_second": 0.697, "eval_steps_per_second": 0.348, "step": 8640 }, { "epoch": 0.53, "grad_norm": 0.4391550123691559, "learning_rate": 8.223113019359007e-05, "loss": 0.1116, "step": 9360 }, { "epoch": 0.53, "eval_bertscore": 0.7158631086349487, "eval_loss": 0.1099533885717392, "eval_rouge1": 0.5557272797557176, "eval_rouge2": 0.332779980249166, "eval_rougeL": 0.4155444723963883, "eval_rougeLsum": 0.41657130732656783, "eval_runtime": 80.0828, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.337, "step": 9360 }, { "epoch": 0.57, "grad_norm": 0.3907322287559509, "learning_rate": 8.086327108307846e-05, "loss": 0.1141, "step": 10080 }, { "epoch": 0.57, "eval_bertscore": 0.7130799293518066, "eval_loss": 0.11258435994386673, "eval_rouge1": 0.5457292777447704, "eval_rouge2": 0.3214033358835623, "eval_rougeL": 0.40814606110656115, "eval_rougeLsum": 0.4086806368595041, "eval_runtime": 73.8407, "eval_samples_per_second": 0.731, "eval_steps_per_second": 0.366, "step": 10080 }, { "epoch": 0.62, "grad_norm": 0.38848328590393066, "learning_rate": 7.949541197256683e-05, "loss": 0.1132, "step": 10800 }, { "epoch": 0.62, "eval_bertscore": 0.7245057225227356, "eval_loss": 0.11034353822469711, "eval_rouge1": 0.5603983140826179, "eval_rouge2": 0.3445987526625777, "eval_rougeL": 0.43423536113182604, "eval_rougeLsum": 0.43398163455334016, "eval_runtime": 77.732, "eval_samples_per_second": 0.695, "eval_steps_per_second": 0.347, "step": 10800 }, { "epoch": 0.66, "grad_norm": 0.37145286798477173, "learning_rate": 7.812945266637537e-05, "loss": 0.112, "step": 11520 }, { "epoch": 0.66, "eval_bertscore": 0.7207842469215393, "eval_loss": 0.11157318204641342, "eval_rouge1": 0.5554178283606811, "eval_rouge2": 0.3317069905744905, "eval_rougeL": 0.4209451268922738, "eval_rougeLsum": 0.42120272115590573, "eval_runtime": 81.6277, "eval_samples_per_second": 0.662, "eval_steps_per_second": 0.331, "step": 11520 }, { "epoch": 0.7, "grad_norm": 0.32327908277511597, "learning_rate": 7.676349336018391e-05, "loss": 0.1118, "step": 12240 }, { "epoch": 0.7, "eval_bertscore": 0.7193225622177124, "eval_loss": 0.11037024855613708, "eval_rouge1": 0.5534709029702873, "eval_rouge2": 0.33508595975393674, "eval_rougeL": 0.4220660586810759, "eval_rougeLsum": 0.42394444829473793, "eval_runtime": 79.4778, "eval_samples_per_second": 0.679, "eval_steps_per_second": 0.34, "step": 12240 }, { "epoch": 0.74, "grad_norm": 0.296165406703949, "learning_rate": 7.539563424967229e-05, "loss": 0.1096, "step": 12960 }, { "epoch": 0.74, "eval_bertscore": 0.7183234691619873, "eval_loss": 0.10668845474720001, "eval_rouge1": 0.5527080110711662, "eval_rouge2": 0.3304597058226536, "eval_rougeL": 0.4176676998826935, "eval_rougeLsum": 0.41906982236369805, "eval_runtime": 74.609, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 12960 }, { "epoch": 0.78, "grad_norm": 0.3004627525806427, "learning_rate": 7.402777513916068e-05, "loss": 0.1105, "step": 13680 }, { "epoch": 0.78, "eval_bertscore": 0.7093863487243652, "eval_loss": 0.1068594753742218, "eval_rouge1": 0.5386027107080774, "eval_rouge2": 0.3174670612173311, "eval_rougeL": 0.4089464604886982, "eval_rougeLsum": 0.40954043741634194, "eval_runtime": 76.1174, "eval_samples_per_second": 0.709, "eval_steps_per_second": 0.355, "step": 13680 }, { "epoch": 0.82, "grad_norm": 0.4122227132320404, "learning_rate": 7.265991602864905e-05, "loss": 0.1094, "step": 14400 }, { "epoch": 0.82, "eval_bertscore": 0.7156451344490051, "eval_loss": 0.10706545412540436, "eval_rouge1": 0.5522097394348282, "eval_rouge2": 0.3376815877629147, "eval_rougeL": 0.41094798705443536, "eval_rougeLsum": 0.41185755780524297, "eval_runtime": 79.5068, "eval_samples_per_second": 0.679, "eval_steps_per_second": 0.34, "step": 14400 }, { "epoch": 0.86, "grad_norm": 0.3019055128097534, "learning_rate": 7.129205691813743e-05, "loss": 0.1047, "step": 15120 }, { "epoch": 0.86, "eval_bertscore": 0.723181962966919, "eval_loss": 0.10513070970773697, "eval_rouge1": 0.5597833953895566, "eval_rouge2": 0.3368159976094224, "eval_rougeL": 0.4251112326345452, "eval_rougeLsum": 0.4271018761152323, "eval_runtime": 72.9275, "eval_samples_per_second": 0.74, "eval_steps_per_second": 0.37, "step": 15120 }, { "epoch": 0.9, "grad_norm": 0.39543616771698, "learning_rate": 6.992609761194597e-05, "loss": 0.106, "step": 15840 }, { "epoch": 0.9, "eval_bertscore": 0.7264233231544495, "eval_loss": 0.10471142083406448, "eval_rouge1": 0.5607444186855683, "eval_rouge2": 0.32933852525922336, "eval_rougeL": 0.4164104876659622, "eval_rougeLsum": 0.4178921783444509, "eval_runtime": 79.4632, "eval_samples_per_second": 0.68, "eval_steps_per_second": 0.34, "step": 15840 }, { "epoch": 0.94, "grad_norm": 0.17296220362186432, "learning_rate": 6.855823850143436e-05, "loss": 0.1085, "step": 16560 }, { "epoch": 0.94, "eval_bertscore": 0.7186797261238098, "eval_loss": 0.10288402438163757, "eval_rouge1": 0.5499937534452628, "eval_rouge2": 0.33202955320606253, "eval_rougeL": 0.41109499153735635, "eval_rougeLsum": 0.4129325173744952, "eval_runtime": 74.2816, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.363, "step": 16560 }, { "epoch": 0.98, "grad_norm": 0.34968239068984985, "learning_rate": 6.719037939092274e-05, "loss": 0.1064, "step": 17280 }, { "epoch": 0.98, "eval_bertscore": 0.715437650680542, "eval_loss": 0.10678575932979584, "eval_rouge1": 0.5487884139639068, "eval_rouge2": 0.3287484312214649, "eval_rougeL": 0.4115546192599129, "eval_rougeLsum": 0.4129129108454481, "eval_runtime": 77.9977, "eval_samples_per_second": 0.692, "eval_steps_per_second": 0.346, "step": 17280 }, { "epoch": 1.03, "grad_norm": 0.22770258784294128, "learning_rate": 6.582252028041113e-05, "loss": 0.094, "step": 18000 }, { "epoch": 1.03, "eval_bertscore": 0.7268933653831482, "eval_loss": 0.10910864919424057, "eval_rouge1": 0.5644640432420631, "eval_rouge2": 0.34856910757450765, "eval_rougeL": 0.4334348850734425, "eval_rougeLsum": 0.4322774316283801, "eval_runtime": 70.7723, "eval_samples_per_second": 0.763, "eval_steps_per_second": 0.382, "step": 18000 }, { "epoch": 1.07, "grad_norm": 0.2036217600107193, "learning_rate": 6.44546611698995e-05, "loss": 0.0864, "step": 18720 }, { "epoch": 1.07, "eval_bertscore": 0.7298507690429688, "eval_loss": 0.1051657572388649, "eval_rouge1": 0.5693416283658175, "eval_rouge2": 0.3547090481291705, "eval_rougeL": 0.4367412765285528, "eval_rougeLsum": 0.4370252833034207, "eval_runtime": 74.7048, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.361, "step": 18720 }, { "epoch": 1.11, "grad_norm": 0.3400803804397583, "learning_rate": 6.308680205938788e-05, "loss": 0.0846, "step": 19440 }, { "epoch": 1.11, "eval_bertscore": 0.7288545966148376, "eval_loss": 0.1069113239645958, "eval_rouge1": 0.5633722381222108, "eval_rouge2": 0.337377454492796, "eval_rougeL": 0.4349115421710151, "eval_rougeLsum": 0.43561356852158567, "eval_runtime": 73.5552, "eval_samples_per_second": 0.734, "eval_steps_per_second": 0.367, "step": 19440 }, { "epoch": 1.15, "grad_norm": 0.3360745310783386, "learning_rate": 6.172084275319642e-05, "loss": 0.0875, "step": 20160 }, { "epoch": 1.15, "eval_bertscore": 0.715953528881073, "eval_loss": 0.10425002127885818, "eval_rouge1": 0.5548398737384996, "eval_rouge2": 0.33589277481599067, "eval_rougeL": 0.42137114864331937, "eval_rougeLsum": 0.4231469615029759, "eval_runtime": 78.0304, "eval_samples_per_second": 0.692, "eval_steps_per_second": 0.346, "step": 20160 }, { "epoch": 1.19, "grad_norm": 0.4246189594268799, "learning_rate": 6.03529836426848e-05, "loss": 0.0868, "step": 20880 }, { "epoch": 1.19, "eval_bertscore": 0.7299396395683289, "eval_loss": 0.10365325212478638, "eval_rouge1": 0.5715394315498017, "eval_rouge2": 0.34427400662165897, "eval_rougeL": 0.433027526044127, "eval_rougeLsum": 0.4347450430032858, "eval_runtime": 74.0473, "eval_samples_per_second": 0.729, "eval_steps_per_second": 0.365, "step": 20880 }, { "epoch": 1.23, "grad_norm": 0.2776849865913391, "learning_rate": 5.898512453217319e-05, "loss": 0.0854, "step": 21600 }, { "epoch": 1.23, "eval_bertscore": 0.7214290499687195, "eval_loss": 0.10122980922460556, "eval_rouge1": 0.5565823263453793, "eval_rouge2": 0.3393375143994867, "eval_rougeL": 0.4156140884756716, "eval_rougeLsum": 0.41819540905867203, "eval_runtime": 73.2235, "eval_samples_per_second": 0.737, "eval_steps_per_second": 0.369, "step": 21600 }, { "epoch": 1.27, "grad_norm": 0.3710538446903229, "learning_rate": 5.761726542166157e-05, "loss": 0.0845, "step": 22320 }, { "epoch": 1.27, "eval_bertscore": 0.7201518416404724, "eval_loss": 0.10378885269165039, "eval_rouge1": 0.5441295123980776, "eval_rouge2": 0.33155064058257405, "eval_rougeL": 0.42094247090226844, "eval_rougeLsum": 0.42274633038817555, "eval_runtime": 76.6313, "eval_samples_per_second": 0.705, "eval_steps_per_second": 0.352, "step": 22320 }, { "epoch": 1.31, "grad_norm": 0.5819060206413269, "learning_rate": 5.624940631114996e-05, "loss": 0.0861, "step": 23040 }, { "epoch": 1.31, "eval_bertscore": 0.7142701148986816, "eval_loss": 0.10384026169776917, "eval_rouge1": 0.5458184588249984, "eval_rouge2": 0.3290922169442115, "eval_rougeL": 0.4214855047650181, "eval_rougeLsum": 0.4239018723206239, "eval_runtime": 79.4541, "eval_samples_per_second": 0.68, "eval_steps_per_second": 0.34, "step": 23040 }, { "epoch": 1.35, "grad_norm": 0.2952657639980316, "learning_rate": 5.488154720063834e-05, "loss": 0.0862, "step": 23760 }, { "epoch": 1.35, "eval_bertscore": 0.7302463054656982, "eval_loss": 0.10171066224575043, "eval_rouge1": 0.564237466077122, "eval_rouge2": 0.346632021192653, "eval_rougeL": 0.44007571581541377, "eval_rougeLsum": 0.4408434182223313, "eval_runtime": 70.4368, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.383, "step": 23760 }, { "epoch": 1.4, "grad_norm": 0.3152740001678467, "learning_rate": 5.351368809012672e-05, "loss": 0.0858, "step": 24480 }, { "epoch": 1.4, "eval_bertscore": 0.7205690741539001, "eval_loss": 0.10222817957401276, "eval_rouge1": 0.561963492920594, "eval_rouge2": 0.3366175149143015, "eval_rougeL": 0.4370056834486044, "eval_rougeLsum": 0.4383325343921459, "eval_runtime": 70.7631, "eval_samples_per_second": 0.763, "eval_steps_per_second": 0.382, "step": 24480 }, { "epoch": 1.44, "grad_norm": 0.3384862542152405, "learning_rate": 5.214772878393526e-05, "loss": 0.0868, "step": 25200 }, { "epoch": 1.44, "eval_bertscore": 0.7201054096221924, "eval_loss": 0.10058918595314026, "eval_rouge1": 0.5506843793300468, "eval_rouge2": 0.3305447880283259, "eval_rougeL": 0.4221671281003694, "eval_rougeLsum": 0.42405735392085775, "eval_runtime": 73.3661, "eval_samples_per_second": 0.736, "eval_steps_per_second": 0.368, "step": 25200 }, { "epoch": 1.48, "grad_norm": 0.2858143150806427, "learning_rate": 5.078176947774379e-05, "loss": 0.0851, "step": 25920 }, { "epoch": 1.48, "eval_bertscore": 0.7324591875076294, "eval_loss": 0.10030569136142731, "eval_rouge1": 0.5711881175991272, "eval_rouge2": 0.35036140380824915, "eval_rougeL": 0.44736244718696055, "eval_rougeLsum": 0.44882200145887735, "eval_runtime": 73.2375, "eval_samples_per_second": 0.737, "eval_steps_per_second": 0.369, "step": 25920 }, { "epoch": 1.52, "grad_norm": 0.34586507081985474, "learning_rate": 4.941391036723218e-05, "loss": 0.0839, "step": 26640 }, { "epoch": 1.52, "eval_bertscore": 0.7323827147483826, "eval_loss": 0.10078810900449753, "eval_rouge1": 0.5643411922408847, "eval_rouge2": 0.35335509416724475, "eval_rougeL": 0.4412030311945061, "eval_rougeLsum": 0.4423071630624772, "eval_runtime": 78.0237, "eval_samples_per_second": 0.692, "eval_steps_per_second": 0.346, "step": 26640 }, { "epoch": 1.56, "grad_norm": 0.35009488463401794, "learning_rate": 4.804605125672056e-05, "loss": 0.0843, "step": 27360 }, { "epoch": 1.56, "eval_bertscore": 0.7391833662986755, "eval_loss": 0.09879420697689056, "eval_rouge1": 0.5744063451356638, "eval_rouge2": 0.3631199161982914, "eval_rougeL": 0.44665302719291095, "eval_rougeLsum": 0.44897406269269213, "eval_runtime": 80.5403, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.335, "step": 27360 }, { "epoch": 1.6, "grad_norm": 0.3002821207046509, "learning_rate": 4.667819214620894e-05, "loss": 0.085, "step": 28080 }, { "epoch": 1.6, "eval_bertscore": 0.7407130002975464, "eval_loss": 0.09699860215187073, "eval_rouge1": 0.5762741233248756, "eval_rouge2": 0.3544722421313946, "eval_rougeL": 0.4384246085216507, "eval_rougeLsum": 0.4390526517186611, "eval_runtime": 78.2681, "eval_samples_per_second": 0.69, "eval_steps_per_second": 0.345, "step": 28080 }, { "epoch": 1.64, "grad_norm": 0.241718590259552, "learning_rate": 4.5310333035697325e-05, "loss": 0.0845, "step": 28800 }, { "epoch": 1.64, "eval_bertscore": 0.7392789125442505, "eval_loss": 0.09867523610591888, "eval_rouge1": 0.580719658129176, "eval_rouge2": 0.3694474172593357, "eval_rougeL": 0.456964934995113, "eval_rougeLsum": 0.45917370226539334, "eval_runtime": 80.3113, "eval_samples_per_second": 0.672, "eval_steps_per_second": 0.336, "step": 28800 } ], "logging_steps": 720, "max_steps": 52638, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2880, "total_flos": 2.496482830587003e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }