| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 7030, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.1422475106685633, |
| "grad_norm": 4.714789390563965, |
| "learning_rate": 4.9670487106017194e-05, |
| "loss": 2.0134, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2844950213371266, |
| "grad_norm": 6.441630840301514, |
| "learning_rate": 4.89541547277937e-05, |
| "loss": 1.5541, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4267425320056899, |
| "grad_norm": 4.592851638793945, |
| "learning_rate": 4.823782234957021e-05, |
| "loss": 1.489, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5689900426742532, |
| "grad_norm": 3.297266960144043, |
| "learning_rate": 4.7521489971346707e-05, |
| "loss": 1.4536, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7112375533428165, |
| "grad_norm": 2.3128268718719482, |
| "learning_rate": 4.680515759312321e-05, |
| "loss": 1.4203, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8534850640113798, |
| "grad_norm": 3.049949884414673, |
| "learning_rate": 4.608882521489972e-05, |
| "loss": 1.4094, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9957325746799431, |
| "grad_norm": 2.860804319381714, |
| "learning_rate": 4.5372492836676226e-05, |
| "loss": 1.3611, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_gen_len": 192.4972, |
| "eval_loss": 1.2695759534835815, |
| "eval_rouge1": 49.1217, |
| "eval_rouge2": 25.983, |
| "eval_rougeL": 34.4639, |
| "eval_rougeLsum": 44.2763, |
| "eval_runtime": 1625.5106, |
| "eval_samples_per_second": 0.865, |
| "eval_steps_per_second": 0.108, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.1379800853485065, |
| "grad_norm": 5.325246334075928, |
| "learning_rate": 4.4656160458452725e-05, |
| "loss": 1.347, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2802275960170697, |
| "grad_norm": 4.638139724731445, |
| "learning_rate": 4.393982808022923e-05, |
| "loss": 1.294, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.422475106685633, |
| "grad_norm": 4.863597869873047, |
| "learning_rate": 4.322349570200573e-05, |
| "loss": 1.3274, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5647226173541964, |
| "grad_norm": 4.364685535430908, |
| "learning_rate": 4.250716332378224e-05, |
| "loss": 1.305, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7069701280227596, |
| "grad_norm": 3.9026546478271484, |
| "learning_rate": 4.179083094555874e-05, |
| "loss": 1.2867, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8492176386913228, |
| "grad_norm": 18.247657775878906, |
| "learning_rate": 4.1074498567335244e-05, |
| "loss": 1.2879, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.991465149359886, |
| "grad_norm": 3.737813949584961, |
| "learning_rate": 4.035816618911175e-05, |
| "loss": 1.2731, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_gen_len": 145.7745, |
| "eval_loss": 1.2143858671188354, |
| "eval_rouge1": 54.2525, |
| "eval_rouge2": 29.3843, |
| "eval_rougeL": 37.7286, |
| "eval_rougeLsum": 49.2444, |
| "eval_runtime": 1195.305, |
| "eval_samples_per_second": 1.176, |
| "eval_steps_per_second": 0.147, |
| "step": 1406 |
| }, |
| { |
| "epoch": 2.1337126600284497, |
| "grad_norm": 3.5325093269348145, |
| "learning_rate": 3.964183381088825e-05, |
| "loss": 1.2249, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.275960170697013, |
| "grad_norm": 4.705758094787598, |
| "learning_rate": 3.8925501432664756e-05, |
| "loss": 1.228, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.418207681365576, |
| "grad_norm": 4.521617412567139, |
| "learning_rate": 3.820916905444126e-05, |
| "loss": 1.2261, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.5604551920341394, |
| "grad_norm": 2.7349600791931152, |
| "learning_rate": 3.749283667621777e-05, |
| "loss": 1.2261, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.7027027027027026, |
| "grad_norm": 3.1735689640045166, |
| "learning_rate": 3.677650429799427e-05, |
| "loss": 1.2177, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.844950213371266, |
| "grad_norm": 4.88203763961792, |
| "learning_rate": 3.6060171919770775e-05, |
| "loss": 1.2055, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.987197724039829, |
| "grad_norm": 2.383328437805176, |
| "learning_rate": 3.534383954154728e-05, |
| "loss": 1.2075, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_gen_len": 135.436, |
| "eval_loss": 1.1897257566452026, |
| "eval_rouge1": 54.4867, |
| "eval_rouge2": 29.2535, |
| "eval_rougeL": 37.5847, |
| "eval_rougeLsum": 49.4878, |
| "eval_runtime": 1293.6847, |
| "eval_samples_per_second": 1.087, |
| "eval_steps_per_second": 0.136, |
| "step": 2109 |
| }, |
| { |
| "epoch": 3.1294452347083928, |
| "grad_norm": 2.6979541778564453, |
| "learning_rate": 3.462750716332379e-05, |
| "loss": 1.1771, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.271692745376956, |
| "grad_norm": 3.8193747997283936, |
| "learning_rate": 3.391117478510029e-05, |
| "loss": 1.1714, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.413940256045519, |
| "grad_norm": 2.840989589691162, |
| "learning_rate": 3.3194842406876794e-05, |
| "loss": 1.161, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.5561877667140824, |
| "grad_norm": 2.8911194801330566, |
| "learning_rate": 3.24785100286533e-05, |
| "loss": 1.1764, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.6984352773826457, |
| "grad_norm": 4.388571739196777, |
| "learning_rate": 3.1762177650429806e-05, |
| "loss": 1.1403, |
| "step": 2600 |
| }, |
| { |
| "epoch": 3.8406827880512093, |
| "grad_norm": 3.452425956726074, |
| "learning_rate": 3.1045845272206306e-05, |
| "loss": 1.1293, |
| "step": 2700 |
| }, |
| { |
| "epoch": 3.9829302987197726, |
| "grad_norm": 3.1834287643432617, |
| "learning_rate": 3.032951289398281e-05, |
| "loss": 1.1413, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_gen_len": 132.9936, |
| "eval_loss": 1.1736302375793457, |
| "eval_rouge1": 54.9178, |
| "eval_rouge2": 29.6386, |
| "eval_rougeL": 37.8747, |
| "eval_rougeLsum": 49.9072, |
| "eval_runtime": 1855.2918, |
| "eval_samples_per_second": 0.758, |
| "eval_steps_per_second": 0.095, |
| "step": 2812 |
| }, |
| { |
| "epoch": 4.125177809388336, |
| "grad_norm": 5.237063407897949, |
| "learning_rate": 2.9613180515759315e-05, |
| "loss": 1.124, |
| "step": 2900 |
| }, |
| { |
| "epoch": 4.2674253200568995, |
| "grad_norm": 4.409717082977295, |
| "learning_rate": 2.889684813753582e-05, |
| "loss": 1.1012, |
| "step": 3000 |
| }, |
| { |
| "epoch": 4.409672830725462, |
| "grad_norm": 6.317205905914307, |
| "learning_rate": 2.818051575931232e-05, |
| "loss": 1.1045, |
| "step": 3100 |
| }, |
| { |
| "epoch": 4.551920341394026, |
| "grad_norm": 3.054473400115967, |
| "learning_rate": 2.7464183381088828e-05, |
| "loss": 1.0951, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.694167852062589, |
| "grad_norm": 7.386185169219971, |
| "learning_rate": 2.674785100286533e-05, |
| "loss": 1.1265, |
| "step": 3300 |
| }, |
| { |
| "epoch": 4.836415362731152, |
| "grad_norm": 2.541593074798584, |
| "learning_rate": 2.6031518624641837e-05, |
| "loss": 1.0964, |
| "step": 3400 |
| }, |
| { |
| "epoch": 4.978662873399715, |
| "grad_norm": 2.9089510440826416, |
| "learning_rate": 2.5322349570200578e-05, |
| "loss": 1.0824, |
| "step": 3500 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_gen_len": 129.9488, |
| "eval_loss": 1.1544321775436401, |
| "eval_rouge1": 55.1013, |
| "eval_rouge2": 29.8133, |
| "eval_rougeL": 37.9405, |
| "eval_rougeLsum": 50.0896, |
| "eval_runtime": 1082.5396, |
| "eval_samples_per_second": 1.299, |
| "eval_steps_per_second": 0.163, |
| "step": 3515 |
| }, |
| { |
| "epoch": 5.120910384068279, |
| "grad_norm": 4.814157962799072, |
| "learning_rate": 2.4606017191977078e-05, |
| "loss": 1.0571, |
| "step": 3600 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 3.1615419387817383, |
| "learning_rate": 2.388968481375358e-05, |
| "loss": 1.0891, |
| "step": 3700 |
| }, |
| { |
| "epoch": 5.405405405405405, |
| "grad_norm": 2.753258466720581, |
| "learning_rate": 2.3173352435530087e-05, |
| "loss": 1.0621, |
| "step": 3800 |
| }, |
| { |
| "epoch": 5.547652916073969, |
| "grad_norm": 2.6968796253204346, |
| "learning_rate": 2.245702005730659e-05, |
| "loss": 1.0714, |
| "step": 3900 |
| }, |
| { |
| "epoch": 5.689900426742532, |
| "grad_norm": 4.843164920806885, |
| "learning_rate": 2.1740687679083096e-05, |
| "loss": 1.0527, |
| "step": 4000 |
| }, |
| { |
| "epoch": 5.832147937411095, |
| "grad_norm": 4.297841548919678, |
| "learning_rate": 2.10243553008596e-05, |
| "loss": 1.0665, |
| "step": 4100 |
| }, |
| { |
| "epoch": 5.974395448079658, |
| "grad_norm": 3.3593056201934814, |
| "learning_rate": 2.0308022922636106e-05, |
| "loss": 1.0649, |
| "step": 4200 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_gen_len": 129.5334, |
| "eval_loss": 1.147682785987854, |
| "eval_rouge1": 55.3737, |
| "eval_rouge2": 30.0994, |
| "eval_rougeL": 38.1751, |
| "eval_rougeLsum": 50.2305, |
| "eval_runtime": 1585.0234, |
| "eval_samples_per_second": 0.887, |
| "eval_steps_per_second": 0.111, |
| "step": 4218 |
| }, |
| { |
| "epoch": 6.116642958748222, |
| "grad_norm": 2.796093225479126, |
| "learning_rate": 1.959169054441261e-05, |
| "loss": 1.0388, |
| "step": 4300 |
| }, |
| { |
| "epoch": 6.2588904694167855, |
| "grad_norm": 4.34324312210083, |
| "learning_rate": 1.8875358166189115e-05, |
| "loss": 1.0477, |
| "step": 4400 |
| }, |
| { |
| "epoch": 6.401137980085348, |
| "grad_norm": 5.817513465881348, |
| "learning_rate": 1.8159025787965618e-05, |
| "loss": 1.0484, |
| "step": 4500 |
| }, |
| { |
| "epoch": 6.543385490753912, |
| "grad_norm": 10.839672088623047, |
| "learning_rate": 1.744269340974212e-05, |
| "loss": 1.0401, |
| "step": 4600 |
| }, |
| { |
| "epoch": 6.685633001422475, |
| "grad_norm": 5.399308681488037, |
| "learning_rate": 1.6726361031518624e-05, |
| "loss": 1.0273, |
| "step": 4700 |
| }, |
| { |
| "epoch": 6.827880512091038, |
| "grad_norm": 3.037004232406616, |
| "learning_rate": 1.601002865329513e-05, |
| "loss": 1.0158, |
| "step": 4800 |
| }, |
| { |
| "epoch": 6.970128022759602, |
| "grad_norm": 4.032598972320557, |
| "learning_rate": 1.5293696275071634e-05, |
| "loss": 1.031, |
| "step": 4900 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_gen_len": 128.6415, |
| "eval_loss": 1.1399182081222534, |
| "eval_rouge1": 55.4367, |
| "eval_rouge2": 30.2958, |
| "eval_rougeL": 38.402, |
| "eval_rougeLsum": 50.4346, |
| "eval_runtime": 1073.7936, |
| "eval_samples_per_second": 1.309, |
| "eval_steps_per_second": 0.164, |
| "step": 4921 |
| }, |
| { |
| "epoch": 7.112375533428165, |
| "grad_norm": 5.663075923919678, |
| "learning_rate": 1.4577363896848137e-05, |
| "loss": 1.0312, |
| "step": 5000 |
| }, |
| { |
| "epoch": 7.2546230440967285, |
| "grad_norm": 3.34541654586792, |
| "learning_rate": 1.3861031518624643e-05, |
| "loss": 1.0059, |
| "step": 5100 |
| }, |
| { |
| "epoch": 7.396870554765291, |
| "grad_norm": 7.185023784637451, |
| "learning_rate": 1.3144699140401146e-05, |
| "loss": 1.0154, |
| "step": 5200 |
| }, |
| { |
| "epoch": 7.539118065433855, |
| "grad_norm": 3.5461268424987793, |
| "learning_rate": 1.242836676217765e-05, |
| "loss": 1.0037, |
| "step": 5300 |
| }, |
| { |
| "epoch": 7.681365576102419, |
| "grad_norm": 3.1910974979400635, |
| "learning_rate": 1.1712034383954155e-05, |
| "loss": 1.0116, |
| "step": 5400 |
| }, |
| { |
| "epoch": 7.823613086770981, |
| "grad_norm": 7.539901256561279, |
| "learning_rate": 1.099570200573066e-05, |
| "loss": 1.0162, |
| "step": 5500 |
| }, |
| { |
| "epoch": 7.965860597439545, |
| "grad_norm": 2.6586227416992188, |
| "learning_rate": 1.0279369627507165e-05, |
| "loss": 1.0169, |
| "step": 5600 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_gen_len": 128.6607, |
| "eval_loss": 1.1396645307540894, |
| "eval_rouge1": 55.3171, |
| "eval_rouge2": 30.1359, |
| "eval_rougeL": 38.2241, |
| "eval_rougeLsum": 50.2819, |
| "eval_runtime": 1048.6479, |
| "eval_samples_per_second": 1.341, |
| "eval_steps_per_second": 0.168, |
| "step": 5624 |
| }, |
| { |
| "epoch": 8.108108108108109, |
| "grad_norm": 13.898772239685059, |
| "learning_rate": 9.563037249283668e-06, |
| "loss": 1.009, |
| "step": 5700 |
| }, |
| { |
| "epoch": 8.250355618776672, |
| "grad_norm": 2.7014620304107666, |
| "learning_rate": 8.846704871060172e-06, |
| "loss": 0.9918, |
| "step": 5800 |
| }, |
| { |
| "epoch": 8.392603129445234, |
| "grad_norm": 3.315486192703247, |
| "learning_rate": 8.130372492836677e-06, |
| "loss": 1.0118, |
| "step": 5900 |
| }, |
| { |
| "epoch": 8.534850640113799, |
| "grad_norm": 3.5905227661132812, |
| "learning_rate": 7.414040114613182e-06, |
| "loss": 0.9963, |
| "step": 6000 |
| }, |
| { |
| "epoch": 8.677098150782362, |
| "grad_norm": 7.71924352645874, |
| "learning_rate": 6.6977077363896855e-06, |
| "loss": 0.9937, |
| "step": 6100 |
| }, |
| { |
| "epoch": 8.819345661450924, |
| "grad_norm": 2.987886905670166, |
| "learning_rate": 5.981375358166189e-06, |
| "loss": 0.983, |
| "step": 6200 |
| }, |
| { |
| "epoch": 8.961593172119487, |
| "grad_norm": 30.301179885864258, |
| "learning_rate": 5.265042979942693e-06, |
| "loss": 1.0011, |
| "step": 6300 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_gen_len": 128.5121, |
| "eval_loss": 1.1343382596969604, |
| "eval_rouge1": 55.7259, |
| "eval_rouge2": 30.5158, |
| "eval_rougeL": 38.443, |
| "eval_rougeLsum": 50.6675, |
| "eval_runtime": 1061.373, |
| "eval_samples_per_second": 1.325, |
| "eval_steps_per_second": 0.166, |
| "step": 6327 |
| }, |
| { |
| "epoch": 9.103840682788052, |
| "grad_norm": 3.3131797313690186, |
| "learning_rate": 4.548710601719198e-06, |
| "loss": 0.9831, |
| "step": 6400 |
| }, |
| { |
| "epoch": 9.246088193456615, |
| "grad_norm": 2.9355273246765137, |
| "learning_rate": 3.839541547277937e-06, |
| "loss": 0.9688, |
| "step": 6500 |
| }, |
| { |
| "epoch": 9.388335704125177, |
| "grad_norm": 3.5932939052581787, |
| "learning_rate": 3.1232091690544415e-06, |
| "loss": 0.975, |
| "step": 6600 |
| }, |
| { |
| "epoch": 9.530583214793742, |
| "grad_norm": 9.377333641052246, |
| "learning_rate": 2.4068767908309457e-06, |
| "loss": 0.9926, |
| "step": 6700 |
| }, |
| { |
| "epoch": 9.672830725462305, |
| "grad_norm": 2.7796216011047363, |
| "learning_rate": 1.69054441260745e-06, |
| "loss": 1.0019, |
| "step": 6800 |
| }, |
| { |
| "epoch": 9.815078236130867, |
| "grad_norm": 3.6320714950561523, |
| "learning_rate": 9.742120343839543e-07, |
| "loss": 0.971, |
| "step": 6900 |
| }, |
| { |
| "epoch": 9.95732574679943, |
| "grad_norm": 3.324120283126831, |
| "learning_rate": 2.5787965616045843e-07, |
| "loss": 1.004, |
| "step": 7000 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_gen_len": 128.5939, |
| "eval_loss": 1.1363346576690674, |
| "eval_rouge1": 55.76, |
| "eval_rouge2": 30.6092, |
| "eval_rougeL": 38.5818, |
| "eval_rougeLsum": 50.678, |
| "eval_runtime": 876.6103, |
| "eval_samples_per_second": 1.604, |
| "eval_steps_per_second": 0.201, |
| "step": 7030 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 7030, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6250347865505792e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|