| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 13.329551900170165, | |
| "eval_steps": 500, | |
| "global_step": 23500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.28360748723766305, | |
| "grad_norm": 4.999021530151367, | |
| "learning_rate": 1.971639251276234e-05, | |
| "loss": 6.7525, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5672149744753261, | |
| "grad_norm": 8.877035140991211, | |
| "learning_rate": 1.9432785025524678e-05, | |
| "loss": 4.8633, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8508224617129893, | |
| "grad_norm": 5.504563808441162, | |
| "learning_rate": 1.9149177538287012e-05, | |
| "loss": 4.7848, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 39.1159, | |
| "eval_samples_per_second": 5.011, | |
| "eval_steps_per_second": 5.011, | |
| "step": 1763 | |
| }, | |
| { | |
| "epoch": 1.1344299489506522, | |
| "grad_norm": 4.3093953132629395, | |
| "learning_rate": 1.886557005104935e-05, | |
| "loss": 4.7304, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.4180374361883152, | |
| "grad_norm": 8.88564395904541, | |
| "learning_rate": 1.8581962563811688e-05, | |
| "loss": 4.7621, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.7016449234259783, | |
| "grad_norm": 4.924105644226074, | |
| "learning_rate": 1.8298355076574022e-05, | |
| "loss": 4.7163, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9852524106636416, | |
| "grad_norm": 5.222480773925781, | |
| "learning_rate": 1.801474758933636e-05, | |
| "loss": 4.6234, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_runtime": 40.3571, | |
| "eval_samples_per_second": 4.857, | |
| "eval_steps_per_second": 4.857, | |
| "step": 3526 | |
| }, | |
| { | |
| "epoch": 2.2688598979013044, | |
| "grad_norm": 4.8733978271484375, | |
| "learning_rate": 1.7731140102098695e-05, | |
| "loss": 4.7049, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.552467385138968, | |
| "grad_norm": 5.571890830993652, | |
| "learning_rate": 1.7447532614861033e-05, | |
| "loss": 4.5987, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.8360748723766305, | |
| "grad_norm": 5.371450424194336, | |
| "learning_rate": 1.716392512762337e-05, | |
| "loss": 4.6503, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_runtime": 36.7471, | |
| "eval_samples_per_second": 5.334, | |
| "eval_steps_per_second": 5.334, | |
| "step": 5289 | |
| }, | |
| { | |
| "epoch": 3.119682359614294, | |
| "grad_norm": 6.941244602203369, | |
| "learning_rate": 1.688031764038571e-05, | |
| "loss": 4.7147, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.403289846851957, | |
| "grad_norm": 7.103884220123291, | |
| "learning_rate": 1.6596710153148043e-05, | |
| "loss": 4.6078, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.68689733408962, | |
| "grad_norm": 6.286114692687988, | |
| "learning_rate": 1.631310266591038e-05, | |
| "loss": 4.6098, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.970504821327283, | |
| "grad_norm": 11.397751808166504, | |
| "learning_rate": 1.602949517867272e-05, | |
| "loss": 4.583, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_runtime": 39.4552, | |
| "eval_samples_per_second": 4.968, | |
| "eval_steps_per_second": 4.968, | |
| "step": 7052 | |
| }, | |
| { | |
| "epoch": 4.254112308564946, | |
| "grad_norm": 6.588546276092529, | |
| "learning_rate": 1.5745887691435057e-05, | |
| "loss": 4.6439, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.537719795802609, | |
| "grad_norm": 6.508044719696045, | |
| "learning_rate": 1.546228020419739e-05, | |
| "loss": 4.5772, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.821327283040272, | |
| "grad_norm": 13.856376647949219, | |
| "learning_rate": 1.5178672716959728e-05, | |
| "loss": 4.5647, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_runtime": 37.0746, | |
| "eval_samples_per_second": 5.287, | |
| "eval_steps_per_second": 5.287, | |
| "step": 8815 | |
| }, | |
| { | |
| "epoch": 5.104934770277936, | |
| "grad_norm": 11.172317504882812, | |
| "learning_rate": 1.4895065229722066e-05, | |
| "loss": 4.6058, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.388542257515598, | |
| "grad_norm": 9.843143463134766, | |
| "learning_rate": 1.4611457742484402e-05, | |
| "loss": 4.5818, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 5.672149744753262, | |
| "grad_norm": 6.524191856384277, | |
| "learning_rate": 1.432785025524674e-05, | |
| "loss": 4.5783, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.955757231990924, | |
| "grad_norm": 13.230880737304688, | |
| "learning_rate": 1.4044242768009078e-05, | |
| "loss": 4.527, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_runtime": 39.9935, | |
| "eval_samples_per_second": 4.901, | |
| "eval_steps_per_second": 4.901, | |
| "step": 10578 | |
| }, | |
| { | |
| "epoch": 6.239364719228588, | |
| "grad_norm": 10.95142936706543, | |
| "learning_rate": 1.3760635280771412e-05, | |
| "loss": 4.5873, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.5229722064662505, | |
| "grad_norm": 7.862865924835205, | |
| "learning_rate": 1.347702779353375e-05, | |
| "loss": 4.5154, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 6.806579693703914, | |
| "grad_norm": 7.10567045211792, | |
| "learning_rate": 1.3193420306296087e-05, | |
| "loss": 4.5425, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_runtime": 37.9349, | |
| "eval_samples_per_second": 5.167, | |
| "eval_steps_per_second": 5.167, | |
| "step": 12341 | |
| }, | |
| { | |
| "epoch": 7.090187180941577, | |
| "grad_norm": 8.122684478759766, | |
| "learning_rate": 1.2909812819058425e-05, | |
| "loss": 4.5825, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 7.37379466817924, | |
| "grad_norm": 10.316974639892578, | |
| "learning_rate": 1.262620533182076e-05, | |
| "loss": 4.5289, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 7.657402155416903, | |
| "grad_norm": 7.87723445892334, | |
| "learning_rate": 1.2342597844583097e-05, | |
| "loss": 4.4922, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 7.941009642654566, | |
| "grad_norm": 7.334928035736084, | |
| "learning_rate": 1.2058990357345435e-05, | |
| "loss": 4.5097, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_runtime": 38.0104, | |
| "eval_samples_per_second": 5.156, | |
| "eval_steps_per_second": 5.156, | |
| "step": 14104 | |
| }, | |
| { | |
| "epoch": 8.224617129892229, | |
| "grad_norm": 47.94816970825195, | |
| "learning_rate": 1.1775382870107773e-05, | |
| "loss": 4.4971, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 8.508224617129892, | |
| "grad_norm": 6.203786849975586, | |
| "learning_rate": 1.149177538287011e-05, | |
| "loss": 4.5237, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 8.791832104367556, | |
| "grad_norm": 7.99770975112915, | |
| "learning_rate": 1.1208167895632445e-05, | |
| "loss": 4.5136, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_runtime": 39.5756, | |
| "eval_samples_per_second": 4.953, | |
| "eval_steps_per_second": 4.953, | |
| "step": 15867 | |
| }, | |
| { | |
| "epoch": 9.075439591605218, | |
| "grad_norm": 7.574156761169434, | |
| "learning_rate": 1.0924560408394782e-05, | |
| "loss": 4.5636, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 9.359047078842881, | |
| "grad_norm": 17.46339225769043, | |
| "learning_rate": 1.064095292115712e-05, | |
| "loss": 4.4819, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 9.642654566080544, | |
| "grad_norm": 11.29720401763916, | |
| "learning_rate": 1.0357345433919458e-05, | |
| "loss": 4.5055, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 9.926262053318208, | |
| "grad_norm": 8.761754035949707, | |
| "learning_rate": 1.0073737946681794e-05, | |
| "loss": 4.4984, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_runtime": 38.8099, | |
| "eval_samples_per_second": 5.05, | |
| "eval_steps_per_second": 5.05, | |
| "step": 17630 | |
| }, | |
| { | |
| "epoch": 10.209869540555871, | |
| "grad_norm": 30.768444061279297, | |
| "learning_rate": 9.79013045944413e-06, | |
| "loss": 4.4927, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 10.493477027793533, | |
| "grad_norm": 9.923407554626465, | |
| "learning_rate": 9.506522972206466e-06, | |
| "loss": 4.5199, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 10.777084515031197, | |
| "grad_norm": 7.2937140464782715, | |
| "learning_rate": 9.222915484968804e-06, | |
| "loss": 4.4576, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_runtime": 39.2629, | |
| "eval_samples_per_second": 4.992, | |
| "eval_steps_per_second": 4.992, | |
| "step": 19393 | |
| }, | |
| { | |
| "epoch": 11.06069200226886, | |
| "grad_norm": 8.553272247314453, | |
| "learning_rate": 8.93930799773114e-06, | |
| "loss": 4.4701, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 11.344299489506524, | |
| "grad_norm": 11.599752426147461, | |
| "learning_rate": 8.655700510493478e-06, | |
| "loss": 4.48, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 11.627906976744185, | |
| "grad_norm": 11.62510871887207, | |
| "learning_rate": 8.372093023255815e-06, | |
| "loss": 4.5049, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 11.911514463981849, | |
| "grad_norm": 10.712797164916992, | |
| "learning_rate": 8.08848553601815e-06, | |
| "loss": 4.4717, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_runtime": 38.3604, | |
| "eval_samples_per_second": 5.109, | |
| "eval_steps_per_second": 5.109, | |
| "step": 21156 | |
| }, | |
| { | |
| "epoch": 12.195121951219512, | |
| "grad_norm": 10.394380569458008, | |
| "learning_rate": 7.804878048780489e-06, | |
| "loss": 4.4372, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 12.478729438457176, | |
| "grad_norm": 10.594198226928711, | |
| "learning_rate": 7.521270561542825e-06, | |
| "loss": 4.4598, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 12.762336925694838, | |
| "grad_norm": 7.086946487426758, | |
| "learning_rate": 7.237663074305162e-06, | |
| "loss": 4.4651, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_runtime": 38.5026, | |
| "eval_samples_per_second": 5.091, | |
| "eval_steps_per_second": 5.091, | |
| "step": 22919 | |
| }, | |
| { | |
| "epoch": 13.045944412932501, | |
| "grad_norm": 6.127635955810547, | |
| "learning_rate": 6.954055587067498e-06, | |
| "loss": 4.5153, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 13.329551900170165, | |
| "grad_norm": 9.139135360717773, | |
| "learning_rate": 6.670448099829836e-06, | |
| "loss": 4.4323, | |
| "step": 23500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 35260, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.77304876695552e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |