| { |
| "best_metric": 7.7564, |
| "best_model_checkpoint": "moore_MT/checkpoint-5000", |
| "epoch": 23.964017991004496, |
| "eval_steps": 5000, |
| "global_step": 7992, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.29985007496251875, |
| "grad_norm": 14.79597282409668, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 14.0578, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5997001499250375, |
| "grad_norm": 12.179410934448242, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 11.5256, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8995502248875562, |
| "grad_norm": 11.735830307006836, |
| "learning_rate": 3.6e-06, |
| "loss": 9.8654, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.199400299850075, |
| "grad_norm": 11.794581413269043, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 8.9024, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4992503748125938, |
| "grad_norm": 12.569659233093262, |
| "learning_rate": 6e-06, |
| "loss": 8.0761, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7991004497751124, |
| "grad_norm": 11.602639198303223, |
| "learning_rate": 7.2e-06, |
| "loss": 7.5104, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.098950524737631, |
| "grad_norm": 9.385009765625, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 7.0449, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.39880059970015, |
| "grad_norm": 9.674553871154785, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 6.5535, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.6986506746626686, |
| "grad_norm": 10.834465980529785, |
| "learning_rate": 1.08e-05, |
| "loss": 6.1979, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.9985007496251876, |
| "grad_norm": 11.001913070678711, |
| "learning_rate": 1.2e-05, |
| "loss": 6.091, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.2983508245877062, |
| "grad_norm": 10.693511962890625, |
| "learning_rate": 1.32e-05, |
| "loss": 5.6784, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.598200899550225, |
| "grad_norm": 12.01541519165039, |
| "learning_rate": 1.44e-05, |
| "loss": 5.506, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.898050974512744, |
| "grad_norm": 11.18288516998291, |
| "learning_rate": 1.56e-05, |
| "loss": 5.3811, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.197901049475262, |
| "grad_norm": 11.982161521911621, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 5.0964, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.497751124437781, |
| "grad_norm": 9.406704902648926, |
| "learning_rate": 1.8e-05, |
| "loss": 4.9189, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.7976011994003, |
| "grad_norm": 11.488901138305664, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 4.8511, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.097451274362818, |
| "grad_norm": 11.708683967590332, |
| "learning_rate": 2.04e-05, |
| "loss": 4.6344, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.397301349325337, |
| "grad_norm": 9.575274467468262, |
| "learning_rate": 2.16e-05, |
| "loss": 4.4337, |
| "step": 1800 |
| }, |
| { |
| "epoch": 5.697151424287856, |
| "grad_norm": 12.235701560974121, |
| "learning_rate": 2.2800000000000002e-05, |
| "loss": 4.2986, |
| "step": 1900 |
| }, |
| { |
| "epoch": 5.997001499250375, |
| "grad_norm": 9.145214080810547, |
| "learning_rate": 2.4e-05, |
| "loss": 4.3928, |
| "step": 2000 |
| }, |
| { |
| "epoch": 6.296851574212893, |
| "grad_norm": 9.224031448364258, |
| "learning_rate": 2.52e-05, |
| "loss": 3.8942, |
| "step": 2100 |
| }, |
| { |
| "epoch": 6.5967016491754125, |
| "grad_norm": 13.213356018066406, |
| "learning_rate": 2.64e-05, |
| "loss": 3.9475, |
| "step": 2200 |
| }, |
| { |
| "epoch": 6.896551724137931, |
| "grad_norm": 10.31118106842041, |
| "learning_rate": 2.7600000000000003e-05, |
| "loss": 3.9677, |
| "step": 2300 |
| }, |
| { |
| "epoch": 7.19640179910045, |
| "grad_norm": 8.905426979064941, |
| "learning_rate": 2.88e-05, |
| "loss": 3.6528, |
| "step": 2400 |
| }, |
| { |
| "epoch": 7.496251874062969, |
| "grad_norm": 12.035221099853516, |
| "learning_rate": 3e-05, |
| "loss": 3.4446, |
| "step": 2500 |
| }, |
| { |
| "epoch": 7.796101949025488, |
| "grad_norm": 10.558347702026367, |
| "learning_rate": 2.9453750910415148e-05, |
| "loss": 3.5438, |
| "step": 2600 |
| }, |
| { |
| "epoch": 8.095952023988007, |
| "grad_norm": 9.570464134216309, |
| "learning_rate": 2.8907501820830298e-05, |
| "loss": 3.4205, |
| "step": 2700 |
| }, |
| { |
| "epoch": 8.395802098950524, |
| "grad_norm": 8.80653190612793, |
| "learning_rate": 2.836125273124545e-05, |
| "loss": 3.0806, |
| "step": 2800 |
| }, |
| { |
| "epoch": 8.695652173913043, |
| "grad_norm": 11.195171356201172, |
| "learning_rate": 2.78150036416606e-05, |
| "loss": 3.091, |
| "step": 2900 |
| }, |
| { |
| "epoch": 8.995502248875562, |
| "grad_norm": 11.424571990966797, |
| "learning_rate": 2.7268754552075746e-05, |
| "loss": 3.1087, |
| "step": 3000 |
| }, |
| { |
| "epoch": 9.295352323838081, |
| "grad_norm": 10.09681224822998, |
| "learning_rate": 2.6722505462490896e-05, |
| "loss": 2.6535, |
| "step": 3100 |
| }, |
| { |
| "epoch": 9.5952023988006, |
| "grad_norm": 10.604998588562012, |
| "learning_rate": 2.6176256372906047e-05, |
| "loss": 2.6928, |
| "step": 3200 |
| }, |
| { |
| "epoch": 9.89505247376312, |
| "grad_norm": 11.570308685302734, |
| "learning_rate": 2.5630007283321194e-05, |
| "loss": 2.6886, |
| "step": 3300 |
| }, |
| { |
| "epoch": 10.194902548725636, |
| "grad_norm": 14.508749961853027, |
| "learning_rate": 2.5083758193736344e-05, |
| "loss": 2.3882, |
| "step": 3400 |
| }, |
| { |
| "epoch": 10.494752623688155, |
| "grad_norm": 8.163262367248535, |
| "learning_rate": 2.453750910415149e-05, |
| "loss": 2.3441, |
| "step": 3500 |
| }, |
| { |
| "epoch": 10.794602698650674, |
| "grad_norm": 9.08784294128418, |
| "learning_rate": 2.3991260014566645e-05, |
| "loss": 2.3557, |
| "step": 3600 |
| }, |
| { |
| "epoch": 11.094452773613193, |
| "grad_norm": 10.316121101379395, |
| "learning_rate": 2.3445010924981792e-05, |
| "loss": 2.2552, |
| "step": 3700 |
| }, |
| { |
| "epoch": 11.394302848575713, |
| "grad_norm": 9.336089134216309, |
| "learning_rate": 2.2898761835396942e-05, |
| "loss": 2.0136, |
| "step": 3800 |
| }, |
| { |
| "epoch": 11.694152923538232, |
| "grad_norm": 12.008231163024902, |
| "learning_rate": 2.235251274581209e-05, |
| "loss": 2.041, |
| "step": 3900 |
| }, |
| { |
| "epoch": 11.994002998500749, |
| "grad_norm": 10.343401908874512, |
| "learning_rate": 2.180626365622724e-05, |
| "loss": 2.0542, |
| "step": 4000 |
| }, |
| { |
| "epoch": 12.293853073463268, |
| "grad_norm": 10.85839557647705, |
| "learning_rate": 2.126001456664239e-05, |
| "loss": 1.7134, |
| "step": 4100 |
| }, |
| { |
| "epoch": 12.593703148425787, |
| "grad_norm": 9.820749282836914, |
| "learning_rate": 2.071922796795339e-05, |
| "loss": 1.7959, |
| "step": 4200 |
| }, |
| { |
| "epoch": 12.893553223388306, |
| "grad_norm": 10.130011558532715, |
| "learning_rate": 2.0172978878368536e-05, |
| "loss": 1.827, |
| "step": 4300 |
| }, |
| { |
| "epoch": 13.193403298350825, |
| "grad_norm": 10.762105941772461, |
| "learning_rate": 1.9626729788783686e-05, |
| "loss": 1.6057, |
| "step": 4400 |
| }, |
| { |
| "epoch": 13.493253373313344, |
| "grad_norm": 10.962275505065918, |
| "learning_rate": 1.9080480699198837e-05, |
| "loss": 1.5081, |
| "step": 4500 |
| }, |
| { |
| "epoch": 13.793103448275861, |
| "grad_norm": 9.258560180664062, |
| "learning_rate": 1.8534231609613984e-05, |
| "loss": 1.5604, |
| "step": 4600 |
| }, |
| { |
| "epoch": 14.09295352323838, |
| "grad_norm": 9.558040618896484, |
| "learning_rate": 1.7987982520029134e-05, |
| "loss": 1.5125, |
| "step": 4700 |
| }, |
| { |
| "epoch": 14.3928035982009, |
| "grad_norm": 9.487316131591797, |
| "learning_rate": 1.744173343044428e-05, |
| "loss": 1.3777, |
| "step": 4800 |
| }, |
| { |
| "epoch": 14.692653673163418, |
| "grad_norm": 9.571447372436523, |
| "learning_rate": 1.6895484340859435e-05, |
| "loss": 1.3723, |
| "step": 4900 |
| }, |
| { |
| "epoch": 14.992503748125937, |
| "grad_norm": 9.926942825317383, |
| "learning_rate": 1.6349235251274582e-05, |
| "loss": 1.3635, |
| "step": 5000 |
| }, |
| { |
| "epoch": 14.992503748125937, |
| "eval_bleu": 7.7564, |
| "eval_gen_len": 59.1124, |
| "eval_loss": 2.4898841381073, |
| "eval_runtime": 358.0616, |
| "eval_samples_per_second": 1.243, |
| "eval_steps_per_second": 0.209, |
| "step": 5000 |
| }, |
| { |
| "epoch": 15.292353823088456, |
| "grad_norm": 10.60864543914795, |
| "learning_rate": 1.5802986161689732e-05, |
| "loss": 1.1936, |
| "step": 5100 |
| }, |
| { |
| "epoch": 15.592203898050975, |
| "grad_norm": 9.889521598815918, |
| "learning_rate": 1.525673707210488e-05, |
| "loss": 1.1642, |
| "step": 5200 |
| }, |
| { |
| "epoch": 15.892053973013493, |
| "grad_norm": 9.8224515914917, |
| "learning_rate": 1.471048798252003e-05, |
| "loss": 1.2187, |
| "step": 5300 |
| }, |
| { |
| "epoch": 16.191904047976013, |
| "grad_norm": 7.935319900512695, |
| "learning_rate": 1.416423889293518e-05, |
| "loss": 1.1355, |
| "step": 5400 |
| }, |
| { |
| "epoch": 16.49175412293853, |
| "grad_norm": 11.877537727355957, |
| "learning_rate": 1.3617989803350329e-05, |
| "loss": 0.9916, |
| "step": 5500 |
| }, |
| { |
| "epoch": 16.791604197901048, |
| "grad_norm": 8.424215316772461, |
| "learning_rate": 1.3071740713765478e-05, |
| "loss": 1.035, |
| "step": 5600 |
| }, |
| { |
| "epoch": 17.09145427286357, |
| "grad_norm": 13.638501167297363, |
| "learning_rate": 1.2525491624180626e-05, |
| "loss": 1.0131, |
| "step": 5700 |
| }, |
| { |
| "epoch": 17.391304347826086, |
| "grad_norm": 8.009260177612305, |
| "learning_rate": 1.1979242534595777e-05, |
| "loss": 0.8849, |
| "step": 5800 |
| }, |
| { |
| "epoch": 17.691154422788607, |
| "grad_norm": 8.497699737548828, |
| "learning_rate": 1.1432993445010925e-05, |
| "loss": 0.9022, |
| "step": 5900 |
| }, |
| { |
| "epoch": 17.991004497751124, |
| "grad_norm": 9.464252471923828, |
| "learning_rate": 1.0886744355426076e-05, |
| "loss": 0.9143, |
| "step": 6000 |
| }, |
| { |
| "epoch": 18.290854572713645, |
| "grad_norm": 8.998556137084961, |
| "learning_rate": 1.0340495265841224e-05, |
| "loss": 0.7642, |
| "step": 6100 |
| }, |
| { |
| "epoch": 18.590704647676162, |
| "grad_norm": 7.708502769470215, |
| "learning_rate": 9.794246176256375e-06, |
| "loss": 0.818, |
| "step": 6200 |
| }, |
| { |
| "epoch": 18.89055472263868, |
| "grad_norm": 10.839265823364258, |
| "learning_rate": 9.25345957756737e-06, |
| "loss": 0.8269, |
| "step": 6300 |
| }, |
| { |
| "epoch": 19.1904047976012, |
| "grad_norm": 7.9040117263793945, |
| "learning_rate": 8.70721048798252e-06, |
| "loss": 0.7836, |
| "step": 6400 |
| }, |
| { |
| "epoch": 19.490254872563717, |
| "grad_norm": 9.677469253540039, |
| "learning_rate": 8.16096139839767e-06, |
| "loss": 0.7262, |
| "step": 6500 |
| }, |
| { |
| "epoch": 19.79010494752624, |
| "grad_norm": 12.025920867919922, |
| "learning_rate": 7.61471230881282e-06, |
| "loss": 0.7191, |
| "step": 6600 |
| }, |
| { |
| "epoch": 20.089955022488756, |
| "grad_norm": 7.995722770690918, |
| "learning_rate": 7.068463219227968e-06, |
| "loss": 0.7145, |
| "step": 6700 |
| }, |
| { |
| "epoch": 20.389805097451273, |
| "grad_norm": 7.845085144042969, |
| "learning_rate": 6.522214129643117e-06, |
| "loss": 0.6646, |
| "step": 6800 |
| }, |
| { |
| "epoch": 20.689655172413794, |
| "grad_norm": 10.144831657409668, |
| "learning_rate": 5.975965040058267e-06, |
| "loss": 0.6903, |
| "step": 6900 |
| }, |
| { |
| "epoch": 20.98950524737631, |
| "grad_norm": 8.53653335571289, |
| "learning_rate": 5.4297159504734155e-06, |
| "loss": 0.674, |
| "step": 7000 |
| }, |
| { |
| "epoch": 21.28935532233883, |
| "grad_norm": 6.819238185882568, |
| "learning_rate": 4.883466860888565e-06, |
| "loss": 0.6452, |
| "step": 7100 |
| }, |
| { |
| "epoch": 21.58920539730135, |
| "grad_norm": 5.426570415496826, |
| "learning_rate": 4.337217771303715e-06, |
| "loss": 0.6358, |
| "step": 7200 |
| }, |
| { |
| "epoch": 21.88905547226387, |
| "grad_norm": 7.385246753692627, |
| "learning_rate": 3.7909686817188637e-06, |
| "loss": 0.5839, |
| "step": 7300 |
| }, |
| { |
| "epoch": 22.188905547226387, |
| "grad_norm": 8.341054916381836, |
| "learning_rate": 3.2447195921340133e-06, |
| "loss": 0.5601, |
| "step": 7400 |
| }, |
| { |
| "epoch": 22.488755622188904, |
| "grad_norm": 7.411434650421143, |
| "learning_rate": 2.6984705025491624e-06, |
| "loss": 0.5808, |
| "step": 7500 |
| }, |
| { |
| "epoch": 22.788605697151425, |
| "grad_norm": 6.588235855102539, |
| "learning_rate": 2.1522214129643115e-06, |
| "loss": 0.584, |
| "step": 7600 |
| }, |
| { |
| "epoch": 23.088455772113942, |
| "grad_norm": 7.494542598724365, |
| "learning_rate": 1.605972323379461e-06, |
| "loss": 0.5442, |
| "step": 7700 |
| }, |
| { |
| "epoch": 23.388305847076463, |
| "grad_norm": 8.254023551940918, |
| "learning_rate": 1.0597232337946102e-06, |
| "loss": 0.5392, |
| "step": 7800 |
| }, |
| { |
| "epoch": 23.68815592203898, |
| "grad_norm": 6.483549118041992, |
| "learning_rate": 5.134741442097597e-07, |
| "loss": 0.5271, |
| "step": 7900 |
| }, |
| { |
| "epoch": 23.964017991004496, |
| "step": 7992, |
| "total_flos": 1.716600802443264e+16, |
| "train_loss": 2.9129727500098364, |
| "train_runtime": 6318.3948, |
| "train_samples_per_second": 15.201, |
| "train_steps_per_second": 1.265 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 7992, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 24, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.716600802443264e+16, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|