| { | |
| "best_metric": 0.7297702431678772, | |
| "best_model_checkpoint": "./results/phi2-mentalchat16k/checkpoint-2000", | |
| "epoch": 3.590127150336574, | |
| "eval_steps": 100, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0014958863126402393, | |
| "grad_norm": 0.3071393668651581, | |
| "learning_rate": 2.469135802469136e-06, | |
| "loss": 1.4302, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.07479431563201197, | |
| "grad_norm": 0.5085623860359192, | |
| "learning_rate": 0.0001234567901234568, | |
| "loss": 1.2645, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14958863126402394, | |
| "grad_norm": 0.35505935549736023, | |
| "learning_rate": 0.00019997346476623897, | |
| "loss": 0.8915, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14958863126402394, | |
| "eval_loss": 0.8518401384353638, | |
| "eval_runtime": 3732.5936, | |
| "eval_samples_per_second": 0.126, | |
| "eval_steps_per_second": 0.032, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2243829468960359, | |
| "grad_norm": 0.33412566781044006, | |
| "learning_rate": 0.00019965023224267117, | |
| "loss": 0.8522, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2991772625280479, | |
| "grad_norm": 0.2724510431289673, | |
| "learning_rate": 0.0001989608571521631, | |
| "loss": 0.8464, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2991772625280479, | |
| "eval_loss": 0.8101955056190491, | |
| "eval_runtime": 38.8184, | |
| "eval_samples_per_second": 12.159, | |
| "eval_steps_per_second": 3.04, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3739715781600598, | |
| "grad_norm": 0.2906951308250427, | |
| "learning_rate": 0.00019790787244982738, | |
| "loss": 0.8286, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4487658937920718, | |
| "grad_norm": 0.2903044819831848, | |
| "learning_rate": 0.00019649514709324878, | |
| "loss": 0.7874, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4487658937920718, | |
| "eval_loss": 0.7927271127700806, | |
| "eval_runtime": 38.811, | |
| "eval_samples_per_second": 12.162, | |
| "eval_steps_per_second": 3.04, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5235602094240838, | |
| "grad_norm": 0.2807984948158264, | |
| "learning_rate": 0.0001947278718268621, | |
| "loss": 0.8088, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5983545250560958, | |
| "grad_norm": 0.2716442942619324, | |
| "learning_rate": 0.00019261254010971866, | |
| "loss": 0.7999, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5983545250560958, | |
| "eval_loss": 0.7814475297927856, | |
| "eval_runtime": 38.7939, | |
| "eval_samples_per_second": 12.167, | |
| "eval_steps_per_second": 3.042, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6731488406881077, | |
| "grad_norm": 0.2673608362674713, | |
| "learning_rate": 0.00019015692425671844, | |
| "loss": 0.784, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7479431563201197, | |
| "grad_norm": 0.2807273268699646, | |
| "learning_rate": 0.00018737004688097132, | |
| "loss": 0.7901, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7479431563201197, | |
| "eval_loss": 0.7731401920318604, | |
| "eval_runtime": 38.8073, | |
| "eval_samples_per_second": 12.163, | |
| "eval_steps_per_second": 3.041, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8227374719521316, | |
| "grad_norm": 0.26429691910743713, | |
| "learning_rate": 0.00018426214774221636, | |
| "loss": 0.7912, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8975317875841436, | |
| "grad_norm": 0.2670207917690277, | |
| "learning_rate": 0.00018084464612310694, | |
| "loss": 0.7801, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8975317875841436, | |
| "eval_loss": 0.764725923538208, | |
| "eval_runtime": 38.8024, | |
| "eval_samples_per_second": 12.164, | |
| "eval_steps_per_second": 3.041, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9723261032161555, | |
| "grad_norm": 0.27276232838630676, | |
| "learning_rate": 0.00017713009887160194, | |
| "loss": 0.7833, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0471204188481675, | |
| "grad_norm": 0.30611422657966614, | |
| "learning_rate": 0.00017313215426362693, | |
| "loss": 0.7585, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0471204188481675, | |
| "eval_loss": 0.7639469504356384, | |
| "eval_runtime": 38.8221, | |
| "eval_samples_per_second": 12.158, | |
| "eval_steps_per_second": 3.04, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1219147344801794, | |
| "grad_norm": 0.29138803482055664, | |
| "learning_rate": 0.00016886550185552613, | |
| "loss": 0.757, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1967090501121915, | |
| "grad_norm": 0.33125239610671997, | |
| "learning_rate": 0.00016434581851056202, | |
| "loss": 0.7592, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1967090501121915, | |
| "eval_loss": 0.7576190233230591, | |
| "eval_runtime": 38.8188, | |
| "eval_samples_per_second": 12.159, | |
| "eval_steps_per_second": 3.04, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2715033657442034, | |
| "grad_norm": 0.2843407094478607, | |
| "learning_rate": 0.00015958971079777556, | |
| "loss": 0.7654, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3462976813762153, | |
| "grad_norm": 0.27510127425193787, | |
| "learning_rate": 0.00015461465397484964, | |
| "loss": 0.7431, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3462976813762153, | |
| "eval_loss": 0.754165768623352, | |
| "eval_runtime": 38.8204, | |
| "eval_samples_per_second": 12.159, | |
| "eval_steps_per_second": 3.04, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4210919970082274, | |
| "grad_norm": 0.3000487983226776, | |
| "learning_rate": 0.00014943892777916998, | |
| "loss": 0.734, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4958863126402393, | |
| "grad_norm": 0.2898794710636139, | |
| "learning_rate": 0.00014408154926300447, | |
| "loss": 0.7555, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4958863126402393, | |
| "eval_loss": 0.7501235008239746, | |
| "eval_runtime": 38.8206, | |
| "eval_samples_per_second": 12.158, | |
| "eval_steps_per_second": 3.04, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5706806282722514, | |
| "grad_norm": 0.2648974359035492, | |
| "learning_rate": 0.00013856220291958335, | |
| "loss": 0.7524, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6454749439042633, | |
| "grad_norm": 0.3109261095523834, | |
| "learning_rate": 0.0001329011683568166, | |
| "loss": 0.7354, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6454749439042633, | |
| "eval_loss": 0.745599627494812, | |
| "eval_runtime": 38.804, | |
| "eval_samples_per_second": 12.164, | |
| "eval_steps_per_second": 3.041, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.7202692595362752, | |
| "grad_norm": 0.28738629817962646, | |
| "learning_rate": 0.00012711924578439465, | |
| "loss": 0.7424, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.795063575168287, | |
| "grad_norm": 0.2660214602947235, | |
| "learning_rate": 0.00012123767958805418, | |
| "loss": 0.7281, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.795063575168287, | |
| "eval_loss": 0.7421949505805969, | |
| "eval_runtime": 38.8011, | |
| "eval_samples_per_second": 12.165, | |
| "eval_steps_per_second": 3.041, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.8698578908002992, | |
| "grad_norm": 0.33451542258262634, | |
| "learning_rate": 0.00011527808027181803, | |
| "loss": 0.7434, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.9446522064323113, | |
| "grad_norm": 0.28617632389068604, | |
| "learning_rate": 0.00010926234505501502, | |
| "loss": 0.7312, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.9446522064323113, | |
| "eval_loss": 0.7395272850990295, | |
| "eval_runtime": 38.813, | |
| "eval_samples_per_second": 12.161, | |
| "eval_steps_per_second": 3.04, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.019446522064323, | |
| "grad_norm": 0.3253704011440277, | |
| "learning_rate": 0.00010321257741582816, | |
| "loss": 0.7145, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "grad_norm": 0.28912901878356934, | |
| "learning_rate": 9.715100587699098e-05, | |
| "loss": 0.6984, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "eval_loss": 0.7388814687728882, | |
| "eval_runtime": 38.7905, | |
| "eval_samples_per_second": 12.168, | |
| "eval_steps_per_second": 3.042, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.169035153328347, | |
| "grad_norm": 0.31665706634521484, | |
| "learning_rate": 9.10999023320352e-05, | |
| "loss": 0.6954, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.243829468960359, | |
| "grad_norm": 0.31037911772727966, | |
| "learning_rate": 8.508150021218224e-05, | |
| "loss": 0.7037, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.243829468960359, | |
| "eval_loss": 0.7381539344787598, | |
| "eval_runtime": 38.7987, | |
| "eval_samples_per_second": 12.165, | |
| "eval_steps_per_second": 3.041, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.318623784592371, | |
| "grad_norm": 0.3227658271789551, | |
| "learning_rate": 7.911791279455607e-05, | |
| "loss": 0.7047, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.393418100224383, | |
| "grad_norm": 0.3283180892467499, | |
| "learning_rate": 7.323105195187506e-05, | |
| "loss": 0.6913, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.393418100224383, | |
| "eval_loss": 0.7356892228126526, | |
| "eval_runtime": 38.8154, | |
| "eval_samples_per_second": 12.16, | |
| "eval_steps_per_second": 3.04, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.468212415856395, | |
| "grad_norm": 0.30508577823638916, | |
| "learning_rate": 6.744254764215987e-05, | |
| "loss": 0.7206, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.543006731488407, | |
| "grad_norm": 0.3187327980995178, | |
| "learning_rate": 6.177366843427392e-05, | |
| "loss": 0.7229, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.543006731488407, | |
| "eval_loss": 0.7340988516807556, | |
| "eval_runtime": 38.799, | |
| "eval_samples_per_second": 12.165, | |
| "eval_steps_per_second": 3.041, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.6178010471204187, | |
| "grad_norm": 0.3277007043361664, | |
| "learning_rate": 5.624524336130754e-05, | |
| "loss": 0.6965, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.6925953627524306, | |
| "grad_norm": 0.3015810251235962, | |
| "learning_rate": 5.087758538893881e-05, | |
| "loss": 0.7095, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.6925953627524306, | |
| "eval_loss": 0.7326057553291321, | |
| "eval_runtime": 38.7888, | |
| "eval_samples_per_second": 12.168, | |
| "eval_steps_per_second": 3.042, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.767389678384443, | |
| "grad_norm": 0.30385613441467285, | |
| "learning_rate": 4.569041677996858e-05, | |
| "loss": 0.7131, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.842183994016455, | |
| "grad_norm": 0.344064325094223, | |
| "learning_rate": 4.0702796629261964e-05, | |
| "loss": 0.6994, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.842183994016455, | |
| "eval_loss": 0.7319425344467163, | |
| "eval_runtime": 38.8045, | |
| "eval_samples_per_second": 12.164, | |
| "eval_steps_per_second": 3.041, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.9169783096484667, | |
| "grad_norm": 0.3289170563220978, | |
| "learning_rate": 3.593305083535229e-05, | |
| "loss": 0.7064, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.9917726252804786, | |
| "grad_norm": 0.32823771238327026, | |
| "learning_rate": 3.139870476601171e-05, | |
| "loss": 0.6995, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.9917726252804786, | |
| "eval_loss": 0.7297702431678772, | |
| "eval_runtime": 38.8084, | |
| "eval_samples_per_second": 12.162, | |
| "eval_steps_per_second": 3.041, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.0665669409124905, | |
| "grad_norm": 0.31735172867774963, | |
| "learning_rate": 2.7116418865193638e-05, | |
| "loss": 0.6928, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.141361256544503, | |
| "grad_norm": 0.36334428191185, | |
| "learning_rate": 2.310192743794496e-05, | |
| "loss": 0.6887, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.141361256544503, | |
| "eval_loss": 0.7313625812530518, | |
| "eval_runtime": 38.8213, | |
| "eval_samples_per_second": 12.158, | |
| "eval_steps_per_second": 3.04, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.2161555721765147, | |
| "grad_norm": 0.3707718849182129, | |
| "learning_rate": 1.9369980838209156e-05, | |
| "loss": 0.6852, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.2909498878085266, | |
| "grad_norm": 0.28901323676109314, | |
| "learning_rate": 1.5934291271938596e-05, | |
| "loss": 0.6712, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.2909498878085266, | |
| "eval_loss": 0.7308326363563538, | |
| "eval_runtime": 154.853, | |
| "eval_samples_per_second": 3.048, | |
| "eval_steps_per_second": 0.762, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.3657442034405385, | |
| "grad_norm": 0.3087999224662781, | |
| "learning_rate": 1.2807482414650063e-05, | |
| "loss": 0.6879, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.4405385190725504, | |
| "grad_norm": 0.3264073431491852, | |
| "learning_rate": 1.0001043028542834e-05, | |
| "loss": 0.6867, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.4405385190725504, | |
| "eval_loss": 0.7300211787223816, | |
| "eval_runtime": 38.8364, | |
| "eval_samples_per_second": 12.154, | |
| "eval_steps_per_second": 3.038, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.5153328347045623, | |
| "grad_norm": 0.35010454058647156, | |
| "learning_rate": 7.5252847496027565e-06, | |
| "loss": 0.6721, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.590127150336574, | |
| "grad_norm": 0.31919124722480774, | |
| "learning_rate": 5.389304199794209e-06, | |
| "loss": 0.6817, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.590127150336574, | |
| "eval_loss": 0.7298696637153625, | |
| "eval_runtime": 38.8245, | |
| "eval_samples_per_second": 12.157, | |
| "eval_steps_per_second": 3.039, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2672, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.001 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 4 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.464313302280704e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |