| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9908925318761383, | |
| "eval_steps": 500, | |
| "global_step": 274, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007285974499089253, | |
| "grad_norm": 2.1298219418311506, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 2.1697, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03642987249544627, | |
| "grad_norm": 1.5904593931711113, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 2.0772, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07285974499089254, | |
| "grad_norm": 0.9900223045213828, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.7424, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1092896174863388, | |
| "grad_norm": 0.3236620060106758, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.9981, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14571948998178508, | |
| "grad_norm": 0.1835982348721837, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.783, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.18214936247723132, | |
| "grad_norm": 0.13846392947417183, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.6361, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2185792349726776, | |
| "grad_norm": 0.131685675809341, | |
| "learning_rate": 9.998369180404283e-06, | |
| "loss": 0.6016, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2550091074681239, | |
| "grad_norm": 0.11211293437608845, | |
| "learning_rate": 9.980034675172274e-06, | |
| "loss": 0.5256, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.29143897996357016, | |
| "grad_norm": 0.1256840427418023, | |
| "learning_rate": 9.941402118901743e-06, | |
| "loss": 0.512, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 0.08619323519274144, | |
| "learning_rate": 9.882628973467972e-06, | |
| "loss": 0.4838, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.36429872495446264, | |
| "grad_norm": 0.10060824077061277, | |
| "learning_rate": 9.803954791481239e-06, | |
| "loss": 0.4745, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4007285974499089, | |
| "grad_norm": 0.08221062885139536, | |
| "learning_rate": 9.705700239897809e-06, | |
| "loss": 0.4216, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.4371584699453552, | |
| "grad_norm": 0.08645648156860204, | |
| "learning_rate": 9.588265793018141e-06, | |
| "loss": 0.3991, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.47358834244080145, | |
| "grad_norm": 0.06648083726688385, | |
| "learning_rate": 9.452130100199504e-06, | |
| "loss": 0.4144, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5100182149362478, | |
| "grad_norm": 0.09770009593503494, | |
| "learning_rate": 9.297848034936007e-06, | |
| "loss": 0.3942, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.546448087431694, | |
| "grad_norm": 0.0671484746212119, | |
| "learning_rate": 9.12604843325778e-06, | |
| "loss": 0.42, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5828779599271403, | |
| "grad_norm": 0.0603539555693471, | |
| "learning_rate": 8.937431530667329e-06, | |
| "loss": 0.4106, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6193078324225865, | |
| "grad_norm": 0.06542285449991517, | |
| "learning_rate": 8.732766108059814e-06, | |
| "loss": 0.4068, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 0.05318309132873638, | |
| "learning_rate": 8.512886358260162e-06, | |
| "loss": 0.4024, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.692167577413479, | |
| "grad_norm": 0.04710411361135824, | |
| "learning_rate": 8.278688485948634e-06, | |
| "loss": 0.3873, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7285974499089253, | |
| "grad_norm": 0.08147624187638997, | |
| "learning_rate": 8.031127054833192e-06, | |
| "loss": 0.3897, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7650273224043715, | |
| "grad_norm": 0.062085885052453355, | |
| "learning_rate": 7.771211096957125e-06, | |
| "loss": 0.3602, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.8014571948998178, | |
| "grad_norm": 0.04741226219798684, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.3615, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8378870673952641, | |
| "grad_norm": 0.052623392395663965, | |
| "learning_rate": 7.218599189334799e-06, | |
| "loss": 0.4025, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8743169398907104, | |
| "grad_norm": 0.05506759208754912, | |
| "learning_rate": 6.92815562244068e-06, | |
| "loss": 0.4107, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9107468123861566, | |
| "grad_norm": 0.06450278629900923, | |
| "learning_rate": 6.629853114035643e-06, | |
| "loss": 0.4019, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9471766848816029, | |
| "grad_norm": 0.061387693531274025, | |
| "learning_rate": 6.32490751098331e-06, | |
| "loss": 0.38, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 0.049633792420609966, | |
| "learning_rate": 6.014561736640334e-06, | |
| "loss": 0.3686, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.3376566767692566, | |
| "eval_runtime": 4.9911, | |
| "eval_samples_per_second": 13.424, | |
| "eval_steps_per_second": 3.406, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.0145719489981786, | |
| "grad_norm": 0.04294549801679507, | |
| "learning_rate": 5.7000807248431466e-06, | |
| "loss": 0.3648, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0510018214936248, | |
| "grad_norm": 0.055870041096427674, | |
| "learning_rate": 5.38274626418248e-06, | |
| "loss": 0.3584, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0874316939890711, | |
| "grad_norm": 0.050163331468631514, | |
| "learning_rate": 5.06385177357987e-06, | |
| "loss": 0.3755, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1238615664845173, | |
| "grad_norm": 0.05681787153693805, | |
| "learning_rate": 4.744697030460248e-06, | |
| "loss": 0.3752, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1602914389799635, | |
| "grad_norm": 0.04904612115661219, | |
| "learning_rate": 4.426582873007999e-06, | |
| "loss": 0.3113, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1967213114754098, | |
| "grad_norm": 0.059727388905107905, | |
| "learning_rate": 4.110805898099492e-06, | |
| "loss": 0.3419, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2331511839708562, | |
| "grad_norm": 0.052958928558906763, | |
| "learning_rate": 3.7986531765226965e-06, | |
| "loss": 0.3863, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2695810564663024, | |
| "grad_norm": 0.05007371301185279, | |
| "learning_rate": 3.4913970070240388e-06, | |
| "loss": 0.3645, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.3060109289617485, | |
| "grad_norm": 0.04620417976687398, | |
| "learning_rate": 3.19028973056441e-06, | |
| "loss": 0.382, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.342440801457195, | |
| "grad_norm": 0.054148147501344494, | |
| "learning_rate": 2.8965586259208295e-06, | |
| "loss": 0.3844, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3788706739526413, | |
| "grad_norm": 0.053468499689767635, | |
| "learning_rate": 2.611400907438685e-06, | |
| "loss": 0.3751, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.4153005464480874, | |
| "grad_norm": 0.04625165040154575, | |
| "learning_rate": 2.3359788453231723e-06, | |
| "loss": 0.3438, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4517304189435336, | |
| "grad_norm": 0.051639778293774724, | |
| "learning_rate": 2.071415028359026e-06, | |
| "loss": 0.3676, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.48816029143898, | |
| "grad_norm": 0.06074450594251965, | |
| "learning_rate": 1.8187877883672024e-06, | |
| "loss": 0.3478, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.5245901639344264, | |
| "grad_norm": 0.046655956476477536, | |
| "learning_rate": 1.5791268050478487e-06, | |
| "loss": 0.3481, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5610200364298725, | |
| "grad_norm": 0.05558551651820435, | |
| "learning_rate": 1.3534089091237757e-06, | |
| "loss": 0.3564, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5974499089253187, | |
| "grad_norm": 0.05059129991621213, | |
| "learning_rate": 1.1425541008902852e-06, | |
| "loss": 0.37, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.633879781420765, | |
| "grad_norm": 0.04834415844975134, | |
| "learning_rate": 9.474218003993275e-07, | |
| "loss": 0.3603, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6703096539162114, | |
| "grad_norm": 0.048945517314366804, | |
| "learning_rate": 7.6880734456178e-07, | |
| "loss": 0.3275, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.7067395264116576, | |
| "grad_norm": 0.04895423242141336, | |
| "learning_rate": 6.074387454452891e-07, | |
| "loss": 0.3759, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7431693989071038, | |
| "grad_norm": 0.04715267859111255, | |
| "learning_rate": 4.639737229804403e-07, | |
| "loss": 0.3232, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7795992714025501, | |
| "grad_norm": 0.05976045666178133, | |
| "learning_rate": 3.3899702416965166e-07, | |
| "loss": 0.3491, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.8160291438979965, | |
| "grad_norm": 0.04227868742024836, | |
| "learning_rate": 2.330180397253473e-07, | |
| "loss": 0.3548, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8524590163934427, | |
| "grad_norm": 0.04541947483086337, | |
| "learning_rate": 1.4646872785175182e-07, | |
| "loss": 0.3609, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.05573935065784949, | |
| "learning_rate": 7.970185363271432e-08, | |
| "loss": 0.3546, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.925318761384335, | |
| "grad_norm": 0.047007609680595135, | |
| "learning_rate": 3.2989551201624836e-08, | |
| "loss": 0.368, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9617486338797814, | |
| "grad_norm": 0.04280029371219344, | |
| "learning_rate": 6.5222145538501595e-09, | |
| "loss": 0.35, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9908925318761383, | |
| "eval_loss": 0.32549938559532166, | |
| "eval_runtime": 3.7346, | |
| "eval_samples_per_second": 17.94, | |
| "eval_steps_per_second": 4.552, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.9908925318761383, | |
| "step": 274, | |
| "total_flos": 7.092604275943014e+17, | |
| "train_loss": 0.4666876497059843, | |
| "train_runtime": 3355.0572, | |
| "train_samples_per_second": 3.926, | |
| "train_steps_per_second": 0.082 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 274, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.092604275943014e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |