| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.001000500250125, | |
| "eval_steps": 25, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05002501250625312, | |
| "grad_norm": 0.517996072769165, | |
| "learning_rate": 0.0001951951951951952, | |
| "loss": 1.677, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05002501250625312, | |
| "eval_loss": 1.3813503980636597, | |
| "eval_runtime": 148.0614, | |
| "eval_samples_per_second": 3.37, | |
| "eval_steps_per_second": 0.425, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10005002501250625, | |
| "grad_norm": 0.5020231604576111, | |
| "learning_rate": 0.0001901901901901902, | |
| "loss": 1.2016, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10005002501250625, | |
| "eval_loss": 1.347744107246399, | |
| "eval_runtime": 151.7258, | |
| "eval_samples_per_second": 3.289, | |
| "eval_steps_per_second": 0.415, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1500750375187594, | |
| "grad_norm": 0.3798060119152069, | |
| "learning_rate": 0.0001851851851851852, | |
| "loss": 1.4491, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1500750375187594, | |
| "eval_loss": 1.3210723400115967, | |
| "eval_runtime": 150.0032, | |
| "eval_samples_per_second": 3.327, | |
| "eval_steps_per_second": 0.42, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2001000500250125, | |
| "grad_norm": 0.3365944027900696, | |
| "learning_rate": 0.00018018018018018018, | |
| "loss": 1.2076, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2001000500250125, | |
| "eval_loss": 1.3334178924560547, | |
| "eval_runtime": 151.2551, | |
| "eval_samples_per_second": 3.299, | |
| "eval_steps_per_second": 0.417, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25012506253126565, | |
| "grad_norm": 0.22820694744586945, | |
| "learning_rate": 0.0001751751751751752, | |
| "loss": 1.4415, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.25012506253126565, | |
| "eval_loss": 1.309592366218567, | |
| "eval_runtime": 149.299, | |
| "eval_samples_per_second": 3.342, | |
| "eval_steps_per_second": 0.422, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3001500750375188, | |
| "grad_norm": 0.3848935663700104, | |
| "learning_rate": 0.0001701701701701702, | |
| "loss": 1.139, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3001500750375188, | |
| "eval_loss": 1.3208202123641968, | |
| "eval_runtime": 149.5811, | |
| "eval_samples_per_second": 3.336, | |
| "eval_steps_per_second": 0.421, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3501750875437719, | |
| "grad_norm": 0.2774136960506439, | |
| "learning_rate": 0.00016516516516516518, | |
| "loss": 1.4055, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3501750875437719, | |
| "eval_loss": 1.3086917400360107, | |
| "eval_runtime": 150.1042, | |
| "eval_samples_per_second": 3.324, | |
| "eval_steps_per_second": 0.42, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.400200100050025, | |
| "grad_norm": 0.32166117429733276, | |
| "learning_rate": 0.00016016016016016018, | |
| "loss": 1.1459, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.400200100050025, | |
| "eval_loss": 1.306862473487854, | |
| "eval_runtime": 150.7168, | |
| "eval_samples_per_second": 3.311, | |
| "eval_steps_per_second": 0.418, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4502251125562781, | |
| "grad_norm": 0.23773141205310822, | |
| "learning_rate": 0.00015515515515515516, | |
| "loss": 1.4444, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4502251125562781, | |
| "eval_loss": 1.3020325899124146, | |
| "eval_runtime": 148.5364, | |
| "eval_samples_per_second": 3.359, | |
| "eval_steps_per_second": 0.424, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5002501250625313, | |
| "grad_norm": 0.37095341086387634, | |
| "learning_rate": 0.00015015015015015014, | |
| "loss": 1.2264, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5002501250625313, | |
| "eval_loss": 1.3001904487609863, | |
| "eval_runtime": 152.658, | |
| "eval_samples_per_second": 3.269, | |
| "eval_steps_per_second": 0.413, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5502751375687844, | |
| "grad_norm": 0.2519828677177429, | |
| "learning_rate": 0.00014514514514514515, | |
| "loss": 1.4605, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5502751375687844, | |
| "eval_loss": 1.299567699432373, | |
| "eval_runtime": 148.4653, | |
| "eval_samples_per_second": 3.361, | |
| "eval_steps_per_second": 0.424, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6003001500750376, | |
| "grad_norm": 0.3685779273509979, | |
| "learning_rate": 0.00014014014014014013, | |
| "loss": 1.1655, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6003001500750376, | |
| "eval_loss": 1.2988265752792358, | |
| "eval_runtime": 151.1788, | |
| "eval_samples_per_second": 3.301, | |
| "eval_steps_per_second": 0.417, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6503251625812907, | |
| "grad_norm": 0.26966241002082825, | |
| "learning_rate": 0.00013513513513513514, | |
| "loss": 1.4313, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6503251625812907, | |
| "eval_loss": 1.298296332359314, | |
| "eval_runtime": 152.0718, | |
| "eval_samples_per_second": 3.281, | |
| "eval_steps_per_second": 0.414, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7003501750875438, | |
| "grad_norm": 0.35637611150741577, | |
| "learning_rate": 0.00013013013013013014, | |
| "loss": 1.2002, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7003501750875438, | |
| "eval_loss": 1.2959158420562744, | |
| "eval_runtime": 151.1585, | |
| "eval_samples_per_second": 3.301, | |
| "eval_steps_per_second": 0.417, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7503751875937968, | |
| "grad_norm": 0.22513383626937866, | |
| "learning_rate": 0.00012512512512512512, | |
| "loss": 1.3994, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7503751875937968, | |
| "eval_loss": 1.2951635122299194, | |
| "eval_runtime": 148.5372, | |
| "eval_samples_per_second": 3.359, | |
| "eval_steps_per_second": 0.424, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.80040020010005, | |
| "grad_norm": 0.35314086079597473, | |
| "learning_rate": 0.00012012012012012013, | |
| "loss": 1.1836, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.80040020010005, | |
| "eval_loss": 1.294690728187561, | |
| "eval_runtime": 149.3769, | |
| "eval_samples_per_second": 3.341, | |
| "eval_steps_per_second": 0.422, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8504252126063031, | |
| "grad_norm": 0.240916907787323, | |
| "learning_rate": 0.00011511511511511512, | |
| "loss": 1.4378, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8504252126063031, | |
| "eval_loss": 1.2916043996810913, | |
| "eval_runtime": 152.0772, | |
| "eval_samples_per_second": 3.281, | |
| "eval_steps_per_second": 0.414, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9004502251125562, | |
| "grad_norm": 0.31087398529052734, | |
| "learning_rate": 0.00011011011011011012, | |
| "loss": 1.1989, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9004502251125562, | |
| "eval_loss": 1.2893831729888916, | |
| "eval_runtime": 150.4895, | |
| "eval_samples_per_second": 3.316, | |
| "eval_steps_per_second": 0.419, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9504752376188094, | |
| "grad_norm": 0.2413586527109146, | |
| "learning_rate": 0.00010510510510510511, | |
| "loss": 1.4508, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9504752376188094, | |
| "eval_loss": 1.2888984680175781, | |
| "eval_runtime": 151.1108, | |
| "eval_samples_per_second": 3.302, | |
| "eval_steps_per_second": 0.417, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0005002501250626, | |
| "grad_norm": 0.40069064497947693, | |
| "learning_rate": 0.00010010010010010012, | |
| "loss": 1.2076, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0005002501250626, | |
| "eval_loss": 1.2911962270736694, | |
| "eval_runtime": 148.6843, | |
| "eval_samples_per_second": 3.356, | |
| "eval_steps_per_second": 0.424, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0505252626313157, | |
| "grad_norm": 0.22050493955612183, | |
| "learning_rate": 9.50950950950951e-05, | |
| "loss": 1.3994, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.0505252626313157, | |
| "eval_loss": 1.2921332120895386, | |
| "eval_runtime": 149.3015, | |
| "eval_samples_per_second": 3.342, | |
| "eval_steps_per_second": 0.422, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.1005502751375689, | |
| "grad_norm": 0.3588818907737732, | |
| "learning_rate": 9.009009009009009e-05, | |
| "loss": 1.177, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1005502751375689, | |
| "eval_loss": 1.2903811931610107, | |
| "eval_runtime": 149.8093, | |
| "eval_samples_per_second": 3.331, | |
| "eval_steps_per_second": 0.421, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.150575287643822, | |
| "grad_norm": 0.2672303020954132, | |
| "learning_rate": 8.50850850850851e-05, | |
| "loss": 1.4015, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.150575287643822, | |
| "eval_loss": 1.2898900508880615, | |
| "eval_runtime": 149.8311, | |
| "eval_samples_per_second": 3.33, | |
| "eval_steps_per_second": 0.42, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2006003001500751, | |
| "grad_norm": 0.31220486760139465, | |
| "learning_rate": 8.008008008008009e-05, | |
| "loss": 1.192, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2006003001500751, | |
| "eval_loss": 1.288824439048767, | |
| "eval_runtime": 151.038, | |
| "eval_samples_per_second": 3.304, | |
| "eval_steps_per_second": 0.417, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2506253126563283, | |
| "grad_norm": 0.2526504695415497, | |
| "learning_rate": 7.507507507507507e-05, | |
| "loss": 1.3829, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.2506253126563283, | |
| "eval_loss": 1.2878332138061523, | |
| "eval_runtime": 151.5015, | |
| "eval_samples_per_second": 3.294, | |
| "eval_steps_per_second": 0.416, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.3006503251625814, | |
| "grad_norm": 0.28051283955574036, | |
| "learning_rate": 7.007007007007007e-05, | |
| "loss": 1.1514, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3006503251625814, | |
| "eval_loss": 1.2859280109405518, | |
| "eval_runtime": 150.4738, | |
| "eval_samples_per_second": 3.316, | |
| "eval_steps_per_second": 0.419, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3506753376688345, | |
| "grad_norm": 0.26419979333877563, | |
| "learning_rate": 6.506506506506507e-05, | |
| "loss": 1.4028, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.3506753376688345, | |
| "eval_loss": 1.2848296165466309, | |
| "eval_runtime": 149.0963, | |
| "eval_samples_per_second": 3.347, | |
| "eval_steps_per_second": 0.423, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.4007003501750876, | |
| "grad_norm": 0.3227976858615875, | |
| "learning_rate": 6.0060060060060066e-05, | |
| "loss": 1.1778, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4007003501750876, | |
| "eval_loss": 1.285400152206421, | |
| "eval_runtime": 149.1519, | |
| "eval_samples_per_second": 3.346, | |
| "eval_steps_per_second": 0.422, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4507253626813408, | |
| "grad_norm": 0.24903441965579987, | |
| "learning_rate": 5.505505505505506e-05, | |
| "loss": 1.4058, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.4507253626813408, | |
| "eval_loss": 1.2824435234069824, | |
| "eval_runtime": 149.5232, | |
| "eval_samples_per_second": 3.337, | |
| "eval_steps_per_second": 0.421, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.500750375187594, | |
| "grad_norm": 0.31187903881073, | |
| "learning_rate": 5.005005005005006e-05, | |
| "loss": 1.1698, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.500750375187594, | |
| "eval_loss": 1.2831988334655762, | |
| "eval_runtime": 150.4227, | |
| "eval_samples_per_second": 3.317, | |
| "eval_steps_per_second": 0.419, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.550775387693847, | |
| "grad_norm": 0.2889004051685333, | |
| "learning_rate": 4.5045045045045046e-05, | |
| "loss": 1.3516, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.550775387693847, | |
| "eval_loss": 1.2823545932769775, | |
| "eval_runtime": 149.8614, | |
| "eval_samples_per_second": 3.33, | |
| "eval_steps_per_second": 0.42, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.6008004002001002, | |
| "grad_norm": 0.37189939618110657, | |
| "learning_rate": 4.0040040040040046e-05, | |
| "loss": 1.1264, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6008004002001002, | |
| "eval_loss": 1.2828818559646606, | |
| "eval_runtime": 150.672, | |
| "eval_samples_per_second": 3.312, | |
| "eval_steps_per_second": 0.418, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6508254127063533, | |
| "grad_norm": 0.25290611386299133, | |
| "learning_rate": 3.503503503503503e-05, | |
| "loss": 1.4113, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.6508254127063533, | |
| "eval_loss": 1.2822470664978027, | |
| "eval_runtime": 149.3988, | |
| "eval_samples_per_second": 3.34, | |
| "eval_steps_per_second": 0.422, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.7008504252126064, | |
| "grad_norm": 0.3559873104095459, | |
| "learning_rate": 3.0030030030030033e-05, | |
| "loss": 1.1248, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7008504252126064, | |
| "eval_loss": 1.2828270196914673, | |
| "eval_runtime": 149.9897, | |
| "eval_samples_per_second": 3.327, | |
| "eval_steps_per_second": 0.42, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7508754377188596, | |
| "grad_norm": 0.3052867352962494, | |
| "learning_rate": 2.502502502502503e-05, | |
| "loss": 1.336, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.7508754377188596, | |
| "eval_loss": 1.282852053642273, | |
| "eval_runtime": 151.397, | |
| "eval_samples_per_second": 3.296, | |
| "eval_steps_per_second": 0.416, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.8009004502251127, | |
| "grad_norm": 0.33662667870521545, | |
| "learning_rate": 2.0020020020020023e-05, | |
| "loss": 1.0725, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8009004502251127, | |
| "eval_loss": 1.2822794914245605, | |
| "eval_runtime": 150.7632, | |
| "eval_samples_per_second": 3.31, | |
| "eval_steps_per_second": 0.418, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8509254627313658, | |
| "grad_norm": 0.29956212639808655, | |
| "learning_rate": 1.5015015015015016e-05, | |
| "loss": 1.3989, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.8509254627313658, | |
| "eval_loss": 1.2824186086654663, | |
| "eval_runtime": 150.6938, | |
| "eval_samples_per_second": 3.311, | |
| "eval_steps_per_second": 0.418, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.900950475237619, | |
| "grad_norm": 0.3255136013031006, | |
| "learning_rate": 1.0010010010010011e-05, | |
| "loss": 1.112, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.900950475237619, | |
| "eval_loss": 1.28144371509552, | |
| "eval_runtime": 149.8969, | |
| "eval_samples_per_second": 3.329, | |
| "eval_steps_per_second": 0.42, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.950975487743872, | |
| "grad_norm": 0.2689700424671173, | |
| "learning_rate": 5.005005005005006e-06, | |
| "loss": 1.3972, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.950975487743872, | |
| "eval_loss": 1.280760645866394, | |
| "eval_runtime": 149.8977, | |
| "eval_samples_per_second": 3.329, | |
| "eval_steps_per_second": 0.42, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.001000500250125, | |
| "grad_norm": 0.3633726239204407, | |
| "learning_rate": 0.0, | |
| "loss": 1.1746, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.001000500250125, | |
| "eval_loss": 1.2818013429641724, | |
| "eval_runtime": 149.8121, | |
| "eval_samples_per_second": 3.331, | |
| "eval_steps_per_second": 0.421, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.844485620424704e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |