{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 10, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.30303030303030304, "grad_norm": 6.751555442810059, "learning_rate": 6.5224903051542e-06, "loss": 0.1213, "step": 10 }, { "epoch": 0.30303030303030304, "eval_loss": 0.9380465149879456, "eval_runtime": 12.8667, "eval_samples_per_second": 10.104, "eval_steps_per_second": 0.699, "step": 10 }, { "epoch": 0.6060606060606061, "grad_norm": 1.701265811920166, "learning_rate": 1.30449806103084e-05, "loss": 0.0674, "step": 20 }, { "epoch": 0.6060606060606061, "eval_loss": 0.7714130282402039, "eval_runtime": 13.1222, "eval_samples_per_second": 9.907, "eval_steps_per_second": 0.686, "step": 20 }, { "epoch": 0.9090909090909091, "grad_norm": 0.47370675206184387, "learning_rate": 1.9567470915462598e-05, "loss": 0.0152, "step": 30 }, { "epoch": 0.9090909090909091, "eval_loss": 0.6980556845664978, "eval_runtime": 13.1297, "eval_samples_per_second": 9.901, "eval_steps_per_second": 0.685, "step": 30 }, { "epoch": 1.2121212121212122, "grad_norm": 0.05018394812941551, "learning_rate": 2.60899612206168e-05, "loss": 0.0092, "step": 40 }, { "epoch": 1.2121212121212122, "eval_loss": 0.6736735105514526, "eval_runtime": 13.2489, "eval_samples_per_second": 9.812, "eval_steps_per_second": 0.679, "step": 40 }, { "epoch": 1.5151515151515151, "grad_norm": 0.09279070049524307, "learning_rate": 3.2612451525771e-05, "loss": 0.0061, "step": 50 }, { "epoch": 1.5151515151515151, "eval_loss": 0.6550866961479187, "eval_runtime": 13.7983, "eval_samples_per_second": 9.421, "eval_steps_per_second": 0.652, "step": 50 }, { "epoch": 1.8181818181818183, "grad_norm": 0.15316803753376007, "learning_rate": 3.9134941830925195e-05, "loss": 0.0054, "step": 60 }, { "epoch": 1.8181818181818183, "eval_loss": 0.6054828763008118, "eval_runtime": 13.111, "eval_samples_per_second": 9.915, "eval_steps_per_second": 0.686, "step": 60 }, { "epoch": 2.121212121212121, "grad_norm": 0.038146719336509705, "learning_rate": 4.56574321360794e-05, "loss": 0.005, "step": 70 }, { "epoch": 2.121212121212121, "eval_loss": 0.5666611194610596, "eval_runtime": 13.2994, "eval_samples_per_second": 9.775, "eval_steps_per_second": 0.677, "step": 70 }, { "epoch": 2.4242424242424243, "grad_norm": 0.03521318733692169, "learning_rate": 5.21799224412336e-05, "loss": 0.0031, "step": 80 }, { "epoch": 2.4242424242424243, "eval_loss": 0.5637892484664917, "eval_runtime": 13.5395, "eval_samples_per_second": 9.602, "eval_steps_per_second": 0.665, "step": 80 }, { "epoch": 2.7272727272727275, "grad_norm": 0.021656330674886703, "learning_rate": 5.8702412746387796e-05, "loss": 0.0025, "step": 90 }, { "epoch": 2.7272727272727275, "eval_loss": 0.5629878044128418, "eval_runtime": 12.8928, "eval_samples_per_second": 10.083, "eval_steps_per_second": 0.698, "step": 90 }, { "epoch": 3.0303030303030303, "grad_norm": 0.023006413131952286, "learning_rate": 6.5224903051542e-05, "loss": 0.0023, "step": 100 }, { "epoch": 3.0303030303030303, "eval_loss": 0.5643568634986877, "eval_runtime": 14.0003, "eval_samples_per_second": 9.285, "eval_steps_per_second": 0.643, "step": 100 }, { "epoch": 3.3333333333333335, "grad_norm": 0.018860142678022385, "learning_rate": 7.17473933566962e-05, "loss": 0.0017, "step": 110 }, { "epoch": 3.3333333333333335, "eval_loss": 0.5654418468475342, "eval_runtime": 12.8973, "eval_samples_per_second": 10.08, "eval_steps_per_second": 0.698, "step": 110 }, { "epoch": 3.6363636363636362, "grad_norm": 0.011563817039132118, "learning_rate": 7.826988366185039e-05, "loss": 0.0016, "step": 120 }, { "epoch": 3.6363636363636362, "eval_loss": 0.5633814930915833, "eval_runtime": 13.2174, "eval_samples_per_second": 9.836, "eval_steps_per_second": 0.681, "step": 120 }, { "epoch": 3.9393939393939394, "grad_norm": 0.013678439892828465, "learning_rate": 8.47923739670046e-05, "loss": 0.0015, "step": 130 }, { "epoch": 3.9393939393939394, "eval_loss": 0.5603265166282654, "eval_runtime": 13.4745, "eval_samples_per_second": 9.648, "eval_steps_per_second": 0.668, "step": 130 }, { "epoch": 4.242424242424242, "grad_norm": 0.012644356116652489, "learning_rate": 9.13148642721588e-05, "loss": 0.0013, "step": 140 }, { "epoch": 4.242424242424242, "eval_loss": 0.5610027313232422, "eval_runtime": 13.2941, "eval_samples_per_second": 9.779, "eval_steps_per_second": 0.677, "step": 140 }, { "epoch": 4.545454545454545, "grad_norm": 0.010118371807038784, "learning_rate": 9.783735457731299e-05, "loss": 0.001, "step": 150 }, { "epoch": 4.545454545454545, "eval_loss": 0.5560722947120667, "eval_runtime": 13.2972, "eval_samples_per_second": 9.777, "eval_steps_per_second": 0.677, "step": 150 }, { "epoch": 4.848484848484849, "grad_norm": 0.00965981837362051, "learning_rate": 0.0001043598448824672, "loss": 0.001, "step": 160 }, { "epoch": 4.848484848484849, "eval_loss": 0.5564282536506653, "eval_runtime": 13.6075, "eval_samples_per_second": 9.554, "eval_steps_per_second": 0.661, "step": 160 }, { "epoch": 5.151515151515151, "grad_norm": 0.01098481472581625, "learning_rate": 0.0001108823351876214, "loss": 0.0009, "step": 170 }, { "epoch": 5.151515151515151, "eval_loss": 0.5595658421516418, "eval_runtime": 13.1715, "eval_samples_per_second": 9.87, "eval_steps_per_second": 0.683, "step": 170 }, { "epoch": 5.454545454545454, "grad_norm": 0.006646531634032726, "learning_rate": 0.00011740482549277559, "loss": 0.0008, "step": 180 }, { "epoch": 5.454545454545454, "eval_loss": 0.5603924989700317, "eval_runtime": 13.6131, "eval_samples_per_second": 9.55, "eval_steps_per_second": 0.661, "step": 180 }, { "epoch": 5.757575757575758, "grad_norm": 0.0069460393860936165, "learning_rate": 0.0001239273157979298, "loss": 0.0007, "step": 190 }, { "epoch": 5.757575757575758, "eval_loss": 0.5604722499847412, "eval_runtime": 13.0651, "eval_samples_per_second": 9.95, "eval_steps_per_second": 0.689, "step": 190 } ], "logging_steps": 10, "max_steps": 198, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.41827014467584e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }