| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.983050847457627, | |
| "eval_steps": 500, | |
| "global_step": 330, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0903954802259887, | |
| "grad_norm": 3.1035618914383156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7917, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1807909604519774, | |
| "grad_norm": 0.8607691188024904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6851, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 1.1913942325018136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6577, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3615819209039548, | |
| "grad_norm": 0.548623444538096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6389, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 0.585455352514535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6294, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 0.8347995665944788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6195, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.632768361581921, | |
| "grad_norm": 0.7995568610573858, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6108, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7231638418079096, | |
| "grad_norm": 0.4720233842026471, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5995, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 0.7085911417350107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5964, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 0.8836062378579512, | |
| "learning_rate": 5e-06, | |
| "loss": 0.594, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9943502824858758, | |
| "grad_norm": 0.47743399780572227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5971, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9943502824858758, | |
| "eval_loss": 0.593043327331543, | |
| "eval_runtime": 76.0822, | |
| "eval_samples_per_second": 39.181, | |
| "eval_steps_per_second": 0.618, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0847457627118644, | |
| "grad_norm": 0.5787192502215555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5786, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1751412429378532, | |
| "grad_norm": 0.5658782261763434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5597, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2655367231638417, | |
| "grad_norm": 0.9890980264503261, | |
| "learning_rate": 5e-06, | |
| "loss": 0.556, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3559322033898304, | |
| "grad_norm": 0.663801200308687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5637, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4463276836158192, | |
| "grad_norm": 0.5545663342924204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5552, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.536723163841808, | |
| "grad_norm": 0.6217937754745557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.557, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6271186440677967, | |
| "grad_norm": 0.5052808547840478, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5561, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7175141242937855, | |
| "grad_norm": 0.4796090218442844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5583, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.807909604519774, | |
| "grad_norm": 0.5000197369268986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5572, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8983050847457628, | |
| "grad_norm": 0.5617846633897506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5549, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.9887005649717513, | |
| "grad_norm": 0.5583576310929224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5574, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.9977401129943502, | |
| "eval_loss": 0.580794095993042, | |
| "eval_runtime": 76.6088, | |
| "eval_samples_per_second": 38.912, | |
| "eval_steps_per_second": 0.614, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.07909604519774, | |
| "grad_norm": 0.7175040219630014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5432, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.169491525423729, | |
| "grad_norm": 0.5582961983522623, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5182, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.2598870056497176, | |
| "grad_norm": 0.5642582951260645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5201, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.3502824858757063, | |
| "grad_norm": 0.6344116026906986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5186, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.440677966101695, | |
| "grad_norm": 0.6118175703856888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5193, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.5310734463276834, | |
| "grad_norm": 0.9063214098694031, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5262, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.621468926553672, | |
| "grad_norm": 0.7840215083427163, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5232, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.711864406779661, | |
| "grad_norm": 0.720684294573406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5192, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.8022598870056497, | |
| "grad_norm": 0.5415059736199705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.526, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.8926553672316384, | |
| "grad_norm": 0.4878846503316281, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5174, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.983050847457627, | |
| "grad_norm": 0.5471913311499952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5233, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.983050847457627, | |
| "eval_loss": 0.5797294974327087, | |
| "eval_runtime": 75.4018, | |
| "eval_samples_per_second": 39.535, | |
| "eval_steps_per_second": 0.623, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.983050847457627, | |
| "step": 330, | |
| "total_flos": 552552911339520.0, | |
| "train_loss": 0.5736125353610877, | |
| "train_runtime": 10701.1707, | |
| "train_samples_per_second": 15.874, | |
| "train_steps_per_second": 0.031 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 330, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 552552911339520.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |