{ "best_global_step": null, "best_metric": 0.5865987539291382, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 57100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 5.180218696594238, "learning_rate": 3.600420315236428e-05, "loss": 1.2756, "step": 5710 }, { "epoch": 1.0, "eval_loss": 0.9367296695709229, "eval_runtime": 76.8888, "eval_samples_per_second": 132.971, "eval_steps_per_second": 16.621, "step": 5710 }, { "epoch": 2.0, "grad_norm": 5.356524467468262, "learning_rate": 3.200630472854641e-05, "loss": 0.9522, "step": 11420 }, { "epoch": 2.0, "eval_loss": 0.8216637969017029, "eval_runtime": 73.2993, "eval_samples_per_second": 139.483, "eval_steps_per_second": 17.435, "step": 11420 }, { "epoch": 3.0, "grad_norm": 4.990707874298096, "learning_rate": 2.800700525394046e-05, "loss": 0.8493, "step": 17130 }, { "epoch": 3.0, "eval_loss": 0.7453346848487854, "eval_runtime": 76.8549, "eval_samples_per_second": 133.03, "eval_steps_per_second": 16.629, "step": 17130 }, { "epoch": 4.0, "grad_norm": 4.565978527069092, "learning_rate": 2.4009106830122595e-05, "loss": 0.781, "step": 22840 }, { "epoch": 4.0, "eval_loss": 0.6874940991401672, "eval_runtime": 38.7504, "eval_samples_per_second": 263.842, "eval_steps_per_second": 32.98, "step": 22840 }, { "epoch": 5.0, "grad_norm": 4.659147262573242, "learning_rate": 2.0010507880910684e-05, "loss": 0.7316, "step": 28550 }, { "epoch": 5.0, "eval_loss": 0.6624111533164978, "eval_runtime": 36.5212, "eval_samples_per_second": 279.947, "eval_steps_per_second": 34.993, "step": 28550 }, { "epoch": 6.0, "grad_norm": 4.955957412719727, "learning_rate": 1.6011908931698776e-05, "loss": 0.6985, "step": 34260 }, { "epoch": 6.0, "eval_loss": 0.6377598643302917, "eval_runtime": 35.9349, "eval_samples_per_second": 284.514, "eval_steps_per_second": 35.564, "step": 34260 }, { "epoch": 7.0, "grad_norm": 4.549160003662109, "learning_rate": 1.2012609457092821e-05, "loss": 0.6736, "step": 39970 }, { "epoch": 7.0, "eval_loss": 0.6155329346656799, "eval_runtime": 37.8645, "eval_samples_per_second": 270.015, "eval_steps_per_second": 33.752, "step": 39970 }, { "epoch": 8.0, "grad_norm": 4.006523609161377, "learning_rate": 8.01260945709282e-06, "loss": 0.6502, "step": 45680 }, { "epoch": 8.0, "eval_loss": 0.6062974333763123, "eval_runtime": 36.5102, "eval_samples_per_second": 280.031, "eval_steps_per_second": 35.004, "step": 45680 }, { "epoch": 9.0, "grad_norm": 3.7265231609344482, "learning_rate": 4.013309982486865e-06, "loss": 0.6345, "step": 51390 }, { "epoch": 9.0, "eval_loss": 0.5894550681114197, "eval_runtime": 43.2376, "eval_samples_per_second": 236.461, "eval_steps_per_second": 29.558, "step": 51390 }, { "epoch": 10.0, "grad_norm": 3.897273302078247, "learning_rate": 1.611208406304729e-08, "loss": 0.6232, "step": 57100 }, { "epoch": 10.0, "eval_loss": 0.5865987539291382, "eval_runtime": 36.5729, "eval_samples_per_second": 279.551, "eval_steps_per_second": 34.944, "step": 57100 }, { "epoch": 10.0, "step": 57100, "total_flos": 6.01310491705344e+16, "train_loss": 0.7869737379939251, "train_runtime": 10124.3653, "train_samples_per_second": 90.247, "train_steps_per_second": 5.64 }, { "epoch": 10.0, "eval_loss": 0.5878283381462097, "eval_runtime": 36.4992, "eval_samples_per_second": 280.116, "eval_steps_per_second": 35.014, "step": 57100 } ], "logging_steps": 500, "max_steps": 57100, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.01310491705344e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }