{ "best_global_step": 360, "best_metric": 0.36397936940193176, "best_model_checkpoint": null, "epoch": 0.4540295119182747, "eval_steps": 40, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04540295119182747, "grad_norm": 5.84375, "learning_rate": 9.75e-06, "loss": 2.131, "step": 40 }, { "epoch": 0.04540295119182747, "eval_loss": 1.8176805973052979, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.8325, "eval_samples_per_second": 79.798, "eval_steps_per_second": 2.494, "step": 40 }, { "epoch": 0.09080590238365494, "grad_norm": 10.5625, "learning_rate": 9.713207455460893e-06, "loss": 1.4527, "step": 80 }, { "epoch": 0.09080590238365494, "eval_loss": 1.1334214210510254, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.7653, "eval_samples_per_second": 80.218, "eval_steps_per_second": 2.507, "step": 80 }, { "epoch": 0.1362088535754824, "grad_norm": 4.09375, "learning_rate": 8.858122916938601e-06, "loss": 0.9438, "step": 120 }, { "epoch": 0.1362088535754824, "eval_loss": 0.7638623714447021, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.8084, "eval_samples_per_second": 79.947, "eval_steps_per_second": 2.498, "step": 120 }, { "epoch": 0.18161180476730987, "grad_norm": 2.734375, "learning_rate": 7.537691814803522e-06, "loss": 0.6657, "step": 160 }, { "epoch": 0.18161180476730987, "eval_loss": 0.5563021898269653, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.785, "eval_samples_per_second": 80.094, "eval_steps_per_second": 2.503, "step": 160 }, { "epoch": 0.22701475595913734, "grad_norm": 2.65625, "learning_rate": 5.911177627460739e-06, "loss": 0.4941, "step": 200 }, { "epoch": 0.22701475595913734, "eval_loss": 0.4392661154270172, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.8151, "eval_samples_per_second": 79.906, "eval_steps_per_second": 2.497, "step": 200 }, { "epoch": 0.2724177071509648, "grad_norm": 1.9921875, "learning_rate": 4.174761970696612e-06, "loss": 0.4092, "step": 240 }, { "epoch": 0.2724177071509648, "eval_loss": 0.3819335103034973, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.831, "eval_samples_per_second": 79.807, "eval_steps_per_second": 2.494, "step": 240 }, { "epoch": 0.3178206583427923, "grad_norm": 1.625, "learning_rate": 2.5378821994826654e-06, "loss": 0.3775, "step": 280 }, { "epoch": 0.3178206583427923, "eval_loss": 0.3678199350833893, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.8047, "eval_samples_per_second": 79.971, "eval_steps_per_second": 2.499, "step": 280 }, { "epoch": 0.36322360953461974, "grad_norm": 1.71875, "learning_rate": 1.1979701719998454e-06, "loss": 0.3693, "step": 320 }, { "epoch": 0.36322360953461974, "eval_loss": 0.36460334062576294, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.7939, "eval_samples_per_second": 80.038, "eval_steps_per_second": 2.501, "step": 320 }, { "epoch": 0.4086265607264472, "grad_norm": 2.25, "learning_rate": 3.166390537580122e-07, "loss": 0.3772, "step": 360 }, { "epoch": 0.4086265607264472, "eval_loss": 0.36397936940193176, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.7983, "eval_samples_per_second": 80.011, "eval_steps_per_second": 2.5, "step": 360 }, { "epoch": 0.4540295119182747, "grad_norm": 1.734375, "learning_rate": 1.903846791434516e-10, "loss": 0.3684, "step": 400 }, { "epoch": 0.4540295119182747, "eval_loss": 0.36452436447143555, "eval_model_preparation_time": 0.0081, "eval_runtime": 12.7892, "eval_samples_per_second": 80.067, "eval_steps_per_second": 2.502, "step": 400 } ], "logging_steps": 40, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.208164803588915e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }