{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 11350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.8810572687224669, "grad_norm": 18084.95703125, "learning_rate": 0.0005993999999999999, "loss": 1.1021, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.2970001674393294, "eval_loss": 4.178283214569092, "eval_runtime": 615.0579, "eval_samples_per_second": 60.807, "eval_steps_per_second": 1.901, "step": 1135 }, { "epoch": 1.7621145374449338, "grad_norm": 26064.150390625, "learning_rate": 0.0005420869565217391, "loss": 0.9898, "step": 2000 }, { "epoch": 2.0, "eval_accuracy": 0.3064508094645081, "eval_loss": 4.065672397613525, "eval_runtime": 614.7114, "eval_samples_per_second": 60.842, "eval_steps_per_second": 1.902, "step": 2270 }, { "epoch": 2.643171806167401, "grad_norm": 14842.248046875, "learning_rate": 0.00048411594202898544, "loss": 0.9617, "step": 3000 }, { "epoch": 3.0, "eval_accuracy": 0.3191253911278085, "eval_loss": 3.9150826930999756, "eval_runtime": 613.7049, "eval_samples_per_second": 60.941, "eval_steps_per_second": 1.905, "step": 3405 }, { "epoch": 3.5242290748898677, "grad_norm": 9546.3798828125, "learning_rate": 0.00042614492753623184, "loss": 0.9291, "step": 4000 }, { "epoch": 4.0, "eval_accuracy": 0.33467485375221073, "eval_loss": 3.7638590335845947, "eval_runtime": 614.245, "eval_samples_per_second": 60.888, "eval_steps_per_second": 1.903, "step": 4540 }, { "epoch": 4.405286343612334, "grad_norm": 6838.88623046875, "learning_rate": 0.00036817391304347824, "loss": 0.8903, "step": 5000 }, { "epoch": 5.0, "eval_accuracy": 0.34916521029333275, "eval_loss": 3.6331233978271484, "eval_runtime": 614.1793, "eval_samples_per_second": 60.894, "eval_steps_per_second": 1.903, "step": 5675 }, { "epoch": 5.286343612334802, "grad_norm": 6897.5400390625, "learning_rate": 0.00031020289855072464, "loss": 0.8546, "step": 6000 }, { "epoch": 6.0, "eval_accuracy": 0.3598596125872516, "eval_loss": 3.546229124069214, "eval_runtime": 613.9133, "eval_samples_per_second": 60.921, "eval_steps_per_second": 1.904, "step": 6810 }, { "epoch": 6.167400881057269, "grad_norm": 7466.97265625, "learning_rate": 0.000252231884057971, "loss": 0.8321, "step": 7000 }, { "epoch": 7.0, "eval_accuracy": 0.36795211235179004, "eval_loss": 3.4829158782958984, "eval_runtime": 613.499, "eval_samples_per_second": 60.962, "eval_steps_per_second": 1.905, "step": 7945 }, { "epoch": 7.048458149779735, "grad_norm": 7247.333984375, "learning_rate": 0.00019426086956521736, "loss": 0.8128, "step": 8000 }, { "epoch": 7.929515418502203, "grad_norm": 7852.26904296875, "learning_rate": 0.00013628985507246376, "loss": 0.798, "step": 9000 }, { "epoch": 8.0, "eval_accuracy": 0.3742137676988604, "eval_loss": 3.4337821006774902, "eval_runtime": 612.7115, "eval_samples_per_second": 61.04, "eval_steps_per_second": 1.908, "step": 9080 }, { "epoch": 8.810572687224669, "grad_norm": 7869.26025390625, "learning_rate": 7.831884057971013e-05, "loss": 0.7836, "step": 10000 }, { "epoch": 9.0, "eval_accuracy": 0.3786116140104859, "eval_loss": 3.4001269340515137, "eval_runtime": 613.2076, "eval_samples_per_second": 60.991, "eval_steps_per_second": 1.906, "step": 10215 }, { "epoch": 9.691629955947137, "grad_norm": 8053.40234375, "learning_rate": 2.034782608695652e-05, "loss": 0.7747, "step": 11000 }, { "epoch": 10.0, "eval_accuracy": 0.3821338572789016, "eval_loss": 3.3799285888671875, "eval_runtime": 613.3109, "eval_samples_per_second": 60.98, "eval_steps_per_second": 1.906, "step": 11350 }, { "epoch": 10.0, "step": 11350, "total_flos": 9.489081434112e+16, "train_loss": 0.8809447758208288, "train_runtime": 16232.7867, "train_samples_per_second": 22.372, "train_steps_per_second": 0.699 } ], "logging_steps": 1000, "max_steps": 11350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.489081434112e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }