{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 5.274852752685547, "learning_rate": 0.00019525, "loss": 3.313, "step": 20 }, { "epoch": 0.1, "grad_norm": 8.226648330688477, "learning_rate": 0.00019025000000000002, "loss": 2.2679, "step": 40 }, { "epoch": 0.15, "grad_norm": 4.02825403213501, "learning_rate": 0.00018525, "loss": 0.7816, "step": 60 }, { "epoch": 0.2, "grad_norm": 12.396927833557129, "learning_rate": 0.00018025000000000002, "loss": 0.6099, "step": 80 }, { "epoch": 0.25, "grad_norm": 3.590737819671631, "learning_rate": 0.00017525, "loss": 0.4067, "step": 100 }, { "epoch": 0.3, "grad_norm": 3.1620190143585205, "learning_rate": 0.00017025, "loss": 0.2761, "step": 120 }, { "epoch": 0.35, "grad_norm": 5.800012588500977, "learning_rate": 0.00016525, "loss": 0.2304, "step": 140 }, { "epoch": 0.4, "grad_norm": 3.7974071502685547, "learning_rate": 0.00016025000000000002, "loss": 0.1981, "step": 160 }, { "epoch": 0.45, "grad_norm": 2.664266347885132, "learning_rate": 0.00015525, "loss": 0.1579, "step": 180 }, { "epoch": 0.5, "grad_norm": 9.237320899963379, "learning_rate": 0.00015025, "loss": 0.1837, "step": 200 }, { "epoch": 0.55, "grad_norm": 2.3988049030303955, "learning_rate": 0.00014525, "loss": 0.1168, "step": 220 }, { "epoch": 0.6, "grad_norm": 6.759559631347656, "learning_rate": 0.00014025000000000002, "loss": 0.1017, "step": 240 }, { "epoch": 0.65, "grad_norm": 1.841704249382019, "learning_rate": 0.00013525, "loss": 0.093, "step": 260 }, { "epoch": 0.7, "grad_norm": 1.0974807739257812, "learning_rate": 0.00013025, "loss": 0.0944, "step": 280 }, { "epoch": 0.75, "grad_norm": 1.219509243965149, "learning_rate": 0.00012525, "loss": 0.0921, "step": 300 }, { "epoch": 0.8, "grad_norm": 1.0698885917663574, "learning_rate": 0.00012025, "loss": 0.0796, "step": 320 }, { "epoch": 0.85, "grad_norm": 1.5936706066131592, "learning_rate": 0.00011525000000000001, "loss": 0.0647, "step": 340 }, { "epoch": 0.9, "grad_norm": 3.975679636001587, "learning_rate": 0.00011025000000000001, "loss": 0.0711, "step": 360 }, { "epoch": 0.95, "grad_norm": 0.7344488501548767, "learning_rate": 0.00010525000000000001, "loss": 0.0598, "step": 380 }, { "epoch": 1.0, "grad_norm": 0.47527560591697693, "learning_rate": 0.00010025, "loss": 0.0688, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.04188907518982887, "eval_runtime": 42.5156, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.306, "step": 400 } ], "logging_steps": 20, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.41750580543488e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }