{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 11320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.8833922261484098, "grad_norm": 13501.556640625, "learning_rate": 0.0005993999999999999, "loss": 1.1808, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.22445885182474118, "eval_loss": 4.3484416007995605, "eval_runtime": 7.0864, "eval_samples_per_second": 36.972, "eval_steps_per_second": 1.27, "step": 1132 }, { "epoch": 1.76678445229682, "grad_norm": 13690.3916015625, "learning_rate": 0.0005419186046511628, "loss": 0.9761, "step": 2000 }, { "epoch": 2.0, "eval_accuracy": 0.27711716287477034, "eval_loss": 3.9408960342407227, "eval_runtime": 6.3793, "eval_samples_per_second": 41.07, "eval_steps_per_second": 1.411, "step": 2264 }, { "epoch": 2.65017667844523, "grad_norm": 11980.6181640625, "learning_rate": 0.00048377906976744176, "loss": 0.9067, "step": 3000 }, { "epoch": 3.0, "eval_accuracy": 0.3071958889171061, "eval_loss": 3.7296414375305176, "eval_runtime": 6.3284, "eval_samples_per_second": 41.401, "eval_steps_per_second": 1.422, "step": 3396 }, { "epoch": 3.53356890459364, "grad_norm": 10367.0361328125, "learning_rate": 0.0004256395348837209, "loss": 0.8612, "step": 4000 }, { "epoch": 4.0, "eval_accuracy": 0.3292451561823098, "eval_loss": 3.5685932636260986, "eval_runtime": 6.4008, "eval_samples_per_second": 40.933, "eval_steps_per_second": 1.406, "step": 4528 }, { "epoch": 4.41696113074205, "grad_norm": 10069.853515625, "learning_rate": 0.0003675, "loss": 0.8249, "step": 5000 }, { "epoch": 5.0, "eval_accuracy": 0.3458941455908935, "eval_loss": 3.458752393722534, "eval_runtime": 6.5228, "eval_samples_per_second": 40.167, "eval_steps_per_second": 1.38, "step": 5660 }, { "epoch": 5.30035335689046, "grad_norm": 10210.6884765625, "learning_rate": 0.00030936046511627905, "loss": 0.7961, "step": 6000 }, { "epoch": 6.0, "eval_accuracy": 0.3591072735692625, "eval_loss": 3.375469446182251, "eval_runtime": 6.438, "eval_samples_per_second": 40.696, "eval_steps_per_second": 1.398, "step": 6792 }, { "epoch": 6.18374558303887, "grad_norm": 10369.4931640625, "learning_rate": 0.0002512209302325581, "loss": 0.7736, "step": 7000 }, { "epoch": 7.0, "eval_accuracy": 0.3692057184685021, "eval_loss": 3.306090831756592, "eval_runtime": 6.274, "eval_samples_per_second": 41.76, "eval_steps_per_second": 1.434, "step": 7924 }, { "epoch": 7.067137809187279, "grad_norm": 10323.146484375, "learning_rate": 0.00019308139534883722, "loss": 0.7547, "step": 8000 }, { "epoch": 7.950530035335689, "grad_norm": 10506.6484375, "learning_rate": 0.00013494186046511625, "loss": 0.7365, "step": 9000 }, { "epoch": 8.0, "eval_accuracy": 0.3775862326526344, "eval_loss": 3.2525575160980225, "eval_runtime": 6.6899, "eval_samples_per_second": 39.164, "eval_steps_per_second": 1.345, "step": 9056 }, { "epoch": 8.8339222614841, "grad_norm": 10966.0029296875, "learning_rate": 7.680232558139534e-05, "loss": 0.7209, "step": 10000 }, { "epoch": 9.0, "eval_accuracy": 0.3828520637576373, "eval_loss": 3.2175469398498535, "eval_runtime": 6.6835, "eval_samples_per_second": 39.201, "eval_steps_per_second": 1.347, "step": 10188 }, { "epoch": 9.717314487632509, "grad_norm": 11004.7744140625, "learning_rate": 1.8662790697674418e-05, "loss": 0.7088, "step": 11000 }, { "epoch": 10.0, "eval_accuracy": 0.3858024230292347, "eval_loss": 3.1980981826782227, "eval_runtime": 6.3685, "eval_samples_per_second": 41.14, "eval_steps_per_second": 1.413, "step": 11320 }, { "epoch": 10.0, "step": 11320, "total_flos": 9.464519983104e+16, "train_loss": 0.8361858516194374, "train_runtime": 10174.633, "train_samples_per_second": 35.6, "train_steps_per_second": 1.113 } ], "logging_steps": 1000, "max_steps": 11320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.464519983104e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }