{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.22727272727272727, "grad_norm": 0.2216796875, "learning_rate": 4.943181818181818e-05, "loss": 0.5826, "step": 50 }, { "epoch": 0.45454545454545453, "grad_norm": 0.150390625, "learning_rate": 4.886363636363637e-05, "loss": 0.3128, "step": 100 }, { "epoch": 0.6818181818181818, "grad_norm": 0.1298828125, "learning_rate": 4.829545454545455e-05, "loss": 0.289, "step": 150 }, { "epoch": 0.9090909090909091, "grad_norm": 0.1435546875, "learning_rate": 4.772727272727273e-05, "loss": 0.2702, "step": 200 }, { "epoch": 1.0, "eval_loss": 0.24845334887504578, "eval_runtime": 52.787, "eval_samples_per_second": 14.89, "eval_steps_per_second": 0.474, "step": 220 }, { "epoch": 1.1363636363636362, "grad_norm": 0.1396484375, "learning_rate": 4.715909090909091e-05, "loss": 0.2663, "step": 250 }, { "epoch": 1.3636363636363638, "grad_norm": 0.166015625, "learning_rate": 4.659090909090909e-05, "loss": 0.2591, "step": 300 }, { "epoch": 1.5909090909090908, "grad_norm": 0.162109375, "learning_rate": 4.602272727272727e-05, "loss": 0.2498, "step": 350 }, { "epoch": 1.8181818181818183, "grad_norm": 0.119140625, "learning_rate": 4.545454545454546e-05, "loss": 0.2557, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.2350398302078247, "eval_runtime": 53.0739, "eval_samples_per_second": 14.81, "eval_steps_per_second": 0.471, "step": 440 }, { "epoch": 2.0454545454545454, "grad_norm": 0.1259765625, "learning_rate": 4.488636363636364e-05, "loss": 0.2466, "step": 450 }, { "epoch": 2.2727272727272725, "grad_norm": 0.1416015625, "learning_rate": 4.431818181818182e-05, "loss": 0.2361, "step": 500 }, { "epoch": 2.5, "grad_norm": 0.1337890625, "learning_rate": 4.375e-05, "loss": 0.253, "step": 550 }, { "epoch": 2.7272727272727275, "grad_norm": 0.1513671875, "learning_rate": 4.318181818181819e-05, "loss": 0.2409, "step": 600 }, { "epoch": 2.9545454545454546, "grad_norm": 0.1640625, "learning_rate": 4.261363636363637e-05, "loss": 0.2436, "step": 650 }, { "epoch": 3.0, "eval_loss": 0.22992941737174988, "eval_runtime": 53.1353, "eval_samples_per_second": 14.792, "eval_steps_per_second": 0.47, "step": 660 }, { "epoch": 3.1818181818181817, "grad_norm": 0.16015625, "learning_rate": 4.204545454545455e-05, "loss": 0.2363, "step": 700 }, { "epoch": 3.409090909090909, "grad_norm": 0.1650390625, "learning_rate": 4.1477272727272734e-05, "loss": 0.2256, "step": 750 }, { "epoch": 3.6363636363636362, "grad_norm": 0.158203125, "learning_rate": 4.0909090909090915e-05, "loss": 0.2389, "step": 800 }, { "epoch": 3.8636363636363638, "grad_norm": 0.169921875, "learning_rate": 4.034090909090909e-05, "loss": 0.2291, "step": 850 }, { "epoch": 4.0, "eval_loss": 0.22730520367622375, "eval_runtime": 53.1918, "eval_samples_per_second": 14.777, "eval_steps_per_second": 0.47, "step": 880 }, { "epoch": 4.090909090909091, "grad_norm": 0.1708984375, "learning_rate": 3.9772727272727275e-05, "loss": 0.2318, "step": 900 }, { "epoch": 4.318181818181818, "grad_norm": 0.181640625, "learning_rate": 3.9204545454545456e-05, "loss": 0.2322, "step": 950 }, { "epoch": 4.545454545454545, "grad_norm": 0.1591796875, "learning_rate": 3.8636363636363636e-05, "loss": 0.2194, "step": 1000 }, { "epoch": 4.7727272727272725, "grad_norm": 0.166015625, "learning_rate": 3.8068181818181816e-05, "loss": 0.2277, "step": 1050 }, { "epoch": 5.0, "grad_norm": 0.1767578125, "learning_rate": 3.7500000000000003e-05, "loss": 0.2329, "step": 1100 }, { "epoch": 5.0, "eval_loss": 0.2264343947172165, "eval_runtime": 54.0094, "eval_samples_per_second": 14.553, "eval_steps_per_second": 0.463, "step": 1100 }, { "epoch": 5.2272727272727275, "grad_norm": 0.1806640625, "learning_rate": 3.6931818181818184e-05, "loss": 0.2234, "step": 1150 }, { "epoch": 5.454545454545454, "grad_norm": 0.19140625, "learning_rate": 3.6363636363636364e-05, "loss": 0.2154, "step": 1200 }, { "epoch": 5.681818181818182, "grad_norm": 0.173828125, "learning_rate": 3.579545454545455e-05, "loss": 0.2349, "step": 1250 }, { "epoch": 5.909090909090909, "grad_norm": 0.162109375, "learning_rate": 3.522727272727273e-05, "loss": 0.2172, "step": 1300 }, { "epoch": 6.0, "eval_loss": 0.22602269053459167, "eval_runtime": 54.0842, "eval_samples_per_second": 14.533, "eval_steps_per_second": 0.462, "step": 1320 } ], "logging_steps": 50, "max_steps": 4400, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 9.285391673846661e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }