{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.042735042735042736, "grad_norm": 25.421951293945312, "learning_rate": 3.888888888888889e-05, "loss": 14.74276123046875, "step": 10 }, { "epoch": 0.08547008547008547, "grad_norm": 12.622594833374023, "learning_rate": 9.444444444444444e-05, "loss": 9.163805389404297, "step": 20 }, { "epoch": 0.1282051282051282, "grad_norm": 8.006195068359375, "learning_rate": 0.00015000000000000001, "loss": 5.28853759765625, "step": 30 }, { "epoch": 0.17094017094017094, "grad_norm": 3.498350143432617, "learning_rate": 0.00019999888744757143, "loss": 4.076284027099609, "step": 40 }, { "epoch": 0.21367521367521367, "grad_norm": 4.330902576446533, "learning_rate": 0.00019986541110764565, "loss": 3.210728073120117, "step": 50 }, { "epoch": 0.2564102564102564, "grad_norm": 5.267370700836182, "learning_rate": 0.0001995097645450266, "loss": 2.6838237762451174, "step": 60 }, { "epoch": 0.29914529914529914, "grad_norm": 3.2072625160217285, "learning_rate": 0.00019893273896534936, "loss": 2.4369382858276367, "step": 70 }, { "epoch": 0.3418803418803419, "grad_norm": 3.1016528606414795, "learning_rate": 0.00019813561807535598, "loss": 2.205874443054199, "step": 80 }, { "epoch": 0.38461538461538464, "grad_norm": 3.8450214862823486, "learning_rate": 0.00019712017522703764, "loss": 1.9279813766479492, "step": 90 }, { "epoch": 0.42735042735042733, "grad_norm": 2.348071575164795, "learning_rate": 0.00019588866947246498, "loss": 1.8235645294189453, "step": 100 }, { "epoch": 0.4700854700854701, "grad_norm": 3.2652463912963867, "learning_rate": 0.00019444384053808288, "loss": 1.8220790863037108, "step": 110 }, { "epoch": 0.5128205128205128, "grad_norm": 2.6423192024230957, "learning_rate": 0.00019278890272965096, "loss": 1.7959518432617188, "step": 120 }, { "epoch": 0.5555555555555556, "grad_norm": 2.6279354095458984, "learning_rate": 0.00019092753778138886, "loss": 1.7804344177246094, "step": 130 }, { "epoch": 0.5982905982905983, "grad_norm": 2.6313953399658203, "learning_rate": 0.0001888638866652356, "loss": 1.642679214477539, "step": 140 }, { "epoch": 0.6410256410256411, "grad_norm": 2.1009438037872314, "learning_rate": 0.00018660254037844388, "loss": 1.545415496826172, "step": 150 }, { "epoch": 0.6837606837606838, "grad_norm": 2.672374963760376, "learning_rate": 0.00018414852973000503, "loss": 1.5645628929138184, "step": 160 }, { "epoch": 0.7264957264957265, "grad_norm": 2.6783759593963623, "learning_rate": 0.00018150731414862622, "loss": 1.5343215942382813, "step": 170 }, { "epoch": 0.7692307692307693, "grad_norm": 2.3677117824554443, "learning_rate": 0.000178684769537159, "loss": 1.5453574180603027, "step": 180 }, { "epoch": 0.811965811965812, "grad_norm": 2.3082728385925293, "learning_rate": 0.0001756871752004992, "loss": 1.5324308395385742, "step": 190 }, { "epoch": 0.8547008547008547, "grad_norm": 1.969205617904663, "learning_rate": 0.00017252119987603973, "loss": 1.5409900665283203, "step": 200 }, { "epoch": 0.8974358974358975, "grad_norm": 2.5397582054138184, "learning_rate": 0.00016919388689775464, "loss": 1.4344990730285645, "step": 210 }, { "epoch": 0.9401709401709402, "grad_norm": 2.0636305809020996, "learning_rate": 0.00016571263852691888, "loss": 1.4311028480529786, "step": 220 }, { "epoch": 0.9829059829059829, "grad_norm": 2.4687087535858154, "learning_rate": 0.0001620851994843244, "loss": 1.461498737335205, "step": 230 } ], "logging_steps": 10, "max_steps": 702, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.961214772268672e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }