{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990645463049579, "eval_steps": 100, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037418147801683815, "grad_norm": 3.605011224746704, "learning_rate": 1.6666666666666667e-05, "loss": 1.3411, "mean_token_accuracy": 0.7953277874737978, "num_tokens": 40960.0, "step": 10 }, { "epoch": 0.07483629560336763, "grad_norm": 3.451061964035034, "learning_rate": 3.518518518518519e-05, "loss": 0.1369, "mean_token_accuracy": 0.9616682939231396, "num_tokens": 81920.0, "step": 20 }, { "epoch": 0.11225444340505145, "grad_norm": 2.5540621280670166, "learning_rate": 4.958333333333334e-05, "loss": 0.1633, "mean_token_accuracy": 0.954109588265419, "num_tokens": 122880.0, "step": 30 }, { "epoch": 0.14967259120673526, "grad_norm": 4.433518886566162, "learning_rate": 4.75e-05, "loss": 0.1509, "mean_token_accuracy": 0.9584148712456226, "num_tokens": 163840.0, "step": 40 }, { "epoch": 0.18709073900841908, "grad_norm": 2.500624179840088, "learning_rate": 4.541666666666667e-05, "loss": 0.1638, "mean_token_accuracy": 0.9535224996507168, "num_tokens": 204800.0, "step": 50 }, { "epoch": 0.2245088868101029, "grad_norm": 1.5342501401901245, "learning_rate": 4.3333333333333334e-05, "loss": 0.1634, "mean_token_accuracy": 0.9534246526658535, "num_tokens": 245760.0, "step": 60 }, { "epoch": 0.26192703461178674, "grad_norm": 1.9312411546707153, "learning_rate": 4.125e-05, "loss": 0.141, "mean_token_accuracy": 0.9598091915249825, "num_tokens": 286720.0, "step": 70 }, { "epoch": 0.2993451824134705, "grad_norm": 1.3097331523895264, "learning_rate": 3.9166666666666665e-05, "loss": 0.1598, "mean_token_accuracy": 0.954647745192051, "num_tokens": 327680.0, "step": 80 }, { "epoch": 0.33676333021515437, "grad_norm": 1.5079143047332764, "learning_rate": 3.708333333333334e-05, "loss": 0.142, "mean_token_accuracy": 0.958488255739212, "num_tokens": 368640.0, "step": 90 }, { "epoch": 0.37418147801683815, "grad_norm": 1.202209234237671, "learning_rate": 3.5e-05, "loss": 0.1459, "mean_token_accuracy": 0.9584882512688637, "num_tokens": 409600.0, "step": 100 }, { "epoch": 0.411599625818522, "grad_norm": 1.2866814136505127, "learning_rate": 3.291666666666667e-05, "loss": 0.1466, "mean_token_accuracy": 0.9581213280558586, "num_tokens": 450560.0, "step": 110 }, { "epoch": 0.4490177736202058, "grad_norm": 1.4433410167694092, "learning_rate": 3.0833333333333335e-05, "loss": 0.1553, "mean_token_accuracy": 0.957167312502861, "num_tokens": 491520.0, "step": 120 }, { "epoch": 0.4864359214218896, "grad_norm": 1.7865726947784424, "learning_rate": 2.8749999999999997e-05, "loss": 0.1292, "mean_token_accuracy": 0.9638209342956543, "num_tokens": 532480.0, "step": 130 }, { "epoch": 0.5238540692235735, "grad_norm": 1.4343348741531372, "learning_rate": 2.6666666666666667e-05, "loss": 0.1454, "mean_token_accuracy": 0.9584393322467804, "num_tokens": 573440.0, "step": 140 }, { "epoch": 0.5612722170252572, "grad_norm": 1.2116364240646362, "learning_rate": 2.4583333333333332e-05, "loss": 0.1336, "mean_token_accuracy": 0.9633561626076699, "num_tokens": 614400.0, "step": 150 }, { "epoch": 0.598690364826941, "grad_norm": 1.7633224725723267, "learning_rate": 2.25e-05, "loss": 0.1319, "mean_token_accuracy": 0.9626712270081044, "num_tokens": 655360.0, "step": 160 }, { "epoch": 0.6361085126286249, "grad_norm": 1.3809901475906372, "learning_rate": 2.0416666666666667e-05, "loss": 0.129, "mean_token_accuracy": 0.963209392875433, "num_tokens": 696320.0, "step": 170 }, { "epoch": 0.6735266604303087, "grad_norm": 1.4324010610580444, "learning_rate": 1.8333333333333333e-05, "loss": 0.1278, "mean_token_accuracy": 0.9635273940861225, "num_tokens": 737280.0, "step": 180 }, { "epoch": 0.7109448082319925, "grad_norm": 1.3217487335205078, "learning_rate": 1.6250000000000002e-05, "loss": 0.2005, "mean_token_accuracy": 0.9594178041443229, "num_tokens": 778240.0, "step": 190 }, { "epoch": 0.7483629560336763, "grad_norm": 1.0903115272521973, "learning_rate": 1.4166666666666668e-05, "loss": 0.1028, "mean_token_accuracy": 0.9706457890570164, "num_tokens": 819200.0, "step": 200 }, { "epoch": 0.7857811038353602, "grad_norm": 1.6808840036392212, "learning_rate": 1.2083333333333333e-05, "loss": 0.108, "mean_token_accuracy": 0.9681751407682896, "num_tokens": 860160.0, "step": 210 }, { "epoch": 0.823199251637044, "grad_norm": 1.3859535455703735, "learning_rate": 1e-05, "loss": 0.1081, "mean_token_accuracy": 0.9688111506402493, "num_tokens": 901120.0, "step": 220 }, { "epoch": 0.8606173994387278, "grad_norm": 0.9109633564949036, "learning_rate": 7.916666666666667e-06, "loss": 0.1121, "mean_token_accuracy": 0.9680039115250111, "num_tokens": 942080.0, "step": 230 }, { "epoch": 0.8980355472404116, "grad_norm": 1.237545132637024, "learning_rate": 5.833333333333334e-06, "loss": 0.1042, "mean_token_accuracy": 0.9695939309895039, "num_tokens": 983040.0, "step": 240 }, { "epoch": 0.9354536950420954, "grad_norm": 1.4165068864822388, "learning_rate": 3.75e-06, "loss": 0.0941, "mean_token_accuracy": 0.9729452036321163, "num_tokens": 1024000.0, "step": 250 }, { "epoch": 0.9728718428437793, "grad_norm": 1.478573203086853, "learning_rate": 1.6666666666666667e-06, "loss": 0.0988, "mean_token_accuracy": 0.9710616409778595, "num_tokens": 1064960.0, "step": 260 } ], "logging_steps": 10, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2890255829041152.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }