{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15016685205784205, "eval_steps": 500, "global_step": 135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8225839495658874, "epoch": 0.0055617352614015575, "grad_norm": 18.125, "learning_rate": 9.757583950719854e-06, "loss": 1.456, "mean_token_accuracy": 0.7172651767730713, "num_tokens": 10200.0, "step": 5 }, { "entropy": 0.990528690814972, "epoch": 0.011123470522803115, "grad_norm": 5.59375, "learning_rate": 2.195456388911967e-05, "loss": 1.1324, "mean_token_accuracy": 0.7504398703575135, "num_tokens": 20440.0, "step": 10 }, { "entropy": 1.0786633253097535, "epoch": 0.01668520578420467, "grad_norm": 4.375, "learning_rate": 3.4151543827519494e-05, "loss": 0.8838, "mean_token_accuracy": 0.7859237432479859, "num_tokens": 30680.0, "step": 15 }, { "entropy": 0.49569674730300906, "epoch": 0.02224694104560623, "grad_norm": 5.65625, "learning_rate": 4.6348523765919305e-05, "loss": 0.4962, "mean_token_accuracy": 0.8680351972579956, "num_tokens": 40920.0, "step": 20 }, { "entropy": 0.3256936728954315, "epoch": 0.027808676307007785, "grad_norm": 3.96875, "learning_rate": 5.854550370431913e-05, "loss": 0.342, "mean_token_accuracy": 0.9047898292541504, "num_tokens": 51160.0, "step": 25 }, { "entropy": 0.34028403759002684, "epoch": 0.03337041156840934, "grad_norm": 2.734375, "learning_rate": 7.074248364271895e-05, "loss": 0.3832, "mean_token_accuracy": 0.8867057681083679, "num_tokens": 61400.0, "step": 30 }, { "entropy": 0.397564172744751, "epoch": 0.0389321468298109, "grad_norm": 3.46875, "learning_rate": 8.293946358111876e-05, "loss": 0.4357, "mean_token_accuracy": 0.8780049562454224, "num_tokens": 71634.0, "step": 35 }, { "entropy": 0.39086252450942993, "epoch": 0.04449388209121246, "grad_norm": 2.171875, "learning_rate": 8.537885171853211e-05, "loss": 0.4262, "mean_token_accuracy": 0.8823036670684814, "num_tokens": 81859.0, "step": 40 }, { "entropy": 0.43406811356544495, "epoch": 0.05005561735261402, "grad_norm": 3.5, "learning_rate": 8.53788198268306e-05, "loss": 0.4289, "mean_token_accuracy": 0.8754657983779908, "num_tokens": 91643.0, "step": 45 }, { "entropy": 0.33308460712432864, "epoch": 0.05561735261401557, "grad_norm": 2.046875, "learning_rate": 8.537876340307694e-05, "loss": 0.3425, "mean_token_accuracy": 0.9025415539741516, "num_tokens": 101883.0, "step": 50 }, { "entropy": 0.3366609990596771, "epoch": 0.06117908787541713, "grad_norm": 2.203125, "learning_rate": 8.537868244731438e-05, "loss": 0.3744, "mean_token_accuracy": 0.8893450617790222, "num_tokens": 112123.0, "step": 55 }, { "entropy": 0.36333391070365906, "epoch": 0.06674082313681869, "grad_norm": 1.8203125, "learning_rate": 8.537857695960494e-05, "loss": 0.401, "mean_token_accuracy": 0.8837732076644897, "num_tokens": 122363.0, "step": 60 }, { "entropy": 0.5008107841014862, "epoch": 0.07230255839822025, "grad_norm": 2.484375, "learning_rate": 8.537844694002943e-05, "loss": 0.5051, "mean_token_accuracy": 0.857869005203247, "num_tokens": 132603.0, "step": 65 }, { "entropy": 0.5081609010696411, "epoch": 0.0778642936596218, "grad_norm": 2.515625, "learning_rate": 8.537829238868749e-05, "loss": 0.5758, "mean_token_accuracy": 0.8529759764671325, "num_tokens": 142833.0, "step": 70 }, { "entropy": 1.2066489636898041, "epoch": 0.08342602892102335, "grad_norm": 338.0, "learning_rate": 8.537811330569756e-05, "loss": 1.6885, "mean_token_accuracy": 0.7405156970024109, "num_tokens": 153063.0, "step": 75 }, { "entropy": 0.3763828158378601, "epoch": 0.08898776418242492, "grad_norm": 1.546875, "learning_rate": 8.537790969119681e-05, "loss": 0.407, "mean_token_accuracy": 0.8820860385894775, "num_tokens": 163299.0, "step": 80 }, { "entropy": 0.371926474571228, "epoch": 0.09454949944382647, "grad_norm": 1.78125, "learning_rate": 8.537768154534127e-05, "loss": 0.3733, "mean_token_accuracy": 0.8954056620597839, "num_tokens": 173539.0, "step": 85 }, { "entropy": 0.3801657140254974, "epoch": 0.10011123470522804, "grad_norm": 1.375, "learning_rate": 8.537742886830578e-05, "loss": 0.3588, "mean_token_accuracy": 0.8942199230194092, "num_tokens": 183779.0, "step": 90 }, { "entropy": 0.26907028555870055, "epoch": 0.10567296996662959, "grad_norm": 1.828125, "learning_rate": 8.537715166028392e-05, "loss": 0.2725, "mean_token_accuracy": 0.9173020601272583, "num_tokens": 194019.0, "step": 95 }, { "entropy": 0.28473606407642366, "epoch": 0.11123470522803114, "grad_norm": 2.046875, "learning_rate": 8.537684992148809e-05, "loss": 0.3359, "mean_token_accuracy": 0.9006433963775635, "num_tokens": 204222.0, "step": 100 }, { "entropy": 0.34853672981262207, "epoch": 0.1167964404894327, "grad_norm": 1.90625, "learning_rate": 8.537652365214949e-05, "loss": 0.3321, "mean_token_accuracy": 0.9035966873168946, "num_tokens": 214237.0, "step": 105 }, { "entropy": 0.3243670523166656, "epoch": 0.12235817575083426, "grad_norm": 1.484375, "learning_rate": 8.537617285251812e-05, "loss": 0.3225, "mean_token_accuracy": 0.9064516067504883, "num_tokens": 224477.0, "step": 110 }, { "entropy": 0.3252596139907837, "epoch": 0.12791991101223582, "grad_norm": 1.5546875, "learning_rate": 8.537579752286277e-05, "loss": 0.3515, "mean_token_accuracy": 0.8956989169120788, "num_tokens": 234717.0, "step": 115 }, { "entropy": 0.27094421088695525, "epoch": 0.13348164627363737, "grad_norm": 3.71875, "learning_rate": 8.537539766347103e-05, "loss": 0.302, "mean_token_accuracy": 0.9114369511604309, "num_tokens": 244957.0, "step": 120 }, { "entropy": 0.3085549890995026, "epoch": 0.13904338153503892, "grad_norm": 1.4921875, "learning_rate": 8.537497327464926e-05, "loss": 0.3061, "mean_token_accuracy": 0.9045943140983581, "num_tokens": 255197.0, "step": 125 }, { "entropy": 0.31254134476184847, "epoch": 0.1446051167964405, "grad_norm": 1.7109375, "learning_rate": 8.537452435672265e-05, "loss": 0.3357, "mean_token_accuracy": 0.9013739109039307, "num_tokens": 265354.0, "step": 130 }, { "entropy": 0.282534995675087, "epoch": 0.15016685205784205, "grad_norm": 1.3203125, "learning_rate": 8.537405091003517e-05, "loss": 0.2944, "mean_token_accuracy": 0.9155425190925598, "num_tokens": 275594.0, "step": 135 } ], "logging_steps": 5, "max_steps": 17980, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4588266032463872.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }