| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.15016685205784205, |
| "eval_steps": 500, |
| "global_step": 135, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.8225839495658874, |
| "epoch": 0.0055617352614015575, |
| "grad_norm": 18.125, |
| "learning_rate": 9.757583950719854e-06, |
| "loss": 1.456, |
| "mean_token_accuracy": 0.7172651767730713, |
| "num_tokens": 10200.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 0.990528690814972, |
| "epoch": 0.011123470522803115, |
| "grad_norm": 5.59375, |
| "learning_rate": 2.195456388911967e-05, |
| "loss": 1.1324, |
| "mean_token_accuracy": 0.7504398703575135, |
| "num_tokens": 20440.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.0786633253097535, |
| "epoch": 0.01668520578420467, |
| "grad_norm": 4.375, |
| "learning_rate": 3.4151543827519494e-05, |
| "loss": 0.8838, |
| "mean_token_accuracy": 0.7859237432479859, |
| "num_tokens": 30680.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.49569674730300906, |
| "epoch": 0.02224694104560623, |
| "grad_norm": 5.65625, |
| "learning_rate": 4.6348523765919305e-05, |
| "loss": 0.4962, |
| "mean_token_accuracy": 0.8680351972579956, |
| "num_tokens": 40920.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.3256936728954315, |
| "epoch": 0.027808676307007785, |
| "grad_norm": 3.96875, |
| "learning_rate": 5.854550370431913e-05, |
| "loss": 0.342, |
| "mean_token_accuracy": 0.9047898292541504, |
| "num_tokens": 51160.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.34028403759002684, |
| "epoch": 0.03337041156840934, |
| "grad_norm": 2.734375, |
| "learning_rate": 7.074248364271895e-05, |
| "loss": 0.3832, |
| "mean_token_accuracy": 0.8867057681083679, |
| "num_tokens": 61400.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.397564172744751, |
| "epoch": 0.0389321468298109, |
| "grad_norm": 3.46875, |
| "learning_rate": 8.293946358111876e-05, |
| "loss": 0.4357, |
| "mean_token_accuracy": 0.8780049562454224, |
| "num_tokens": 71634.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.39086252450942993, |
| "epoch": 0.04449388209121246, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.537885171853211e-05, |
| "loss": 0.4262, |
| "mean_token_accuracy": 0.8823036670684814, |
| "num_tokens": 81859.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.43406811356544495, |
| "epoch": 0.05005561735261402, |
| "grad_norm": 3.5, |
| "learning_rate": 8.53788198268306e-05, |
| "loss": 0.4289, |
| "mean_token_accuracy": 0.8754657983779908, |
| "num_tokens": 91643.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.33308460712432864, |
| "epoch": 0.05561735261401557, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.537876340307694e-05, |
| "loss": 0.3425, |
| "mean_token_accuracy": 0.9025415539741516, |
| "num_tokens": 101883.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.3366609990596771, |
| "epoch": 0.06117908787541713, |
| "grad_norm": 2.203125, |
| "learning_rate": 8.537868244731438e-05, |
| "loss": 0.3744, |
| "mean_token_accuracy": 0.8893450617790222, |
| "num_tokens": 112123.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.36333391070365906, |
| "epoch": 0.06674082313681869, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.537857695960494e-05, |
| "loss": 0.401, |
| "mean_token_accuracy": 0.8837732076644897, |
| "num_tokens": 122363.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.5008107841014862, |
| "epoch": 0.07230255839822025, |
| "grad_norm": 2.484375, |
| "learning_rate": 8.537844694002943e-05, |
| "loss": 0.5051, |
| "mean_token_accuracy": 0.857869005203247, |
| "num_tokens": 132603.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.5081609010696411, |
| "epoch": 0.0778642936596218, |
| "grad_norm": 2.515625, |
| "learning_rate": 8.537829238868749e-05, |
| "loss": 0.5758, |
| "mean_token_accuracy": 0.8529759764671325, |
| "num_tokens": 142833.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.2066489636898041, |
| "epoch": 0.08342602892102335, |
| "grad_norm": 338.0, |
| "learning_rate": 8.537811330569756e-05, |
| "loss": 1.6885, |
| "mean_token_accuracy": 0.7405156970024109, |
| "num_tokens": 153063.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.3763828158378601, |
| "epoch": 0.08898776418242492, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.537790969119681e-05, |
| "loss": 0.407, |
| "mean_token_accuracy": 0.8820860385894775, |
| "num_tokens": 163299.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.371926474571228, |
| "epoch": 0.09454949944382647, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.537768154534127e-05, |
| "loss": 0.3733, |
| "mean_token_accuracy": 0.8954056620597839, |
| "num_tokens": 173539.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.3801657140254974, |
| "epoch": 0.10011123470522804, |
| "grad_norm": 1.375, |
| "learning_rate": 8.537742886830578e-05, |
| "loss": 0.3588, |
| "mean_token_accuracy": 0.8942199230194092, |
| "num_tokens": 183779.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.26907028555870055, |
| "epoch": 0.10567296996662959, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.537715166028392e-05, |
| "loss": 0.2725, |
| "mean_token_accuracy": 0.9173020601272583, |
| "num_tokens": 194019.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.28473606407642366, |
| "epoch": 0.11123470522803114, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.537684992148809e-05, |
| "loss": 0.3359, |
| "mean_token_accuracy": 0.9006433963775635, |
| "num_tokens": 204222.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.34853672981262207, |
| "epoch": 0.1167964404894327, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.537652365214949e-05, |
| "loss": 0.3321, |
| "mean_token_accuracy": 0.9035966873168946, |
| "num_tokens": 214237.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.3243670523166656, |
| "epoch": 0.12235817575083426, |
| "grad_norm": 1.484375, |
| "learning_rate": 8.537617285251812e-05, |
| "loss": 0.3225, |
| "mean_token_accuracy": 0.9064516067504883, |
| "num_tokens": 224477.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.3252596139907837, |
| "epoch": 0.12791991101223582, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.537579752286277e-05, |
| "loss": 0.3515, |
| "mean_token_accuracy": 0.8956989169120788, |
| "num_tokens": 234717.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.27094421088695525, |
| "epoch": 0.13348164627363737, |
| "grad_norm": 3.71875, |
| "learning_rate": 8.537539766347103e-05, |
| "loss": 0.302, |
| "mean_token_accuracy": 0.9114369511604309, |
| "num_tokens": 244957.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.3085549890995026, |
| "epoch": 0.13904338153503892, |
| "grad_norm": 1.4921875, |
| "learning_rate": 8.537497327464926e-05, |
| "loss": 0.3061, |
| "mean_token_accuracy": 0.9045943140983581, |
| "num_tokens": 255197.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.31254134476184847, |
| "epoch": 0.1446051167964405, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.537452435672265e-05, |
| "loss": 0.3357, |
| "mean_token_accuracy": 0.9013739109039307, |
| "num_tokens": 265354.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.282534995675087, |
| "epoch": 0.15016685205784205, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.537405091003517e-05, |
| "loss": 0.2944, |
| "mean_token_accuracy": 0.9155425190925598, |
| "num_tokens": 275594.0, |
| "step": 135 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 17980, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4588266032463872.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|