| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 177, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.8946975544095039, | |
| "epoch": 0.17167381974248927, | |
| "grad_norm": 3.2923269271850586, | |
| "learning_rate": 6e-06, | |
| "loss": 0.9271, | |
| "mean_token_accuracy": 0.7838601619005203, | |
| "num_tokens": 646810.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.8304631188511848, | |
| "epoch": 0.34334763948497854, | |
| "grad_norm": 1.4505877494812012, | |
| "learning_rate": 9.994965332706574e-06, | |
| "loss": 0.8398, | |
| "mean_token_accuracy": 0.8009490087628365, | |
| "num_tokens": 1291203.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.768204678595066, | |
| "epoch": 0.5150214592274678, | |
| "grad_norm": 1.083687663078308, | |
| "learning_rate": 9.938441702975689e-06, | |
| "loss": 0.7803, | |
| "mean_token_accuracy": 0.8115046098828316, | |
| "num_tokens": 1941398.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.7275782853364945, | |
| "epoch": 0.6866952789699571, | |
| "grad_norm": 0.7802343368530273, | |
| "learning_rate": 9.819814303479268e-06, | |
| "loss": 0.7392, | |
| "mean_token_accuracy": 0.8186714142560959, | |
| "num_tokens": 2586314.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.7106381312012673, | |
| "epoch": 0.8583690987124464, | |
| "grad_norm": 0.7659251689910889, | |
| "learning_rate": 9.640574942595195e-06, | |
| "loss": 0.7229, | |
| "mean_token_accuracy": 0.8213993713259697, | |
| "num_tokens": 3239144.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.7049121647267729, | |
| "epoch": 1.0171673819742488, | |
| "grad_norm": 0.7232816815376282, | |
| "learning_rate": 9.40297765928369e-06, | |
| "loss": 0.7128, | |
| "mean_token_accuracy": 0.8236586129343187, | |
| "num_tokens": 3846086.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.66157948076725, | |
| "epoch": 1.1888412017167382, | |
| "grad_norm": 0.6707538962364197, | |
| "learning_rate": 9.110010377239552e-06, | |
| "loss": 0.6685, | |
| "mean_token_accuracy": 0.832102257013321, | |
| "num_tokens": 4500217.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.6357302084565163, | |
| "epoch": 1.3605150214592274, | |
| "grad_norm": 0.695083737373352, | |
| "learning_rate": 8.765357330018056e-06, | |
| "loss": 0.6488, | |
| "mean_token_accuracy": 0.8356550931930542, | |
| "num_tokens": 5142939.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.6433110848069191, | |
| "epoch": 1.5321888412017168, | |
| "grad_norm": 0.6362840533256531, | |
| "learning_rate": 8.373352729660373e-06, | |
| "loss": 0.6497, | |
| "mean_token_accuracy": 0.8368361875414848, | |
| "num_tokens": 5795423.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.6335700437426567, | |
| "epoch": 1.703862660944206, | |
| "grad_norm": 0.608460545539856, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.646, | |
| "mean_token_accuracy": 0.8386218667030334, | |
| "num_tokens": 6441560.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.6372291177511216, | |
| "epoch": 1.8755364806866952, | |
| "grad_norm": 0.5837402939796448, | |
| "learning_rate": 7.467541090321735e-06, | |
| "loss": 0.6432, | |
| "mean_token_accuracy": 0.8375322207808494, | |
| "num_tokens": 7088692.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.6296658838117445, | |
| "epoch": 2.0343347639484977, | |
| "grad_norm": 0.6868466138839722, | |
| "learning_rate": 6.965125158269619e-06, | |
| "loss": 0.6355, | |
| "mean_token_accuracy": 0.8384278191102518, | |
| "num_tokens": 7689634.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5824477970600128, | |
| "epoch": 2.2060085836909873, | |
| "grad_norm": 0.6492967009544373, | |
| "learning_rate": 6.437996637160086e-06, | |
| "loss": 0.5903, | |
| "mean_token_accuracy": 0.8480394512414933, | |
| "num_tokens": 8335925.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5835598841309547, | |
| "epoch": 2.3776824034334765, | |
| "grad_norm": 0.6394692063331604, | |
| "learning_rate": 5.892784473993184e-06, | |
| "loss": 0.5887, | |
| "mean_token_accuracy": 0.8477904468774795, | |
| "num_tokens": 8984053.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5792419567704201, | |
| "epoch": 2.5493562231759657, | |
| "grad_norm": 0.6243847608566284, | |
| "learning_rate": 5.336345028060199e-06, | |
| "loss": 0.5894, | |
| "mean_token_accuracy": 0.8487418398261071, | |
| "num_tokens": 9632178.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5750040769577026, | |
| "epoch": 2.721030042918455, | |
| "grad_norm": 0.6083446741104126, | |
| "learning_rate": 4.775675848247427e-06, | |
| "loss": 0.5818, | |
| "mean_token_accuracy": 0.8494960919022561, | |
| "num_tokens": 10281754.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.584269268810749, | |
| "epoch": 2.8927038626609445, | |
| "grad_norm": 0.6467194557189941, | |
| "learning_rate": 4.217827674798845e-06, | |
| "loss": 0.5897, | |
| "mean_token_accuracy": 0.848649799823761, | |
| "num_tokens": 10930722.0, | |
| "step": 170 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 295, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.385177303604265e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |