| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.14135049993151624, |
| "eval_steps": 500, |
| "global_step": 258, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 3.362587633728981, |
| "epoch": 0.005478701547733187, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 2.1232558139534884e-05, |
| "loss": 3.311968994140625, |
| "mean_token_accuracy": 0.37255842238664627, |
| "num_tokens": 79935.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 3.397061613202095, |
| "epoch": 0.010957403095466374, |
| "grad_norm": 0.01409912109375, |
| "learning_rate": 2.037984496124031e-05, |
| "loss": 3.316569137573242, |
| "mean_token_accuracy": 0.36559674218297006, |
| "num_tokens": 160237.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 3.4269614964723587, |
| "epoch": 0.016436104643199563, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 1.9527131782945736e-05, |
| "loss": 3.297005844116211, |
| "mean_token_accuracy": 0.36753762811422347, |
| "num_tokens": 240388.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 3.52002138197422, |
| "epoch": 0.021914806190932747, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 1.867441860465116e-05, |
| "loss": 3.3458625793457033, |
| "mean_token_accuracy": 0.36172508671879766, |
| "num_tokens": 320540.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 3.54078314602375, |
| "epoch": 0.027393507738665935, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 1.7821705426356588e-05, |
| "loss": 3.3246700286865236, |
| "mean_token_accuracy": 0.36284139938652515, |
| "num_tokens": 400670.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 3.5925668478012085, |
| "epoch": 0.032872209286399126, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 1.6968992248062016e-05, |
| "loss": 3.371358108520508, |
| "mean_token_accuracy": 0.3544324830174446, |
| "num_tokens": 480752.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 3.594508448243141, |
| "epoch": 0.038350910834132314, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.611627906976744e-05, |
| "loss": 3.347163772583008, |
| "mean_token_accuracy": 0.3613624542951584, |
| "num_tokens": 560695.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 3.577978181838989, |
| "epoch": 0.043829612381865495, |
| "grad_norm": 0.01531982421875, |
| "learning_rate": 1.5263565891472868e-05, |
| "loss": 3.31346435546875, |
| "mean_token_accuracy": 0.36706827245652673, |
| "num_tokens": 640768.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 3.65379436314106, |
| "epoch": 0.04930831392959868, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 1.4410852713178296e-05, |
| "loss": 3.376618576049805, |
| "mean_token_accuracy": 0.3594830695539713, |
| "num_tokens": 720755.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 3.6422270834445953, |
| "epoch": 0.05478701547733187, |
| "grad_norm": 0.0147705078125, |
| "learning_rate": 1.355813953488372e-05, |
| "loss": 3.340700149536133, |
| "mean_token_accuracy": 0.36250820718705656, |
| "num_tokens": 800718.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 3.6474091559648514, |
| "epoch": 0.06026571702506506, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 1.2705426356589148e-05, |
| "loss": 3.351934051513672, |
| "mean_token_accuracy": 0.3618617424741387, |
| "num_tokens": 880453.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 3.5958647608757017, |
| "epoch": 0.06574441857279825, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 1.1852713178294572e-05, |
| "loss": 3.292824935913086, |
| "mean_token_accuracy": 0.37089960649609566, |
| "num_tokens": 960655.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 3.654010236263275, |
| "epoch": 0.07122312012053143, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 1.1e-05, |
| "loss": 3.3539878845214846, |
| "mean_token_accuracy": 0.3581444948911667, |
| "num_tokens": 1040605.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 3.6406412810087203, |
| "epoch": 0.07670182166826463, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 1.0147286821705426e-05, |
| "loss": 3.309850311279297, |
| "mean_token_accuracy": 0.36814334206283095, |
| "num_tokens": 1120658.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 3.6587454110383986, |
| "epoch": 0.08218052321599781, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 9.294573643410852e-06, |
| "loss": 3.327889251708984, |
| "mean_token_accuracy": 0.36664336957037447, |
| "num_tokens": 1200045.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 3.707397770881653, |
| "epoch": 0.08765922476373099, |
| "grad_norm": 0.0174560546875, |
| "learning_rate": 8.44186046511628e-06, |
| "loss": 3.3876434326171876, |
| "mean_token_accuracy": 0.36042023114860056, |
| "num_tokens": 1279910.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 3.6571301251649855, |
| "epoch": 0.09313792631146418, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 7.5891472868217055e-06, |
| "loss": 3.3133255004882813, |
| "mean_token_accuracy": 0.3639252860099077, |
| "num_tokens": 1359687.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 3.6417709648609162, |
| "epoch": 0.09861662785919736, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 6.7364341085271315e-06, |
| "loss": 3.3142974853515623, |
| "mean_token_accuracy": 0.3656363181769848, |
| "num_tokens": 1439345.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 3.708697122335434, |
| "epoch": 0.10409532940693056, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 5.8837209302325576e-06, |
| "loss": 3.3883724212646484, |
| "mean_token_accuracy": 0.3580680161714554, |
| "num_tokens": 1519760.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 3.6610186755657197, |
| "epoch": 0.10957403095466374, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 5.0310077519379844e-06, |
| "loss": 3.310716247558594, |
| "mean_token_accuracy": 0.3715970482677221, |
| "num_tokens": 1598992.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 3.6523694813251497, |
| "epoch": 0.11505273250239693, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 4.1782945736434104e-06, |
| "loss": 3.3050559997558593, |
| "mean_token_accuracy": 0.368355280905962, |
| "num_tokens": 1678133.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 3.6562148123979568, |
| "epoch": 0.12053143405013012, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 3.325581395348837e-06, |
| "loss": 3.312104034423828, |
| "mean_token_accuracy": 0.3710305690765381, |
| "num_tokens": 1757654.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 3.6466243118047714, |
| "epoch": 0.1260101355978633, |
| "grad_norm": 0.01708984375, |
| "learning_rate": 2.4728682170542638e-06, |
| "loss": 3.3163944244384767, |
| "mean_token_accuracy": 0.37261501625180243, |
| "num_tokens": 1837077.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 3.655566415190697, |
| "epoch": 0.1314888371455965, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 1.62015503875969e-06, |
| "loss": 3.315351867675781, |
| "mean_token_accuracy": 0.36515464186668395, |
| "num_tokens": 1917482.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 3.6800541818141936, |
| "epoch": 0.13696753869332967, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 7.674418604651162e-07, |
| "loss": 3.3684162139892577, |
| "mean_token_accuracy": 0.35846460834145544, |
| "num_tokens": 1997079.0, |
| "step": 250 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 258, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3893922899263488.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|