| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1407, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.7955821046978235, | |
| "epoch": 0.21344717182497333, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001859275053304904, | |
| "loss": 0.8123, | |
| "mean_token_accuracy": 0.7977147643268109, | |
| "num_tokens": 3815906.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.7014639886468649, | |
| "epoch": 0.42689434364994666, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.0001717128642501777, | |
| "loss": 0.7209, | |
| "mean_token_accuracy": 0.8144587337970733, | |
| "num_tokens": 7626566.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.684154222086072, | |
| "epoch": 0.6403415154749199, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.00015749822316986497, | |
| "loss": 0.7017, | |
| "mean_token_accuracy": 0.8176216109097004, | |
| "num_tokens": 11456784.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.6706647833436727, | |
| "epoch": 0.8537886872998933, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.00014328358208955225, | |
| "loss": 0.6881, | |
| "mean_token_accuracy": 0.8207742583751678, | |
| "num_tokens": 15297842.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5925532557332336, | |
| "epoch": 1.0661686232657417, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 0.00012906894100923953, | |
| "loss": 0.5889, | |
| "mean_token_accuracy": 0.8439057000617886, | |
| "num_tokens": 19106846.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.42009303748607635, | |
| "epoch": 1.279615795090715, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 0.0001148542999289268, | |
| "loss": 0.3836, | |
| "mean_token_accuracy": 0.8929252660274506, | |
| "num_tokens": 22944206.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.4144257218763232, | |
| "epoch": 1.4930629669156883, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00010063965884861408, | |
| "loss": 0.3809, | |
| "mean_token_accuracy": 0.8935985819995403, | |
| "num_tokens": 26787706.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.4126653341576457, | |
| "epoch": 1.7065101387406618, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 8.642501776830136e-05, | |
| "loss": 0.3796, | |
| "mean_token_accuracy": 0.8937636642158031, | |
| "num_tokens": 30652821.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.4055024874210358, | |
| "epoch": 1.9199573105656351, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 7.221037668798864e-05, | |
| "loss": 0.3719, | |
| "mean_token_accuracy": 0.896048932671547, | |
| "num_tokens": 34453009.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.33940730929075175, | |
| "epoch": 2.1323372465314834, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 5.7995735607675904e-05, | |
| "loss": 0.2844, | |
| "mean_token_accuracy": 0.922690433024162, | |
| "num_tokens": 38246767.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.28708830222487447, | |
| "epoch": 2.3457844183564567, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.3781094527363184e-05, | |
| "loss": 0.2259, | |
| "mean_token_accuracy": 0.9395377920567989, | |
| "num_tokens": 42074506.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.28266524378210306, | |
| "epoch": 2.55923159018143, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.9566453447050464e-05, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9406452259421348, | |
| "num_tokens": 45883112.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.28105670753866435, | |
| "epoch": 2.7726787620064033, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 1.535181236673774e-05, | |
| "loss": 0.2224, | |
| "mean_token_accuracy": 0.9406881707906724, | |
| "num_tokens": 49691137.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.28744990050792696, | |
| "epoch": 2.9861259338313766, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.1371712864250178e-06, | |
| "loss": 0.2261, | |
| "mean_token_accuracy": 0.9393656353652478, | |
| "num_tokens": 53554833.0, | |
| "step": 1400 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 1407, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0385484190580736e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |