| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1407, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.806774509884417, | |
| "epoch": 0.21344717182497333, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0001859275053304904, | |
| "loss": 0.8233, | |
| "mean_token_accuracy": 0.7962292967736722, | |
| "num_tokens": 3815906.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.7081028638407588, | |
| "epoch": 0.42689434364994666, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.0001717128642501777, | |
| "loss": 0.7291, | |
| "mean_token_accuracy": 0.8129953134059906, | |
| "num_tokens": 7626566.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.6894140053912997, | |
| "epoch": 0.6403415154749199, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00015749822316986497, | |
| "loss": 0.7077, | |
| "mean_token_accuracy": 0.8166575956344605, | |
| "num_tokens": 11456784.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.6752224664017558, | |
| "epoch": 0.8537886872998933, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.00014328358208955225, | |
| "loss": 0.6934, | |
| "mean_token_accuracy": 0.819763490408659, | |
| "num_tokens": 15297842.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5943935247671664, | |
| "epoch": 1.0661686232657417, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.00012906894100923953, | |
| "loss": 0.5923, | |
| "mean_token_accuracy": 0.8431494048791914, | |
| "num_tokens": 19106846.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.4187217493355274, | |
| "epoch": 1.279615795090715, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.0001148542999289268, | |
| "loss": 0.3844, | |
| "mean_token_accuracy": 0.892728195488453, | |
| "num_tokens": 22944206.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.411714296489954, | |
| "epoch": 1.4930629669156883, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.00010063965884861408, | |
| "loss": 0.3803, | |
| "mean_token_accuracy": 0.8937244528532028, | |
| "num_tokens": 26787706.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.4113427203521132, | |
| "epoch": 1.7065101387406618, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 8.642501776830136e-05, | |
| "loss": 0.3803, | |
| "mean_token_accuracy": 0.8936202809214592, | |
| "num_tokens": 30652821.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.40327337332069874, | |
| "epoch": 1.9199573105656351, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 7.221037668798864e-05, | |
| "loss": 0.3717, | |
| "mean_token_accuracy": 0.896033379137516, | |
| "num_tokens": 34453009.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.3362068770518854, | |
| "epoch": 2.1323372465314834, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 5.7995735607675904e-05, | |
| "loss": 0.2826, | |
| "mean_token_accuracy": 0.9232019716171763, | |
| "num_tokens": 38246767.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.282884752638638, | |
| "epoch": 2.3457844183564567, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.3781094527363184e-05, | |
| "loss": 0.2226, | |
| "mean_token_accuracy": 0.9403098250925541, | |
| "num_tokens": 42074506.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.27763547843322156, | |
| "epoch": 2.55923159018143, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.9566453447050464e-05, | |
| "loss": 0.2188, | |
| "mean_token_accuracy": 0.9416492365300655, | |
| "num_tokens": 45883112.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.2768232163786888, | |
| "epoch": 2.7726787620064033, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1.535181236673774e-05, | |
| "loss": 0.2191, | |
| "mean_token_accuracy": 0.9414364543557167, | |
| "num_tokens": 49691137.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.28273184813559055, | |
| "epoch": 2.9861259338313766, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 1.1371712864250178e-06, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9403267921507359, | |
| "num_tokens": 53554833.0, | |
| "step": 1400 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 1407, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0385484190580736e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |