| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 290, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.7135222218930721, | |
| "epoch": 0.1386481802426343, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 3.2758620689655175e-06, | |
| "loss": 1.0037, | |
| "mean_token_accuracy": 0.766643451154232, | |
| "num_tokens": 6832690.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6983387872576714, | |
| "epoch": 0.2772963604852686, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 6.724137931034484e-06, | |
| "loss": 0.7837, | |
| "mean_token_accuracy": 0.7993580244481564, | |
| "num_tokens": 13664933.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5203136764466763, | |
| "epoch": 0.41594454072790293, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 9.999541586764836e-06, | |
| "loss": 0.5293, | |
| "mean_token_accuracy": 0.8474333696067333, | |
| "num_tokens": 20500452.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.4632941197603941, | |
| "epoch": 0.5545927209705372, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 9.799195340909569e-06, | |
| "loss": 0.4664, | |
| "mean_token_accuracy": 0.8605481564998627, | |
| "num_tokens": 27342335.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.44385356418788435, | |
| "epoch": 0.6932409012131716, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 9.248987682898576e-06, | |
| "loss": 0.4448, | |
| "mean_token_accuracy": 0.8655192881822587, | |
| "num_tokens": 34182590.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.44103220105171204, | |
| "epoch": 0.8318890814558059, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 8.389028759232816e-06, | |
| "loss": 0.4425, | |
| "mean_token_accuracy": 0.8660077638924122, | |
| "num_tokens": 41024570.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.4328003875911236, | |
| "epoch": 0.9705372616984402, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 7.2820095883138456e-06, | |
| "loss": 0.4334, | |
| "mean_token_accuracy": 0.8682045668363572, | |
| "num_tokens": 47861377.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.4284165660282234, | |
| "epoch": 1.1039861351819757, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 6.008631884264387e-06, | |
| "loss": 0.4289, | |
| "mean_token_accuracy": 0.868948469688366, | |
| "num_tokens": 54391813.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.4236792534589767, | |
| "epoch": 1.24263431542461, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.661724900761355e-06, | |
| "loss": 0.4239, | |
| "mean_token_accuracy": 0.8704770557582379, | |
| "num_tokens": 61227970.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.42442810237407685, | |
| "epoch": 1.3812824956672443, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 3.3394781770539406e-06, | |
| "loss": 0.4245, | |
| "mean_token_accuracy": 0.8702129699289799, | |
| "num_tokens": 68065726.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.42482112273573874, | |
| "epoch": 1.5199306759098787, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.138283519083281e-06, | |
| "loss": 0.4249, | |
| "mean_token_accuracy": 0.8700512439012528, | |
| "num_tokens": 74903041.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.4213219854980707, | |
| "epoch": 1.658578856152513, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.145708035387177e-06, | |
| "loss": 0.4219, | |
| "mean_token_accuracy": 0.8707552805542946, | |
| "num_tokens": 81743295.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.422089908644557, | |
| "epoch": 1.7972270363951472, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 4.341104935775442e-07, | |
| "loss": 0.4229, | |
| "mean_token_accuracy": 0.8708024740219116, | |
| "num_tokens": 88577789.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.4224289160221815, | |
| "epoch": 1.9358752166377817, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 5.536636509891225e-08, | |
| "loss": 0.4232, | |
| "mean_token_accuracy": 0.8705480195581913, | |
| "num_tokens": 95417722.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.4213579764237275, | |
| "epoch": 2.0, | |
| "mean_token_accuracy": 0.8710838395196039, | |
| "num_tokens": 98528536.0, | |
| "step": 290, | |
| "total_flos": 2.1561577524323942e+18, | |
| "train_loss": 0.5024350297862086, | |
| "train_runtime": 9862.6956, | |
| "train_samples_per_second": 9.819, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 290, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1561577524323942e+18, | |
| "train_batch_size": 84, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |