| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9990645463049579, | |
| "eval_steps": 100, | |
| "global_step": 267, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.037418147801683815, | |
| "grad_norm": 3.605011224746704, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.3411, | |
| "mean_token_accuracy": 0.7953277874737978, | |
| "num_tokens": 40960.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07483629560336763, | |
| "grad_norm": 3.451061964035034, | |
| "learning_rate": 3.518518518518519e-05, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9616682939231396, | |
| "num_tokens": 81920.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11225444340505145, | |
| "grad_norm": 2.5540621280670166, | |
| "learning_rate": 4.958333333333334e-05, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.954109588265419, | |
| "num_tokens": 122880.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14967259120673526, | |
| "grad_norm": 4.433518886566162, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9584148712456226, | |
| "num_tokens": 163840.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18709073900841908, | |
| "grad_norm": 2.500624179840088, | |
| "learning_rate": 4.541666666666667e-05, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.9535224996507168, | |
| "num_tokens": 204800.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2245088868101029, | |
| "grad_norm": 1.5342501401901245, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.9534246526658535, | |
| "num_tokens": 245760.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26192703461178674, | |
| "grad_norm": 1.9312411546707153, | |
| "learning_rate": 4.125e-05, | |
| "loss": 0.141, | |
| "mean_token_accuracy": 0.9598091915249825, | |
| "num_tokens": 286720.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2993451824134705, | |
| "grad_norm": 1.3097331523895264, | |
| "learning_rate": 3.9166666666666665e-05, | |
| "loss": 0.1598, | |
| "mean_token_accuracy": 0.954647745192051, | |
| "num_tokens": 327680.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.33676333021515437, | |
| "grad_norm": 1.5079143047332764, | |
| "learning_rate": 3.708333333333334e-05, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.958488255739212, | |
| "num_tokens": 368640.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.37418147801683815, | |
| "grad_norm": 1.202209234237671, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9584882512688637, | |
| "num_tokens": 409600.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.411599625818522, | |
| "grad_norm": 1.2866814136505127, | |
| "learning_rate": 3.291666666666667e-05, | |
| "loss": 0.1466, | |
| "mean_token_accuracy": 0.9581213280558586, | |
| "num_tokens": 450560.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4490177736202058, | |
| "grad_norm": 1.4433410167694092, | |
| "learning_rate": 3.0833333333333335e-05, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.957167312502861, | |
| "num_tokens": 491520.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4864359214218896, | |
| "grad_norm": 1.7865726947784424, | |
| "learning_rate": 2.8749999999999997e-05, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.9638209342956543, | |
| "num_tokens": 532480.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5238540692235735, | |
| "grad_norm": 1.4343348741531372, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.9584393322467804, | |
| "num_tokens": 573440.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5612722170252572, | |
| "grad_norm": 1.2116364240646362, | |
| "learning_rate": 2.4583333333333332e-05, | |
| "loss": 0.1336, | |
| "mean_token_accuracy": 0.9633561626076699, | |
| "num_tokens": 614400.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.598690364826941, | |
| "grad_norm": 1.7633224725723267, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9626712270081044, | |
| "num_tokens": 655360.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6361085126286249, | |
| "grad_norm": 1.3809901475906372, | |
| "learning_rate": 2.0416666666666667e-05, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.963209392875433, | |
| "num_tokens": 696320.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6735266604303087, | |
| "grad_norm": 1.4324010610580444, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9635273940861225, | |
| "num_tokens": 737280.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7109448082319925, | |
| "grad_norm": 1.3217487335205078, | |
| "learning_rate": 1.6250000000000002e-05, | |
| "loss": 0.2005, | |
| "mean_token_accuracy": 0.9594178041443229, | |
| "num_tokens": 778240.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7483629560336763, | |
| "grad_norm": 1.0903115272521973, | |
| "learning_rate": 1.4166666666666668e-05, | |
| "loss": 0.1028, | |
| "mean_token_accuracy": 0.9706457890570164, | |
| "num_tokens": 819200.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7857811038353602, | |
| "grad_norm": 1.6808840036392212, | |
| "learning_rate": 1.2083333333333333e-05, | |
| "loss": 0.108, | |
| "mean_token_accuracy": 0.9681751407682896, | |
| "num_tokens": 860160.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.823199251637044, | |
| "grad_norm": 1.3859535455703735, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1081, | |
| "mean_token_accuracy": 0.9688111506402493, | |
| "num_tokens": 901120.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8606173994387278, | |
| "grad_norm": 0.9109633564949036, | |
| "learning_rate": 7.916666666666667e-06, | |
| "loss": 0.1121, | |
| "mean_token_accuracy": 0.9680039115250111, | |
| "num_tokens": 942080.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8980355472404116, | |
| "grad_norm": 1.237545132637024, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.1042, | |
| "mean_token_accuracy": 0.9695939309895039, | |
| "num_tokens": 983040.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9354536950420954, | |
| "grad_norm": 1.4165068864822388, | |
| "learning_rate": 3.75e-06, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.9729452036321163, | |
| "num_tokens": 1024000.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9728718428437793, | |
| "grad_norm": 1.478573203086853, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0988, | |
| "mean_token_accuracy": 0.9710616409778595, | |
| "num_tokens": 1064960.0, | |
| "step": 260 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 267, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2890255829041152.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |