| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 450, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.22346368715083798, |
| "grad_norm": 2.651468276977539, |
| "learning_rate": 1.777777777777778e-05, |
| "loss": 3.6282, |
| "mean_token_accuracy": 0.4188130386173725, |
| "num_tokens": 25449.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.44692737430167595, |
| "grad_norm": 1.5548869371414185, |
| "learning_rate": 4e-05, |
| "loss": 3.2525, |
| "mean_token_accuracy": 0.45206254720687866, |
| "num_tokens": 49961.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.6703910614525139, |
| "grad_norm": 1.3944281339645386, |
| "learning_rate": 6.222222222222222e-05, |
| "loss": 2.7616, |
| "mean_token_accuracy": 0.4963876515626907, |
| "num_tokens": 74318.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.8938547486033519, |
| "grad_norm": 1.1315394639968872, |
| "learning_rate": 8.444444444444444e-05, |
| "loss": 2.282, |
| "mean_token_accuracy": 0.5633564636111259, |
| "num_tokens": 99279.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 1.111731843575419, |
| "grad_norm": 1.4461969137191772, |
| "learning_rate": 9.998646205897309e-05, |
| "loss": 1.7485, |
| "mean_token_accuracy": 0.6460915727493091, |
| "num_tokens": 123123.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.3351955307262569, |
| "grad_norm": 1.323089599609375, |
| "learning_rate": 9.974599143895107e-05, |
| "loss": 1.2522, |
| "mean_token_accuracy": 0.7487043648958206, |
| "num_tokens": 147861.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.558659217877095, |
| "grad_norm": 1.4107621908187866, |
| "learning_rate": 9.920634257308216e-05, |
| "loss": 0.6935, |
| "mean_token_accuracy": 0.8642275393009186, |
| "num_tokens": 173170.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.7821229050279328, |
| "grad_norm": 1.5649642944335938, |
| "learning_rate": 9.837076097314319e-05, |
| "loss": 0.4389, |
| "mean_token_accuracy": 0.9179576024413109, |
| "num_tokens": 198201.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.7766520380973816, |
| "learning_rate": 9.72442719251944e-05, |
| "loss": 0.2782, |
| "mean_token_accuracy": 0.9465488699766306, |
| "num_tokens": 223068.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 2.223463687150838, |
| "grad_norm": 0.8976675868034363, |
| "learning_rate": 9.583365026691784e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9619620949029922, |
| "num_tokens": 247368.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.446927374301676, |
| "grad_norm": 1.256354808807373, |
| "learning_rate": 9.414737964294636e-05, |
| "loss": 0.1671, |
| "mean_token_accuracy": 0.9649594113230705, |
| "num_tokens": 272690.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.6703910614525137, |
| "grad_norm": 0.5383216142654419, |
| "learning_rate": 9.219560148322654e-05, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9639525949954987, |
| "num_tokens": 297796.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.893854748603352, |
| "grad_norm": 0.49042603373527527, |
| "learning_rate": 8.99900540112658e-05, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9645605370402336, |
| "num_tokens": 323013.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 3.111731843575419, |
| "grad_norm": 0.5440304279327393, |
| "learning_rate": 8.754400164907497e-05, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9709367400560623, |
| "num_tokens": 346094.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 3.335195530726257, |
| "grad_norm": 0.7135694026947021, |
| "learning_rate": 8.487215524337357e-05, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9754688128829002, |
| "num_tokens": 370649.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 3.558659217877095, |
| "grad_norm": 0.7237837910652161, |
| "learning_rate": 8.199058359282674e-05, |
| "loss": 0.1168, |
| "mean_token_accuracy": 0.9735488459467888, |
| "num_tokens": 396363.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 3.782122905027933, |
| "grad_norm": 0.768436849117279, |
| "learning_rate": 7.891661680839932e-05, |
| "loss": 0.1004, |
| "mean_token_accuracy": 0.9762797430157661, |
| "num_tokens": 420757.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.5389193892478943, |
| "learning_rate": 7.566874208802938e-05, |
| "loss": 0.0921, |
| "mean_token_accuracy": 0.9779695364145132, |
| "num_tokens": 446136.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 4.223463687150838, |
| "grad_norm": 0.8165420889854431, |
| "learning_rate": 7.226649253244448e-05, |
| "loss": 0.0758, |
| "mean_token_accuracy": 0.9822951450943946, |
| "num_tokens": 470938.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 4.446927374301676, |
| "grad_norm": 0.596653938293457, |
| "learning_rate": 6.873032967079561e-05, |
| "loss": 0.0759, |
| "mean_token_accuracy": 0.9825226783752441, |
| "num_tokens": 494883.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 4.670391061452514, |
| "grad_norm": 0.5423166751861572, |
| "learning_rate": 6.508152040261328e-05, |
| "loss": 0.0647, |
| "mean_token_accuracy": 0.9844422772526741, |
| "num_tokens": 521194.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 4.893854748603352, |
| "grad_norm": 0.4943588376045227, |
| "learning_rate": 6.134200909617135e-05, |
| "loss": 0.0686, |
| "mean_token_accuracy": 0.9831502199172973, |
| "num_tokens": 546333.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 5.111731843575419, |
| "grad_norm": 0.5471933484077454, |
| "learning_rate": 5.753428561247416e-05, |
| "loss": 0.0727, |
| "mean_token_accuracy": 0.9841189751258264, |
| "num_tokens": 569821.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 5.335195530726257, |
| "grad_norm": 0.792306661605835, |
| "learning_rate": 5.368125004858624e-05, |
| "loss": 0.0567, |
| "mean_token_accuracy": 0.987368130683899, |
| "num_tokens": 595559.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 5.558659217877095, |
| "grad_norm": 0.4141499698162079, |
| "learning_rate": 4.9806075013753995e-05, |
| "loss": 0.0519, |
| "mean_token_accuracy": 0.9876407846808434, |
| "num_tokens": 620822.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 5.782122905027933, |
| "grad_norm": 0.560502290725708, |
| "learning_rate": 4.593206626660709e-05, |
| "loss": 0.0534, |
| "mean_token_accuracy": 0.9872279047966004, |
| "num_tokens": 645049.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.6270470023155212, |
| "learning_rate": 4.2082522551583867e-05, |
| "loss": 0.0556, |
| "mean_token_accuracy": 0.9871117854729677, |
| "num_tokens": 669204.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 6.223463687150838, |
| "grad_norm": 0.41528642177581787, |
| "learning_rate": 3.828059547754077e-05, |
| "loss": 0.0426, |
| "mean_token_accuracy": 0.9900245934724807, |
| "num_tokens": 693949.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 6.446927374301676, |
| "grad_norm": 0.3252829611301422, |
| "learning_rate": 3.4549150281252636e-05, |
| "loss": 0.0453, |
| "mean_token_accuracy": 0.9897840306162834, |
| "num_tokens": 718020.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 6.670391061452514, |
| "grad_norm": 0.2989633083343506, |
| "learning_rate": 3.091062831318825e-05, |
| "loss": 0.0457, |
| "mean_token_accuracy": 0.9903687924146652, |
| "num_tokens": 742902.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 6.893854748603352, |
| "grad_norm": 0.2880214750766754, |
| "learning_rate": 2.738691207258812e-05, |
| "loss": 0.0449, |
| "mean_token_accuracy": 0.9903770983219147, |
| "num_tokens": 768980.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 7.111731843575419, |
| "grad_norm": 0.25206810235977173, |
| "learning_rate": 2.399919360353923e-05, |
| "loss": 0.0438, |
| "mean_token_accuracy": 0.990071585545173, |
| "num_tokens": 792798.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 7.335195530726257, |
| "grad_norm": 0.3471860885620117, |
| "learning_rate": 2.076784704352835e-05, |
| "loss": 0.0396, |
| "mean_token_accuracy": 0.9913395941257477, |
| "num_tokens": 817561.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 7.558659217877095, |
| "grad_norm": 0.21744246780872345, |
| "learning_rate": 1.7712306090981896e-05, |
| "loss": 0.0395, |
| "mean_token_accuracy": 0.9915344551205635, |
| "num_tokens": 843470.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 7.782122905027933, |
| "grad_norm": 0.2922864556312561, |
| "learning_rate": 1.4850947128716913e-05, |
| "loss": 0.0387, |
| "mean_token_accuracy": 0.9912592649459839, |
| "num_tokens": 867744.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.3212120831012726, |
| "learning_rate": 1.2200978706212607e-05, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9908862557166662, |
| "num_tokens": 892272.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 8.223463687150838, |
| "grad_norm": 0.2715793550014496, |
| "learning_rate": 9.7783380453689e-06, |
| "loss": 0.0381, |
| "mean_token_accuracy": 0.9915784135460853, |
| "num_tokens": 917435.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 8.446927374301676, |
| "grad_norm": 0.26999297738075256, |
| "learning_rate": 7.597595192178702e-06, |
| "loss": 0.0382, |
| "mean_token_accuracy": 0.9912244379520416, |
| "num_tokens": 942005.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 8.670391061452515, |
| "grad_norm": 0.2515369653701782, |
| "learning_rate": 5.6718653907569475e-06, |
| "loss": 0.0381, |
| "mean_token_accuracy": 0.9913900807499886, |
| "num_tokens": 966911.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 8.893854748603353, |
| "grad_norm": 0.24382378160953522, |
| "learning_rate": 4.012730206719229e-06, |
| "loss": 0.0379, |
| "mean_token_accuracy": 0.9922321572899818, |
| "num_tokens": 992856.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 9.111731843575418, |
| "grad_norm": 0.2050442099571228, |
| "learning_rate": 2.63016787428354e-06, |
| "loss": 0.0353, |
| "mean_token_accuracy": 0.9915406214885223, |
| "num_tokens": 1016741.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 9.335195530726256, |
| "grad_norm": 0.3232128918170929, |
| "learning_rate": 1.5324932859955399e-06, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9917459413409233, |
| "num_tokens": 1042654.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 9.558659217877095, |
| "grad_norm": 0.21526314318180084, |
| "learning_rate": 7.263079859864297e-07, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9920469373464584, |
| "num_tokens": 1066827.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 9.782122905027933, |
| "grad_norm": 0.20906753838062286, |
| "learning_rate": 2.1646046750978254e-07, |
| "loss": 0.0378, |
| "mean_token_accuracy": 0.9913564190268517, |
| "num_tokens": 1091448.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.21541918814182281, |
| "learning_rate": 6.017013532627624e-09, |
| "loss": 0.0337, |
| "mean_token_accuracy": 0.9919140201348525, |
| "num_tokens": 1115340.0, |
| "step": 450 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 450, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0661326932393984e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|