Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.49797696856520385, | |
| "eval_steps": 10, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012449424214130097, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0, | |
| "loss": 2.7527, | |
| "mean_token_accuracy": 0.6098426431417465, | |
| "num_tokens": 4577.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.012449424214130096, | |
| "grad_norm": 16.535795211791992, | |
| "learning_rate": 9.876543209876543e-06, | |
| "loss": 2.4622, | |
| "mean_token_accuracy": 0.6381922413905462, | |
| "num_tokens": 40313.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012449424214130096, | |
| "eval_loss": 2.009472608566284, | |
| "eval_mean_token_accuracy": 0.652686527967453, | |
| "eval_num_tokens": 40313.0, | |
| "eval_runtime": 22.1397, | |
| "eval_samples_per_second": 2.258, | |
| "eval_steps_per_second": 2.258, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024898848428260192, | |
| "grad_norm": 6.280624866485596, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 1.6438, | |
| "mean_token_accuracy": 0.6907053753733635, | |
| "num_tokens": 82255.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024898848428260192, | |
| "eval_loss": 1.292786717414856, | |
| "eval_mean_token_accuracy": 0.7414309787750244, | |
| "eval_num_tokens": 82255.0, | |
| "eval_runtime": 22.3447, | |
| "eval_samples_per_second": 2.238, | |
| "eval_steps_per_second": 2.238, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03734827264239029, | |
| "grad_norm": 3.875969648361206, | |
| "learning_rate": 3.45679012345679e-05, | |
| "loss": 1.0252, | |
| "mean_token_accuracy": 0.7984871238470077, | |
| "num_tokens": 120722.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03734827264239029, | |
| "eval_loss": 0.8175942897796631, | |
| "eval_mean_token_accuracy": 0.8347543370723725, | |
| "eval_num_tokens": 120722.0, | |
| "eval_runtime": 22.4004, | |
| "eval_samples_per_second": 2.232, | |
| "eval_steps_per_second": 2.232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.049797696856520385, | |
| "grad_norm": 2.5878844261169434, | |
| "learning_rate": 4.691358024691358e-05, | |
| "loss": 0.7542, | |
| "mean_token_accuracy": 0.8444898426532745, | |
| "num_tokens": 161994.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.049797696856520385, | |
| "eval_loss": 0.6629099249839783, | |
| "eval_mean_token_accuracy": 0.8559932851791382, | |
| "eval_num_tokens": 161994.0, | |
| "eval_runtime": 22.2776, | |
| "eval_samples_per_second": 2.244, | |
| "eval_steps_per_second": 2.244, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06224712107065048, | |
| "grad_norm": 1.784697413444519, | |
| "learning_rate": 5.925925925925926e-05, | |
| "loss": 0.6373, | |
| "mean_token_accuracy": 0.8567869439721107, | |
| "num_tokens": 202956.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06224712107065048, | |
| "eval_loss": 0.5834522843360901, | |
| "eval_mean_token_accuracy": 0.8706429171562194, | |
| "eval_num_tokens": 202956.0, | |
| "eval_runtime": 22.3133, | |
| "eval_samples_per_second": 2.241, | |
| "eval_steps_per_second": 2.241, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07469654528478058, | |
| "grad_norm": 2.509762763977051, | |
| "learning_rate": 7.160493827160494e-05, | |
| "loss": 0.5859, | |
| "mean_token_accuracy": 0.8670632347464562, | |
| "num_tokens": 245518.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07469654528478058, | |
| "eval_loss": 0.5327284932136536, | |
| "eval_mean_token_accuracy": 0.8751986050605773, | |
| "eval_num_tokens": 245518.0, | |
| "eval_runtime": 22.29, | |
| "eval_samples_per_second": 2.243, | |
| "eval_steps_per_second": 2.243, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08714596949891068, | |
| "grad_norm": 1.8014663457870483, | |
| "learning_rate": 8.395061728395062e-05, | |
| "loss": 0.5217, | |
| "mean_token_accuracy": 0.8772685110569001, | |
| "num_tokens": 286841.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08714596949891068, | |
| "eval_loss": 0.4707345962524414, | |
| "eval_mean_token_accuracy": 0.8786909699440002, | |
| "eval_num_tokens": 286841.0, | |
| "eval_runtime": 22.2481, | |
| "eval_samples_per_second": 2.247, | |
| "eval_steps_per_second": 2.247, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09959539371304077, | |
| "grad_norm": 1.7159992456436157, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 0.4529, | |
| "mean_token_accuracy": 0.8761002823710442, | |
| "num_tokens": 330576.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09959539371304077, | |
| "eval_loss": 0.38634419441223145, | |
| "eval_mean_token_accuracy": 0.8834530138969421, | |
| "eval_num_tokens": 330576.0, | |
| "eval_runtime": 22.3312, | |
| "eval_samples_per_second": 2.239, | |
| "eval_steps_per_second": 2.239, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11204481792717087, | |
| "grad_norm": 2.0236318111419678, | |
| "learning_rate": 9.997687265620273e-05, | |
| "loss": 0.3827, | |
| "mean_token_accuracy": 0.8846412658691406, | |
| "num_tokens": 371225.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11204481792717087, | |
| "eval_loss": 0.37090864777565, | |
| "eval_mean_token_accuracy": 0.8859660923480988, | |
| "eval_num_tokens": 371225.0, | |
| "eval_runtime": 22.3399, | |
| "eval_samples_per_second": 2.238, | |
| "eval_steps_per_second": 2.238, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12449424214130096, | |
| "grad_norm": 2.2110610008239746, | |
| "learning_rate": 9.98636473719804e-05, | |
| "loss": 0.3966, | |
| "mean_token_accuracy": 0.8800512284040451, | |
| "num_tokens": 415356.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12449424214130096, | |
| "eval_loss": 0.3614565134048462, | |
| "eval_mean_token_accuracy": 0.8883592307567596, | |
| "eval_num_tokens": 415356.0, | |
| "eval_runtime": 22.2481, | |
| "eval_samples_per_second": 2.247, | |
| "eval_steps_per_second": 2.247, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13694366635543107, | |
| "grad_norm": 2.259178876876831, | |
| "learning_rate": 9.965628974662144e-05, | |
| "loss": 0.3822, | |
| "mean_token_accuracy": 0.8851513683795929, | |
| "num_tokens": 453785.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13694366635543107, | |
| "eval_loss": 0.35502663254737854, | |
| "eval_mean_token_accuracy": 0.8901030969619751, | |
| "eval_num_tokens": 453785.0, | |
| "eval_runtime": 22.0185, | |
| "eval_samples_per_second": 2.271, | |
| "eval_steps_per_second": 2.271, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14939309056956115, | |
| "grad_norm": 1.4706709384918213, | |
| "learning_rate": 9.935519122880152e-05, | |
| "loss": 0.3597, | |
| "mean_token_accuracy": 0.8902897864580155, | |
| "num_tokens": 493793.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14939309056956115, | |
| "eval_loss": 0.34856370091438293, | |
| "eval_mean_token_accuracy": 0.8903208661079407, | |
| "eval_num_tokens": 493793.0, | |
| "eval_runtime": 22.339, | |
| "eval_samples_per_second": 2.238, | |
| "eval_steps_per_second": 2.238, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16184251478369124, | |
| "grad_norm": 1.2550042867660522, | |
| "learning_rate": 9.896092023077865e-05, | |
| "loss": 0.3342, | |
| "mean_token_accuracy": 0.8952602833509445, | |
| "num_tokens": 533894.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16184251478369124, | |
| "eval_loss": 0.34542137384414673, | |
| "eval_mean_token_accuracy": 0.891883670091629, | |
| "eval_num_tokens": 533894.0, | |
| "eval_runtime": 22.258, | |
| "eval_samples_per_second": 2.246, | |
| "eval_steps_per_second": 2.246, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.17429193899782136, | |
| "grad_norm": 1.4467822313308716, | |
| "learning_rate": 9.847422105534739e-05, | |
| "loss": 0.364, | |
| "mean_token_accuracy": 0.8903828382492065, | |
| "num_tokens": 574730.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17429193899782136, | |
| "eval_loss": 0.3417295515537262, | |
| "eval_mean_token_accuracy": 0.8922731137275696, | |
| "eval_num_tokens": 574730.0, | |
| "eval_runtime": 22.3005, | |
| "eval_samples_per_second": 2.242, | |
| "eval_steps_per_second": 2.242, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18674136321195145, | |
| "grad_norm": 1.324472188949585, | |
| "learning_rate": 9.78960124907478e-05, | |
| "loss": 0.3447, | |
| "mean_token_accuracy": 0.893255865573883, | |
| "num_tokens": 616097.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18674136321195145, | |
| "eval_loss": 0.3354889154434204, | |
| "eval_mean_token_accuracy": 0.8933182156085968, | |
| "eval_num_tokens": 616097.0, | |
| "eval_runtime": 22.248, | |
| "eval_samples_per_second": 2.247, | |
| "eval_steps_per_second": 2.247, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19919078742608154, | |
| "grad_norm": 1.350631833076477, | |
| "learning_rate": 9.722738607618171e-05, | |
| "loss": 0.3799, | |
| "mean_token_accuracy": 0.8845137342810631, | |
| "num_tokens": 658786.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.19919078742608154, | |
| "eval_loss": 0.3358314633369446, | |
| "eval_mean_token_accuracy": 0.8949730741977692, | |
| "eval_num_tokens": 658786.0, | |
| "eval_runtime": 22.2937, | |
| "eval_samples_per_second": 2.243, | |
| "eval_steps_per_second": 2.243, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "grad_norm": 1.375606656074524, | |
| "learning_rate": 9.646960404121042e-05, | |
| "loss": 0.3545, | |
| "mean_token_accuracy": 0.894567859172821, | |
| "num_tokens": 699144.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "eval_loss": 0.33176079392433167, | |
| "eval_mean_token_accuracy": 0.8978442597389221, | |
| "eval_num_tokens": 699144.0, | |
| "eval_runtime": 22.321, | |
| "eval_samples_per_second": 2.24, | |
| "eval_steps_per_second": 2.24, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.22408963585434175, | |
| "grad_norm": 1.2101880311965942, | |
| "learning_rate": 9.562409692292424e-05, | |
| "loss": 0.3595, | |
| "mean_token_accuracy": 0.8943482771515846, | |
| "num_tokens": 739037.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22408963585434175, | |
| "eval_loss": 0.3300569951534271, | |
| "eval_mean_token_accuracy": 0.8991416382789612, | |
| "eval_num_tokens": 739037.0, | |
| "eval_runtime": 22.3075, | |
| "eval_samples_per_second": 2.241, | |
| "eval_steps_per_second": 2.241, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.23653906006847183, | |
| "grad_norm": 1.403489589691162, | |
| "learning_rate": 9.469246086538175e-05, | |
| "loss": 0.3525, | |
| "mean_token_accuracy": 0.892733770608902, | |
| "num_tokens": 780872.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.23653906006847183, | |
| "eval_loss": 0.32495585083961487, | |
| "eval_mean_token_accuracy": 0.9010837972164154, | |
| "eval_num_tokens": 780872.0, | |
| "eval_runtime": 22.2334, | |
| "eval_samples_per_second": 2.249, | |
| "eval_steps_per_second": 2.249, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24898848428260192, | |
| "grad_norm": 2.186126470565796, | |
| "learning_rate": 9.367645460641716e-05, | |
| "loss": 0.333, | |
| "mean_token_accuracy": 0.8986451297998428, | |
| "num_tokens": 821731.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24898848428260192, | |
| "eval_loss": 0.32020682096481323, | |
| "eval_mean_token_accuracy": 0.901833120584488, | |
| "eval_num_tokens": 821731.0, | |
| "eval_runtime": 22.2962, | |
| "eval_samples_per_second": 2.243, | |
| "eval_steps_per_second": 2.243, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.26143790849673204, | |
| "grad_norm": 1.3674637079238892, | |
| "learning_rate": 9.257799615750385e-05, | |
| "loss": 0.3311, | |
| "mean_token_accuracy": 0.8988270297646522, | |
| "num_tokens": 863318.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.26143790849673204, | |
| "eval_loss": 0.312335729598999, | |
| "eval_mean_token_accuracy": 0.9045632266998291, | |
| "eval_num_tokens": 863318.0, | |
| "eval_runtime": 21.8783, | |
| "eval_samples_per_second": 2.285, | |
| "eval_steps_per_second": 2.285, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.27388733271086213, | |
| "grad_norm": 1.2784392833709717, | |
| "learning_rate": 9.139915918294213e-05, | |
| "loss": 0.3258, | |
| "mean_token_accuracy": 0.9025416001677513, | |
| "num_tokens": 906386.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.27388733271086213, | |
| "eval_loss": 0.30771327018737793, | |
| "eval_mean_token_accuracy": 0.9066664147377014, | |
| "eval_num_tokens": 906386.0, | |
| "eval_runtime": 22.2842, | |
| "eval_samples_per_second": 2.244, | |
| "eval_steps_per_second": 2.244, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2863367569249922, | |
| "grad_norm": 1.1845546960830688, | |
| "learning_rate": 9.014216908520618e-05, | |
| "loss": 0.3005, | |
| "mean_token_accuracy": 0.9102896004915237, | |
| "num_tokens": 946683.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2863367569249922, | |
| "eval_loss": 0.306180477142334, | |
| "eval_mean_token_accuracy": 0.9058228933811188, | |
| "eval_num_tokens": 946683.0, | |
| "eval_runtime": 22.2708, | |
| "eval_samples_per_second": 2.245, | |
| "eval_steps_per_second": 2.245, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2987861811391223, | |
| "grad_norm": 1.3700170516967773, | |
| "learning_rate": 8.88093988038406e-05, | |
| "loss": 0.3457, | |
| "mean_token_accuracy": 0.8991452261805535, | |
| "num_tokens": 988276.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2987861811391223, | |
| "eval_loss": 0.30229029059410095, | |
| "eval_mean_token_accuracy": 0.9061234760284423, | |
| "eval_num_tokens": 988276.0, | |
| "eval_runtime": 22.3208, | |
| "eval_samples_per_second": 2.24, | |
| "eval_steps_per_second": 2.24, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3112356053532524, | |
| "grad_norm": 1.7757388353347778, | |
| "learning_rate": 8.740336433583704e-05, | |
| "loss": 0.2971, | |
| "mean_token_accuracy": 0.9136147618293762, | |
| "num_tokens": 1027142.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3112356053532524, | |
| "eval_loss": 0.3006957471370697, | |
| "eval_mean_token_accuracy": 0.9066220009326935, | |
| "eval_num_tokens": 1027142.0, | |
| "eval_runtime": 22.3249, | |
| "eval_samples_per_second": 2.24, | |
| "eval_steps_per_second": 2.24, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3236850295673825, | |
| "grad_norm": 1.1962240934371948, | |
| "learning_rate": 8.592671998594794e-05, | |
| "loss": 0.3075, | |
| "mean_token_accuracy": 0.9059017911553383, | |
| "num_tokens": 1066879.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3236850295673825, | |
| "eval_loss": 0.2966723144054413, | |
| "eval_mean_token_accuracy": 0.908552759885788, | |
| "eval_num_tokens": 1066879.0, | |
| "eval_runtime": 22.2952, | |
| "eval_samples_per_second": 2.243, | |
| "eval_steps_per_second": 2.243, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 1.075523018836975, | |
| "learning_rate": 8.438225335590333e-05, | |
| "loss": 0.3203, | |
| "mean_token_accuracy": 0.9030415132641793, | |
| "num_tokens": 1107335.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "eval_loss": 0.2949363589286804, | |
| "eval_mean_token_accuracy": 0.9083844482898712, | |
| "eval_num_tokens": 1107335.0, | |
| "eval_runtime": 22.2589, | |
| "eval_samples_per_second": 2.246, | |
| "eval_steps_per_second": 2.246, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3485838779956427, | |
| "grad_norm": 1.1951340436935425, | |
| "learning_rate": 8.27728800819905e-05, | |
| "loss": 0.3404, | |
| "mean_token_accuracy": 0.8968317583203316, | |
| "num_tokens": 1148632.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3485838779956427, | |
| "eval_loss": 0.2915548086166382, | |
| "eval_mean_token_accuracy": 0.9077938544750214, | |
| "eval_num_tokens": 1148632.0, | |
| "eval_runtime": 22.2818, | |
| "eval_samples_per_second": 2.244, | |
| "eval_steps_per_second": 2.244, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3610333022097728, | |
| "grad_norm": 0.9940551519393921, | |
| "learning_rate": 8.11016383309305e-05, | |
| "loss": 0.2957, | |
| "mean_token_accuracy": 0.9087760657072067, | |
| "num_tokens": 1188823.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3610333022097728, | |
| "eval_loss": 0.2891790270805359, | |
| "eval_mean_token_accuracy": 0.9090994548797607, | |
| "eval_num_tokens": 1188823.0, | |
| "eval_runtime": 22.27, | |
| "eval_samples_per_second": 2.245, | |
| "eval_steps_per_second": 2.245, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3734827264239029, | |
| "grad_norm": 1.0126793384552002, | |
| "learning_rate": 7.937168306444242e-05, | |
| "loss": 0.3636, | |
| "mean_token_accuracy": 0.89151521474123, | |
| "num_tokens": 1230550.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3734827264239029, | |
| "eval_loss": 0.2865450978279114, | |
| "eval_mean_token_accuracy": 0.9095333778858185, | |
| "eval_num_tokens": 1230550.0, | |
| "eval_runtime": 22.2643, | |
| "eval_samples_per_second": 2.246, | |
| "eval_steps_per_second": 2.246, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.385932150638033, | |
| "grad_norm": 1.1640918254852295, | |
| "learning_rate": 7.758628008332261e-05, | |
| "loss": 0.298, | |
| "mean_token_accuracy": 0.9120915725827217, | |
| "num_tokens": 1269322.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.385932150638033, | |
| "eval_loss": 0.28298419713974, | |
| "eval_mean_token_accuracy": 0.9108501935005188, | |
| "eval_num_tokens": 1269322.0, | |
| "eval_runtime": 22.2371, | |
| "eval_samples_per_second": 2.248, | |
| "eval_steps_per_second": 2.248, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3983815748521631, | |
| "grad_norm": 1.2163608074188232, | |
| "learning_rate": 7.574879986228245e-05, | |
| "loss": 0.304, | |
| "mean_token_accuracy": 0.9081651225686074, | |
| "num_tokens": 1311242.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3983815748521631, | |
| "eval_loss": 0.2835235297679901, | |
| "eval_mean_token_accuracy": 0.9103049302101135, | |
| "eval_num_tokens": 1311242.0, | |
| "eval_runtime": 22.3204, | |
| "eval_samples_per_second": 2.24, | |
| "eval_steps_per_second": 2.24, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.41083099906629317, | |
| "grad_norm": 2.2140297889709473, | |
| "learning_rate": 7.38627111871833e-05, | |
| "loss": 0.3334, | |
| "mean_token_accuracy": 0.9038107171654701, | |
| "num_tokens": 1351414.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.41083099906629317, | |
| "eval_loss": 0.2817688286304474, | |
| "eval_mean_token_accuracy": 0.9108499121665955, | |
| "eval_num_tokens": 1351414.0, | |
| "eval_runtime": 22.3432, | |
| "eval_samples_per_second": 2.238, | |
| "eval_steps_per_second": 2.238, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 1.1960259675979614, | |
| "learning_rate": 7.193157460668005e-05, | |
| "loss": 0.2933, | |
| "mean_token_accuracy": 0.9120548516511917, | |
| "num_tokens": 1392404.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "eval_loss": 0.2820216417312622, | |
| "eval_mean_token_accuracy": 0.9112770104408264, | |
| "eval_num_tokens": 1392404.0, | |
| "eval_runtime": 22.2719, | |
| "eval_samples_per_second": 2.245, | |
| "eval_steps_per_second": 2.245, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4357298474945534, | |
| "grad_norm": 0.950002908706665, | |
| "learning_rate": 6.99590357106354e-05, | |
| "loss": 0.3248, | |
| "mean_token_accuracy": 0.9039641574025155, | |
| "num_tokens": 1433105.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4357298474945534, | |
| "eval_loss": 0.2805749773979187, | |
| "eval_mean_token_accuracy": 0.9103002834320069, | |
| "eval_num_tokens": 1433105.0, | |
| "eval_runtime": 22.3037, | |
| "eval_samples_per_second": 2.242, | |
| "eval_steps_per_second": 2.242, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4481792717086835, | |
| "grad_norm": 0.9667631983757019, | |
| "learning_rate": 6.79488182479938e-05, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.9092229396104813, | |
| "num_tokens": 1474722.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4481792717086835, | |
| "eval_loss": 0.2767864167690277, | |
| "eval_mean_token_accuracy": 0.911848417520523, | |
| "eval_num_tokens": 1474722.0, | |
| "eval_runtime": 22.3405, | |
| "eval_samples_per_second": 2.238, | |
| "eval_steps_per_second": 2.238, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4606286959228136, | |
| "grad_norm": 1.0764143466949463, | |
| "learning_rate": 6.590471709710703e-05, | |
| "loss": 0.3335, | |
| "mean_token_accuracy": 0.8984953165054321, | |
| "num_tokens": 1516420.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4606286959228136, | |
| "eval_loss": 0.2774994671344757, | |
| "eval_mean_token_accuracy": 0.911427743434906, | |
| "eval_num_tokens": 1516420.0, | |
| "eval_runtime": 22.3317, | |
| "eval_samples_per_second": 2.239, | |
| "eval_steps_per_second": 2.239, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.47307812013694367, | |
| "grad_norm": 1.0027621984481812, | |
| "learning_rate": 6.383059110178204e-05, | |
| "loss": 0.3434, | |
| "mean_token_accuracy": 0.9006192669272423, | |
| "num_tokens": 1557572.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.47307812013694367, | |
| "eval_loss": 0.2765714228153229, | |
| "eval_mean_token_accuracy": 0.9118164765834809, | |
| "eval_num_tokens": 1557572.0, | |
| "eval_runtime": 22.3501, | |
| "eval_samples_per_second": 2.237, | |
| "eval_steps_per_second": 2.237, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.48552754435107376, | |
| "grad_norm": 1.2028086185455322, | |
| "learning_rate": 6.173035578657512e-05, | |
| "loss": 0.3116, | |
| "mean_token_accuracy": 0.9067506313323974, | |
| "num_tokens": 1600657.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.48552754435107376, | |
| "eval_loss": 0.2748585045337677, | |
| "eval_mean_token_accuracy": 0.9116013216972351, | |
| "eval_num_tokens": 1600657.0, | |
| "eval_runtime": 22.3017, | |
| "eval_samples_per_second": 2.242, | |
| "eval_steps_per_second": 2.242, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.49797696856520385, | |
| "grad_norm": 0.9993801116943359, | |
| "learning_rate": 5.9607975965084526e-05, | |
| "loss": 0.2804, | |
| "mean_token_accuracy": 0.9129077598452568, | |
| "num_tokens": 1643316.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.49797696856520385, | |
| "eval_loss": 0.27263733744621277, | |
| "eval_mean_token_accuracy": 0.9128299736976624, | |
| "eval_num_tokens": 1643316.0, | |
| "eval_runtime": 22.3273, | |
| "eval_samples_per_second": 2.239, | |
| "eval_steps_per_second": 2.239, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.064033082156032e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |