| { | |
| "best_metric": 0.15249475836753845, | |
| "best_model_checkpoint": "/Users/davidbirkenberger/Documents/rifel_models/final/checkpoint-378", | |
| "epoch": 9.0, | |
| "eval_steps": 500, | |
| "global_step": 378, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 1.2485231161117554, | |
| "learning_rate": 5.061503329725875e-05, | |
| "loss": 0.4969, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 1.6543331146240234, | |
| "learning_rate": 4.979866179246426e-05, | |
| "loss": 0.4438, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 1.400197148323059, | |
| "learning_rate": 4.8982290287669756e-05, | |
| "loss": 0.378, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.311822533607483, | |
| "learning_rate": 4.8165918782875265e-05, | |
| "loss": 0.4147, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.3573644757270813, | |
| "eval_runtime": 1.6061, | |
| "eval_samples_per_second": 19.924, | |
| "eval_steps_per_second": 6.849, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 1.5031193494796753, | |
| "learning_rate": 4.734954727808077e-05, | |
| "loss": 0.337, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 1.6817210912704468, | |
| "learning_rate": 4.6533175773286276e-05, | |
| "loss": 0.3037, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.1621549129486084, | |
| "learning_rate": 4.571680426849177e-05, | |
| "loss": 0.3029, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 1.644331693649292, | |
| "learning_rate": 4.490043276369728e-05, | |
| "loss": 0.2911, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.271997332572937, | |
| "eval_runtime": 1.65, | |
| "eval_samples_per_second": 19.394, | |
| "eval_steps_per_second": 6.667, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 1.0107003450393677, | |
| "learning_rate": 4.408406125890278e-05, | |
| "loss": 0.2386, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.835885763168335, | |
| "learning_rate": 4.326768975410829e-05, | |
| "loss": 0.2218, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 0.7079904079437256, | |
| "learning_rate": 4.245131824931379e-05, | |
| "loss": 0.2033, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 1.4346115589141846, | |
| "learning_rate": 4.16349467445193e-05, | |
| "loss": 0.2206, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.2123033106327057, | |
| "eval_runtime": 1.5676, | |
| "eval_samples_per_second": 20.414, | |
| "eval_steps_per_second": 7.017, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 3.0952380952380953, | |
| "grad_norm": 1.273030400276184, | |
| "learning_rate": 4.08185752397248e-05, | |
| "loss": 0.1684, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.7772669792175293, | |
| "learning_rate": 4.000220373493031e-05, | |
| "loss": 0.155, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 1.3734729290008545, | |
| "learning_rate": 3.9185832230135803e-05, | |
| "loss": 0.1655, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 1.454413652420044, | |
| "learning_rate": 3.836946072534131e-05, | |
| "loss": 0.1598, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.2093825340270996, | |
| "eval_runtime": 1.6719, | |
| "eval_samples_per_second": 19.14, | |
| "eval_steps_per_second": 6.579, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 4.0476190476190474, | |
| "grad_norm": 1.0927952527999878, | |
| "learning_rate": 3.7553089220546815e-05, | |
| "loss": 0.1394, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 1.0157573223114014, | |
| "learning_rate": 3.6736717715752324e-05, | |
| "loss": 0.1571, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.523809523809524, | |
| "grad_norm": 0.43212223052978516, | |
| "learning_rate": 3.5920346210957826e-05, | |
| "loss": 0.1281, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 0.6443906426429749, | |
| "learning_rate": 3.510397470616333e-05, | |
| "loss": 0.131, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.5671747922897339, | |
| "learning_rate": 3.428760320136883e-05, | |
| "loss": 0.1466, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.1823369264602661, | |
| "eval_runtime": 1.6446, | |
| "eval_samples_per_second": 19.458, | |
| "eval_steps_per_second": 6.689, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.238095238095238, | |
| "grad_norm": 1.444640040397644, | |
| "learning_rate": 3.347123169657434e-05, | |
| "loss": 0.1252, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 5.476190476190476, | |
| "grad_norm": 1.4842709302902222, | |
| "learning_rate": 3.265486019177984e-05, | |
| "loss": 0.1176, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 1.338508129119873, | |
| "learning_rate": 3.1838488686985344e-05, | |
| "loss": 0.1013, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 0.5218590497970581, | |
| "learning_rate": 3.1022117182190846e-05, | |
| "loss": 0.0871, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.16389165818691254, | |
| "eval_runtime": 1.7452, | |
| "eval_samples_per_second": 18.336, | |
| "eval_steps_per_second": 6.303, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 6.190476190476191, | |
| "grad_norm": 1.4809467792510986, | |
| "learning_rate": 3.0205745677396355e-05, | |
| "loss": 0.1164, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 6.428571428571429, | |
| "grad_norm": 0.4164857566356659, | |
| "learning_rate": 2.9389374172601854e-05, | |
| "loss": 0.074, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.37571170926094055, | |
| "learning_rate": 2.8573002667807363e-05, | |
| "loss": 0.0815, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.904761904761905, | |
| "grad_norm": 1.2223498821258545, | |
| "learning_rate": 2.7756631163012862e-05, | |
| "loss": 0.088, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.15850776433944702, | |
| "eval_runtime": 1.5802, | |
| "eval_samples_per_second": 20.251, | |
| "eval_steps_per_second": 6.961, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 1.3939220905303955, | |
| "learning_rate": 2.694025965821837e-05, | |
| "loss": 0.1117, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.380952380952381, | |
| "grad_norm": 0.8872294425964355, | |
| "learning_rate": 2.612388815342387e-05, | |
| "loss": 0.0748, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 7.619047619047619, | |
| "grad_norm": 0.7376847267150879, | |
| "learning_rate": 2.5307516648629376e-05, | |
| "loss": 0.0768, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 7.857142857142857, | |
| "grad_norm": 1.4566947221755981, | |
| "learning_rate": 2.4491145143834878e-05, | |
| "loss": 0.0675, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.16645725071430206, | |
| "eval_runtime": 1.6616, | |
| "eval_samples_per_second": 19.258, | |
| "eval_steps_per_second": 6.62, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 8.095238095238095, | |
| "grad_norm": 0.39620542526245117, | |
| "learning_rate": 2.3674773639040384e-05, | |
| "loss": 0.0763, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.24305318295955658, | |
| "learning_rate": 2.2858402134245886e-05, | |
| "loss": 0.052, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 0.3784918487071991, | |
| "learning_rate": 2.204203062945139e-05, | |
| "loss": 0.0625, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 8.80952380952381, | |
| "grad_norm": 0.25161460041999817, | |
| "learning_rate": 2.1225659124656894e-05, | |
| "loss": 0.0673, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.15249475836753845, | |
| "eval_runtime": 1.6247, | |
| "eval_samples_per_second": 19.697, | |
| "eval_steps_per_second": 6.771, | |
| "step": 378 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 630, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 2, | |
| "early_stopping_threshold": 0.001 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 280687197719796.0, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |