{ "best_metric": 0.15249475836753845, "best_model_checkpoint": "/Users/davidbirkenberger/Documents/rifel_models/final/checkpoint-378", "epoch": 9.0, "eval_steps": 500, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23809523809523808, "grad_norm": 1.2485231161117554, "learning_rate": 5.061503329725875e-05, "loss": 0.4969, "step": 10 }, { "epoch": 0.47619047619047616, "grad_norm": 1.6543331146240234, "learning_rate": 4.979866179246426e-05, "loss": 0.4438, "step": 20 }, { "epoch": 0.7142857142857143, "grad_norm": 1.400197148323059, "learning_rate": 4.8982290287669756e-05, "loss": 0.378, "step": 30 }, { "epoch": 0.9523809523809523, "grad_norm": 1.311822533607483, "learning_rate": 4.8165918782875265e-05, "loss": 0.4147, "step": 40 }, { "epoch": 1.0, "eval_loss": 0.3573644757270813, "eval_runtime": 1.6061, "eval_samples_per_second": 19.924, "eval_steps_per_second": 6.849, "step": 42 }, { "epoch": 1.1904761904761905, "grad_norm": 1.5031193494796753, "learning_rate": 4.734954727808077e-05, "loss": 0.337, "step": 50 }, { "epoch": 1.4285714285714286, "grad_norm": 1.6817210912704468, "learning_rate": 4.6533175773286276e-05, "loss": 0.3037, "step": 60 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1621549129486084, "learning_rate": 4.571680426849177e-05, "loss": 0.3029, "step": 70 }, { "epoch": 1.9047619047619047, "grad_norm": 1.644331693649292, "learning_rate": 4.490043276369728e-05, "loss": 0.2911, "step": 80 }, { "epoch": 2.0, "eval_loss": 0.271997332572937, "eval_runtime": 1.65, "eval_samples_per_second": 19.394, "eval_steps_per_second": 6.667, "step": 84 }, { "epoch": 2.142857142857143, "grad_norm": 1.0107003450393677, "learning_rate": 4.408406125890278e-05, "loss": 0.2386, "step": 90 }, { "epoch": 2.380952380952381, "grad_norm": 0.835885763168335, "learning_rate": 4.326768975410829e-05, "loss": 0.2218, "step": 100 }, { "epoch": 2.619047619047619, "grad_norm": 0.7079904079437256, "learning_rate": 4.245131824931379e-05, "loss": 0.2033, "step": 110 }, { "epoch": 2.857142857142857, "grad_norm": 1.4346115589141846, "learning_rate": 4.16349467445193e-05, "loss": 0.2206, "step": 120 }, { "epoch": 3.0, "eval_loss": 0.2123033106327057, "eval_runtime": 1.5676, "eval_samples_per_second": 20.414, "eval_steps_per_second": 7.017, "step": 126 }, { "epoch": 3.0952380952380953, "grad_norm": 1.273030400276184, "learning_rate": 4.08185752397248e-05, "loss": 0.1684, "step": 130 }, { "epoch": 3.3333333333333335, "grad_norm": 0.7772669792175293, "learning_rate": 4.000220373493031e-05, "loss": 0.155, "step": 140 }, { "epoch": 3.571428571428571, "grad_norm": 1.3734729290008545, "learning_rate": 3.9185832230135803e-05, "loss": 0.1655, "step": 150 }, { "epoch": 3.8095238095238093, "grad_norm": 1.454413652420044, "learning_rate": 3.836946072534131e-05, "loss": 0.1598, "step": 160 }, { "epoch": 4.0, "eval_loss": 0.2093825340270996, "eval_runtime": 1.6719, "eval_samples_per_second": 19.14, "eval_steps_per_second": 6.579, "step": 168 }, { "epoch": 4.0476190476190474, "grad_norm": 1.0927952527999878, "learning_rate": 3.7553089220546815e-05, "loss": 0.1394, "step": 170 }, { "epoch": 4.285714285714286, "grad_norm": 1.0157573223114014, "learning_rate": 3.6736717715752324e-05, "loss": 0.1571, "step": 180 }, { "epoch": 4.523809523809524, "grad_norm": 0.43212223052978516, "learning_rate": 3.5920346210957826e-05, "loss": 0.1281, "step": 190 }, { "epoch": 4.761904761904762, "grad_norm": 0.6443906426429749, "learning_rate": 3.510397470616333e-05, "loss": 0.131, "step": 200 }, { "epoch": 5.0, "grad_norm": 1.5671747922897339, "learning_rate": 3.428760320136883e-05, "loss": 0.1466, "step": 210 }, { "epoch": 5.0, "eval_loss": 0.1823369264602661, "eval_runtime": 1.6446, "eval_samples_per_second": 19.458, "eval_steps_per_second": 6.689, "step": 210 }, { "epoch": 5.238095238095238, "grad_norm": 1.444640040397644, "learning_rate": 3.347123169657434e-05, "loss": 0.1252, "step": 220 }, { "epoch": 5.476190476190476, "grad_norm": 1.4842709302902222, "learning_rate": 3.265486019177984e-05, "loss": 0.1176, "step": 230 }, { "epoch": 5.714285714285714, "grad_norm": 1.338508129119873, "learning_rate": 3.1838488686985344e-05, "loss": 0.1013, "step": 240 }, { "epoch": 5.9523809523809526, "grad_norm": 0.5218590497970581, "learning_rate": 3.1022117182190846e-05, "loss": 0.0871, "step": 250 }, { "epoch": 6.0, "eval_loss": 0.16389165818691254, "eval_runtime": 1.7452, "eval_samples_per_second": 18.336, "eval_steps_per_second": 6.303, "step": 252 }, { "epoch": 6.190476190476191, "grad_norm": 1.4809467792510986, "learning_rate": 3.0205745677396355e-05, "loss": 0.1164, "step": 260 }, { "epoch": 6.428571428571429, "grad_norm": 0.4164857566356659, "learning_rate": 2.9389374172601854e-05, "loss": 0.074, "step": 270 }, { "epoch": 6.666666666666667, "grad_norm": 0.37571170926094055, "learning_rate": 2.8573002667807363e-05, "loss": 0.0815, "step": 280 }, { "epoch": 6.904761904761905, "grad_norm": 1.2223498821258545, "learning_rate": 2.7756631163012862e-05, "loss": 0.088, "step": 290 }, { "epoch": 7.0, "eval_loss": 0.15850776433944702, "eval_runtime": 1.5802, "eval_samples_per_second": 20.251, "eval_steps_per_second": 6.961, "step": 294 }, { "epoch": 7.142857142857143, "grad_norm": 1.3939220905303955, "learning_rate": 2.694025965821837e-05, "loss": 0.1117, "step": 300 }, { "epoch": 7.380952380952381, "grad_norm": 0.8872294425964355, "learning_rate": 2.612388815342387e-05, "loss": 0.0748, "step": 310 }, { "epoch": 7.619047619047619, "grad_norm": 0.7376847267150879, "learning_rate": 2.5307516648629376e-05, "loss": 0.0768, "step": 320 }, { "epoch": 7.857142857142857, "grad_norm": 1.4566947221755981, "learning_rate": 2.4491145143834878e-05, "loss": 0.0675, "step": 330 }, { "epoch": 8.0, "eval_loss": 0.16645725071430206, "eval_runtime": 1.6616, "eval_samples_per_second": 19.258, "eval_steps_per_second": 6.62, "step": 336 }, { "epoch": 8.095238095238095, "grad_norm": 0.39620542526245117, "learning_rate": 2.3674773639040384e-05, "loss": 0.0763, "step": 340 }, { "epoch": 8.333333333333334, "grad_norm": 0.24305318295955658, "learning_rate": 2.2858402134245886e-05, "loss": 0.052, "step": 350 }, { "epoch": 8.571428571428571, "grad_norm": 0.3784918487071991, "learning_rate": 2.204203062945139e-05, "loss": 0.0625, "step": 360 }, { "epoch": 8.80952380952381, "grad_norm": 0.25161460041999817, "learning_rate": 2.1225659124656894e-05, "loss": 0.0673, "step": 370 }, { "epoch": 9.0, "eval_loss": 0.15249475836753845, "eval_runtime": 1.6247, "eval_samples_per_second": 19.697, "eval_steps_per_second": 6.771, "step": 378 } ], "logging_steps": 10, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 280687197719796.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }