| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 100, | |
| "global_step": 3416, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.117096018735363, | |
| "eval_accuracy": 0.16272885212172436, | |
| "eval_loss": 6.373325347900391, | |
| "eval_runtime": 1.492, | |
| "eval_samples_per_second": 153.483, | |
| "eval_steps_per_second": 1.34, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "grad_norm": 0.5273725986480713, | |
| "learning_rate": 0.00029824050644124145, | |
| "loss": 6.8543, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "eval_accuracy": 0.19693341358364602, | |
| "eval_loss": 5.696830749511719, | |
| "eval_runtime": 1.1659, | |
| "eval_samples_per_second": 196.414, | |
| "eval_steps_per_second": 1.715, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.351288056206089, | |
| "eval_accuracy": 0.21933947162852643, | |
| "eval_loss": 5.292416095733643, | |
| "eval_runtime": 1.1779, | |
| "eval_samples_per_second": 194.406, | |
| "eval_steps_per_second": 1.698, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "grad_norm": 0.7039262056350708, | |
| "learning_rate": 0.00029145544445670097, | |
| "loss": 5.3976, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "eval_accuracy": 0.2335839021287676, | |
| "eval_loss": 5.054598808288574, | |
| "eval_runtime": 1.1808, | |
| "eval_samples_per_second": 193.929, | |
| "eval_steps_per_second": 1.694, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.585480093676815, | |
| "eval_accuracy": 0.24265474864150735, | |
| "eval_loss": 4.891258716583252, | |
| "eval_runtime": 1.1832, | |
| "eval_samples_per_second": 193.538, | |
| "eval_steps_per_second": 1.69, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "grad_norm": 0.7881871461868286, | |
| "learning_rate": 0.0002797991505064349, | |
| "loss": 4.9739, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "eval_accuracy": 0.2506498994736775, | |
| "eval_loss": 4.759438514709473, | |
| "eval_runtime": 1.1726, | |
| "eval_samples_per_second": 195.299, | |
| "eval_steps_per_second": 1.706, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "eval_accuracy": 0.2589993468990511, | |
| "eval_loss": 4.651967525482178, | |
| "eval_runtime": 1.1752, | |
| "eval_samples_per_second": 194.864, | |
| "eval_steps_per_second": 1.702, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "grad_norm": 0.5959051251411438, | |
| "learning_rate": 0.00026367302668500736, | |
| "loss": 4.7436, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "eval_accuracy": 0.2653638796757546, | |
| "eval_loss": 4.566330432891846, | |
| "eval_runtime": 1.1821, | |
| "eval_samples_per_second": 193.727, | |
| "eval_steps_per_second": 1.692, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.053864168618267, | |
| "eval_accuracy": 0.270379524218093, | |
| "eval_loss": 4.497960567474365, | |
| "eval_runtime": 1.1842, | |
| "eval_samples_per_second": 193.385, | |
| "eval_steps_per_second": 1.689, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.17096018735363, | |
| "grad_norm": 0.6118926405906677, | |
| "learning_rate": 0.00024363240043240403, | |
| "loss": 4.5595, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.17096018735363, | |
| "eval_accuracy": 0.2755872572748189, | |
| "eval_loss": 4.44118070602417, | |
| "eval_runtime": 1.1779, | |
| "eval_samples_per_second": 194.409, | |
| "eval_steps_per_second": 1.698, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.288056206088993, | |
| "eval_accuracy": 0.2790832682366701, | |
| "eval_loss": 4.388593673706055, | |
| "eval_runtime": 1.1719, | |
| "eval_samples_per_second": 195.408, | |
| "eval_steps_per_second": 1.707, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.405152224824356, | |
| "grad_norm": 0.6060830354690552, | |
| "learning_rate": 0.00022036740099454973, | |
| "loss": 4.44, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.405152224824356, | |
| "eval_accuracy": 0.28338605095894853, | |
| "eval_loss": 4.339376449584961, | |
| "eval_runtime": 1.1734, | |
| "eval_samples_per_second": 195.157, | |
| "eval_steps_per_second": 1.704, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5222482435597189, | |
| "eval_accuracy": 0.28774005728506363, | |
| "eval_loss": 4.292956352233887, | |
| "eval_runtime": 1.1856, | |
| "eval_samples_per_second": 193.144, | |
| "eval_steps_per_second": 1.687, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "grad_norm": 0.5661224126815796, | |
| "learning_rate": 0.0001946791937799555, | |
| "loss": 4.3482, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "eval_accuracy": 0.2911549642075068, | |
| "eval_loss": 4.25685977935791, | |
| "eval_runtime": 1.175, | |
| "eval_samples_per_second": 194.899, | |
| "eval_steps_per_second": 1.702, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.756440281030445, | |
| "eval_accuracy": 0.29346429501380905, | |
| "eval_loss": 4.224252700805664, | |
| "eval_runtime": 1.1755, | |
| "eval_samples_per_second": 194.807, | |
| "eval_steps_per_second": 1.701, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8735362997658078, | |
| "grad_norm": 0.5346641540527344, | |
| "learning_rate": 0.00016745239101833442, | |
| "loss": 4.2765, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8735362997658078, | |
| "eval_accuracy": 0.29705421591602743, | |
| "eval_loss": 4.186337947845459, | |
| "eval_runtime": 1.1767, | |
| "eval_samples_per_second": 194.606, | |
| "eval_steps_per_second": 1.7, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.990632318501171, | |
| "eval_accuracy": 0.3002471538885118, | |
| "eval_loss": 4.158440113067627, | |
| "eval_runtime": 1.1742, | |
| "eval_samples_per_second": 195.024, | |
| "eval_steps_per_second": 1.703, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.107728337236534, | |
| "grad_norm": 0.5907180309295654, | |
| "learning_rate": 0.0001396245888018435, | |
| "loss": 4.1869, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.107728337236534, | |
| "eval_accuracy": 0.3025009924573243, | |
| "eval_loss": 4.138100624084473, | |
| "eval_runtime": 1.1874, | |
| "eval_samples_per_second": 192.866, | |
| "eval_steps_per_second": 1.684, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.2248243559718968, | |
| "eval_accuracy": 0.30516461985683, | |
| "eval_loss": 4.114769458770752, | |
| "eval_runtime": 1.1787, | |
| "eval_samples_per_second": 194.278, | |
| "eval_steps_per_second": 1.697, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.34192037470726, | |
| "grad_norm": 0.5956910848617554, | |
| "learning_rate": 0.00011215407954696412, | |
| "loss": 4.1154, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.34192037470726, | |
| "eval_accuracy": 0.30651777672484815, | |
| "eval_loss": 4.09398889541626, | |
| "eval_runtime": 1.1823, | |
| "eval_samples_per_second": 193.693, | |
| "eval_steps_per_second": 1.692, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "eval_accuracy": 0.30912164325321106, | |
| "eval_loss": 4.074470043182373, | |
| "eval_runtime": 1.1705, | |
| "eval_samples_per_second": 195.64, | |
| "eval_steps_per_second": 1.709, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.576112412177986, | |
| "grad_norm": 0.6125728487968445, | |
| "learning_rate": 8.611292881304631e-05, | |
| "loss": 4.0822, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.576112412177986, | |
| "eval_accuracy": 0.31070530633849414, | |
| "eval_loss": 4.0601091384887695, | |
| "eval_runtime": 1.176, | |
| "eval_samples_per_second": 194.735, | |
| "eval_steps_per_second": 1.701, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.693208430913349, | |
| "eval_accuracy": 0.31145231722777855, | |
| "eval_loss": 4.047569274902344, | |
| "eval_runtime": 1.1721, | |
| "eval_samples_per_second": 195.374, | |
| "eval_steps_per_second": 1.706, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.810304449648712, | |
| "grad_norm": 0.5131784081459045, | |
| "learning_rate": 6.21369402825085e-05, | |
| "loss": 4.0496, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.810304449648712, | |
| "eval_accuracy": 0.3138342148061827, | |
| "eval_loss": 4.032547950744629, | |
| "eval_runtime": 1.1719, | |
| "eval_samples_per_second": 195.415, | |
| "eval_steps_per_second": 1.707, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.927400468384075, | |
| "eval_accuracy": 0.31531969931744547, | |
| "eval_loss": 4.0211310386657715, | |
| "eval_runtime": 1.1766, | |
| "eval_samples_per_second": 194.63, | |
| "eval_steps_per_second": 1.7, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.0444964871194378, | |
| "grad_norm": 0.5091509819030762, | |
| "learning_rate": 4.118664897236514e-05, | |
| "loss": 4.0193, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.0444964871194378, | |
| "eval_accuracy": 0.31634844002783147, | |
| "eval_loss": 4.012126445770264, | |
| "eval_runtime": 1.1748, | |
| "eval_samples_per_second": 194.929, | |
| "eval_steps_per_second": 1.702, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.161592505854801, | |
| "eval_accuracy": 0.31667712481911664, | |
| "eval_loss": 4.007138729095459, | |
| "eval_runtime": 1.1742, | |
| "eval_samples_per_second": 195.034, | |
| "eval_steps_per_second": 1.703, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.278688524590164, | |
| "grad_norm": 0.4789488911628723, | |
| "learning_rate": 2.39835098192799e-05, | |
| "loss": 3.9665, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.278688524590164, | |
| "eval_accuracy": 0.31719362949113616, | |
| "eval_loss": 4.001045227050781, | |
| "eval_runtime": 1.1742, | |
| "eval_samples_per_second": 195.029, | |
| "eval_steps_per_second": 1.703, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.3957845433255267, | |
| "eval_accuracy": 0.31759488105452327, | |
| "eval_loss": 3.997783899307251, | |
| "eval_runtime": 1.1723, | |
| "eval_samples_per_second": 195.344, | |
| "eval_steps_per_second": 1.706, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.51288056206089, | |
| "grad_norm": 0.4739971458911896, | |
| "learning_rate": 1.1119938913529075e-05, | |
| "loss": 3.9575, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.51288056206089, | |
| "eval_accuracy": 0.3181796838649917, | |
| "eval_loss": 3.993601083755493, | |
| "eval_runtime": 1.1865, | |
| "eval_samples_per_second": 192.998, | |
| "eval_steps_per_second": 1.686, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.629976580796253, | |
| "eval_accuracy": 0.3181924897659508, | |
| "eval_loss": 3.991323947906494, | |
| "eval_runtime": 1.1742, | |
| "eval_samples_per_second": 195.026, | |
| "eval_steps_per_second": 1.703, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.747072599531616, | |
| "grad_norm": 0.47446030378341675, | |
| "learning_rate": 3.038912755806222e-06, | |
| "loss": 3.9558, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.747072599531616, | |
| "eval_accuracy": 0.3182010270332569, | |
| "eval_loss": 3.9895448684692383, | |
| "eval_runtime": 1.1801, | |
| "eval_samples_per_second": 194.046, | |
| "eval_steps_per_second": 1.695, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.8641686182669788, | |
| "eval_accuracy": 0.3184998313889707, | |
| "eval_loss": 3.9884603023529053, | |
| "eval_runtime": 1.1735, | |
| "eval_samples_per_second": 195.146, | |
| "eval_steps_per_second": 1.704, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.981264637002342, | |
| "grad_norm": 0.4558132290840149, | |
| "learning_rate": 1.8713692093474952e-08, | |
| "loss": 3.9531, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.981264637002342, | |
| "eval_accuracy": 0.3183333546765016, | |
| "eval_loss": 3.988285541534424, | |
| "eval_runtime": 1.1729, | |
| "eval_samples_per_second": 195.237, | |
| "eval_steps_per_second": 1.705, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 3416, | |
| "total_flos": 1.543347550373806e+17, | |
| "train_loss": 4.46121194826077, | |
| "train_runtime": 2449.295, | |
| "train_samples_per_second": 178.438, | |
| "train_steps_per_second": 1.395 | |
| } | |
| ], | |
| "logging_steps": 200, | |
| "max_steps": 3416, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.543347550373806e+17, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |