{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 3416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.117096018735363, "eval_accuracy": 0.16272885212172436, "eval_loss": 6.373325347900391, "eval_runtime": 1.492, "eval_samples_per_second": 153.483, "eval_steps_per_second": 1.34, "step": 100 }, { "epoch": 0.234192037470726, "grad_norm": 0.5273725986480713, "learning_rate": 0.00029824050644124145, "loss": 6.8543, "step": 200 }, { "epoch": 0.234192037470726, "eval_accuracy": 0.19693341358364602, "eval_loss": 5.696830749511719, "eval_runtime": 1.1659, "eval_samples_per_second": 196.414, "eval_steps_per_second": 1.715, "step": 200 }, { "epoch": 0.351288056206089, "eval_accuracy": 0.21933947162852643, "eval_loss": 5.292416095733643, "eval_runtime": 1.1779, "eval_samples_per_second": 194.406, "eval_steps_per_second": 1.698, "step": 300 }, { "epoch": 0.468384074941452, "grad_norm": 0.7039262056350708, "learning_rate": 0.00029145544445670097, "loss": 5.3976, "step": 400 }, { "epoch": 0.468384074941452, "eval_accuracy": 0.2335839021287676, "eval_loss": 5.054598808288574, "eval_runtime": 1.1808, "eval_samples_per_second": 193.929, "eval_steps_per_second": 1.694, "step": 400 }, { "epoch": 0.585480093676815, "eval_accuracy": 0.24265474864150735, "eval_loss": 4.891258716583252, "eval_runtime": 1.1832, "eval_samples_per_second": 193.538, "eval_steps_per_second": 1.69, "step": 500 }, { "epoch": 0.702576112412178, "grad_norm": 0.7881871461868286, "learning_rate": 0.0002797991505064349, "loss": 4.9739, "step": 600 }, { "epoch": 0.702576112412178, "eval_accuracy": 0.2506498994736775, "eval_loss": 4.759438514709473, "eval_runtime": 1.1726, "eval_samples_per_second": 195.299, "eval_steps_per_second": 1.706, "step": 600 }, { "epoch": 0.819672131147541, "eval_accuracy": 0.2589993468990511, "eval_loss": 4.651967525482178, "eval_runtime": 1.1752, "eval_samples_per_second": 194.864, "eval_steps_per_second": 1.702, "step": 700 }, { "epoch": 0.936768149882904, "grad_norm": 0.5959051251411438, "learning_rate": 0.00026367302668500736, "loss": 4.7436, "step": 800 }, { "epoch": 0.936768149882904, "eval_accuracy": 0.2653638796757546, "eval_loss": 4.566330432891846, "eval_runtime": 1.1821, "eval_samples_per_second": 193.727, "eval_steps_per_second": 1.692, "step": 800 }, { "epoch": 1.053864168618267, "eval_accuracy": 0.270379524218093, "eval_loss": 4.497960567474365, "eval_runtime": 1.1842, "eval_samples_per_second": 193.385, "eval_steps_per_second": 1.689, "step": 900 }, { "epoch": 1.17096018735363, "grad_norm": 0.6118926405906677, "learning_rate": 0.00024363240043240403, "loss": 4.5595, "step": 1000 }, { "epoch": 1.17096018735363, "eval_accuracy": 0.2755872572748189, "eval_loss": 4.44118070602417, "eval_runtime": 1.1779, "eval_samples_per_second": 194.409, "eval_steps_per_second": 1.698, "step": 1000 }, { "epoch": 1.288056206088993, "eval_accuracy": 0.2790832682366701, "eval_loss": 4.388593673706055, "eval_runtime": 1.1719, "eval_samples_per_second": 195.408, "eval_steps_per_second": 1.707, "step": 1100 }, { "epoch": 1.405152224824356, "grad_norm": 0.6060830354690552, "learning_rate": 0.00022036740099454973, "loss": 4.44, "step": 1200 }, { "epoch": 1.405152224824356, "eval_accuracy": 0.28338605095894853, "eval_loss": 4.339376449584961, "eval_runtime": 1.1734, "eval_samples_per_second": 195.157, "eval_steps_per_second": 1.704, "step": 1200 }, { "epoch": 1.5222482435597189, "eval_accuracy": 0.28774005728506363, "eval_loss": 4.292956352233887, "eval_runtime": 1.1856, "eval_samples_per_second": 193.144, "eval_steps_per_second": 1.687, "step": 1300 }, { "epoch": 1.639344262295082, "grad_norm": 0.5661224126815796, "learning_rate": 0.0001946791937799555, "loss": 4.3482, "step": 1400 }, { "epoch": 1.639344262295082, "eval_accuracy": 0.2911549642075068, "eval_loss": 4.25685977935791, "eval_runtime": 1.175, "eval_samples_per_second": 194.899, "eval_steps_per_second": 1.702, "step": 1400 }, { "epoch": 1.756440281030445, "eval_accuracy": 0.29346429501380905, "eval_loss": 4.224252700805664, "eval_runtime": 1.1755, "eval_samples_per_second": 194.807, "eval_steps_per_second": 1.701, "step": 1500 }, { "epoch": 1.8735362997658078, "grad_norm": 0.5346641540527344, "learning_rate": 0.00016745239101833442, "loss": 4.2765, "step": 1600 }, { "epoch": 1.8735362997658078, "eval_accuracy": 0.29705421591602743, "eval_loss": 4.186337947845459, "eval_runtime": 1.1767, "eval_samples_per_second": 194.606, "eval_steps_per_second": 1.7, "step": 1600 }, { "epoch": 1.990632318501171, "eval_accuracy": 0.3002471538885118, "eval_loss": 4.158440113067627, "eval_runtime": 1.1742, "eval_samples_per_second": 195.024, "eval_steps_per_second": 1.703, "step": 1700 }, { "epoch": 2.107728337236534, "grad_norm": 0.5907180309295654, "learning_rate": 0.0001396245888018435, "loss": 4.1869, "step": 1800 }, { "epoch": 2.107728337236534, "eval_accuracy": 0.3025009924573243, "eval_loss": 4.138100624084473, "eval_runtime": 1.1874, "eval_samples_per_second": 192.866, "eval_steps_per_second": 1.684, "step": 1800 }, { "epoch": 2.2248243559718968, "eval_accuracy": 0.30516461985683, "eval_loss": 4.114769458770752, "eval_runtime": 1.1787, "eval_samples_per_second": 194.278, "eval_steps_per_second": 1.697, "step": 1900 }, { "epoch": 2.34192037470726, "grad_norm": 0.5956910848617554, "learning_rate": 0.00011215407954696412, "loss": 4.1154, "step": 2000 }, { "epoch": 2.34192037470726, "eval_accuracy": 0.30651777672484815, "eval_loss": 4.09398889541626, "eval_runtime": 1.1823, "eval_samples_per_second": 193.693, "eval_steps_per_second": 1.692, "step": 2000 }, { "epoch": 2.459016393442623, "eval_accuracy": 0.30912164325321106, "eval_loss": 4.074470043182373, "eval_runtime": 1.1705, "eval_samples_per_second": 195.64, "eval_steps_per_second": 1.709, "step": 2100 }, { "epoch": 2.576112412177986, "grad_norm": 0.6125728487968445, "learning_rate": 8.611292881304631e-05, "loss": 4.0822, "step": 2200 }, { "epoch": 2.576112412177986, "eval_accuracy": 0.31070530633849414, "eval_loss": 4.0601091384887695, "eval_runtime": 1.176, "eval_samples_per_second": 194.735, "eval_steps_per_second": 1.701, "step": 2200 }, { "epoch": 2.693208430913349, "eval_accuracy": 0.31145231722777855, "eval_loss": 4.047569274902344, "eval_runtime": 1.1721, "eval_samples_per_second": 195.374, "eval_steps_per_second": 1.706, "step": 2300 }, { "epoch": 2.810304449648712, "grad_norm": 0.5131784081459045, "learning_rate": 6.21369402825085e-05, "loss": 4.0496, "step": 2400 }, { "epoch": 2.810304449648712, "eval_accuracy": 0.3138342148061827, "eval_loss": 4.032547950744629, "eval_runtime": 1.1719, "eval_samples_per_second": 195.415, "eval_steps_per_second": 1.707, "step": 2400 }, { "epoch": 2.927400468384075, "eval_accuracy": 0.31531969931744547, "eval_loss": 4.0211310386657715, "eval_runtime": 1.1766, "eval_samples_per_second": 194.63, "eval_steps_per_second": 1.7, "step": 2500 }, { "epoch": 3.0444964871194378, "grad_norm": 0.5091509819030762, "learning_rate": 4.118664897236514e-05, "loss": 4.0193, "step": 2600 }, { "epoch": 3.0444964871194378, "eval_accuracy": 0.31634844002783147, "eval_loss": 4.012126445770264, "eval_runtime": 1.1748, "eval_samples_per_second": 194.929, "eval_steps_per_second": 1.702, "step": 2600 }, { "epoch": 3.161592505854801, "eval_accuracy": 0.31667712481911664, "eval_loss": 4.007138729095459, "eval_runtime": 1.1742, "eval_samples_per_second": 195.034, "eval_steps_per_second": 1.703, "step": 2700 }, { "epoch": 3.278688524590164, "grad_norm": 0.4789488911628723, "learning_rate": 2.39835098192799e-05, "loss": 3.9665, "step": 2800 }, { "epoch": 3.278688524590164, "eval_accuracy": 0.31719362949113616, "eval_loss": 4.001045227050781, "eval_runtime": 1.1742, "eval_samples_per_second": 195.029, "eval_steps_per_second": 1.703, "step": 2800 }, { "epoch": 3.3957845433255267, "eval_accuracy": 0.31759488105452327, "eval_loss": 3.997783899307251, "eval_runtime": 1.1723, "eval_samples_per_second": 195.344, "eval_steps_per_second": 1.706, "step": 2900 }, { "epoch": 3.51288056206089, "grad_norm": 0.4739971458911896, "learning_rate": 1.1119938913529075e-05, "loss": 3.9575, "step": 3000 }, { "epoch": 3.51288056206089, "eval_accuracy": 0.3181796838649917, "eval_loss": 3.993601083755493, "eval_runtime": 1.1865, "eval_samples_per_second": 192.998, "eval_steps_per_second": 1.686, "step": 3000 }, { "epoch": 3.629976580796253, "eval_accuracy": 0.3181924897659508, "eval_loss": 3.991323947906494, "eval_runtime": 1.1742, "eval_samples_per_second": 195.026, "eval_steps_per_second": 1.703, "step": 3100 }, { "epoch": 3.747072599531616, "grad_norm": 0.47446030378341675, "learning_rate": 3.038912755806222e-06, "loss": 3.9558, "step": 3200 }, { "epoch": 3.747072599531616, "eval_accuracy": 0.3182010270332569, "eval_loss": 3.9895448684692383, "eval_runtime": 1.1801, "eval_samples_per_second": 194.046, "eval_steps_per_second": 1.695, "step": 3200 }, { "epoch": 3.8641686182669788, "eval_accuracy": 0.3184998313889707, "eval_loss": 3.9884603023529053, "eval_runtime": 1.1735, "eval_samples_per_second": 195.146, "eval_steps_per_second": 1.704, "step": 3300 }, { "epoch": 3.981264637002342, "grad_norm": 0.4558132290840149, "learning_rate": 1.8713692093474952e-08, "loss": 3.9531, "step": 3400 }, { "epoch": 3.981264637002342, "eval_accuracy": 0.3183333546765016, "eval_loss": 3.988285541534424, "eval_runtime": 1.1729, "eval_samples_per_second": 195.237, "eval_steps_per_second": 1.705, "step": 3400 }, { "epoch": 4.0, "step": 3416, "total_flos": 1.543347550373806e+17, "train_loss": 4.46121194826077, "train_runtime": 2449.295, "train_samples_per_second": 178.438, "train_steps_per_second": 1.395 } ], "logging_steps": 200, "max_steps": 3416, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.543347550373806e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }