{ "best_metric": 0.6739130434782609, "best_model_checkpoint": "SW2-DMAE-2\\checkpoint-168", "epoch": 68.57142857142857, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.6269055604934692, "eval_runtime": 0.2761, "eval_samples_per_second": 166.629, "eval_steps_per_second": 10.867, "step": 3 }, { "epoch": 2.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.6078405380249023, "eval_runtime": 0.2401, "eval_samples_per_second": 191.623, "eval_steps_per_second": 12.497, "step": 7 }, { "epoch": 2.86, "learning_rate": 1.4375e-05, "loss": 1.618, "step": 10 }, { "epoch": 2.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.585246205329895, "eval_runtime": 0.3111, "eval_samples_per_second": 147.877, "eval_steps_per_second": 9.644, "step": 10 }, { "epoch": 4.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.539771556854248, "eval_runtime": 0.2925, "eval_samples_per_second": 157.267, "eval_steps_per_second": 10.257, "step": 14 }, { "epoch": 4.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4947997331619263, "eval_runtime": 0.2826, "eval_samples_per_second": 162.793, "eval_steps_per_second": 10.617, "step": 17 }, { "epoch": 5.71, "learning_rate": 1.375e-05, "loss": 1.5162, "step": 20 }, { "epoch": 6.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.434383749961853, "eval_runtime": 0.2591, "eval_samples_per_second": 177.566, "eval_steps_per_second": 11.58, "step": 21 }, { "epoch": 6.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.3878703117370605, "eval_runtime": 0.2371, "eval_samples_per_second": 194.049, "eval_steps_per_second": 12.655, "step": 24 }, { "epoch": 8.0, "eval_accuracy": 0.17391304347826086, "eval_loss": 1.328822135925293, "eval_runtime": 0.2481, "eval_samples_per_second": 185.442, "eval_steps_per_second": 12.094, "step": 28 }, { "epoch": 8.57, "learning_rate": 1.3125e-05, "loss": 1.3459, "step": 30 }, { "epoch": 8.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2925546169281006, "eval_runtime": 0.2481, "eval_samples_per_second": 185.442, "eval_steps_per_second": 12.094, "step": 31 }, { "epoch": 10.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2562181949615479, "eval_runtime": 0.2541, "eval_samples_per_second": 181.006, "eval_steps_per_second": 11.805, "step": 35 }, { "epoch": 10.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.238446593284607, "eval_runtime": 0.2411, "eval_samples_per_second": 190.828, "eval_steps_per_second": 12.445, "step": 38 }, { "epoch": 11.43, "learning_rate": 1.25e-05, "loss": 1.2384, "step": 40 }, { "epoch": 12.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2205413579940796, "eval_runtime": 0.2537, "eval_samples_per_second": 181.326, "eval_steps_per_second": 11.826, "step": 42 }, { "epoch": 12.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2173599004745483, "eval_runtime": 0.2401, "eval_samples_per_second": 191.624, "eval_steps_per_second": 12.497, "step": 45 }, { "epoch": 14.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2131370306015015, "eval_runtime": 0.2691, "eval_samples_per_second": 170.965, "eval_steps_per_second": 11.15, "step": 49 }, { "epoch": 14.29, "learning_rate": 1.1874999999999999e-05, "loss": 1.2049, "step": 50 }, { "epoch": 14.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2104469537734985, "eval_runtime": 0.2431, "eval_samples_per_second": 189.258, "eval_steps_per_second": 12.343, "step": 52 }, { "epoch": 16.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.208552598953247, "eval_runtime": 0.2451, "eval_samples_per_second": 187.713, "eval_steps_per_second": 12.242, "step": 56 }, { "epoch": 16.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2076021432876587, "eval_runtime": 0.2556, "eval_samples_per_second": 179.993, "eval_steps_per_second": 11.739, "step": 59 }, { "epoch": 17.14, "learning_rate": 1.125e-05, "loss": 1.1815, "step": 60 }, { "epoch": 18.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2051774263381958, "eval_runtime": 0.2431, "eval_samples_per_second": 189.258, "eval_steps_per_second": 12.343, "step": 63 }, { "epoch": 18.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.204942226409912, "eval_runtime": 0.3966, "eval_samples_per_second": 115.986, "eval_steps_per_second": 7.564, "step": 66 }, { "epoch": 20.0, "learning_rate": 1.0625e-05, "loss": 1.1826, "step": 70 }, { "epoch": 20.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2018660306930542, "eval_runtime": 0.3121, "eval_samples_per_second": 147.402, "eval_steps_per_second": 9.613, "step": 70 }, { "epoch": 20.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1960418224334717, "eval_runtime": 0.3191, "eval_samples_per_second": 144.168, "eval_steps_per_second": 9.402, "step": 73 }, { "epoch": 22.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1926813125610352, "eval_runtime": 0.3096, "eval_samples_per_second": 148.592, "eval_steps_per_second": 9.691, "step": 77 }, { "epoch": 22.86, "learning_rate": 9.999999999999999e-06, "loss": 1.1647, "step": 80 }, { "epoch": 22.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1927775144577026, "eval_runtime": 0.2491, "eval_samples_per_second": 184.697, "eval_steps_per_second": 12.045, "step": 80 }, { "epoch": 24.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1924192905426025, "eval_runtime": 0.2991, "eval_samples_per_second": 153.812, "eval_steps_per_second": 10.031, "step": 84 }, { "epoch": 24.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1902908086776733, "eval_runtime": 0.3005, "eval_samples_per_second": 153.097, "eval_steps_per_second": 9.985, "step": 87 }, { "epoch": 25.71, "learning_rate": 9.375000000000001e-06, "loss": 1.1568, "step": 90 }, { "epoch": 26.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1878631114959717, "eval_runtime": 0.2861, "eval_samples_per_second": 160.803, "eval_steps_per_second": 10.487, "step": 91 }, { "epoch": 26.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1913325786590576, "eval_runtime": 0.2806, "eval_samples_per_second": 163.951, "eval_steps_per_second": 10.692, "step": 94 }, { "epoch": 28.0, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.204640507698059, "eval_runtime": 0.2621, "eval_samples_per_second": 175.533, "eval_steps_per_second": 11.448, "step": 98 }, { "epoch": 28.57, "learning_rate": 8.750000000000001e-06, "loss": 1.1432, "step": 100 }, { "epoch": 28.86, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.193393588066101, "eval_runtime": 0.2361, "eval_samples_per_second": 194.871, "eval_steps_per_second": 12.709, "step": 101 }, { "epoch": 30.0, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.166512131690979, "eval_runtime": 0.2421, "eval_samples_per_second": 190.04, "eval_steps_per_second": 12.394, "step": 105 }, { "epoch": 30.86, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.1600818634033203, "eval_runtime": 0.2381, "eval_samples_per_second": 193.234, "eval_steps_per_second": 12.602, "step": 108 }, { "epoch": 31.43, "learning_rate": 8.125e-06, "loss": 1.1112, "step": 110 }, { "epoch": 32.0, "eval_accuracy": 0.5, "eval_loss": 1.1623895168304443, "eval_runtime": 0.2801, "eval_samples_per_second": 164.249, "eval_steps_per_second": 10.712, "step": 112 }, { "epoch": 32.86, "eval_accuracy": 0.5217391304347826, "eval_loss": 1.1663668155670166, "eval_runtime": 0.3126, "eval_samples_per_second": 147.162, "eval_steps_per_second": 9.597, "step": 115 }, { "epoch": 34.0, "eval_accuracy": 0.5, "eval_loss": 1.1692047119140625, "eval_runtime": 0.2586, "eval_samples_per_second": 177.904, "eval_steps_per_second": 11.602, "step": 119 }, { "epoch": 34.29, "learning_rate": 7.5e-06, "loss": 1.1132, "step": 120 }, { "epoch": 34.86, "eval_accuracy": 0.5434782608695652, "eval_loss": 1.1513336896896362, "eval_runtime": 0.2416, "eval_samples_per_second": 190.427, "eval_steps_per_second": 12.419, "step": 122 }, { "epoch": 36.0, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1384443044662476, "eval_runtime": 0.2461, "eval_samples_per_second": 186.95, "eval_steps_per_second": 12.192, "step": 126 }, { "epoch": 36.86, "eval_accuracy": 0.6086956521739131, "eval_loss": 1.127366542816162, "eval_runtime": 0.2446, "eval_samples_per_second": 188.091, "eval_steps_per_second": 12.267, "step": 129 }, { "epoch": 37.14, "learning_rate": 6.875e-06, "loss": 1.0642, "step": 130 }, { "epoch": 38.0, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1442575454711914, "eval_runtime": 0.2586, "eval_samples_per_second": 177.903, "eval_steps_per_second": 11.602, "step": 133 }, { "epoch": 38.86, "eval_accuracy": 0.5, "eval_loss": 1.1651057004928589, "eval_runtime": 0.2496, "eval_samples_per_second": 184.324, "eval_steps_per_second": 12.021, "step": 136 }, { "epoch": 40.0, "learning_rate": 6.25e-06, "loss": 1.0439, "step": 140 }, { "epoch": 40.0, "eval_accuracy": 0.5, "eval_loss": 1.149288296699524, "eval_runtime": 0.2441, "eval_samples_per_second": 188.478, "eval_steps_per_second": 12.292, "step": 140 }, { "epoch": 40.86, "eval_accuracy": 0.5217391304347826, "eval_loss": 1.1330839395523071, "eval_runtime": 0.2811, "eval_samples_per_second": 163.664, "eval_steps_per_second": 10.674, "step": 143 }, { "epoch": 42.0, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1032251119613647, "eval_runtime": 0.2521, "eval_samples_per_second": 182.499, "eval_steps_per_second": 11.902, "step": 147 }, { "epoch": 42.86, "learning_rate": 5.625e-06, "loss": 1.0362, "step": 150 }, { "epoch": 42.86, "eval_accuracy": 0.6304347826086957, "eval_loss": 1.0988132953643799, "eval_runtime": 0.2916, "eval_samples_per_second": 157.767, "eval_steps_per_second": 10.289, "step": 150 }, { "epoch": 44.0, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1092532873153687, "eval_runtime": 0.2506, "eval_samples_per_second": 183.583, "eval_steps_per_second": 11.973, "step": 154 }, { "epoch": 44.86, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1101136207580566, "eval_runtime": 0.22, "eval_samples_per_second": 209.044, "eval_steps_per_second": 13.633, "step": 157 }, { "epoch": 45.71, "learning_rate": 4.9999999999999996e-06, "loss": 1.0177, "step": 160 }, { "epoch": 46.0, "eval_accuracy": 0.6304347826086957, "eval_loss": 1.0903021097183228, "eval_runtime": 0.3931, "eval_samples_per_second": 117.022, "eval_steps_per_second": 7.632, "step": 161 }, { "epoch": 46.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.069130301475525, "eval_runtime": 0.4281, "eval_samples_per_second": 107.447, "eval_steps_per_second": 7.007, "step": 164 }, { "epoch": 48.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.0509947538375854, "eval_runtime": 0.2701, "eval_samples_per_second": 170.332, "eval_steps_per_second": 11.109, "step": 168 }, { "epoch": 48.57, "learning_rate": 4.3750000000000005e-06, "loss": 1.0, "step": 170 }, { "epoch": 48.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0451492071151733, "eval_runtime": 0.2466, "eval_samples_per_second": 186.563, "eval_steps_per_second": 12.167, "step": 171 }, { "epoch": 50.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0425117015838623, "eval_runtime": 0.2591, "eval_samples_per_second": 177.566, "eval_steps_per_second": 11.58, "step": 175 }, { "epoch": 50.86, "eval_accuracy": 0.6086956521739131, "eval_loss": 1.0512455701828003, "eval_runtime": 0.2671, "eval_samples_per_second": 172.246, "eval_steps_per_second": 11.233, "step": 178 }, { "epoch": 51.43, "learning_rate": 3.75e-06, "loss": 0.9636, "step": 180 }, { "epoch": 52.0, "eval_accuracy": 0.6304347826086957, "eval_loss": 1.044124722480774, "eval_runtime": 0.2541, "eval_samples_per_second": 181.062, "eval_steps_per_second": 11.808, "step": 182 }, { "epoch": 52.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0401920080184937, "eval_runtime": 0.2676, "eval_samples_per_second": 171.917, "eval_steps_per_second": 11.212, "step": 185 }, { "epoch": 54.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0161322355270386, "eval_runtime": 0.2421, "eval_samples_per_second": 190.039, "eval_steps_per_second": 12.394, "step": 189 }, { "epoch": 54.29, "learning_rate": 3.125e-06, "loss": 0.9744, "step": 190 }, { "epoch": 54.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0072776079177856, "eval_runtime": 0.3861, "eval_samples_per_second": 119.144, "eval_steps_per_second": 7.77, "step": 192 }, { "epoch": 56.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 1.0047576427459717, "eval_runtime": 0.2431, "eval_samples_per_second": 189.257, "eval_steps_per_second": 12.343, "step": 196 }, { "epoch": 56.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9993048310279846, "eval_runtime": 0.2251, "eval_samples_per_second": 204.398, "eval_steps_per_second": 13.33, "step": 199 }, { "epoch": 57.14, "learning_rate": 2.4999999999999998e-06, "loss": 0.9233, "step": 200 }, { "epoch": 58.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9939430952072144, "eval_runtime": 0.2431, "eval_samples_per_second": 189.258, "eval_steps_per_second": 12.343, "step": 203 }, { "epoch": 58.86, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9938895106315613, "eval_runtime": 0.2396, "eval_samples_per_second": 192.017, "eval_steps_per_second": 12.523, "step": 206 }, { "epoch": 60.0, "learning_rate": 1.875e-06, "loss": 0.9452, "step": 210 }, { "epoch": 60.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9975183010101318, "eval_runtime": 0.2521, "eval_samples_per_second": 182.499, "eval_steps_per_second": 11.902, "step": 210 }, { "epoch": 60.86, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.998058557510376, "eval_runtime": 0.2471, "eval_samples_per_second": 186.193, "eval_steps_per_second": 12.143, "step": 213 }, { "epoch": 62.0, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.9985237121582031, "eval_runtime": 0.2421, "eval_samples_per_second": 190.04, "eval_steps_per_second": 12.394, "step": 217 }, { "epoch": 62.86, "learning_rate": 1.2499999999999999e-06, "loss": 0.9183, "step": 220 }, { "epoch": 62.86, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.9968777298927307, "eval_runtime": 0.2391, "eval_samples_per_second": 192.425, "eval_steps_per_second": 12.549, "step": 220 }, { "epoch": 64.0, "eval_accuracy": 0.6304347826086957, "eval_loss": 0.99575275182724, "eval_runtime": 0.2351, "eval_samples_per_second": 195.701, "eval_steps_per_second": 12.763, "step": 224 }, { "epoch": 64.86, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.9928344488143921, "eval_runtime": 0.2406, "eval_samples_per_second": 191.216, "eval_steps_per_second": 12.471, "step": 227 }, { "epoch": 65.71, "learning_rate": 6.249999999999999e-07, "loss": 0.9449, "step": 230 }, { "epoch": 66.0, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.9906012415885925, "eval_runtime": 0.2456, "eval_samples_per_second": 187.316, "eval_steps_per_second": 12.216, "step": 231 }, { "epoch": 66.86, "eval_accuracy": 0.6304347826086957, "eval_loss": 0.9892796874046326, "eval_runtime": 0.2501, "eval_samples_per_second": 183.959, "eval_steps_per_second": 11.997, "step": 234 }, { "epoch": 68.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9880576133728027, "eval_runtime": 0.2471, "eval_samples_per_second": 186.193, "eval_steps_per_second": 12.143, "step": 238 }, { "epoch": 68.57, "learning_rate": 0.0, "loss": 0.9154, "step": 240 }, { "epoch": 68.57, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9880411028862, "eval_runtime": 0.2566, "eval_samples_per_second": 179.291, "eval_steps_per_second": 11.693, "step": 240 }, { "epoch": 68.57, "step": 240, "total_flos": 4.754181186964685e+17, "train_loss": 1.113492695490519, "train_runtime": 374.3307, "train_samples_per_second": 45.521, "train_steps_per_second": 0.641 } ], "logging_steps": 10, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 4.754181186964685e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }