| { | |
| "best_metric": 0.9193548387096774, | |
| "best_model_checkpoint": "SW2-TO-DA\\checkpoint-130", | |
| "epoch": 38.62068965517241, | |
| "eval_steps": 500, | |
| "global_step": 560, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 2.6785714285714284e-05, | |
| "loss": 1.4955, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "eval_accuracy": 0.08064516129032258, | |
| "eval_loss": 1.558031439781189, | |
| "eval_runtime": 2.4051, | |
| "eval_samples_per_second": 25.778, | |
| "eval_steps_per_second": 1.663, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 5.357142857142857e-05, | |
| "loss": 1.3943, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.6451612903225806, | |
| "eval_loss": 1.1315828561782837, | |
| "eval_runtime": 1.9485, | |
| "eval_samples_per_second": 31.82, | |
| "eval_steps_per_second": 2.053, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "learning_rate": 8.035714285714285e-05, | |
| "loss": 1.2678, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "learning_rate": 0.00010714285714285714, | |
| "loss": 1.0056, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "eval_accuracy": 0.7419354838709677, | |
| "eval_loss": 0.6406522393226624, | |
| "eval_runtime": 2.1315, | |
| "eval_samples_per_second": 29.087, | |
| "eval_steps_per_second": 1.877, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "learning_rate": 0.00013392857142857144, | |
| "loss": 0.7744, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.4265057146549225, | |
| "eval_runtime": 1.9565, | |
| "eval_samples_per_second": 31.69, | |
| "eval_steps_per_second": 2.045, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "learning_rate": 0.0001488095238095238, | |
| "loss": 0.7109, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "learning_rate": 0.00014583333333333332, | |
| "loss": 0.6022, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.4360726773738861, | |
| "eval_runtime": 1.949, | |
| "eval_samples_per_second": 31.812, | |
| "eval_steps_per_second": 2.052, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "learning_rate": 0.00014285714285714284, | |
| "loss": 0.5854, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.8064516129032258, | |
| "eval_loss": 0.5508103370666504, | |
| "eval_runtime": 1.948, | |
| "eval_samples_per_second": 31.828, | |
| "eval_steps_per_second": 2.053, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 6.21, | |
| "learning_rate": 0.00013988095238095236, | |
| "loss": 0.5151, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "learning_rate": 0.00013690476190476189, | |
| "loss": 0.4581, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 6.97, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.3123740255832672, | |
| "eval_runtime": 1.9775, | |
| "eval_samples_per_second": 31.353, | |
| "eval_steps_per_second": 2.023, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "learning_rate": 0.00013392857142857144, | |
| "loss": 0.386, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.31687411665916443, | |
| "eval_runtime": 2.0035, | |
| "eval_samples_per_second": 30.946, | |
| "eval_steps_per_second": 1.997, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "learning_rate": 0.00013095238095238093, | |
| "loss": 0.4182, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 8.97, | |
| "learning_rate": 0.00012797619047619045, | |
| "loss": 0.347, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 8.97, | |
| "eval_accuracy": 0.9193548387096774, | |
| "eval_loss": 0.22072885930538177, | |
| "eval_runtime": 2.034, | |
| "eval_samples_per_second": 30.482, | |
| "eval_steps_per_second": 1.967, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 9.66, | |
| "learning_rate": 0.000125, | |
| "loss": 0.3873, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.8225806451612904, | |
| "eval_loss": 0.5968738198280334, | |
| "eval_runtime": 1.986, | |
| "eval_samples_per_second": 31.219, | |
| "eval_steps_per_second": 2.014, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 10.34, | |
| "learning_rate": 0.00012202380952380951, | |
| "loss": 0.3508, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 10.97, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.342462956905365, | |
| "eval_runtime": 1.981, | |
| "eval_samples_per_second": 31.298, | |
| "eval_steps_per_second": 2.019, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 11.03, | |
| "learning_rate": 0.00011904761904761903, | |
| "loss": 0.3437, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 11.72, | |
| "learning_rate": 0.00011607142857142857, | |
| "loss": 0.274, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.3376210927963257, | |
| "eval_runtime": 2.0573, | |
| "eval_samples_per_second": 30.136, | |
| "eval_steps_per_second": 1.944, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 12.41, | |
| "learning_rate": 0.00011309523809523808, | |
| "loss": 0.2615, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 12.97, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.4912601709365845, | |
| "eval_runtime": 2.019, | |
| "eval_samples_per_second": 30.709, | |
| "eval_steps_per_second": 1.981, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 13.1, | |
| "learning_rate": 0.0001101190476190476, | |
| "loss": 0.2471, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 13.79, | |
| "learning_rate": 0.00010714285714285714, | |
| "loss": 0.3118, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.40341296792030334, | |
| "eval_runtime": 1.9425, | |
| "eval_samples_per_second": 31.918, | |
| "eval_steps_per_second": 2.059, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 14.48, | |
| "learning_rate": 0.00010416666666666666, | |
| "loss": 0.2205, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 14.97, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.3167090117931366, | |
| "eval_runtime": 1.966, | |
| "eval_samples_per_second": 31.537, | |
| "eval_steps_per_second": 2.035, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 15.17, | |
| "learning_rate": 0.00010119047619047618, | |
| "loss": 0.2102, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 15.86, | |
| "learning_rate": 9.82142857142857e-05, | |
| "loss": 0.2325, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.3042737543582916, | |
| "eval_runtime": 1.9585, | |
| "eval_samples_per_second": 31.658, | |
| "eval_steps_per_second": 2.042, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 16.55, | |
| "learning_rate": 9.523809523809523e-05, | |
| "loss": 0.1914, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 16.97, | |
| "eval_accuracy": 0.8225806451612904, | |
| "eval_loss": 0.4256087839603424, | |
| "eval_runtime": 1.9625, | |
| "eval_samples_per_second": 31.593, | |
| "eval_steps_per_second": 2.038, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 17.24, | |
| "learning_rate": 9.226190476190476e-05, | |
| "loss": 0.2044, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 17.93, | |
| "learning_rate": 8.928571428571427e-05, | |
| "loss": 0.1997, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.37694820761680603, | |
| "eval_runtime": 2.009, | |
| "eval_samples_per_second": 30.862, | |
| "eval_steps_per_second": 1.991, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 18.62, | |
| "learning_rate": 8.63095238095238e-05, | |
| "loss": 0.1752, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 18.97, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.587546169757843, | |
| "eval_runtime": 2.1045, | |
| "eval_samples_per_second": 29.461, | |
| "eval_steps_per_second": 1.901, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 19.31, | |
| "learning_rate": 8.333333333333333e-05, | |
| "loss": 0.237, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "learning_rate": 8.035714285714285e-05, | |
| "loss": 0.1685, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.41043660044670105, | |
| "eval_runtime": 1.8789, | |
| "eval_samples_per_second": 32.997, | |
| "eval_steps_per_second": 2.129, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 20.69, | |
| "learning_rate": 7.738095238095239e-05, | |
| "loss": 0.1736, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 20.97, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.5480897426605225, | |
| "eval_runtime": 1.8865, | |
| "eval_samples_per_second": 32.866, | |
| "eval_steps_per_second": 2.12, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 21.38, | |
| "learning_rate": 7.44047619047619e-05, | |
| "loss": 0.1901, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.3800370693206787, | |
| "eval_runtime": 1.916, | |
| "eval_samples_per_second": 32.36, | |
| "eval_steps_per_second": 2.088, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 22.07, | |
| "learning_rate": 7.142857142857142e-05, | |
| "loss": 0.1718, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 22.76, | |
| "learning_rate": 6.845238095238094e-05, | |
| "loss": 0.1426, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 22.97, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.44246000051498413, | |
| "eval_runtime": 1.939, | |
| "eval_samples_per_second": 31.976, | |
| "eval_steps_per_second": 2.063, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 23.45, | |
| "learning_rate": 6.547619047619047e-05, | |
| "loss": 0.1251, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.3373814523220062, | |
| "eval_runtime": 1.9735, | |
| "eval_samples_per_second": 31.416, | |
| "eval_steps_per_second": 2.027, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 24.14, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.1329, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 24.83, | |
| "learning_rate": 5.952380952380952e-05, | |
| "loss": 0.1326, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 24.97, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.3627336323261261, | |
| "eval_runtime": 1.951, | |
| "eval_samples_per_second": 31.779, | |
| "eval_steps_per_second": 2.05, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 25.52, | |
| "learning_rate": 5.654761904761904e-05, | |
| "loss": 0.1271, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.4767535328865051, | |
| "eval_runtime": 1.977, | |
| "eval_samples_per_second": 31.361, | |
| "eval_steps_per_second": 2.023, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 26.21, | |
| "learning_rate": 5.357142857142857e-05, | |
| "loss": 0.1415, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 26.9, | |
| "learning_rate": 5.059523809523809e-05, | |
| "loss": 0.1835, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 26.97, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.5603958368301392, | |
| "eval_runtime": 2.171, | |
| "eval_samples_per_second": 28.558, | |
| "eval_steps_per_second": 1.842, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 27.59, | |
| "learning_rate": 4.7619047619047614e-05, | |
| "loss": 0.1378, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.4130818247795105, | |
| "eval_runtime": 2.0055, | |
| "eval_samples_per_second": 30.916, | |
| "eval_steps_per_second": 1.995, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 28.28, | |
| "learning_rate": 4.4642857142857136e-05, | |
| "loss": 0.1253, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 28.97, | |
| "learning_rate": 4.1666666666666665e-05, | |
| "loss": 0.1349, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 28.97, | |
| "eval_accuracy": 0.8548387096774194, | |
| "eval_loss": 0.5103474259376526, | |
| "eval_runtime": 1.9155, | |
| "eval_samples_per_second": 32.368, | |
| "eval_steps_per_second": 2.088, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 29.66, | |
| "learning_rate": 3.8690476190476195e-05, | |
| "loss": 0.0999, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_accuracy": 0.9193548387096774, | |
| "eval_loss": 0.37231481075286865, | |
| "eval_runtime": 1.94, | |
| "eval_samples_per_second": 31.959, | |
| "eval_steps_per_second": 2.062, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 30.34, | |
| "learning_rate": 3.571428571428571e-05, | |
| "loss": 0.1198, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 30.97, | |
| "eval_accuracy": 0.8709677419354839, | |
| "eval_loss": 0.5360597968101501, | |
| "eval_runtime": 1.9235, | |
| "eval_samples_per_second": 32.233, | |
| "eval_steps_per_second": 2.08, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 31.03, | |
| "learning_rate": 3.273809523809523e-05, | |
| "loss": 0.1301, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 31.72, | |
| "learning_rate": 2.976190476190476e-05, | |
| "loss": 0.1195, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.41935569047927856, | |
| "eval_runtime": 1.9165, | |
| "eval_samples_per_second": 32.351, | |
| "eval_steps_per_second": 2.087, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 32.41, | |
| "learning_rate": 2.6785714285714284e-05, | |
| "loss": 0.0766, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 32.97, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.4133478105068207, | |
| "eval_runtime": 1.924, | |
| "eval_samples_per_second": 32.225, | |
| "eval_steps_per_second": 2.079, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 33.1, | |
| "learning_rate": 2.3809523809523807e-05, | |
| "loss": 0.1043, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 33.79, | |
| "learning_rate": 2.0833333333333333e-05, | |
| "loss": 0.0862, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.42390120029449463, | |
| "eval_runtime": 1.9, | |
| "eval_samples_per_second": 32.632, | |
| "eval_steps_per_second": 2.105, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 34.48, | |
| "learning_rate": 1.7857142857142855e-05, | |
| "loss": 0.1048, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 34.97, | |
| "eval_accuracy": 0.9193548387096774, | |
| "eval_loss": 0.4120253920555115, | |
| "eval_runtime": 1.954, | |
| "eval_samples_per_second": 31.73, | |
| "eval_steps_per_second": 2.047, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 35.17, | |
| "learning_rate": 1.488095238095238e-05, | |
| "loss": 0.0884, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 35.86, | |
| "learning_rate": 1.1904761904761903e-05, | |
| "loss": 0.0902, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.44083285331726074, | |
| "eval_runtime": 1.9034, | |
| "eval_samples_per_second": 32.573, | |
| "eval_steps_per_second": 2.101, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 36.55, | |
| "learning_rate": 8.928571428571428e-06, | |
| "loss": 0.088, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 36.97, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.4435848295688629, | |
| "eval_runtime": 1.8659, | |
| "eval_samples_per_second": 33.227, | |
| "eval_steps_per_second": 2.144, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 37.24, | |
| "learning_rate": 5.952380952380952e-06, | |
| "loss": 0.0864, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 37.93, | |
| "learning_rate": 2.976190476190476e-06, | |
| "loss": 0.089, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_accuracy": 0.9032258064516129, | |
| "eval_loss": 0.46484920382499695, | |
| "eval_runtime": 1.9695, | |
| "eval_samples_per_second": 31.481, | |
| "eval_steps_per_second": 2.031, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 38.62, | |
| "learning_rate": 0.0, | |
| "loss": 0.1089, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 38.62, | |
| "eval_accuracy": 0.8870967741935484, | |
| "eval_loss": 0.46501168608665466, | |
| "eval_runtime": 2.034, | |
| "eval_samples_per_second": 30.482, | |
| "eval_steps_per_second": 1.967, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 38.62, | |
| "step": 560, | |
| "total_flos": 1.1660953582043136e+18, | |
| "train_loss": 0.30870234380875317, | |
| "train_runtime": 1509.6082, | |
| "train_samples_per_second": 24.589, | |
| "train_steps_per_second": 0.371 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 560, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 500, | |
| "total_flos": 1.1660953582043136e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |