{ "best_metric": 0.9193548387096774, "best_model_checkpoint": "SW2-TO-DA\\checkpoint-130", "epoch": 38.62068965517241, "eval_steps": 500, "global_step": 560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.69, "learning_rate": 2.6785714285714284e-05, "loss": 1.4955, "step": 10 }, { "epoch": 0.97, "eval_accuracy": 0.08064516129032258, "eval_loss": 1.558031439781189, "eval_runtime": 2.4051, "eval_samples_per_second": 25.778, "eval_steps_per_second": 1.663, "step": 14 }, { "epoch": 1.38, "learning_rate": 5.357142857142857e-05, "loss": 1.3943, "step": 20 }, { "epoch": 2.0, "eval_accuracy": 0.6451612903225806, "eval_loss": 1.1315828561782837, "eval_runtime": 1.9485, "eval_samples_per_second": 31.82, "eval_steps_per_second": 2.053, "step": 29 }, { "epoch": 2.07, "learning_rate": 8.035714285714285e-05, "loss": 1.2678, "step": 30 }, { "epoch": 2.76, "learning_rate": 0.00010714285714285714, "loss": 1.0056, "step": 40 }, { "epoch": 2.97, "eval_accuracy": 0.7419354838709677, "eval_loss": 0.6406522393226624, "eval_runtime": 2.1315, "eval_samples_per_second": 29.087, "eval_steps_per_second": 1.877, "step": 43 }, { "epoch": 3.45, "learning_rate": 0.00013392857142857144, "loss": 0.7744, "step": 50 }, { "epoch": 4.0, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.4265057146549225, "eval_runtime": 1.9565, "eval_samples_per_second": 31.69, "eval_steps_per_second": 2.045, "step": 58 }, { "epoch": 4.14, "learning_rate": 0.0001488095238095238, "loss": 0.7109, "step": 60 }, { "epoch": 4.83, "learning_rate": 0.00014583333333333332, "loss": 0.6022, "step": 70 }, { "epoch": 4.97, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.4360726773738861, "eval_runtime": 1.949, "eval_samples_per_second": 31.812, "eval_steps_per_second": 2.052, "step": 72 }, { "epoch": 5.52, "learning_rate": 0.00014285714285714284, "loss": 0.5854, "step": 80 }, { "epoch": 6.0, "eval_accuracy": 0.8064516129032258, "eval_loss": 0.5508103370666504, "eval_runtime": 1.948, "eval_samples_per_second": 31.828, "eval_steps_per_second": 2.053, "step": 87 }, { "epoch": 6.21, "learning_rate": 0.00013988095238095236, "loss": 0.5151, "step": 90 }, { "epoch": 6.9, "learning_rate": 0.00013690476190476189, "loss": 0.4581, "step": 100 }, { "epoch": 6.97, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.3123740255832672, "eval_runtime": 1.9775, "eval_samples_per_second": 31.353, "eval_steps_per_second": 2.023, "step": 101 }, { "epoch": 7.59, "learning_rate": 0.00013392857142857144, "loss": 0.386, "step": 110 }, { "epoch": 8.0, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.31687411665916443, "eval_runtime": 2.0035, "eval_samples_per_second": 30.946, "eval_steps_per_second": 1.997, "step": 116 }, { "epoch": 8.28, "learning_rate": 0.00013095238095238093, "loss": 0.4182, "step": 120 }, { "epoch": 8.97, "learning_rate": 0.00012797619047619045, "loss": 0.347, "step": 130 }, { "epoch": 8.97, "eval_accuracy": 0.9193548387096774, "eval_loss": 0.22072885930538177, "eval_runtime": 2.034, "eval_samples_per_second": 30.482, "eval_steps_per_second": 1.967, "step": 130 }, { "epoch": 9.66, "learning_rate": 0.000125, "loss": 0.3873, "step": 140 }, { "epoch": 10.0, "eval_accuracy": 0.8225806451612904, "eval_loss": 0.5968738198280334, "eval_runtime": 1.986, "eval_samples_per_second": 31.219, "eval_steps_per_second": 2.014, "step": 145 }, { "epoch": 10.34, "learning_rate": 0.00012202380952380951, "loss": 0.3508, "step": 150 }, { "epoch": 10.97, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.342462956905365, "eval_runtime": 1.981, "eval_samples_per_second": 31.298, "eval_steps_per_second": 2.019, "step": 159 }, { "epoch": 11.03, "learning_rate": 0.00011904761904761903, "loss": 0.3437, "step": 160 }, { "epoch": 11.72, "learning_rate": 0.00011607142857142857, "loss": 0.274, "step": 170 }, { "epoch": 12.0, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.3376210927963257, "eval_runtime": 2.0573, "eval_samples_per_second": 30.136, "eval_steps_per_second": 1.944, "step": 174 }, { "epoch": 12.41, "learning_rate": 0.00011309523809523808, "loss": 0.2615, "step": 180 }, { "epoch": 12.97, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.4912601709365845, "eval_runtime": 2.019, "eval_samples_per_second": 30.709, "eval_steps_per_second": 1.981, "step": 188 }, { "epoch": 13.1, "learning_rate": 0.0001101190476190476, "loss": 0.2471, "step": 190 }, { "epoch": 13.79, "learning_rate": 0.00010714285714285714, "loss": 0.3118, "step": 200 }, { "epoch": 14.0, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.40341296792030334, "eval_runtime": 1.9425, "eval_samples_per_second": 31.918, "eval_steps_per_second": 2.059, "step": 203 }, { "epoch": 14.48, "learning_rate": 0.00010416666666666666, "loss": 0.2205, "step": 210 }, { "epoch": 14.97, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.3167090117931366, "eval_runtime": 1.966, "eval_samples_per_second": 31.537, "eval_steps_per_second": 2.035, "step": 217 }, { "epoch": 15.17, "learning_rate": 0.00010119047619047618, "loss": 0.2102, "step": 220 }, { "epoch": 15.86, "learning_rate": 9.82142857142857e-05, "loss": 0.2325, "step": 230 }, { "epoch": 16.0, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.3042737543582916, "eval_runtime": 1.9585, "eval_samples_per_second": 31.658, "eval_steps_per_second": 2.042, "step": 232 }, { "epoch": 16.55, "learning_rate": 9.523809523809523e-05, "loss": 0.1914, "step": 240 }, { "epoch": 16.97, "eval_accuracy": 0.8225806451612904, "eval_loss": 0.4256087839603424, "eval_runtime": 1.9625, "eval_samples_per_second": 31.593, "eval_steps_per_second": 2.038, "step": 246 }, { "epoch": 17.24, "learning_rate": 9.226190476190476e-05, "loss": 0.2044, "step": 250 }, { "epoch": 17.93, "learning_rate": 8.928571428571427e-05, "loss": 0.1997, "step": 260 }, { "epoch": 18.0, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.37694820761680603, "eval_runtime": 2.009, "eval_samples_per_second": 30.862, "eval_steps_per_second": 1.991, "step": 261 }, { "epoch": 18.62, "learning_rate": 8.63095238095238e-05, "loss": 0.1752, "step": 270 }, { "epoch": 18.97, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.587546169757843, "eval_runtime": 2.1045, "eval_samples_per_second": 29.461, "eval_steps_per_second": 1.901, "step": 275 }, { "epoch": 19.31, "learning_rate": 8.333333333333333e-05, "loss": 0.237, "step": 280 }, { "epoch": 20.0, "learning_rate": 8.035714285714285e-05, "loss": 0.1685, "step": 290 }, { "epoch": 20.0, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.41043660044670105, "eval_runtime": 1.8789, "eval_samples_per_second": 32.997, "eval_steps_per_second": 2.129, "step": 290 }, { "epoch": 20.69, "learning_rate": 7.738095238095239e-05, "loss": 0.1736, "step": 300 }, { "epoch": 20.97, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.5480897426605225, "eval_runtime": 1.8865, "eval_samples_per_second": 32.866, "eval_steps_per_second": 2.12, "step": 304 }, { "epoch": 21.38, "learning_rate": 7.44047619047619e-05, "loss": 0.1901, "step": 310 }, { "epoch": 22.0, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.3800370693206787, "eval_runtime": 1.916, "eval_samples_per_second": 32.36, "eval_steps_per_second": 2.088, "step": 319 }, { "epoch": 22.07, "learning_rate": 7.142857142857142e-05, "loss": 0.1718, "step": 320 }, { "epoch": 22.76, "learning_rate": 6.845238095238094e-05, "loss": 0.1426, "step": 330 }, { "epoch": 22.97, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.44246000051498413, "eval_runtime": 1.939, "eval_samples_per_second": 31.976, "eval_steps_per_second": 2.063, "step": 333 }, { "epoch": 23.45, "learning_rate": 6.547619047619047e-05, "loss": 0.1251, "step": 340 }, { "epoch": 24.0, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.3373814523220062, "eval_runtime": 1.9735, "eval_samples_per_second": 31.416, "eval_steps_per_second": 2.027, "step": 348 }, { "epoch": 24.14, "learning_rate": 6.25e-05, "loss": 0.1329, "step": 350 }, { "epoch": 24.83, "learning_rate": 5.952380952380952e-05, "loss": 0.1326, "step": 360 }, { "epoch": 24.97, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.3627336323261261, "eval_runtime": 1.951, "eval_samples_per_second": 31.779, "eval_steps_per_second": 2.05, "step": 362 }, { "epoch": 25.52, "learning_rate": 5.654761904761904e-05, "loss": 0.1271, "step": 370 }, { "epoch": 26.0, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.4767535328865051, "eval_runtime": 1.977, "eval_samples_per_second": 31.361, "eval_steps_per_second": 2.023, "step": 377 }, { "epoch": 26.21, "learning_rate": 5.357142857142857e-05, "loss": 0.1415, "step": 380 }, { "epoch": 26.9, "learning_rate": 5.059523809523809e-05, "loss": 0.1835, "step": 390 }, { "epoch": 26.97, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.5603958368301392, "eval_runtime": 2.171, "eval_samples_per_second": 28.558, "eval_steps_per_second": 1.842, "step": 391 }, { "epoch": 27.59, "learning_rate": 4.7619047619047614e-05, "loss": 0.1378, "step": 400 }, { "epoch": 28.0, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.4130818247795105, "eval_runtime": 2.0055, "eval_samples_per_second": 30.916, "eval_steps_per_second": 1.995, "step": 406 }, { "epoch": 28.28, "learning_rate": 4.4642857142857136e-05, "loss": 0.1253, "step": 410 }, { "epoch": 28.97, "learning_rate": 4.1666666666666665e-05, "loss": 0.1349, "step": 420 }, { "epoch": 28.97, "eval_accuracy": 0.8548387096774194, "eval_loss": 0.5103474259376526, "eval_runtime": 1.9155, "eval_samples_per_second": 32.368, "eval_steps_per_second": 2.088, "step": 420 }, { "epoch": 29.66, "learning_rate": 3.8690476190476195e-05, "loss": 0.0999, "step": 430 }, { "epoch": 30.0, "eval_accuracy": 0.9193548387096774, "eval_loss": 0.37231481075286865, "eval_runtime": 1.94, "eval_samples_per_second": 31.959, "eval_steps_per_second": 2.062, "step": 435 }, { "epoch": 30.34, "learning_rate": 3.571428571428571e-05, "loss": 0.1198, "step": 440 }, { "epoch": 30.97, "eval_accuracy": 0.8709677419354839, "eval_loss": 0.5360597968101501, "eval_runtime": 1.9235, "eval_samples_per_second": 32.233, "eval_steps_per_second": 2.08, "step": 449 }, { "epoch": 31.03, "learning_rate": 3.273809523809523e-05, "loss": 0.1301, "step": 450 }, { "epoch": 31.72, "learning_rate": 2.976190476190476e-05, "loss": 0.1195, "step": 460 }, { "epoch": 32.0, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.41935569047927856, "eval_runtime": 1.9165, "eval_samples_per_second": 32.351, "eval_steps_per_second": 2.087, "step": 464 }, { "epoch": 32.41, "learning_rate": 2.6785714285714284e-05, "loss": 0.0766, "step": 470 }, { "epoch": 32.97, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.4133478105068207, "eval_runtime": 1.924, "eval_samples_per_second": 32.225, "eval_steps_per_second": 2.079, "step": 478 }, { "epoch": 33.1, "learning_rate": 2.3809523809523807e-05, "loss": 0.1043, "step": 480 }, { "epoch": 33.79, "learning_rate": 2.0833333333333333e-05, "loss": 0.0862, "step": 490 }, { "epoch": 34.0, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.42390120029449463, "eval_runtime": 1.9, "eval_samples_per_second": 32.632, "eval_steps_per_second": 2.105, "step": 493 }, { "epoch": 34.48, "learning_rate": 1.7857142857142855e-05, "loss": 0.1048, "step": 500 }, { "epoch": 34.97, "eval_accuracy": 0.9193548387096774, "eval_loss": 0.4120253920555115, "eval_runtime": 1.954, "eval_samples_per_second": 31.73, "eval_steps_per_second": 2.047, "step": 507 }, { "epoch": 35.17, "learning_rate": 1.488095238095238e-05, "loss": 0.0884, "step": 510 }, { "epoch": 35.86, "learning_rate": 1.1904761904761903e-05, "loss": 0.0902, "step": 520 }, { "epoch": 36.0, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.44083285331726074, "eval_runtime": 1.9034, "eval_samples_per_second": 32.573, "eval_steps_per_second": 2.101, "step": 522 }, { "epoch": 36.55, "learning_rate": 8.928571428571428e-06, "loss": 0.088, "step": 530 }, { "epoch": 36.97, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.4435848295688629, "eval_runtime": 1.8659, "eval_samples_per_second": 33.227, "eval_steps_per_second": 2.144, "step": 536 }, { "epoch": 37.24, "learning_rate": 5.952380952380952e-06, "loss": 0.0864, "step": 540 }, { "epoch": 37.93, "learning_rate": 2.976190476190476e-06, "loss": 0.089, "step": 550 }, { "epoch": 38.0, "eval_accuracy": 0.9032258064516129, "eval_loss": 0.46484920382499695, "eval_runtime": 1.9695, "eval_samples_per_second": 31.481, "eval_steps_per_second": 2.031, "step": 551 }, { "epoch": 38.62, "learning_rate": 0.0, "loss": 0.1089, "step": 560 }, { "epoch": 38.62, "eval_accuracy": 0.8870967741935484, "eval_loss": 0.46501168608665466, "eval_runtime": 2.034, "eval_samples_per_second": 30.482, "eval_steps_per_second": 1.967, "step": 560 }, { "epoch": 38.62, "step": 560, "total_flos": 1.1660953582043136e+18, "train_loss": 0.30870234380875317, "train_runtime": 1509.6082, "train_samples_per_second": 24.589, "train_steps_per_second": 0.371 } ], "logging_steps": 10, "max_steps": 560, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 1.1660953582043136e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }