{ "best_metric": 0.616822429906542, "best_model_checkpoint": "SW2-RHS-DA\\checkpoint-292", "epoch": 39.111111111111114, "eval_steps": 500, "global_step": 880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.44, "learning_rate": 3.954545454545455e-05, "loss": 1.6539, "step": 10 }, { "epoch": 0.89, "learning_rate": 3.909090909090909e-05, "loss": 1.302, "step": 20 }, { "epoch": 0.98, "eval_accuracy": 0.411214953271028, "eval_loss": 2.847358226776123, "eval_runtime": 2.4491, "eval_samples_per_second": 43.69, "eval_steps_per_second": 2.858, "step": 22 }, { "epoch": 1.33, "learning_rate": 3.863636363636364e-05, "loss": 0.8548, "step": 30 }, { "epoch": 1.78, "learning_rate": 3.818181818181819e-05, "loss": 0.6224, "step": 40 }, { "epoch": 2.0, "eval_accuracy": 0.411214953271028, "eval_loss": 1.4649972915649414, "eval_runtime": 2.5381, "eval_samples_per_second": 42.157, "eval_steps_per_second": 2.758, "step": 45 }, { "epoch": 2.22, "learning_rate": 3.772727272727273e-05, "loss": 0.4983, "step": 50 }, { "epoch": 2.67, "learning_rate": 3.7272727272727276e-05, "loss": 0.3905, "step": 60 }, { "epoch": 2.98, "eval_accuracy": 0.411214953271028, "eval_loss": 1.486470103263855, "eval_runtime": 2.5569, "eval_samples_per_second": 41.847, "eval_steps_per_second": 2.738, "step": 67 }, { "epoch": 3.11, "learning_rate": 3.681818181818182e-05, "loss": 0.3143, "step": 70 }, { "epoch": 3.56, "learning_rate": 3.6363636363636364e-05, "loss": 0.2038, "step": 80 }, { "epoch": 4.0, "learning_rate": 3.590909090909091e-05, "loss": 0.1416, "step": 90 }, { "epoch": 4.0, "eval_accuracy": 0.5981308411214953, "eval_loss": 0.9452531933784485, "eval_runtime": 2.8177, "eval_samples_per_second": 37.974, "eval_steps_per_second": 2.484, "step": 90 }, { "epoch": 4.44, "learning_rate": 3.545454545454546e-05, "loss": 0.1267, "step": 100 }, { "epoch": 4.89, "learning_rate": 3.5000000000000004e-05, "loss": 0.1116, "step": 110 }, { "epoch": 4.98, "eval_accuracy": 0.5514018691588785, "eval_loss": 0.9800727963447571, "eval_runtime": 2.4796, "eval_samples_per_second": 43.152, "eval_steps_per_second": 2.823, "step": 112 }, { "epoch": 5.33, "learning_rate": 3.454545454545455e-05, "loss": 0.1516, "step": 120 }, { "epoch": 5.78, "learning_rate": 3.409090909090909e-05, "loss": 0.0866, "step": 130 }, { "epoch": 6.0, "eval_accuracy": 0.6074766355140186, "eval_loss": 1.520142912864685, "eval_runtime": 2.5261, "eval_samples_per_second": 42.358, "eval_steps_per_second": 2.771, "step": 135 }, { "epoch": 6.22, "learning_rate": 3.363636363636364e-05, "loss": 0.1082, "step": 140 }, { "epoch": 6.67, "learning_rate": 3.318181818181819e-05, "loss": 0.0579, "step": 150 }, { "epoch": 6.98, "eval_accuracy": 0.5981308411214953, "eval_loss": 1.723357915878296, "eval_runtime": 2.4956, "eval_samples_per_second": 42.876, "eval_steps_per_second": 2.805, "step": 157 }, { "epoch": 7.11, "learning_rate": 3.272727272727273e-05, "loss": 0.0679, "step": 160 }, { "epoch": 7.56, "learning_rate": 3.2272727272727276e-05, "loss": 0.1047, "step": 170 }, { "epoch": 8.0, "learning_rate": 3.181818181818182e-05, "loss": 0.0667, "step": 180 }, { "epoch": 8.0, "eval_accuracy": 0.5981308411214953, "eval_loss": 1.9649068117141724, "eval_runtime": 2.4861, "eval_samples_per_second": 43.04, "eval_steps_per_second": 2.816, "step": 180 }, { "epoch": 8.44, "learning_rate": 3.1363636363636365e-05, "loss": 0.0661, "step": 190 }, { "epoch": 8.89, "learning_rate": 3.090909090909091e-05, "loss": 0.0664, "step": 200 }, { "epoch": 8.98, "eval_accuracy": 0.5981308411214953, "eval_loss": 1.9504597187042236, "eval_runtime": 2.4306, "eval_samples_per_second": 44.022, "eval_steps_per_second": 2.88, "step": 202 }, { "epoch": 9.33, "learning_rate": 3.0454545454545456e-05, "loss": 0.0647, "step": 210 }, { "epoch": 9.78, "learning_rate": 3.0000000000000004e-05, "loss": 0.0742, "step": 220 }, { "epoch": 10.0, "eval_accuracy": 0.5981308411214953, "eval_loss": 1.9448089599609375, "eval_runtime": 2.4836, "eval_samples_per_second": 43.083, "eval_steps_per_second": 2.819, "step": 225 }, { "epoch": 10.22, "learning_rate": 2.954545454545455e-05, "loss": 0.0588, "step": 230 }, { "epoch": 10.67, "learning_rate": 2.9090909090909093e-05, "loss": 0.0558, "step": 240 }, { "epoch": 10.98, "eval_accuracy": 0.5981308411214953, "eval_loss": 1.9545286893844604, "eval_runtime": 2.5206, "eval_samples_per_second": 42.45, "eval_steps_per_second": 2.777, "step": 247 }, { "epoch": 11.11, "learning_rate": 2.863636363636364e-05, "loss": 0.0699, "step": 250 }, { "epoch": 11.56, "learning_rate": 2.8181818181818185e-05, "loss": 0.0564, "step": 260 }, { "epoch": 12.0, "learning_rate": 2.7727272727272732e-05, "loss": 0.0475, "step": 270 }, { "epoch": 12.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.1516401767730713, "eval_runtime": 2.4679, "eval_samples_per_second": 43.356, "eval_steps_per_second": 2.836, "step": 270 }, { "epoch": 12.44, "learning_rate": 2.7272727272727273e-05, "loss": 0.0408, "step": 280 }, { "epoch": 12.89, "learning_rate": 2.6818181818181817e-05, "loss": 0.114, "step": 290 }, { "epoch": 12.98, "eval_accuracy": 0.616822429906542, "eval_loss": 2.1001620292663574, "eval_runtime": 2.4814, "eval_samples_per_second": 43.122, "eval_steps_per_second": 2.821, "step": 292 }, { "epoch": 13.33, "learning_rate": 2.6363636363636365e-05, "loss": 0.0592, "step": 300 }, { "epoch": 13.78, "learning_rate": 2.590909090909091e-05, "loss": 0.051, "step": 310 }, { "epoch": 14.0, "eval_accuracy": 0.5981308411214953, "eval_loss": 2.2643392086029053, "eval_runtime": 2.4411, "eval_samples_per_second": 43.833, "eval_steps_per_second": 2.868, "step": 315 }, { "epoch": 14.22, "learning_rate": 2.5454545454545457e-05, "loss": 0.0575, "step": 320 }, { "epoch": 14.67, "learning_rate": 2.5e-05, "loss": 0.0318, "step": 330 }, { "epoch": 14.98, "eval_accuracy": 0.5981308411214953, "eval_loss": 2.346831798553467, "eval_runtime": 2.4771, "eval_samples_per_second": 43.196, "eval_steps_per_second": 2.826, "step": 337 }, { "epoch": 15.11, "learning_rate": 2.454545454545455e-05, "loss": 0.0273, "step": 340 }, { "epoch": 15.56, "learning_rate": 2.4090909090909093e-05, "loss": 0.0232, "step": 350 }, { "epoch": 16.0, "learning_rate": 2.363636363636364e-05, "loss": 0.0673, "step": 360 }, { "epoch": 16.0, "eval_accuracy": 0.6074766355140186, "eval_loss": 2.334057092666626, "eval_runtime": 2.5831, "eval_samples_per_second": 41.422, "eval_steps_per_second": 2.71, "step": 360 }, { "epoch": 16.44, "learning_rate": 2.3181818181818185e-05, "loss": 0.0402, "step": 370 }, { "epoch": 16.89, "learning_rate": 2.2727272727272733e-05, "loss": 0.0566, "step": 380 }, { "epoch": 16.98, "eval_accuracy": 0.6074766355140186, "eval_loss": 2.319139003753662, "eval_runtime": 2.4296, "eval_samples_per_second": 44.04, "eval_steps_per_second": 2.881, "step": 382 }, { "epoch": 17.33, "learning_rate": 2.2272727272727274e-05, "loss": 0.0445, "step": 390 }, { "epoch": 17.78, "learning_rate": 2.1818181818181818e-05, "loss": 0.0543, "step": 400 }, { "epoch": 18.0, "eval_accuracy": 0.616822429906542, "eval_loss": 2.280747413635254, "eval_runtime": 2.5226, "eval_samples_per_second": 42.417, "eval_steps_per_second": 2.775, "step": 405 }, { "epoch": 18.22, "learning_rate": 2.1363636363636365e-05, "loss": 0.0419, "step": 410 }, { "epoch": 18.67, "learning_rate": 2.090909090909091e-05, "loss": 0.0458, "step": 420 }, { "epoch": 18.98, "eval_accuracy": 0.616822429906542, "eval_loss": 2.2148241996765137, "eval_runtime": 2.5536, "eval_samples_per_second": 41.901, "eval_steps_per_second": 2.741, "step": 427 }, { "epoch": 19.11, "learning_rate": 2.0454545454545457e-05, "loss": 0.0255, "step": 430 }, { "epoch": 19.56, "learning_rate": 2e-05, "loss": 0.0306, "step": 440 }, { "epoch": 20.0, "learning_rate": 1.9545454545454546e-05, "loss": 0.0421, "step": 450 }, { "epoch": 20.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.5467782020568848, "eval_runtime": 2.4846, "eval_samples_per_second": 43.065, "eval_steps_per_second": 2.817, "step": 450 }, { "epoch": 20.44, "learning_rate": 1.9090909090909094e-05, "loss": 0.0509, "step": 460 }, { "epoch": 20.89, "learning_rate": 1.8636363636363638e-05, "loss": 0.0139, "step": 470 }, { "epoch": 20.98, "eval_accuracy": 0.616822429906542, "eval_loss": 2.2408316135406494, "eval_runtime": 2.5411, "eval_samples_per_second": 42.108, "eval_steps_per_second": 2.755, "step": 472 }, { "epoch": 21.33, "learning_rate": 1.8181818181818182e-05, "loss": 0.0208, "step": 480 }, { "epoch": 21.78, "learning_rate": 1.772727272727273e-05, "loss": 0.012, "step": 490 }, { "epoch": 22.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.568880796432495, "eval_runtime": 2.4491, "eval_samples_per_second": 43.69, "eval_steps_per_second": 2.858, "step": 495 }, { "epoch": 22.22, "learning_rate": 1.7272727272727274e-05, "loss": 0.0375, "step": 500 }, { "epoch": 22.67, "learning_rate": 1.681818181818182e-05, "loss": 0.017, "step": 510 }, { "epoch": 22.98, "eval_accuracy": 0.616822429906542, "eval_loss": 2.548487901687622, "eval_runtime": 2.5416, "eval_samples_per_second": 42.1, "eval_steps_per_second": 2.754, "step": 517 }, { "epoch": 23.11, "learning_rate": 1.6363636363636366e-05, "loss": 0.0293, "step": 520 }, { "epoch": 23.56, "learning_rate": 1.590909090909091e-05, "loss": 0.0482, "step": 530 }, { "epoch": 24.0, "learning_rate": 1.5454545454545454e-05, "loss": 0.0529, "step": 540 }, { "epoch": 24.0, "eval_accuracy": 0.6074766355140186, "eval_loss": 2.674633502960205, "eval_runtime": 2.5767, "eval_samples_per_second": 41.526, "eval_steps_per_second": 2.717, "step": 540 }, { "epoch": 24.44, "learning_rate": 1.5000000000000002e-05, "loss": 0.0063, "step": 550 }, { "epoch": 24.89, "learning_rate": 1.4545454545454546e-05, "loss": 0.0414, "step": 560 }, { "epoch": 24.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.7693288326263428, "eval_runtime": 2.6239, "eval_samples_per_second": 40.778, "eval_steps_per_second": 2.668, "step": 562 }, { "epoch": 25.33, "learning_rate": 1.4090909090909092e-05, "loss": 0.0203, "step": 570 }, { "epoch": 25.78, "learning_rate": 1.3636363636363637e-05, "loss": 0.0158, "step": 580 }, { "epoch": 26.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.7446506023406982, "eval_runtime": 2.5651, "eval_samples_per_second": 41.714, "eval_steps_per_second": 2.729, "step": 585 }, { "epoch": 26.22, "learning_rate": 1.3181818181818183e-05, "loss": 0.0185, "step": 590 }, { "epoch": 26.67, "learning_rate": 1.2727272727272728e-05, "loss": 0.0205, "step": 600 }, { "epoch": 26.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.8566343784332275, "eval_runtime": 2.5211, "eval_samples_per_second": 42.442, "eval_steps_per_second": 2.777, "step": 607 }, { "epoch": 27.11, "learning_rate": 1.2272727272727274e-05, "loss": 0.04, "step": 610 }, { "epoch": 27.56, "learning_rate": 1.181818181818182e-05, "loss": 0.0133, "step": 620 }, { "epoch": 28.0, "learning_rate": 1.1363636363636366e-05, "loss": 0.0205, "step": 630 }, { "epoch": 28.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.846874713897705, "eval_runtime": 2.4371, "eval_samples_per_second": 43.905, "eval_steps_per_second": 2.872, "step": 630 }, { "epoch": 28.44, "learning_rate": 1.0909090909090909e-05, "loss": 0.0211, "step": 640 }, { "epoch": 28.89, "learning_rate": 1.0454545454545455e-05, "loss": 0.006, "step": 650 }, { "epoch": 28.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 2.9508450031280518, "eval_runtime": 2.4616, "eval_samples_per_second": 43.468, "eval_steps_per_second": 2.844, "step": 652 }, { "epoch": 29.33, "learning_rate": 1e-05, "loss": 0.0198, "step": 660 }, { "epoch": 29.78, "learning_rate": 9.545454545454547e-06, "loss": 0.0061, "step": 670 }, { "epoch": 30.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0560247898101807, "eval_runtime": 2.6005, "eval_samples_per_second": 41.145, "eval_steps_per_second": 2.692, "step": 675 }, { "epoch": 30.22, "learning_rate": 9.090909090909091e-06, "loss": 0.0124, "step": 680 }, { "epoch": 30.67, "learning_rate": 8.636363636363637e-06, "loss": 0.0227, "step": 690 }, { "epoch": 30.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0431346893310547, "eval_runtime": 2.5396, "eval_samples_per_second": 42.132, "eval_steps_per_second": 2.756, "step": 697 }, { "epoch": 31.11, "learning_rate": 8.181818181818183e-06, "loss": 0.0261, "step": 700 }, { "epoch": 31.56, "learning_rate": 7.727272727272727e-06, "loss": 0.0174, "step": 710 }, { "epoch": 32.0, "learning_rate": 7.272727272727273e-06, "loss": 0.034, "step": 720 }, { "epoch": 32.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0496559143066406, "eval_runtime": 2.4746, "eval_samples_per_second": 43.24, "eval_steps_per_second": 2.829, "step": 720 }, { "epoch": 32.44, "learning_rate": 6.818181818181818e-06, "loss": 0.0277, "step": 730 }, { "epoch": 32.89, "learning_rate": 6.363636363636364e-06, "loss": 0.0039, "step": 740 }, { "epoch": 32.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0935721397399902, "eval_runtime": 2.5656, "eval_samples_per_second": 41.706, "eval_steps_per_second": 2.728, "step": 742 }, { "epoch": 33.33, "learning_rate": 5.90909090909091e-06, "loss": 0.02, "step": 750 }, { "epoch": 33.78, "learning_rate": 5.4545454545454545e-06, "loss": 0.0031, "step": 760 }, { "epoch": 34.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.1158316135406494, "eval_runtime": 2.4251, "eval_samples_per_second": 44.123, "eval_steps_per_second": 2.887, "step": 765 }, { "epoch": 34.22, "learning_rate": 5e-06, "loss": 0.0178, "step": 770 }, { "epoch": 34.67, "learning_rate": 4.5454545454545455e-06, "loss": 0.0118, "step": 780 }, { "epoch": 34.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.109339714050293, "eval_runtime": 2.5796, "eval_samples_per_second": 41.479, "eval_steps_per_second": 2.714, "step": 787 }, { "epoch": 35.11, "learning_rate": 4.0909090909090915e-06, "loss": 0.0061, "step": 790 }, { "epoch": 35.56, "learning_rate": 3.6363636363636366e-06, "loss": 0.018, "step": 800 }, { "epoch": 36.0, "learning_rate": 3.181818181818182e-06, "loss": 0.0045, "step": 810 }, { "epoch": 36.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.088502883911133, "eval_runtime": 2.4691, "eval_samples_per_second": 43.336, "eval_steps_per_second": 2.835, "step": 810 }, { "epoch": 36.44, "learning_rate": 2.7272727272727272e-06, "loss": 0.0224, "step": 820 }, { "epoch": 36.89, "learning_rate": 2.2727272727272728e-06, "loss": 0.0168, "step": 830 }, { "epoch": 36.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0813817977905273, "eval_runtime": 2.46, "eval_samples_per_second": 43.495, "eval_steps_per_second": 2.845, "step": 832 }, { "epoch": 37.33, "learning_rate": 1.8181818181818183e-06, "loss": 0.0106, "step": 840 }, { "epoch": 37.78, "learning_rate": 1.3636363636363636e-06, "loss": 0.0071, "step": 850 }, { "epoch": 38.0, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.073270320892334, "eval_runtime": 2.8765, "eval_samples_per_second": 37.198, "eval_steps_per_second": 2.433, "step": 855 }, { "epoch": 38.22, "learning_rate": 9.090909090909091e-07, "loss": 0.0123, "step": 860 }, { "epoch": 38.67, "learning_rate": 4.5454545454545457e-07, "loss": 0.0238, "step": 870 }, { "epoch": 38.98, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.0825016498565674, "eval_runtime": 2.6096, "eval_samples_per_second": 41.002, "eval_steps_per_second": 2.682, "step": 877 }, { "epoch": 39.11, "learning_rate": 0.0, "loss": 0.0072, "step": 880 }, { "epoch": 39.11, "eval_accuracy": 0.5887850467289719, "eval_loss": 3.082833766937256, "eval_runtime": 2.596, "eval_samples_per_second": 41.217, "eval_steps_per_second": 2.696, "step": 880 }, { "epoch": 39.11, "step": 880, "total_flos": 1.8272580263466762e+18, "train_loss": 0.10380522197493437, "train_runtime": 1214.1801, "train_samples_per_second": 47.308, "train_steps_per_second": 0.725 } ], "logging_steps": 10, "max_steps": 880, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 1.8272580263466762e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }