{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7168930960819125, "epoch": 0.016, "grad_norm": 73.5, "learning_rate": 1.8e-07, "loss": 15.0906, "mean_token_accuracy": 0.7680471498519182, "num_tokens": 280941.0, "step": 10 }, { "entropy": 0.741888347826898, "epoch": 0.032, "grad_norm": 78.0, "learning_rate": 3.8e-07, "loss": 15.4631, "mean_token_accuracy": 0.7632880255579948, "num_tokens": 558056.0, "step": 20 }, { "entropy": 0.7365243999287486, "epoch": 0.048, "grad_norm": 65.0, "learning_rate": 5.800000000000001e-07, "loss": 15.1017, "mean_token_accuracy": 0.768239913508296, "num_tokens": 836753.0, "step": 30 }, { "entropy": 0.7818019269034266, "epoch": 0.064, "grad_norm": 56.25, "learning_rate": 7.8e-07, "loss": 15.6995, "mean_token_accuracy": 0.7602430328726768, "num_tokens": 1126446.0, "step": 40 }, { "entropy": 0.8105136282742024, "epoch": 0.08, "grad_norm": 47.25, "learning_rate": 9.800000000000001e-07, "loss": 15.0321, "mean_token_accuracy": 0.7664048440754414, "num_tokens": 1413596.0, "step": 50 }, { "entropy": 0.8216251201927662, "epoch": 0.096, "grad_norm": 39.75, "learning_rate": 1.1800000000000001e-06, "loss": 14.6285, "mean_token_accuracy": 0.7707466218620539, "num_tokens": 1701193.0, "step": 60 }, { "entropy": 0.8549969043582678, "epoch": 0.112, "grad_norm": 39.0, "learning_rate": 1.3800000000000001e-06, "loss": 14.8679, "mean_token_accuracy": 0.7673181220889091, "num_tokens": 1979232.0, "step": 70 }, { "entropy": 0.8556341353803874, "epoch": 0.128, "grad_norm": 34.0, "learning_rate": 1.5800000000000001e-06, "loss": 14.6467, "mean_token_accuracy": 0.7710235182195901, "num_tokens": 2274177.0, "step": 80 }, { "entropy": 0.8478256281465292, "epoch": 0.144, "grad_norm": 31.0, "learning_rate": 1.7800000000000001e-06, "loss": 14.3261, "mean_token_accuracy": 0.7732654966413974, "num_tokens": 2548445.0, "step": 90 }, { "entropy": 0.8438362725079059, "epoch": 0.16, "grad_norm": 38.5, "learning_rate": 1.98e-06, "loss": 14.0318, "mean_token_accuracy": 0.7773757755756379, "num_tokens": 2824418.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.211377203464508, "eval_biology_loss": 1.1644827127456665, "eval_biology_mean_token_accuracy": 0.7046201548576355, "eval_biology_num_tokens": 2824418.0, "eval_biology_runtime": 20.5128, "eval_biology_samples_per_second": 24.375, "eval_biology_steps_per_second": 6.094, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.875089626789093, "eval_math_loss": 0.8965557217597961, "eval_math_mean_token_accuracy": 0.7736486663818359, "eval_math_num_tokens": 2824418.0, "eval_math_runtime": 25.0963, "eval_math_samples_per_second": 19.923, "eval_math_steps_per_second": 4.981, "step": 100 }, { "entropy": 0.8555477414280176, "epoch": 0.176, "grad_norm": 47.75, "learning_rate": 2.1800000000000003e-06, "loss": 14.0356, "mean_token_accuracy": 0.7775060940533877, "num_tokens": 3110313.0, "step": 110 }, { "entropy": 0.8610258311033249, "epoch": 0.192, "grad_norm": 31.375, "learning_rate": 2.38e-06, "loss": 14.025, "mean_token_accuracy": 0.7762446004897356, "num_tokens": 3394170.0, "step": 120 }, { "entropy": 0.8509209487587214, "epoch": 0.208, "grad_norm": 30.0, "learning_rate": 2.5800000000000003e-06, "loss": 13.8558, "mean_token_accuracy": 0.7793015491217374, "num_tokens": 3673600.0, "step": 130 }, { "entropy": 0.8540813602507115, "epoch": 0.224, "grad_norm": 30.125, "learning_rate": 2.7800000000000005e-06, "loss": 13.8247, "mean_token_accuracy": 0.7789989039301872, "num_tokens": 3953732.0, "step": 140 }, { "entropy": 0.8311325689777732, "epoch": 0.24, "grad_norm": 30.25, "learning_rate": 2.9800000000000003e-06, "loss": 13.4688, "mean_token_accuracy": 0.7846441507339478, "num_tokens": 4243655.0, "step": 150 }, { "entropy": 0.8143093746155501, "epoch": 0.256, "grad_norm": 27.625, "learning_rate": 3.1800000000000005e-06, "loss": 13.0712, "mean_token_accuracy": 0.7881167802959681, "num_tokens": 4531471.0, "step": 160 }, { "entropy": 0.8313567344099283, "epoch": 0.272, "grad_norm": 24.0, "learning_rate": 3.3800000000000007e-06, "loss": 13.3645, "mean_token_accuracy": 0.7852793108671904, "num_tokens": 4810284.0, "step": 170 }, { "entropy": 0.8148340426385403, "epoch": 0.288, "grad_norm": 26.375, "learning_rate": 3.58e-06, "loss": 13.0548, "mean_token_accuracy": 0.7882425185292959, "num_tokens": 5095104.0, "step": 180 }, { "entropy": 0.8263534324243664, "epoch": 0.304, "grad_norm": 25.125, "learning_rate": 3.7800000000000002e-06, "loss": 13.2184, "mean_token_accuracy": 0.7860081434249878, "num_tokens": 5383732.0, "step": 190 }, { "entropy": 0.7958642322570085, "epoch": 0.32, "grad_norm": 23.75, "learning_rate": 3.980000000000001e-06, "loss": 12.7072, "mean_token_accuracy": 0.7937722463160753, "num_tokens": 5676334.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.1982407326698303, "eval_biology_loss": 1.1808913946151733, "eval_biology_mean_token_accuracy": 0.6998598065376281, "eval_biology_num_tokens": 5676334.0, "eval_biology_runtime": 19.7103, "eval_biology_samples_per_second": 25.367, "eval_biology_steps_per_second": 6.342, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.8109114770889282, "eval_math_loss": 0.822318971157074, "eval_math_mean_token_accuracy": 0.7874419956207276, "eval_math_num_tokens": 5676334.0, "eval_math_runtime": 24.5627, "eval_math_samples_per_second": 20.356, "eval_math_steps_per_second": 5.089, "step": 200 }, { "entropy": 0.8209991015493869, "epoch": 0.336, "grad_norm": 23.375, "learning_rate": 4.18e-06, "loss": 13.1713, "mean_token_accuracy": 0.78699039965868, "num_tokens": 5958480.0, "step": 210 }, { "entropy": 0.7989038350060582, "epoch": 0.352, "grad_norm": 25.0, "learning_rate": 4.38e-06, "loss": 12.7482, "mean_token_accuracy": 0.7930307753384114, "num_tokens": 6242161.0, "step": 220 }, { "entropy": 0.7915343299508095, "epoch": 0.368, "grad_norm": 24.875, "learning_rate": 4.58e-06, "loss": 12.6757, "mean_token_accuracy": 0.7925275303423405, "num_tokens": 6523679.0, "step": 230 }, { "entropy": 0.7743825454264879, "epoch": 0.384, "grad_norm": 23.375, "learning_rate": 4.78e-06, "loss": 12.3704, "mean_token_accuracy": 0.79692403934896, "num_tokens": 6810978.0, "step": 240 }, { "entropy": 0.8141555316746235, "epoch": 0.4, "grad_norm": 24.125, "learning_rate": 4.980000000000001e-06, "loss": 13.0618, "mean_token_accuracy": 0.7885617177933455, "num_tokens": 7096903.0, "step": 250 }, { "entropy": 0.7800503006204963, "epoch": 0.416, "grad_norm": 22.25, "learning_rate": 5.18e-06, "loss": 12.498, "mean_token_accuracy": 0.7941101636737585, "num_tokens": 7377181.0, "step": 260 }, { "entropy": 0.7713520534336566, "epoch": 0.432, "grad_norm": 25.625, "learning_rate": 5.380000000000001e-06, "loss": 12.2429, "mean_token_accuracy": 0.798516258224845, "num_tokens": 7650523.0, "step": 270 }, { "entropy": 0.767449575662613, "epoch": 0.448, "grad_norm": 24.875, "learning_rate": 5.580000000000001e-06, "loss": 12.2843, "mean_token_accuracy": 0.7973004225641489, "num_tokens": 7936788.0, "step": 280 }, { "entropy": 0.7630951976403594, "epoch": 0.464, "grad_norm": 21.375, "learning_rate": 5.78e-06, "loss": 12.2689, "mean_token_accuracy": 0.7984702557325363, "num_tokens": 8223147.0, "step": 290 }, { "entropy": 0.7839587140828371, "epoch": 0.48, "grad_norm": 22.125, "learning_rate": 5.98e-06, "loss": 12.4483, "mean_token_accuracy": 0.7962926685810089, "num_tokens": 8506030.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.2097046246528627, "eval_biology_loss": 1.1880755424499512, "eval_biology_mean_token_accuracy": 0.6989095420837402, "eval_biology_num_tokens": 8506030.0, "eval_biology_runtime": 19.7532, "eval_biology_samples_per_second": 25.312, "eval_biology_steps_per_second": 6.328, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.7799157240390777, "eval_math_loss": 0.7781939506530762, "eval_math_mean_token_accuracy": 0.7959079008102417, "eval_math_num_tokens": 8506030.0, "eval_math_runtime": 24.5905, "eval_math_samples_per_second": 20.333, "eval_math_steps_per_second": 5.083, "step": 300 }, { "entropy": 0.7540466286242008, "epoch": 0.496, "grad_norm": 24.375, "learning_rate": 6.18e-06, "loss": 12.0453, "mean_token_accuracy": 0.8007366862148046, "num_tokens": 8788726.0, "step": 310 }, { "entropy": 0.7493760107085109, "epoch": 0.512, "grad_norm": 24.0, "learning_rate": 6.380000000000001e-06, "loss": 11.9382, "mean_token_accuracy": 0.8025602623820305, "num_tokens": 9078039.0, "step": 320 }, { "entropy": 0.7366351887583733, "epoch": 0.528, "grad_norm": 23.0, "learning_rate": 6.5800000000000005e-06, "loss": 11.7509, "mean_token_accuracy": 0.8059428248554468, "num_tokens": 9373860.0, "step": 330 }, { "entropy": 0.7388057533651591, "epoch": 0.544, "grad_norm": 24.125, "learning_rate": 6.780000000000001e-06, "loss": 11.827, "mean_token_accuracy": 0.8040345013141632, "num_tokens": 9660940.0, "step": 340 }, { "entropy": 0.7601569008082152, "epoch": 0.56, "grad_norm": 26.625, "learning_rate": 6.98e-06, "loss": 12.1324, "mean_token_accuracy": 0.797855831682682, "num_tokens": 9932302.0, "step": 350 }, { "entropy": 0.7458819771185518, "epoch": 0.576, "grad_norm": 22.375, "learning_rate": 7.180000000000001e-06, "loss": 11.9451, "mean_token_accuracy": 0.8013740532100201, "num_tokens": 10215462.0, "step": 360 }, { "entropy": 0.7404385067522525, "epoch": 0.592, "grad_norm": 21.625, "learning_rate": 7.3800000000000005e-06, "loss": 11.8309, "mean_token_accuracy": 0.802900119498372, "num_tokens": 10504396.0, "step": 370 }, { "entropy": 0.7532710742205382, "epoch": 0.608, "grad_norm": 27.75, "learning_rate": 7.58e-06, "loss": 12.0281, "mean_token_accuracy": 0.801684994623065, "num_tokens": 10793126.0, "step": 380 }, { "entropy": 0.7331796364858747, "epoch": 0.624, "grad_norm": 21.25, "learning_rate": 7.78e-06, "loss": 11.697, "mean_token_accuracy": 0.8045222193002701, "num_tokens": 11081768.0, "step": 390 }, { "entropy": 0.7177777705714107, "epoch": 0.64, "grad_norm": 20.0, "learning_rate": 7.980000000000002e-06, "loss": 11.4977, "mean_token_accuracy": 0.8074667323380709, "num_tokens": 11370320.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.199187099456787, "eval_biology_loss": 1.194938063621521, "eval_biology_mean_token_accuracy": 0.6980597639083862, "eval_biology_num_tokens": 11370320.0, "eval_biology_runtime": 19.7518, "eval_biology_samples_per_second": 25.314, "eval_biology_steps_per_second": 6.329, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.7331400663852692, "eval_math_loss": 0.746539831161499, "eval_math_mean_token_accuracy": 0.8015342946052552, "eval_math_num_tokens": 11370320.0, "eval_math_runtime": 24.6063, "eval_math_samples_per_second": 20.32, "eval_math_steps_per_second": 5.08, "step": 400 }, { "entropy": 0.7210552679374814, "epoch": 0.656, "grad_norm": 20.75, "learning_rate": 8.18e-06, "loss": 11.4447, "mean_token_accuracy": 0.8082952104508877, "num_tokens": 11657835.0, "step": 410 }, { "entropy": 0.7324020706117154, "epoch": 0.672, "grad_norm": 22.5, "learning_rate": 8.380000000000001e-06, "loss": 11.7039, "mean_token_accuracy": 0.8040592070668936, "num_tokens": 11949262.0, "step": 420 }, { "entropy": 0.7221599837765098, "epoch": 0.688, "grad_norm": 19.5, "learning_rate": 8.580000000000001e-06, "loss": 11.5232, "mean_token_accuracy": 0.8065517093986273, "num_tokens": 12227640.0, "step": 430 }, { "entropy": 0.7129955545067788, "epoch": 0.704, "grad_norm": 19.0, "learning_rate": 8.78e-06, "loss": 11.4179, "mean_token_accuracy": 0.8087377645075321, "num_tokens": 12516641.0, "step": 440 }, { "entropy": 0.7289297079667449, "epoch": 0.72, "grad_norm": 23.125, "learning_rate": 8.98e-06, "loss": 11.5855, "mean_token_accuracy": 0.8067171189934015, "num_tokens": 12793343.0, "step": 450 }, { "entropy": 0.7121220523491502, "epoch": 0.736, "grad_norm": 21.0, "learning_rate": 9.180000000000002e-06, "loss": 11.3422, "mean_token_accuracy": 0.8094062607735395, "num_tokens": 13077981.0, "step": 460 }, { "entropy": 0.6984126020222903, "epoch": 0.752, "grad_norm": 18.625, "learning_rate": 9.38e-06, "loss": 11.0848, "mean_token_accuracy": 0.811941733583808, "num_tokens": 13358957.0, "step": 470 }, { "entropy": 0.677340486086905, "epoch": 0.768, "grad_norm": 20.875, "learning_rate": 9.58e-06, "loss": 10.8164, "mean_token_accuracy": 0.8181491158902645, "num_tokens": 13653412.0, "step": 480 }, { "entropy": 0.717779103666544, "epoch": 0.784, "grad_norm": 20.875, "learning_rate": 9.780000000000001e-06, "loss": 11.5008, "mean_token_accuracy": 0.8082873310893774, "num_tokens": 13940856.0, "step": 490 }, { "entropy": 0.7195664433762431, "epoch": 0.8, "grad_norm": 18.625, "learning_rate": 9.980000000000001e-06, "loss": 11.4516, "mean_token_accuracy": 0.8075944270938635, "num_tokens": 14230754.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.2116008014678956, "eval_biology_loss": 1.1978343725204468, "eval_biology_mean_token_accuracy": 0.6975936050415039, "eval_biology_num_tokens": 14230754.0, "eval_biology_runtime": 19.7814, "eval_biology_samples_per_second": 25.276, "eval_biology_steps_per_second": 6.319, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7416743865013122, "eval_math_loss": 0.7205922603607178, "eval_math_mean_token_accuracy": 0.8071010875701904, "eval_math_num_tokens": 14230754.0, "eval_math_runtime": 24.6035, "eval_math_samples_per_second": 20.322, "eval_math_steps_per_second": 5.081, "step": 500 }, { "entropy": 0.7075521990656852, "epoch": 0.816, "grad_norm": 19.875, "learning_rate": 1.018e-05, "loss": 11.2548, "mean_token_accuracy": 0.8095884408801794, "num_tokens": 14519893.0, "step": 510 }, { "entropy": 0.6945433892309666, "epoch": 0.832, "grad_norm": 19.375, "learning_rate": 1.038e-05, "loss": 11.0849, "mean_token_accuracy": 0.8136709745973348, "num_tokens": 14805088.0, "step": 520 }, { "entropy": 0.7229658916592598, "epoch": 0.848, "grad_norm": 21.625, "learning_rate": 1.0580000000000002e-05, "loss": 11.4838, "mean_token_accuracy": 0.8065179891884326, "num_tokens": 15086306.0, "step": 530 }, { "entropy": 0.7076279081404209, "epoch": 0.864, "grad_norm": 18.25, "learning_rate": 1.0780000000000002e-05, "loss": 11.2928, "mean_token_accuracy": 0.8091448776423931, "num_tokens": 15370985.0, "step": 540 }, { "entropy": 0.7115106744691729, "epoch": 0.88, "grad_norm": 18.5, "learning_rate": 1.0980000000000002e-05, "loss": 11.3236, "mean_token_accuracy": 0.8087493713945151, "num_tokens": 15653836.0, "step": 550 }, { "entropy": 0.7007610065862536, "epoch": 0.896, "grad_norm": 19.25, "learning_rate": 1.1180000000000001e-05, "loss": 11.1858, "mean_token_accuracy": 0.8117571648210287, "num_tokens": 15932179.0, "step": 560 }, { "entropy": 0.689027976989746, "epoch": 0.912, "grad_norm": 19.0, "learning_rate": 1.138e-05, "loss": 11.0305, "mean_token_accuracy": 0.8121056731790304, "num_tokens": 16219842.0, "step": 570 }, { "entropy": 0.6829117186367512, "epoch": 0.928, "grad_norm": 17.75, "learning_rate": 1.1580000000000001e-05, "loss": 10.8991, "mean_token_accuracy": 0.814700061455369, "num_tokens": 16499842.0, "step": 580 }, { "entropy": 0.6976756127551198, "epoch": 0.944, "grad_norm": 33.5, "learning_rate": 1.178e-05, "loss": 11.1931, "mean_token_accuracy": 0.8115118339657783, "num_tokens": 16781882.0, "step": 590 }, { "entropy": 0.7033443799242377, "epoch": 0.96, "grad_norm": 16.75, "learning_rate": 1.198e-05, "loss": 11.1514, "mean_token_accuracy": 0.8120843637734652, "num_tokens": 17067407.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 1.186503161907196, "eval_biology_loss": 1.2035058736801147, "eval_biology_mean_token_accuracy": 0.6964959187507629, "eval_biology_num_tokens": 17067407.0, "eval_biology_runtime": 20.0132, "eval_biology_samples_per_second": 24.983, "eval_biology_steps_per_second": 6.246, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.7170893518924714, "eval_math_loss": 0.7009586095809937, "eval_math_mean_token_accuracy": 0.8111450595855713, "eval_math_num_tokens": 17067407.0, "eval_math_runtime": 24.6137, "eval_math_samples_per_second": 20.314, "eval_math_steps_per_second": 5.078, "step": 600 }, { "entropy": 0.6821664813905954, "epoch": 0.976, "grad_norm": 22.0, "learning_rate": 1.218e-05, "loss": 10.9331, "mean_token_accuracy": 0.8155146226286888, "num_tokens": 17350994.0, "step": 610 }, { "entropy": 0.7061214720830321, "epoch": 0.992, "grad_norm": 21.375, "learning_rate": 1.2380000000000002e-05, "loss": 11.2138, "mean_token_accuracy": 0.8097808599472046, "num_tokens": 17637514.0, "step": 620 }, { "entropy": 0.688976364955306, "epoch": 1.008, "grad_norm": 18.75, "learning_rate": 1.2580000000000002e-05, "loss": 10.9803, "mean_token_accuracy": 0.8123320799320937, "num_tokens": 17926570.0, "step": 630 }, { "entropy": 0.66466862000525, "epoch": 1.024, "grad_norm": 17.625, "learning_rate": 1.2780000000000001e-05, "loss": 10.6644, "mean_token_accuracy": 0.8173153955489397, "num_tokens": 18207652.0, "step": 640 }, { "entropy": 0.6729175634682178, "epoch": 1.04, "grad_norm": 18.5, "learning_rate": 1.2980000000000001e-05, "loss": 10.5877, "mean_token_accuracy": 0.8187220424413681, "num_tokens": 18484931.0, "step": 650 }, { "entropy": 0.6688125738874078, "epoch": 1.056, "grad_norm": 30.0, "learning_rate": 1.3180000000000001e-05, "loss": 10.7513, "mean_token_accuracy": 0.8168547667562962, "num_tokens": 18773457.0, "step": 660 }, { "entropy": 0.677242561429739, "epoch": 1.072, "grad_norm": 21.0, "learning_rate": 1.3380000000000002e-05, "loss": 10.7823, "mean_token_accuracy": 0.8151846762746573, "num_tokens": 19055365.0, "step": 670 }, { "entropy": 0.6669268727302551, "epoch": 1.088, "grad_norm": 18.5, "learning_rate": 1.3580000000000002e-05, "loss": 10.6585, "mean_token_accuracy": 0.8173960983753205, "num_tokens": 19345730.0, "step": 680 }, { "entropy": 0.6672801608219743, "epoch": 1.104, "grad_norm": 16.5, "learning_rate": 1.378e-05, "loss": 10.6607, "mean_token_accuracy": 0.8167301990091801, "num_tokens": 19637390.0, "step": 690 }, { "entropy": 0.6486887495964766, "epoch": 1.12, "grad_norm": 17.375, "learning_rate": 1.398e-05, "loss": 10.3743, "mean_token_accuracy": 0.8213667117059231, "num_tokens": 19923914.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 1.1550931658744812, "eval_biology_loss": 1.2134206295013428, "eval_biology_mean_token_accuracy": 0.6951225929260254, "eval_biology_num_tokens": 19923914.0, "eval_biology_runtime": 19.789, "eval_biology_samples_per_second": 25.267, "eval_biology_steps_per_second": 6.317, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.6779097893238067, "eval_math_loss": 0.687623143196106, "eval_math_mean_token_accuracy": 0.8132782921791076, "eval_math_num_tokens": 19923914.0, "eval_math_runtime": 24.5984, "eval_math_samples_per_second": 20.327, "eval_math_steps_per_second": 5.082, "step": 700 }, { "entropy": 0.6534717444330453, "epoch": 1.1360000000000001, "grad_norm": 18.625, "learning_rate": 1.418e-05, "loss": 10.4612, "mean_token_accuracy": 0.8200784765183926, "num_tokens": 20201892.0, "step": 710 }, { "entropy": 0.6626596201211215, "epoch": 1.152, "grad_norm": 17.375, "learning_rate": 1.4380000000000001e-05, "loss": 10.6065, "mean_token_accuracy": 0.8195517498999834, "num_tokens": 20490282.0, "step": 720 }, { "entropy": 0.6553794769570231, "epoch": 1.168, "grad_norm": 16.875, "learning_rate": 1.4580000000000001e-05, "loss": 10.4507, "mean_token_accuracy": 0.8203363090753555, "num_tokens": 20785786.0, "step": 730 }, { "entropy": 0.6596008328720927, "epoch": 1.184, "grad_norm": 16.75, "learning_rate": 1.478e-05, "loss": 10.4924, "mean_token_accuracy": 0.8198904592543841, "num_tokens": 21074205.0, "step": 740 }, { "entropy": 0.6442235017195344, "epoch": 1.2, "grad_norm": 17.75, "learning_rate": 1.498e-05, "loss": 10.3329, "mean_token_accuracy": 0.8222478657960892, "num_tokens": 21369159.0, "step": 750 }, { "entropy": 0.6584161130711437, "epoch": 1.216, "grad_norm": 17.75, "learning_rate": 1.5180000000000002e-05, "loss": 10.4952, "mean_token_accuracy": 0.8194109592586756, "num_tokens": 21649178.0, "step": 760 }, { "entropy": 0.6401060940697789, "epoch": 1.232, "grad_norm": 17.875, "learning_rate": 1.5380000000000002e-05, "loss": 10.2162, "mean_token_accuracy": 0.8237158339470625, "num_tokens": 21930239.0, "step": 770 }, { "entropy": 0.6497831366956234, "epoch": 1.248, "grad_norm": 17.375, "learning_rate": 1.5580000000000003e-05, "loss": 10.3038, "mean_token_accuracy": 0.8218781109899282, "num_tokens": 22216387.0, "step": 780 }, { "entropy": 0.6619962759315967, "epoch": 1.264, "grad_norm": 17.5, "learning_rate": 1.578e-05, "loss": 10.6292, "mean_token_accuracy": 0.8176151167601347, "num_tokens": 22501002.0, "step": 790 }, { "entropy": 0.6531268676742912, "epoch": 1.28, "grad_norm": 17.0, "learning_rate": 1.5980000000000003e-05, "loss": 10.4508, "mean_token_accuracy": 0.8208171010017395, "num_tokens": 22779682.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 1.195213288784027, "eval_biology_loss": 1.2141798734664917, "eval_biology_mean_token_accuracy": 0.6942030134201049, "eval_biology_num_tokens": 22779682.0, "eval_biology_runtime": 19.7104, "eval_biology_samples_per_second": 25.367, "eval_biology_steps_per_second": 6.342, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.6831141312122345, "eval_math_loss": 0.6743567585945129, "eval_math_mean_token_accuracy": 0.8159358091354371, "eval_math_num_tokens": 22779682.0, "eval_math_runtime": 24.5582, "eval_math_samples_per_second": 20.36, "eval_math_steps_per_second": 5.09, "step": 800 }, { "entropy": 0.6429138701409102, "epoch": 1.296, "grad_norm": 17.125, "learning_rate": 1.618e-05, "loss": 10.2614, "mean_token_accuracy": 0.8221869930624962, "num_tokens": 23057744.0, "step": 810 }, { "entropy": 0.6462734818458558, "epoch": 1.312, "grad_norm": 20.75, "learning_rate": 1.638e-05, "loss": 10.4226, "mean_token_accuracy": 0.8203614544123411, "num_tokens": 23344644.0, "step": 820 }, { "entropy": 0.6614464454352855, "epoch": 1.328, "grad_norm": 17.75, "learning_rate": 1.658e-05, "loss": 10.5671, "mean_token_accuracy": 0.8176218140870333, "num_tokens": 23622405.0, "step": 830 }, { "entropy": 0.6431225946173071, "epoch": 1.3439999999999999, "grad_norm": 15.25, "learning_rate": 1.6780000000000002e-05, "loss": 10.2817, "mean_token_accuracy": 0.8216937210410833, "num_tokens": 23899771.0, "step": 840 }, { "entropy": 0.6422285752370953, "epoch": 1.3599999999999999, "grad_norm": 15.9375, "learning_rate": 1.698e-05, "loss": 10.2688, "mean_token_accuracy": 0.8234549313783646, "num_tokens": 24187023.0, "step": 850 }, { "entropy": 0.6847162164747715, "epoch": 1.376, "grad_norm": 17.25, "learning_rate": 1.718e-05, "loss": 10.9876, "mean_token_accuracy": 0.81151960529387, "num_tokens": 24466132.0, "step": 860 }, { "entropy": 0.6464430714026094, "epoch": 1.392, "grad_norm": 16.25, "learning_rate": 1.7380000000000003e-05, "loss": 10.3124, "mean_token_accuracy": 0.821755214035511, "num_tokens": 24748043.0, "step": 870 }, { "entropy": 0.6374656381085515, "epoch": 1.408, "grad_norm": 17.125, "learning_rate": 1.758e-05, "loss": 10.1759, "mean_token_accuracy": 0.823258052393794, "num_tokens": 25036674.0, "step": 880 }, { "entropy": 0.6316773502156139, "epoch": 1.424, "grad_norm": 16.375, "learning_rate": 1.7780000000000003e-05, "loss": 10.1508, "mean_token_accuracy": 0.8241602942347527, "num_tokens": 25324579.0, "step": 890 }, { "entropy": 0.6475198846310377, "epoch": 1.44, "grad_norm": 18.125, "learning_rate": 1.798e-05, "loss": 10.3668, "mean_token_accuracy": 0.8207426533102989, "num_tokens": 25606824.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 1.1267103943824768, "eval_biology_loss": 1.223482370376587, "eval_biology_mean_token_accuracy": 0.6937152419090271, "eval_biology_num_tokens": 25606824.0, "eval_biology_runtime": 19.7672, "eval_biology_samples_per_second": 25.294, "eval_biology_steps_per_second": 6.324, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.65911474609375, "eval_math_loss": 0.6649472713470459, "eval_math_mean_token_accuracy": 0.8173848538398742, "eval_math_num_tokens": 25606824.0, "eval_math_runtime": 24.6125, "eval_math_samples_per_second": 20.315, "eval_math_steps_per_second": 5.079, "step": 900 }, { "entropy": 0.6390923649072647, "epoch": 1.456, "grad_norm": 16.375, "learning_rate": 1.8180000000000002e-05, "loss": 10.2031, "mean_token_accuracy": 0.8223831083625555, "num_tokens": 25886396.0, "step": 910 }, { "entropy": 0.641916100680828, "epoch": 1.472, "grad_norm": 15.4375, "learning_rate": 1.8380000000000004e-05, "loss": 10.3424, "mean_token_accuracy": 0.8202576618641615, "num_tokens": 26163618.0, "step": 920 }, { "entropy": 0.662255228124559, "epoch": 1.488, "grad_norm": 17.5, "learning_rate": 1.858e-05, "loss": 10.5833, "mean_token_accuracy": 0.8174156688153744, "num_tokens": 26438338.0, "step": 930 }, { "entropy": 0.6319910818710923, "epoch": 1.504, "grad_norm": 17.5, "learning_rate": 1.878e-05, "loss": 10.1189, "mean_token_accuracy": 0.8249218709766865, "num_tokens": 26729255.0, "step": 940 }, { "entropy": 0.6524576544761658, "epoch": 1.52, "grad_norm": 15.6875, "learning_rate": 1.898e-05, "loss": 10.3845, "mean_token_accuracy": 0.8204564996063709, "num_tokens": 27017935.0, "step": 950 }, { "entropy": 0.6368671843782068, "epoch": 1.536, "grad_norm": 15.3125, "learning_rate": 1.918e-05, "loss": 10.2034, "mean_token_accuracy": 0.8228708405047656, "num_tokens": 27306339.0, "step": 960 }, { "entropy": 0.6371303182095289, "epoch": 1.552, "grad_norm": 16.5, "learning_rate": 1.938e-05, "loss": 10.1556, "mean_token_accuracy": 0.8237581226974726, "num_tokens": 27591959.0, "step": 970 }, { "entropy": 0.6189220814034343, "epoch": 1.568, "grad_norm": 15.4375, "learning_rate": 1.9580000000000002e-05, "loss": 9.9336, "mean_token_accuracy": 0.8277939360588789, "num_tokens": 27884398.0, "step": 980 }, { "entropy": 0.6375723648816347, "epoch": 1.584, "grad_norm": 15.3125, "learning_rate": 1.978e-05, "loss": 10.116, "mean_token_accuracy": 0.824637695401907, "num_tokens": 28171274.0, "step": 990 }, { "entropy": 0.631741807423532, "epoch": 1.6, "grad_norm": 16.125, "learning_rate": 1.9980000000000002e-05, "loss": 10.1091, "mean_token_accuracy": 0.8227341767400503, "num_tokens": 28457624.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 1.133832766532898, "eval_biology_loss": 1.2268259525299072, "eval_biology_mean_token_accuracy": 0.6920726819038391, "eval_biology_num_tokens": 28457624.0, "eval_biology_runtime": 19.738, "eval_biology_samples_per_second": 25.332, "eval_biology_steps_per_second": 6.333, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.6353513326644897, "eval_math_loss": 0.6570390462875366, "eval_math_mean_token_accuracy": 0.8185423817634583, "eval_math_num_tokens": 28457624.0, "eval_math_runtime": 24.6162, "eval_math_samples_per_second": 20.312, "eval_math_steps_per_second": 5.078, "step": 1000 }, { "entropy": 0.6374903971329331, "epoch": 1.616, "grad_norm": 16.125, "learning_rate": 1.9980000000000002e-05, "loss": 10.264, "mean_token_accuracy": 0.8219246376305819, "num_tokens": 28743099.0, "step": 1010 }, { "entropy": 0.6523007312789559, "epoch": 1.6320000000000001, "grad_norm": 15.375, "learning_rate": 1.995777777777778e-05, "loss": 10.326, "mean_token_accuracy": 0.8201606553047895, "num_tokens": 29017297.0, "step": 1020 }, { "entropy": 0.629386986978352, "epoch": 1.6480000000000001, "grad_norm": 14.6875, "learning_rate": 1.9935555555555557e-05, "loss": 10.0254, "mean_token_accuracy": 0.8264750462025404, "num_tokens": 29303707.0, "step": 1030 }, { "entropy": 0.6322049422189593, "epoch": 1.6640000000000001, "grad_norm": 16.625, "learning_rate": 1.9913333333333335e-05, "loss": 10.1151, "mean_token_accuracy": 0.8231775060296058, "num_tokens": 29597156.0, "step": 1040 }, { "entropy": 0.6406657313928008, "epoch": 1.6800000000000002, "grad_norm": 14.0, "learning_rate": 1.9891111111111112e-05, "loss": 10.2285, "mean_token_accuracy": 0.8236899144947529, "num_tokens": 29883879.0, "step": 1050 }, { "entropy": 0.6394492890685797, "epoch": 1.696, "grad_norm": 16.5, "learning_rate": 1.986888888888889e-05, "loss": 10.2443, "mean_token_accuracy": 0.8218765918165445, "num_tokens": 30165760.0, "step": 1060 }, { "entropy": 0.6265557751059532, "epoch": 1.712, "grad_norm": 13.6875, "learning_rate": 1.9846666666666668e-05, "loss": 10.0545, "mean_token_accuracy": 0.82537433616817, "num_tokens": 30460367.0, "step": 1070 }, { "entropy": 0.6222736675292253, "epoch": 1.728, "grad_norm": 16.375, "learning_rate": 1.9824444444444445e-05, "loss": 9.8823, "mean_token_accuracy": 0.8277810603380203, "num_tokens": 30739137.0, "step": 1080 }, { "entropy": 0.6292094394564629, "epoch": 1.744, "grad_norm": 15.375, "learning_rate": 1.9802222222222226e-05, "loss": 10.0169, "mean_token_accuracy": 0.8262683913111687, "num_tokens": 31022663.0, "step": 1090 }, { "entropy": 0.6284451805055141, "epoch": 1.76, "grad_norm": 15.9375, "learning_rate": 1.978e-05, "loss": 10.033, "mean_token_accuracy": 0.8236148204654455, "num_tokens": 31306494.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 1.1596141772270203, "eval_biology_loss": 1.2293517589569092, "eval_biology_mean_token_accuracy": 0.6917372670173645, "eval_biology_num_tokens": 31306494.0, "eval_biology_runtime": 19.7887, "eval_biology_samples_per_second": 25.267, "eval_biology_steps_per_second": 6.317, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.6379002649784088, "eval_math_loss": 0.6466652154922485, "eval_math_mean_token_accuracy": 0.8209902768135071, "eval_math_num_tokens": 31306494.0, "eval_math_runtime": 24.6327, "eval_math_samples_per_second": 20.298, "eval_math_steps_per_second": 5.075, "step": 1100 }, { "entropy": 0.625352057442069, "epoch": 1.776, "grad_norm": 14.25, "learning_rate": 1.975777777777778e-05, "loss": 10.0185, "mean_token_accuracy": 0.8257606349885463, "num_tokens": 31595542.0, "step": 1110 }, { "entropy": 0.6339781129732728, "epoch": 1.792, "grad_norm": 15.0625, "learning_rate": 1.9735555555555556e-05, "loss": 10.206, "mean_token_accuracy": 0.823058757558465, "num_tokens": 31881189.0, "step": 1120 }, { "entropy": 0.6271994180977345, "epoch": 1.808, "grad_norm": 14.0625, "learning_rate": 1.9713333333333337e-05, "loss": 10.0454, "mean_token_accuracy": 0.824696258828044, "num_tokens": 32164196.0, "step": 1130 }, { "entropy": 0.6018361985683441, "epoch": 1.8239999999999998, "grad_norm": 15.25, "learning_rate": 1.969111111111111e-05, "loss": 9.6036, "mean_token_accuracy": 0.8306465744972229, "num_tokens": 32441530.0, "step": 1140 }, { "entropy": 0.6176456701010465, "epoch": 1.8399999999999999, "grad_norm": 14.75, "learning_rate": 1.9668888888888892e-05, "loss": 9.9561, "mean_token_accuracy": 0.82696249820292, "num_tokens": 32723145.0, "step": 1150 }, { "entropy": 0.6277465337887407, "epoch": 1.8559999999999999, "grad_norm": 15.1875, "learning_rate": 1.9646666666666666e-05, "loss": 10.0615, "mean_token_accuracy": 0.8258139468729496, "num_tokens": 33011263.0, "step": 1160 }, { "entropy": 0.6127156307920814, "epoch": 1.8719999999999999, "grad_norm": 15.75, "learning_rate": 1.9624444444444447e-05, "loss": 9.8454, "mean_token_accuracy": 0.8283385183662176, "num_tokens": 33298921.0, "step": 1170 }, { "entropy": 0.626422967761755, "epoch": 1.888, "grad_norm": 16.25, "learning_rate": 1.9602222222222225e-05, "loss": 10.0059, "mean_token_accuracy": 0.8256520442664623, "num_tokens": 33576243.0, "step": 1180 }, { "entropy": 0.6264065893366932, "epoch": 1.904, "grad_norm": 15.4375, "learning_rate": 1.9580000000000002e-05, "loss": 9.9977, "mean_token_accuracy": 0.8253488805145025, "num_tokens": 33850968.0, "step": 1190 }, { "entropy": 0.6095137868076563, "epoch": 1.92, "grad_norm": 14.9375, "learning_rate": 1.955777777777778e-05, "loss": 9.7316, "mean_token_accuracy": 0.8285220514982938, "num_tokens": 34128558.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 1.182666036605835, "eval_biology_loss": 1.2319380044937134, "eval_biology_mean_token_accuracy": 0.6909053907394409, "eval_biology_num_tokens": 34128558.0, "eval_biology_runtime": 19.7849, "eval_biology_samples_per_second": 25.272, "eval_biology_steps_per_second": 6.318, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.649978009223938, "eval_math_loss": 0.63917076587677, "eval_math_mean_token_accuracy": 0.8233149046897889, "eval_math_num_tokens": 34128558.0, "eval_math_runtime": 24.6175, "eval_math_samples_per_second": 20.311, "eval_math_steps_per_second": 5.078, "step": 1200 }, { "entropy": 0.6194202324375511, "epoch": 1.936, "grad_norm": 16.25, "learning_rate": 1.9535555555555557e-05, "loss": 9.9383, "mean_token_accuracy": 0.8252742733806372, "num_tokens": 34408056.0, "step": 1210 }, { "entropy": 0.6192464983090759, "epoch": 1.952, "grad_norm": 14.625, "learning_rate": 1.9513333333333335e-05, "loss": 9.7913, "mean_token_accuracy": 0.8282815985381603, "num_tokens": 34684679.0, "step": 1220 }, { "entropy": 0.6408920273184776, "epoch": 1.968, "grad_norm": 16.875, "learning_rate": 1.9491111111111113e-05, "loss": 10.1921, "mean_token_accuracy": 0.8224945243448019, "num_tokens": 34971038.0, "step": 1230 }, { "entropy": 0.6087088288739324, "epoch": 1.984, "grad_norm": 14.25, "learning_rate": 1.946888888888889e-05, "loss": 9.7506, "mean_token_accuracy": 0.8292552776634693, "num_tokens": 35262281.0, "step": 1240 }, { "entropy": 0.6189011264592409, "epoch": 2.0, "grad_norm": 14.5625, "learning_rate": 1.9446666666666668e-05, "loss": 9.8391, "mean_token_accuracy": 0.8280998166650534, "num_tokens": 35560864.0, "step": 1250 }, { "entropy": 0.5718940345570445, "epoch": 2.016, "grad_norm": 17.875, "learning_rate": 1.9424444444444446e-05, "loss": 8.9755, "mean_token_accuracy": 0.8381088264286518, "num_tokens": 35846704.0, "step": 1260 }, { "entropy": 0.5528001293540001, "epoch": 2.032, "grad_norm": 17.375, "learning_rate": 1.9402222222222223e-05, "loss": 8.7959, "mean_token_accuracy": 0.8400239538401365, "num_tokens": 36128775.0, "step": 1270 }, { "entropy": 0.5431115614250303, "epoch": 2.048, "grad_norm": 16.875, "learning_rate": 1.938e-05, "loss": 8.6342, "mean_token_accuracy": 0.8440989479422569, "num_tokens": 36419504.0, "step": 1280 }, { "entropy": 0.5422856478020549, "epoch": 2.064, "grad_norm": 16.125, "learning_rate": 1.935777777777778e-05, "loss": 8.6381, "mean_token_accuracy": 0.8434138212352991, "num_tokens": 36706816.0, "step": 1290 }, { "entropy": 0.5466266760602594, "epoch": 2.08, "grad_norm": 17.25, "learning_rate": 1.9335555555555556e-05, "loss": 8.6792, "mean_token_accuracy": 0.8412496495991946, "num_tokens": 36988475.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 0.9598336253166199, "eval_biology_loss": 1.2842097282409668, "eval_biology_mean_token_accuracy": 0.6866690034866333, "eval_biology_num_tokens": 36988475.0, "eval_biology_runtime": 19.8089, "eval_biology_samples_per_second": 25.241, "eval_biology_steps_per_second": 6.31, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.5634605071544647, "eval_math_loss": 0.6450303792953491, "eval_math_mean_token_accuracy": 0.823544692993164, "eval_math_num_tokens": 36988475.0, "eval_math_runtime": 24.6182, "eval_math_samples_per_second": 20.31, "eval_math_steps_per_second": 5.078, "step": 1300 }, { "entropy": 0.5461967477574945, "epoch": 2.096, "grad_norm": 16.25, "learning_rate": 1.9313333333333334e-05, "loss": 8.7832, "mean_token_accuracy": 0.8421301823109388, "num_tokens": 37270131.0, "step": 1310 }, { "entropy": 0.5379752703011036, "epoch": 2.112, "grad_norm": 17.125, "learning_rate": 1.9291111111111115e-05, "loss": 8.6125, "mean_token_accuracy": 0.8422962158918381, "num_tokens": 37563537.0, "step": 1320 }, { "entropy": 0.5374840356409549, "epoch": 2.128, "grad_norm": 16.75, "learning_rate": 1.926888888888889e-05, "loss": 8.4982, "mean_token_accuracy": 0.8446350190788507, "num_tokens": 37843959.0, "step": 1330 }, { "entropy": 0.5455164171755313, "epoch": 2.144, "grad_norm": 16.375, "learning_rate": 1.924666666666667e-05, "loss": 8.6663, "mean_token_accuracy": 0.842664523050189, "num_tokens": 38133092.0, "step": 1340 }, { "entropy": 0.5403652492910623, "epoch": 2.16, "grad_norm": 15.0625, "learning_rate": 1.9224444444444444e-05, "loss": 8.6681, "mean_token_accuracy": 0.8432158157229424, "num_tokens": 38421229.0, "step": 1350 }, { "entropy": 0.5242220051586628, "epoch": 2.176, "grad_norm": 16.875, "learning_rate": 1.9202222222222225e-05, "loss": 8.3559, "mean_token_accuracy": 0.8481345418840647, "num_tokens": 38708043.0, "step": 1360 }, { "entropy": 0.552289474569261, "epoch": 2.192, "grad_norm": 16.25, "learning_rate": 1.918e-05, "loss": 8.8236, "mean_token_accuracy": 0.8398358784615993, "num_tokens": 38996930.0, "step": 1370 }, { "entropy": 0.5456716753542423, "epoch": 2.208, "grad_norm": 18.625, "learning_rate": 1.915777777777778e-05, "loss": 8.7515, "mean_token_accuracy": 0.8416260961443186, "num_tokens": 39279481.0, "step": 1380 }, { "entropy": 0.5409996012225747, "epoch": 2.224, "grad_norm": 17.5, "learning_rate": 1.9135555555555555e-05, "loss": 8.6161, "mean_token_accuracy": 0.843621500954032, "num_tokens": 39569030.0, "step": 1390 }, { "entropy": 0.5462250377982855, "epoch": 2.24, "grad_norm": 16.5, "learning_rate": 1.9113333333333336e-05, "loss": 8.7545, "mean_token_accuracy": 0.8413930989801883, "num_tokens": 39854873.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 0.9318458199501037, "eval_biology_loss": 1.3069241046905518, "eval_biology_mean_token_accuracy": 0.6855153131484986, "eval_biology_num_tokens": 39854873.0, "eval_biology_runtime": 19.7633, "eval_biology_samples_per_second": 25.299, "eval_biology_steps_per_second": 6.325, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.5731087529659271, "eval_math_loss": 0.6433758735656738, "eval_math_mean_token_accuracy": 0.8230452270507812, "eval_math_num_tokens": 39854873.0, "eval_math_runtime": 24.8539, "eval_math_samples_per_second": 20.118, "eval_math_steps_per_second": 5.029, "step": 1400 }, { "entropy": 0.543942479789257, "epoch": 2.2560000000000002, "grad_norm": 16.875, "learning_rate": 1.9091111111111113e-05, "loss": 8.6955, "mean_token_accuracy": 0.8418730091303587, "num_tokens": 40141190.0, "step": 1410 }, { "entropy": 0.5582456098869443, "epoch": 2.2720000000000002, "grad_norm": 19.625, "learning_rate": 1.906888888888889e-05, "loss": 8.8471, "mean_token_accuracy": 0.8396125495433807, "num_tokens": 40415203.0, "step": 1420 }, { "entropy": 0.5530563285574317, "epoch": 2.288, "grad_norm": 17.875, "learning_rate": 1.904666666666667e-05, "loss": 8.823, "mean_token_accuracy": 0.8394552428275347, "num_tokens": 40702393.0, "step": 1430 }, { "entropy": 0.5304178670048714, "epoch": 2.304, "grad_norm": 18.0, "learning_rate": 1.9024444444444446e-05, "loss": 8.4621, "mean_token_accuracy": 0.8458537045866251, "num_tokens": 40982775.0, "step": 1440 }, { "entropy": 0.5507002430036664, "epoch": 2.32, "grad_norm": 17.25, "learning_rate": 1.9002222222222224e-05, "loss": 8.778, "mean_token_accuracy": 0.8414905358105897, "num_tokens": 41263356.0, "step": 1450 }, { "entropy": 0.5461155388504266, "epoch": 2.336, "grad_norm": 16.375, "learning_rate": 1.898e-05, "loss": 8.7082, "mean_token_accuracy": 0.8419744338840246, "num_tokens": 41545235.0, "step": 1460 }, { "entropy": 0.5644198174588382, "epoch": 2.352, "grad_norm": 17.0, "learning_rate": 1.895777777777778e-05, "loss": 9.0111, "mean_token_accuracy": 0.837278475239873, "num_tokens": 41833417.0, "step": 1470 }, { "entropy": 0.5465062925592065, "epoch": 2.368, "grad_norm": 16.625, "learning_rate": 1.8935555555555556e-05, "loss": 8.7822, "mean_token_accuracy": 0.8409049317240715, "num_tokens": 42117030.0, "step": 1480 }, { "entropy": 0.5634627625346184, "epoch": 2.384, "grad_norm": 17.0, "learning_rate": 1.8913333333333334e-05, "loss": 8.9984, "mean_token_accuracy": 0.8382249467074872, "num_tokens": 42410990.0, "step": 1490 }, { "entropy": 0.5541804760694504, "epoch": 2.4, "grad_norm": 19.125, "learning_rate": 1.8891111111111115e-05, "loss": 8.8381, "mean_token_accuracy": 0.8405785549432039, "num_tokens": 42691890.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 0.9389902620315552, "eval_biology_loss": 1.3136844635009766, "eval_biology_mean_token_accuracy": 0.6836875596046448, "eval_biology_num_tokens": 42691890.0, "eval_biology_runtime": 19.8044, "eval_biology_samples_per_second": 25.247, "eval_biology_steps_per_second": 6.312, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.5745205206871032, "eval_math_loss": 0.6360605955123901, "eval_math_mean_token_accuracy": 0.824403573513031, "eval_math_num_tokens": 42691890.0, "eval_math_runtime": 24.8829, "eval_math_samples_per_second": 20.094, "eval_math_steps_per_second": 5.024, "step": 1500 }, { "entropy": 0.5681238017976284, "epoch": 2.416, "grad_norm": 17.375, "learning_rate": 1.886888888888889e-05, "loss": 9.0358, "mean_token_accuracy": 0.8371844127774238, "num_tokens": 42971588.0, "step": 1510 }, { "entropy": 0.5386728642508387, "epoch": 2.432, "grad_norm": 17.875, "learning_rate": 1.884666666666667e-05, "loss": 8.6841, "mean_token_accuracy": 0.8422587804496289, "num_tokens": 43253821.0, "step": 1520 }, { "entropy": 0.5443267293274403, "epoch": 2.448, "grad_norm": 18.125, "learning_rate": 1.8824444444444445e-05, "loss": 8.67, "mean_token_accuracy": 0.8435158774256706, "num_tokens": 43550902.0, "step": 1530 }, { "entropy": 0.5498035730794072, "epoch": 2.464, "grad_norm": 18.0, "learning_rate": 1.8802222222222226e-05, "loss": 8.7914, "mean_token_accuracy": 0.8404768038541078, "num_tokens": 43844259.0, "step": 1540 }, { "entropy": 0.5709992805495858, "epoch": 2.48, "grad_norm": 17.5, "learning_rate": 1.878e-05, "loss": 9.1217, "mean_token_accuracy": 0.8360334102064371, "num_tokens": 44115701.0, "step": 1550 }, { "entropy": 0.5559496510773897, "epoch": 2.496, "grad_norm": 17.125, "learning_rate": 1.875777777777778e-05, "loss": 8.8877, "mean_token_accuracy": 0.8398744653910398, "num_tokens": 44405520.0, "step": 1560 }, { "entropy": 0.5509569091722369, "epoch": 2.512, "grad_norm": 16.875, "learning_rate": 1.873555555555556e-05, "loss": 8.7783, "mean_token_accuracy": 0.8410256687551737, "num_tokens": 44686477.0, "step": 1570 }, { "entropy": 0.5495854092761874, "epoch": 2.528, "grad_norm": 16.0, "learning_rate": 1.8713333333333336e-05, "loss": 8.7681, "mean_token_accuracy": 0.8412394899874925, "num_tokens": 44969760.0, "step": 1580 }, { "entropy": 0.5426954831928015, "epoch": 2.544, "grad_norm": 17.375, "learning_rate": 1.8691111111111114e-05, "loss": 8.7142, "mean_token_accuracy": 0.8419138621538877, "num_tokens": 45255326.0, "step": 1590 }, { "entropy": 0.5427656076848507, "epoch": 2.56, "grad_norm": 16.75, "learning_rate": 1.866888888888889e-05, "loss": 8.6495, "mean_token_accuracy": 0.8414557803422212, "num_tokens": 45532525.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 0.9223047132492066, "eval_biology_loss": 1.3209964036941528, "eval_biology_mean_token_accuracy": 0.6842733683586121, "eval_biology_num_tokens": 45532525.0, "eval_biology_runtime": 19.7451, "eval_biology_samples_per_second": 25.323, "eval_biology_steps_per_second": 6.331, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.5608251221179962, "eval_math_loss": 0.6343366503715515, "eval_math_mean_token_accuracy": 0.8256231875419616, "eval_math_num_tokens": 45532525.0, "eval_math_runtime": 24.5926, "eval_math_samples_per_second": 20.331, "eval_math_steps_per_second": 5.083, "step": 1600 }, { "entropy": 0.5462278285995126, "epoch": 2.576, "grad_norm": 17.375, "learning_rate": 1.864666666666667e-05, "loss": 8.7304, "mean_token_accuracy": 0.8419133082032204, "num_tokens": 45817478.0, "step": 1610 }, { "entropy": 0.5377364344894886, "epoch": 2.592, "grad_norm": 16.125, "learning_rate": 1.8624444444444446e-05, "loss": 8.6128, "mean_token_accuracy": 0.8431659761816264, "num_tokens": 46109575.0, "step": 1620 }, { "entropy": 0.5603145483881236, "epoch": 2.608, "grad_norm": 17.625, "learning_rate": 1.8602222222222224e-05, "loss": 8.926, "mean_token_accuracy": 0.8384825445711612, "num_tokens": 46391461.0, "step": 1630 }, { "entropy": 0.5244756257161498, "epoch": 2.624, "grad_norm": 16.0, "learning_rate": 1.858e-05, "loss": 8.3548, "mean_token_accuracy": 0.8485719878226519, "num_tokens": 46683117.0, "step": 1640 }, { "entropy": 0.5536964586004615, "epoch": 2.64, "grad_norm": 17.375, "learning_rate": 1.855777777777778e-05, "loss": 8.8373, "mean_token_accuracy": 0.839028225839138, "num_tokens": 46965534.0, "step": 1650 }, { "entropy": 0.5332709014415741, "epoch": 2.656, "grad_norm": 16.75, "learning_rate": 1.8535555555555557e-05, "loss": 8.4657, "mean_token_accuracy": 0.844694945588708, "num_tokens": 47245852.0, "step": 1660 }, { "entropy": 0.5504178514704108, "epoch": 2.672, "grad_norm": 18.0, "learning_rate": 1.8513333333333335e-05, "loss": 8.8108, "mean_token_accuracy": 0.8397566247731447, "num_tokens": 47524916.0, "step": 1670 }, { "entropy": 0.5476151436567307, "epoch": 2.6879999999999997, "grad_norm": 15.5, "learning_rate": 1.8491111111111112e-05, "loss": 8.7263, "mean_token_accuracy": 0.8412932168692351, "num_tokens": 47807131.0, "step": 1680 }, { "entropy": 0.5529261413961649, "epoch": 2.7039999999999997, "grad_norm": 16.75, "learning_rate": 1.846888888888889e-05, "loss": 8.8394, "mean_token_accuracy": 0.838797665014863, "num_tokens": 48099654.0, "step": 1690 }, { "entropy": 0.5456200305372476, "epoch": 2.7199999999999998, "grad_norm": 17.625, "learning_rate": 1.8446666666666667e-05, "loss": 8.6682, "mean_token_accuracy": 0.841873237863183, "num_tokens": 48375019.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 0.9218110795021057, "eval_biology_loss": 1.3180720806121826, "eval_biology_mean_token_accuracy": 0.684489251613617, "eval_biology_num_tokens": 48375019.0, "eval_biology_runtime": 19.7671, "eval_biology_samples_per_second": 25.295, "eval_biology_steps_per_second": 6.324, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.5653176684379577, "eval_math_loss": 0.6293387413024902, "eval_math_mean_token_accuracy": 0.8265204019546509, "eval_math_num_tokens": 48375019.0, "eval_math_runtime": 24.6242, "eval_math_samples_per_second": 20.305, "eval_math_steps_per_second": 5.076, "step": 1700 }, { "entropy": 0.5463435992598533, "epoch": 2.7359999999999998, "grad_norm": 17.875, "learning_rate": 1.842444444444445e-05, "loss": 8.7467, "mean_token_accuracy": 0.8409269347786903, "num_tokens": 48659284.0, "step": 1710 }, { "entropy": 0.5540915697813034, "epoch": 2.752, "grad_norm": 16.625, "learning_rate": 1.8402222222222223e-05, "loss": 8.8179, "mean_token_accuracy": 0.8404988449066877, "num_tokens": 48943804.0, "step": 1720 }, { "entropy": 0.5476498136296868, "epoch": 2.768, "grad_norm": 15.75, "learning_rate": 1.8380000000000004e-05, "loss": 8.8006, "mean_token_accuracy": 0.841009271889925, "num_tokens": 49230939.0, "step": 1730 }, { "entropy": 0.5640784077346325, "epoch": 2.784, "grad_norm": 17.0, "learning_rate": 1.8357777777777778e-05, "loss": 9.005, "mean_token_accuracy": 0.8373467523604632, "num_tokens": 49504425.0, "step": 1740 }, { "entropy": 0.5430868171155453, "epoch": 2.8, "grad_norm": 17.25, "learning_rate": 1.833555555555556e-05, "loss": 8.6415, "mean_token_accuracy": 0.8431323904544115, "num_tokens": 49782661.0, "step": 1750 }, { "entropy": 0.5524541085585952, "epoch": 2.816, "grad_norm": 17.0, "learning_rate": 1.8313333333333333e-05, "loss": 8.7568, "mean_token_accuracy": 0.8402639802545309, "num_tokens": 50073632.0, "step": 1760 }, { "entropy": 0.5354580119252205, "epoch": 2.832, "grad_norm": 17.875, "learning_rate": 1.8291111111111114e-05, "loss": 8.632, "mean_token_accuracy": 0.8425567515194416, "num_tokens": 50356964.0, "step": 1770 }, { "entropy": 0.5553580898791551, "epoch": 2.848, "grad_norm": 15.625, "learning_rate": 1.8268888888888888e-05, "loss": 8.8708, "mean_token_accuracy": 0.8403212446719408, "num_tokens": 50644535.0, "step": 1780 }, { "entropy": 0.5478905290365219, "epoch": 2.864, "grad_norm": 18.375, "learning_rate": 1.824666666666667e-05, "loss": 8.7886, "mean_token_accuracy": 0.841050173342228, "num_tokens": 50925653.0, "step": 1790 }, { "entropy": 0.5450881006196141, "epoch": 2.88, "grad_norm": 16.625, "learning_rate": 1.8224444444444447e-05, "loss": 8.6642, "mean_token_accuracy": 0.84115383438766, "num_tokens": 51204374.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 0.9217254042625427, "eval_biology_loss": 1.3202892541885376, "eval_biology_mean_token_accuracy": 0.6841764874458313, "eval_biology_num_tokens": 51204374.0, "eval_biology_runtime": 19.813, "eval_biology_samples_per_second": 25.236, "eval_biology_steps_per_second": 6.309, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.5681756961345673, "eval_math_loss": 0.6243875026702881, "eval_math_mean_token_accuracy": 0.8276074986457824, "eval_math_num_tokens": 51204374.0, "eval_math_runtime": 24.6389, "eval_math_samples_per_second": 20.293, "eval_math_steps_per_second": 5.073, "step": 1800 }, { "entropy": 0.5379522321745753, "epoch": 2.896, "grad_norm": 17.75, "learning_rate": 1.8202222222222225e-05, "loss": 8.6271, "mean_token_accuracy": 0.8437154974788428, "num_tokens": 51483944.0, "step": 1810 }, { "entropy": 0.5471471425145864, "epoch": 2.912, "grad_norm": 17.0, "learning_rate": 1.8180000000000002e-05, "loss": 8.6769, "mean_token_accuracy": 0.8418111637234688, "num_tokens": 51765755.0, "step": 1820 }, { "entropy": 0.5508731028065086, "epoch": 2.928, "grad_norm": 17.0, "learning_rate": 1.815777777777778e-05, "loss": 8.7862, "mean_token_accuracy": 0.8419726848602295, "num_tokens": 52056379.0, "step": 1830 }, { "entropy": 0.5354436157271266, "epoch": 2.944, "grad_norm": 16.375, "learning_rate": 1.8135555555555557e-05, "loss": 8.5928, "mean_token_accuracy": 0.8430151861160994, "num_tokens": 52346232.0, "step": 1840 }, { "entropy": 0.5504492402076722, "epoch": 2.96, "grad_norm": 17.5, "learning_rate": 1.8113333333333335e-05, "loss": 8.7501, "mean_token_accuracy": 0.8406570095568895, "num_tokens": 52633789.0, "step": 1850 }, { "entropy": 0.5475983273237943, "epoch": 2.976, "grad_norm": 15.25, "learning_rate": 1.8091111111111113e-05, "loss": 8.714, "mean_token_accuracy": 0.8408999726176262, "num_tokens": 52911755.0, "step": 1860 }, { "entropy": 0.5406377092003822, "epoch": 2.992, "grad_norm": 16.0, "learning_rate": 1.806888888888889e-05, "loss": 8.6328, "mean_token_accuracy": 0.8425012800842524, "num_tokens": 53198176.0, "step": 1870 }, { "entropy": 0.5113964939489961, "epoch": 3.008, "grad_norm": 22.875, "learning_rate": 1.8046666666666668e-05, "loss": 7.822, "mean_token_accuracy": 0.8548128705471754, "num_tokens": 53481893.0, "step": 1880 }, { "entropy": 0.4152779897674918, "epoch": 3.024, "grad_norm": 22.0, "learning_rate": 1.8024444444444445e-05, "loss": 6.7546, "mean_token_accuracy": 0.8709000959992409, "num_tokens": 53771717.0, "step": 1890 }, { "entropy": 0.40918179890140893, "epoch": 3.04, "grad_norm": 22.75, "learning_rate": 1.8002222222222223e-05, "loss": 6.5585, "mean_token_accuracy": 0.8754206687211991, "num_tokens": 54058045.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 0.6926028978824615, "eval_biology_loss": 1.5856647491455078, "eval_biology_mean_token_accuracy": 0.6686906161308288, "eval_biology_num_tokens": 54058045.0, "eval_biology_runtime": 19.7886, "eval_biology_samples_per_second": 25.267, "eval_biology_steps_per_second": 6.317, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.46198054814338685, "eval_math_loss": 0.6858065724372864, "eval_math_mean_token_accuracy": 0.822696931362152, "eval_math_num_tokens": 54058045.0, "eval_math_runtime": 24.6254, "eval_math_samples_per_second": 20.304, "eval_math_steps_per_second": 5.076, "step": 1900 }, { "entropy": 0.40489907208830117, "epoch": 3.056, "grad_norm": 25.25, "learning_rate": 1.798e-05, "loss": 6.3683, "mean_token_accuracy": 0.8761902552098035, "num_tokens": 54334332.0, "step": 1910 }, { "entropy": 0.40934212449938057, "epoch": 3.072, "grad_norm": 23.5, "learning_rate": 1.7957777777777778e-05, "loss": 6.4642, "mean_token_accuracy": 0.8751021821051836, "num_tokens": 54624543.0, "step": 1920 }, { "entropy": 0.39203624669462445, "epoch": 3.088, "grad_norm": 25.375, "learning_rate": 1.7935555555555556e-05, "loss": 6.2116, "mean_token_accuracy": 0.8796712458133698, "num_tokens": 54907550.0, "step": 1930 }, { "entropy": 0.40762526309117675, "epoch": 3.104, "grad_norm": 23.375, "learning_rate": 1.7913333333333337e-05, "loss": 6.4546, "mean_token_accuracy": 0.8756711948662996, "num_tokens": 55190959.0, "step": 1940 }, { "entropy": 0.39818487148731946, "epoch": 3.12, "grad_norm": 24.0, "learning_rate": 1.789111111111111e-05, "loss": 6.3256, "mean_token_accuracy": 0.8780492424964905, "num_tokens": 55481635.0, "step": 1950 }, { "entropy": 0.4032851942814887, "epoch": 3.136, "grad_norm": 24.5, "learning_rate": 1.7868888888888892e-05, "loss": 6.4233, "mean_token_accuracy": 0.8760235741734504, "num_tokens": 55769010.0, "step": 1960 }, { "entropy": 0.40782611249014733, "epoch": 3.152, "grad_norm": 23.25, "learning_rate": 1.7846666666666666e-05, "loss": 6.4685, "mean_token_accuracy": 0.8753455895930529, "num_tokens": 56053160.0, "step": 1970 }, { "entropy": 0.41834324020892383, "epoch": 3.168, "grad_norm": 24.75, "learning_rate": 1.7824444444444447e-05, "loss": 6.6597, "mean_token_accuracy": 0.8723560575395822, "num_tokens": 56337066.0, "step": 1980 }, { "entropy": 0.4165022653527558, "epoch": 3.184, "grad_norm": 27.375, "learning_rate": 1.780222222222222e-05, "loss": 6.5528, "mean_token_accuracy": 0.8731590420007705, "num_tokens": 56618899.0, "step": 1990 }, { "entropy": 0.40481978207826613, "epoch": 3.2, "grad_norm": 22.625, "learning_rate": 1.7780000000000003e-05, "loss": 6.4003, "mean_token_accuracy": 0.8771062396466732, "num_tokens": 56910071.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 0.6016733210086822, "eval_biology_loss": 1.7224782705307007, "eval_biology_mean_token_accuracy": 0.6672822990417481, "eval_biology_num_tokens": 56910071.0, "eval_biology_runtime": 19.7656, "eval_biology_samples_per_second": 25.297, "eval_biology_steps_per_second": 6.324, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.4420904459953308, "eval_math_loss": 0.6964770555496216, "eval_math_mean_token_accuracy": 0.8225251660346985, "eval_math_num_tokens": 56910071.0, "eval_math_runtime": 24.578, "eval_math_samples_per_second": 20.343, "eval_math_steps_per_second": 5.086, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.791878293573609e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }