| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.2, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7168930960819125, |
| "epoch": 0.016, |
| "grad_norm": 73.5, |
| "learning_rate": 1.8e-07, |
| "loss": 15.0906, |
| "mean_token_accuracy": 0.7680471498519182, |
| "num_tokens": 280941.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.741888347826898, |
| "epoch": 0.032, |
| "grad_norm": 78.0, |
| "learning_rate": 3.8e-07, |
| "loss": 15.4631, |
| "mean_token_accuracy": 0.7632880255579948, |
| "num_tokens": 558056.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.7365243999287486, |
| "epoch": 0.048, |
| "grad_norm": 65.0, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 15.1017, |
| "mean_token_accuracy": 0.768239913508296, |
| "num_tokens": 836753.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.7818019269034266, |
| "epoch": 0.064, |
| "grad_norm": 56.25, |
| "learning_rate": 7.8e-07, |
| "loss": 15.6995, |
| "mean_token_accuracy": 0.7602430328726768, |
| "num_tokens": 1126446.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8105136282742024, |
| "epoch": 0.08, |
| "grad_norm": 47.25, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 15.0321, |
| "mean_token_accuracy": 0.7664048440754414, |
| "num_tokens": 1413596.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8216251201927662, |
| "epoch": 0.096, |
| "grad_norm": 39.75, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 14.6285, |
| "mean_token_accuracy": 0.7707466218620539, |
| "num_tokens": 1701193.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8549969043582678, |
| "epoch": 0.112, |
| "grad_norm": 39.0, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 14.8679, |
| "mean_token_accuracy": 0.7673181220889091, |
| "num_tokens": 1979232.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8556341353803874, |
| "epoch": 0.128, |
| "grad_norm": 34.0, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 14.6467, |
| "mean_token_accuracy": 0.7710235182195901, |
| "num_tokens": 2274177.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8478256281465292, |
| "epoch": 0.144, |
| "grad_norm": 31.0, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 14.3261, |
| "mean_token_accuracy": 0.7732654966413974, |
| "num_tokens": 2548445.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8438362725079059, |
| "epoch": 0.16, |
| "grad_norm": 38.5, |
| "learning_rate": 1.98e-06, |
| "loss": 14.0318, |
| "mean_token_accuracy": 0.7773757755756379, |
| "num_tokens": 2824418.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_biology_entropy": 1.211377203464508, |
| "eval_biology_loss": 1.1644827127456665, |
| "eval_biology_mean_token_accuracy": 0.7046201548576355, |
| "eval_biology_num_tokens": 2824418.0, |
| "eval_biology_runtime": 20.5128, |
| "eval_biology_samples_per_second": 24.375, |
| "eval_biology_steps_per_second": 6.094, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_math_entropy": 0.875089626789093, |
| "eval_math_loss": 0.8965557217597961, |
| "eval_math_mean_token_accuracy": 0.7736486663818359, |
| "eval_math_num_tokens": 2824418.0, |
| "eval_math_runtime": 25.0963, |
| "eval_math_samples_per_second": 19.923, |
| "eval_math_steps_per_second": 4.981, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8555477414280176, |
| "epoch": 0.176, |
| "grad_norm": 47.75, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 14.0356, |
| "mean_token_accuracy": 0.7775060940533877, |
| "num_tokens": 3110313.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8610258311033249, |
| "epoch": 0.192, |
| "grad_norm": 31.375, |
| "learning_rate": 2.38e-06, |
| "loss": 14.025, |
| "mean_token_accuracy": 0.7762446004897356, |
| "num_tokens": 3394170.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.8509209487587214, |
| "epoch": 0.208, |
| "grad_norm": 30.0, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 13.8558, |
| "mean_token_accuracy": 0.7793015491217374, |
| "num_tokens": 3673600.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8540813602507115, |
| "epoch": 0.224, |
| "grad_norm": 30.125, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 13.8247, |
| "mean_token_accuracy": 0.7789989039301872, |
| "num_tokens": 3953732.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.8311325689777732, |
| "epoch": 0.24, |
| "grad_norm": 30.25, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 13.4688, |
| "mean_token_accuracy": 0.7846441507339478, |
| "num_tokens": 4243655.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.8143093746155501, |
| "epoch": 0.256, |
| "grad_norm": 27.625, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 13.0712, |
| "mean_token_accuracy": 0.7881167802959681, |
| "num_tokens": 4531471.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.8313567344099283, |
| "epoch": 0.272, |
| "grad_norm": 24.0, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 13.3645, |
| "mean_token_accuracy": 0.7852793108671904, |
| "num_tokens": 4810284.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.8148340426385403, |
| "epoch": 0.288, |
| "grad_norm": 26.375, |
| "learning_rate": 3.58e-06, |
| "loss": 13.0548, |
| "mean_token_accuracy": 0.7882425185292959, |
| "num_tokens": 5095104.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.8263534324243664, |
| "epoch": 0.304, |
| "grad_norm": 25.125, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 13.2184, |
| "mean_token_accuracy": 0.7860081434249878, |
| "num_tokens": 5383732.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.7958642322570085, |
| "epoch": 0.32, |
| "grad_norm": 23.75, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 12.7072, |
| "mean_token_accuracy": 0.7937722463160753, |
| "num_tokens": 5676334.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_biology_entropy": 1.1982407326698303, |
| "eval_biology_loss": 1.1808913946151733, |
| "eval_biology_mean_token_accuracy": 0.6998598065376281, |
| "eval_biology_num_tokens": 5676334.0, |
| "eval_biology_runtime": 19.7103, |
| "eval_biology_samples_per_second": 25.367, |
| "eval_biology_steps_per_second": 6.342, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_math_entropy": 0.8109114770889282, |
| "eval_math_loss": 0.822318971157074, |
| "eval_math_mean_token_accuracy": 0.7874419956207276, |
| "eval_math_num_tokens": 5676334.0, |
| "eval_math_runtime": 24.5627, |
| "eval_math_samples_per_second": 20.356, |
| "eval_math_steps_per_second": 5.089, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8209991015493869, |
| "epoch": 0.336, |
| "grad_norm": 23.375, |
| "learning_rate": 4.18e-06, |
| "loss": 13.1713, |
| "mean_token_accuracy": 0.78699039965868, |
| "num_tokens": 5958480.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.7989038350060582, |
| "epoch": 0.352, |
| "grad_norm": 25.0, |
| "learning_rate": 4.38e-06, |
| "loss": 12.7482, |
| "mean_token_accuracy": 0.7930307753384114, |
| "num_tokens": 6242161.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.7915343299508095, |
| "epoch": 0.368, |
| "grad_norm": 24.875, |
| "learning_rate": 4.58e-06, |
| "loss": 12.6757, |
| "mean_token_accuracy": 0.7925275303423405, |
| "num_tokens": 6523679.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.7743825454264879, |
| "epoch": 0.384, |
| "grad_norm": 23.375, |
| "learning_rate": 4.78e-06, |
| "loss": 12.3704, |
| "mean_token_accuracy": 0.79692403934896, |
| "num_tokens": 6810978.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.8141555316746235, |
| "epoch": 0.4, |
| "grad_norm": 24.125, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 13.0618, |
| "mean_token_accuracy": 0.7885617177933455, |
| "num_tokens": 7096903.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.7800503006204963, |
| "epoch": 0.416, |
| "grad_norm": 22.25, |
| "learning_rate": 5.18e-06, |
| "loss": 12.498, |
| "mean_token_accuracy": 0.7941101636737585, |
| "num_tokens": 7377181.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.7713520534336566, |
| "epoch": 0.432, |
| "grad_norm": 25.625, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 12.2429, |
| "mean_token_accuracy": 0.798516258224845, |
| "num_tokens": 7650523.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.767449575662613, |
| "epoch": 0.448, |
| "grad_norm": 24.875, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 12.2843, |
| "mean_token_accuracy": 0.7973004225641489, |
| "num_tokens": 7936788.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.7630951976403594, |
| "epoch": 0.464, |
| "grad_norm": 21.375, |
| "learning_rate": 5.78e-06, |
| "loss": 12.2689, |
| "mean_token_accuracy": 0.7984702557325363, |
| "num_tokens": 8223147.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.7839587140828371, |
| "epoch": 0.48, |
| "grad_norm": 22.125, |
| "learning_rate": 5.98e-06, |
| "loss": 12.4483, |
| "mean_token_accuracy": 0.7962926685810089, |
| "num_tokens": 8506030.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_biology_entropy": 1.2097046246528627, |
| "eval_biology_loss": 1.1880755424499512, |
| "eval_biology_mean_token_accuracy": 0.6989095420837402, |
| "eval_biology_num_tokens": 8506030.0, |
| "eval_biology_runtime": 19.7532, |
| "eval_biology_samples_per_second": 25.312, |
| "eval_biology_steps_per_second": 6.328, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_math_entropy": 0.7799157240390777, |
| "eval_math_loss": 0.7781939506530762, |
| "eval_math_mean_token_accuracy": 0.7959079008102417, |
| "eval_math_num_tokens": 8506030.0, |
| "eval_math_runtime": 24.5905, |
| "eval_math_samples_per_second": 20.333, |
| "eval_math_steps_per_second": 5.083, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7540466286242008, |
| "epoch": 0.496, |
| "grad_norm": 24.375, |
| "learning_rate": 6.18e-06, |
| "loss": 12.0453, |
| "mean_token_accuracy": 0.8007366862148046, |
| "num_tokens": 8788726.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.7493760107085109, |
| "epoch": 0.512, |
| "grad_norm": 24.0, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 11.9382, |
| "mean_token_accuracy": 0.8025602623820305, |
| "num_tokens": 9078039.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.7366351887583733, |
| "epoch": 0.528, |
| "grad_norm": 23.0, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 11.7509, |
| "mean_token_accuracy": 0.8059428248554468, |
| "num_tokens": 9373860.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.7388057533651591, |
| "epoch": 0.544, |
| "grad_norm": 24.125, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 11.827, |
| "mean_token_accuracy": 0.8040345013141632, |
| "num_tokens": 9660940.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.7601569008082152, |
| "epoch": 0.56, |
| "grad_norm": 26.625, |
| "learning_rate": 6.98e-06, |
| "loss": 12.1324, |
| "mean_token_accuracy": 0.797855831682682, |
| "num_tokens": 9932302.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.7458819771185518, |
| "epoch": 0.576, |
| "grad_norm": 22.375, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 11.9451, |
| "mean_token_accuracy": 0.8013740532100201, |
| "num_tokens": 10215462.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.7404385067522525, |
| "epoch": 0.592, |
| "grad_norm": 21.625, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 11.8309, |
| "mean_token_accuracy": 0.802900119498372, |
| "num_tokens": 10504396.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.7532710742205382, |
| "epoch": 0.608, |
| "grad_norm": 27.75, |
| "learning_rate": 7.58e-06, |
| "loss": 12.0281, |
| "mean_token_accuracy": 0.801684994623065, |
| "num_tokens": 10793126.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.7331796364858747, |
| "epoch": 0.624, |
| "grad_norm": 21.25, |
| "learning_rate": 7.78e-06, |
| "loss": 11.697, |
| "mean_token_accuracy": 0.8045222193002701, |
| "num_tokens": 11081768.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.7177777705714107, |
| "epoch": 0.64, |
| "grad_norm": 20.0, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 11.4977, |
| "mean_token_accuracy": 0.8074667323380709, |
| "num_tokens": 11370320.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_biology_entropy": 1.199187099456787, |
| "eval_biology_loss": 1.194938063621521, |
| "eval_biology_mean_token_accuracy": 0.6980597639083862, |
| "eval_biology_num_tokens": 11370320.0, |
| "eval_biology_runtime": 19.7518, |
| "eval_biology_samples_per_second": 25.314, |
| "eval_biology_steps_per_second": 6.329, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_math_entropy": 0.7331400663852692, |
| "eval_math_loss": 0.746539831161499, |
| "eval_math_mean_token_accuracy": 0.8015342946052552, |
| "eval_math_num_tokens": 11370320.0, |
| "eval_math_runtime": 24.6063, |
| "eval_math_samples_per_second": 20.32, |
| "eval_math_steps_per_second": 5.08, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.7210552679374814, |
| "epoch": 0.656, |
| "grad_norm": 20.75, |
| "learning_rate": 8.18e-06, |
| "loss": 11.4447, |
| "mean_token_accuracy": 0.8082952104508877, |
| "num_tokens": 11657835.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.7324020706117154, |
| "epoch": 0.672, |
| "grad_norm": 22.5, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 11.7039, |
| "mean_token_accuracy": 0.8040592070668936, |
| "num_tokens": 11949262.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.7221599837765098, |
| "epoch": 0.688, |
| "grad_norm": 19.5, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 11.5232, |
| "mean_token_accuracy": 0.8065517093986273, |
| "num_tokens": 12227640.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.7129955545067788, |
| "epoch": 0.704, |
| "grad_norm": 19.0, |
| "learning_rate": 8.78e-06, |
| "loss": 11.4179, |
| "mean_token_accuracy": 0.8087377645075321, |
| "num_tokens": 12516641.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.7289297079667449, |
| "epoch": 0.72, |
| "grad_norm": 23.125, |
| "learning_rate": 8.98e-06, |
| "loss": 11.5855, |
| "mean_token_accuracy": 0.8067171189934015, |
| "num_tokens": 12793343.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7121220523491502, |
| "epoch": 0.736, |
| "grad_norm": 21.0, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 11.3422, |
| "mean_token_accuracy": 0.8094062607735395, |
| "num_tokens": 13077981.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.6984126020222903, |
| "epoch": 0.752, |
| "grad_norm": 18.625, |
| "learning_rate": 9.38e-06, |
| "loss": 11.0848, |
| "mean_token_accuracy": 0.811941733583808, |
| "num_tokens": 13358957.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.677340486086905, |
| "epoch": 0.768, |
| "grad_norm": 20.875, |
| "learning_rate": 9.58e-06, |
| "loss": 10.8164, |
| "mean_token_accuracy": 0.8181491158902645, |
| "num_tokens": 13653412.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.717779103666544, |
| "epoch": 0.784, |
| "grad_norm": 20.875, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 11.5008, |
| "mean_token_accuracy": 0.8082873310893774, |
| "num_tokens": 13940856.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.7195664433762431, |
| "epoch": 0.8, |
| "grad_norm": 18.625, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 11.4516, |
| "mean_token_accuracy": 0.8075944270938635, |
| "num_tokens": 14230754.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_biology_entropy": 1.2116008014678956, |
| "eval_biology_loss": 1.1978343725204468, |
| "eval_biology_mean_token_accuracy": 0.6975936050415039, |
| "eval_biology_num_tokens": 14230754.0, |
| "eval_biology_runtime": 19.7814, |
| "eval_biology_samples_per_second": 25.276, |
| "eval_biology_steps_per_second": 6.319, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_math_entropy": 0.7416743865013122, |
| "eval_math_loss": 0.7205922603607178, |
| "eval_math_mean_token_accuracy": 0.8071010875701904, |
| "eval_math_num_tokens": 14230754.0, |
| "eval_math_runtime": 24.6035, |
| "eval_math_samples_per_second": 20.322, |
| "eval_math_steps_per_second": 5.081, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.7075521990656852, |
| "epoch": 0.816, |
| "grad_norm": 19.875, |
| "learning_rate": 1.018e-05, |
| "loss": 11.2548, |
| "mean_token_accuracy": 0.8095884408801794, |
| "num_tokens": 14519893.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.6945433892309666, |
| "epoch": 0.832, |
| "grad_norm": 19.375, |
| "learning_rate": 1.038e-05, |
| "loss": 11.0849, |
| "mean_token_accuracy": 0.8136709745973348, |
| "num_tokens": 14805088.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.7229658916592598, |
| "epoch": 0.848, |
| "grad_norm": 21.625, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 11.4838, |
| "mean_token_accuracy": 0.8065179891884326, |
| "num_tokens": 15086306.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.7076279081404209, |
| "epoch": 0.864, |
| "grad_norm": 18.25, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 11.2928, |
| "mean_token_accuracy": 0.8091448776423931, |
| "num_tokens": 15370985.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.7115106744691729, |
| "epoch": 0.88, |
| "grad_norm": 18.5, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 11.3236, |
| "mean_token_accuracy": 0.8087493713945151, |
| "num_tokens": 15653836.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.7007610065862536, |
| "epoch": 0.896, |
| "grad_norm": 19.25, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 11.1858, |
| "mean_token_accuracy": 0.8117571648210287, |
| "num_tokens": 15932179.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.689027976989746, |
| "epoch": 0.912, |
| "grad_norm": 19.0, |
| "learning_rate": 1.138e-05, |
| "loss": 11.0305, |
| "mean_token_accuracy": 0.8121056731790304, |
| "num_tokens": 16219842.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.6829117186367512, |
| "epoch": 0.928, |
| "grad_norm": 17.75, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 10.8991, |
| "mean_token_accuracy": 0.814700061455369, |
| "num_tokens": 16499842.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.6976756127551198, |
| "epoch": 0.944, |
| "grad_norm": 33.5, |
| "learning_rate": 1.178e-05, |
| "loss": 11.1931, |
| "mean_token_accuracy": 0.8115118339657783, |
| "num_tokens": 16781882.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.7033443799242377, |
| "epoch": 0.96, |
| "grad_norm": 16.75, |
| "learning_rate": 1.198e-05, |
| "loss": 11.1514, |
| "mean_token_accuracy": 0.8120843637734652, |
| "num_tokens": 17067407.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_biology_entropy": 1.186503161907196, |
| "eval_biology_loss": 1.2035058736801147, |
| "eval_biology_mean_token_accuracy": 0.6964959187507629, |
| "eval_biology_num_tokens": 17067407.0, |
| "eval_biology_runtime": 20.0132, |
| "eval_biology_samples_per_second": 24.983, |
| "eval_biology_steps_per_second": 6.246, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_math_entropy": 0.7170893518924714, |
| "eval_math_loss": 0.7009586095809937, |
| "eval_math_mean_token_accuracy": 0.8111450595855713, |
| "eval_math_num_tokens": 17067407.0, |
| "eval_math_runtime": 24.6137, |
| "eval_math_samples_per_second": 20.314, |
| "eval_math_steps_per_second": 5.078, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.6821664813905954, |
| "epoch": 0.976, |
| "grad_norm": 22.0, |
| "learning_rate": 1.218e-05, |
| "loss": 10.9331, |
| "mean_token_accuracy": 0.8155146226286888, |
| "num_tokens": 17350994.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.7061214720830321, |
| "epoch": 0.992, |
| "grad_norm": 21.375, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 11.2138, |
| "mean_token_accuracy": 0.8097808599472046, |
| "num_tokens": 17637514.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.688976364955306, |
| "epoch": 1.008, |
| "grad_norm": 18.75, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 10.9803, |
| "mean_token_accuracy": 0.8123320799320937, |
| "num_tokens": 17926570.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.66466862000525, |
| "epoch": 1.024, |
| "grad_norm": 17.625, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 10.6644, |
| "mean_token_accuracy": 0.8173153955489397, |
| "num_tokens": 18207652.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.6729175634682178, |
| "epoch": 1.04, |
| "grad_norm": 18.5, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 10.5877, |
| "mean_token_accuracy": 0.8187220424413681, |
| "num_tokens": 18484931.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.6688125738874078, |
| "epoch": 1.056, |
| "grad_norm": 30.0, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 10.7513, |
| "mean_token_accuracy": 0.8168547667562962, |
| "num_tokens": 18773457.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.677242561429739, |
| "epoch": 1.072, |
| "grad_norm": 21.0, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 10.7823, |
| "mean_token_accuracy": 0.8151846762746573, |
| "num_tokens": 19055365.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.6669268727302551, |
| "epoch": 1.088, |
| "grad_norm": 18.5, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 10.6585, |
| "mean_token_accuracy": 0.8173960983753205, |
| "num_tokens": 19345730.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.6672801608219743, |
| "epoch": 1.104, |
| "grad_norm": 16.5, |
| "learning_rate": 1.378e-05, |
| "loss": 10.6607, |
| "mean_token_accuracy": 0.8167301990091801, |
| "num_tokens": 19637390.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.6486887495964766, |
| "epoch": 1.12, |
| "grad_norm": 17.375, |
| "learning_rate": 1.398e-05, |
| "loss": 10.3743, |
| "mean_token_accuracy": 0.8213667117059231, |
| "num_tokens": 19923914.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_biology_entropy": 1.1550931658744812, |
| "eval_biology_loss": 1.2134206295013428, |
| "eval_biology_mean_token_accuracy": 0.6951225929260254, |
| "eval_biology_num_tokens": 19923914.0, |
| "eval_biology_runtime": 19.789, |
| "eval_biology_samples_per_second": 25.267, |
| "eval_biology_steps_per_second": 6.317, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_math_entropy": 0.6779097893238067, |
| "eval_math_loss": 0.687623143196106, |
| "eval_math_mean_token_accuracy": 0.8132782921791076, |
| "eval_math_num_tokens": 19923914.0, |
| "eval_math_runtime": 24.5984, |
| "eval_math_samples_per_second": 20.327, |
| "eval_math_steps_per_second": 5.082, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.6534717444330453, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 18.625, |
| "learning_rate": 1.418e-05, |
| "loss": 10.4612, |
| "mean_token_accuracy": 0.8200784765183926, |
| "num_tokens": 20201892.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.6626596201211215, |
| "epoch": 1.152, |
| "grad_norm": 17.375, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 10.6065, |
| "mean_token_accuracy": 0.8195517498999834, |
| "num_tokens": 20490282.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.6553794769570231, |
| "epoch": 1.168, |
| "grad_norm": 16.875, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 10.4507, |
| "mean_token_accuracy": 0.8203363090753555, |
| "num_tokens": 20785786.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.6596008328720927, |
| "epoch": 1.184, |
| "grad_norm": 16.75, |
| "learning_rate": 1.478e-05, |
| "loss": 10.4924, |
| "mean_token_accuracy": 0.8198904592543841, |
| "num_tokens": 21074205.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.6442235017195344, |
| "epoch": 1.2, |
| "grad_norm": 17.75, |
| "learning_rate": 1.498e-05, |
| "loss": 10.3329, |
| "mean_token_accuracy": 0.8222478657960892, |
| "num_tokens": 21369159.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.6584161130711437, |
| "epoch": 1.216, |
| "grad_norm": 17.75, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 10.4952, |
| "mean_token_accuracy": 0.8194109592586756, |
| "num_tokens": 21649178.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.6401060940697789, |
| "epoch": 1.232, |
| "grad_norm": 17.875, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 10.2162, |
| "mean_token_accuracy": 0.8237158339470625, |
| "num_tokens": 21930239.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.6497831366956234, |
| "epoch": 1.248, |
| "grad_norm": 17.375, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 10.3038, |
| "mean_token_accuracy": 0.8218781109899282, |
| "num_tokens": 22216387.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.6619962759315967, |
| "epoch": 1.264, |
| "grad_norm": 17.5, |
| "learning_rate": 1.578e-05, |
| "loss": 10.6292, |
| "mean_token_accuracy": 0.8176151167601347, |
| "num_tokens": 22501002.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.6531268676742912, |
| "epoch": 1.28, |
| "grad_norm": 17.0, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 10.4508, |
| "mean_token_accuracy": 0.8208171010017395, |
| "num_tokens": 22779682.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_biology_entropy": 1.195213288784027, |
| "eval_biology_loss": 1.2141798734664917, |
| "eval_biology_mean_token_accuracy": 0.6942030134201049, |
| "eval_biology_num_tokens": 22779682.0, |
| "eval_biology_runtime": 19.7104, |
| "eval_biology_samples_per_second": 25.367, |
| "eval_biology_steps_per_second": 6.342, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_math_entropy": 0.6831141312122345, |
| "eval_math_loss": 0.6743567585945129, |
| "eval_math_mean_token_accuracy": 0.8159358091354371, |
| "eval_math_num_tokens": 22779682.0, |
| "eval_math_runtime": 24.5582, |
| "eval_math_samples_per_second": 20.36, |
| "eval_math_steps_per_second": 5.09, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.6429138701409102, |
| "epoch": 1.296, |
| "grad_norm": 17.125, |
| "learning_rate": 1.618e-05, |
| "loss": 10.2614, |
| "mean_token_accuracy": 0.8221869930624962, |
| "num_tokens": 23057744.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.6462734818458558, |
| "epoch": 1.312, |
| "grad_norm": 20.75, |
| "learning_rate": 1.638e-05, |
| "loss": 10.4226, |
| "mean_token_accuracy": 0.8203614544123411, |
| "num_tokens": 23344644.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.6614464454352855, |
| "epoch": 1.328, |
| "grad_norm": 17.75, |
| "learning_rate": 1.658e-05, |
| "loss": 10.5671, |
| "mean_token_accuracy": 0.8176218140870333, |
| "num_tokens": 23622405.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.6431225946173071, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 15.25, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 10.2817, |
| "mean_token_accuracy": 0.8216937210410833, |
| "num_tokens": 23899771.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.6422285752370953, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 15.9375, |
| "learning_rate": 1.698e-05, |
| "loss": 10.2688, |
| "mean_token_accuracy": 0.8234549313783646, |
| "num_tokens": 24187023.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.6847162164747715, |
| "epoch": 1.376, |
| "grad_norm": 17.25, |
| "learning_rate": 1.718e-05, |
| "loss": 10.9876, |
| "mean_token_accuracy": 0.81151960529387, |
| "num_tokens": 24466132.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.6464430714026094, |
| "epoch": 1.392, |
| "grad_norm": 16.25, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 10.3124, |
| "mean_token_accuracy": 0.821755214035511, |
| "num_tokens": 24748043.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.6374656381085515, |
| "epoch": 1.408, |
| "grad_norm": 17.125, |
| "learning_rate": 1.758e-05, |
| "loss": 10.1759, |
| "mean_token_accuracy": 0.823258052393794, |
| "num_tokens": 25036674.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.6316773502156139, |
| "epoch": 1.424, |
| "grad_norm": 16.375, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 10.1508, |
| "mean_token_accuracy": 0.8241602942347527, |
| "num_tokens": 25324579.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.6475198846310377, |
| "epoch": 1.44, |
| "grad_norm": 18.125, |
| "learning_rate": 1.798e-05, |
| "loss": 10.3668, |
| "mean_token_accuracy": 0.8207426533102989, |
| "num_tokens": 25606824.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_biology_entropy": 1.1267103943824768, |
| "eval_biology_loss": 1.223482370376587, |
| "eval_biology_mean_token_accuracy": 0.6937152419090271, |
| "eval_biology_num_tokens": 25606824.0, |
| "eval_biology_runtime": 19.7672, |
| "eval_biology_samples_per_second": 25.294, |
| "eval_biology_steps_per_second": 6.324, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_math_entropy": 0.65911474609375, |
| "eval_math_loss": 0.6649472713470459, |
| "eval_math_mean_token_accuracy": 0.8173848538398742, |
| "eval_math_num_tokens": 25606824.0, |
| "eval_math_runtime": 24.6125, |
| "eval_math_samples_per_second": 20.315, |
| "eval_math_steps_per_second": 5.079, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.6390923649072647, |
| "epoch": 1.456, |
| "grad_norm": 16.375, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 10.2031, |
| "mean_token_accuracy": 0.8223831083625555, |
| "num_tokens": 25886396.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.641916100680828, |
| "epoch": 1.472, |
| "grad_norm": 15.4375, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 10.3424, |
| "mean_token_accuracy": 0.8202576618641615, |
| "num_tokens": 26163618.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.662255228124559, |
| "epoch": 1.488, |
| "grad_norm": 17.5, |
| "learning_rate": 1.858e-05, |
| "loss": 10.5833, |
| "mean_token_accuracy": 0.8174156688153744, |
| "num_tokens": 26438338.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.6319910818710923, |
| "epoch": 1.504, |
| "grad_norm": 17.5, |
| "learning_rate": 1.878e-05, |
| "loss": 10.1189, |
| "mean_token_accuracy": 0.8249218709766865, |
| "num_tokens": 26729255.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.6524576544761658, |
| "epoch": 1.52, |
| "grad_norm": 15.6875, |
| "learning_rate": 1.898e-05, |
| "loss": 10.3845, |
| "mean_token_accuracy": 0.8204564996063709, |
| "num_tokens": 27017935.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.6368671843782068, |
| "epoch": 1.536, |
| "grad_norm": 15.3125, |
| "learning_rate": 1.918e-05, |
| "loss": 10.2034, |
| "mean_token_accuracy": 0.8228708405047656, |
| "num_tokens": 27306339.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.6371303182095289, |
| "epoch": 1.552, |
| "grad_norm": 16.5, |
| "learning_rate": 1.938e-05, |
| "loss": 10.1556, |
| "mean_token_accuracy": 0.8237581226974726, |
| "num_tokens": 27591959.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.6189220814034343, |
| "epoch": 1.568, |
| "grad_norm": 15.4375, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 9.9336, |
| "mean_token_accuracy": 0.8277939360588789, |
| "num_tokens": 27884398.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.6375723648816347, |
| "epoch": 1.584, |
| "grad_norm": 15.3125, |
| "learning_rate": 1.978e-05, |
| "loss": 10.116, |
| "mean_token_accuracy": 0.824637695401907, |
| "num_tokens": 28171274.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.631741807423532, |
| "epoch": 1.6, |
| "grad_norm": 16.125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 10.1091, |
| "mean_token_accuracy": 0.8227341767400503, |
| "num_tokens": 28457624.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_biology_entropy": 1.133832766532898, |
| "eval_biology_loss": 1.2268259525299072, |
| "eval_biology_mean_token_accuracy": 0.6920726819038391, |
| "eval_biology_num_tokens": 28457624.0, |
| "eval_biology_runtime": 19.738, |
| "eval_biology_samples_per_second": 25.332, |
| "eval_biology_steps_per_second": 6.333, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_math_entropy": 0.6353513326644897, |
| "eval_math_loss": 0.6570390462875366, |
| "eval_math_mean_token_accuracy": 0.8185423817634583, |
| "eval_math_num_tokens": 28457624.0, |
| "eval_math_runtime": 24.6162, |
| "eval_math_samples_per_second": 20.312, |
| "eval_math_steps_per_second": 5.078, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.6374903971329331, |
| "epoch": 1.616, |
| "grad_norm": 16.125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 10.264, |
| "mean_token_accuracy": 0.8219246376305819, |
| "num_tokens": 28743099.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.6523007312789559, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 15.375, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 10.326, |
| "mean_token_accuracy": 0.8201606553047895, |
| "num_tokens": 29017297.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.629386986978352, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 14.6875, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 10.0254, |
| "mean_token_accuracy": 0.8264750462025404, |
| "num_tokens": 29303707.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.6322049422189593, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 16.625, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 10.1151, |
| "mean_token_accuracy": 0.8231775060296058, |
| "num_tokens": 29597156.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.6406657313928008, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 14.0, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 10.2285, |
| "mean_token_accuracy": 0.8236899144947529, |
| "num_tokens": 29883879.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.6394492890685797, |
| "epoch": 1.696, |
| "grad_norm": 16.5, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 10.2443, |
| "mean_token_accuracy": 0.8218765918165445, |
| "num_tokens": 30165760.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.6265557751059532, |
| "epoch": 1.712, |
| "grad_norm": 13.6875, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 10.0545, |
| "mean_token_accuracy": 0.82537433616817, |
| "num_tokens": 30460367.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.6222736675292253, |
| "epoch": 1.728, |
| "grad_norm": 16.375, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 9.8823, |
| "mean_token_accuracy": 0.8277810603380203, |
| "num_tokens": 30739137.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.6292094394564629, |
| "epoch": 1.744, |
| "grad_norm": 15.375, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 10.0169, |
| "mean_token_accuracy": 0.8262683913111687, |
| "num_tokens": 31022663.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.6284451805055141, |
| "epoch": 1.76, |
| "grad_norm": 15.9375, |
| "learning_rate": 1.978e-05, |
| "loss": 10.033, |
| "mean_token_accuracy": 0.8236148204654455, |
| "num_tokens": 31306494.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_biology_entropy": 1.1596141772270203, |
| "eval_biology_loss": 1.2293517589569092, |
| "eval_biology_mean_token_accuracy": 0.6917372670173645, |
| "eval_biology_num_tokens": 31306494.0, |
| "eval_biology_runtime": 19.7887, |
| "eval_biology_samples_per_second": 25.267, |
| "eval_biology_steps_per_second": 6.317, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_math_entropy": 0.6379002649784088, |
| "eval_math_loss": 0.6466652154922485, |
| "eval_math_mean_token_accuracy": 0.8209902768135071, |
| "eval_math_num_tokens": 31306494.0, |
| "eval_math_runtime": 24.6327, |
| "eval_math_samples_per_second": 20.298, |
| "eval_math_steps_per_second": 5.075, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.625352057442069, |
| "epoch": 1.776, |
| "grad_norm": 14.25, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 10.0185, |
| "mean_token_accuracy": 0.8257606349885463, |
| "num_tokens": 31595542.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.6339781129732728, |
| "epoch": 1.792, |
| "grad_norm": 15.0625, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 10.206, |
| "mean_token_accuracy": 0.823058757558465, |
| "num_tokens": 31881189.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.6271994180977345, |
| "epoch": 1.808, |
| "grad_norm": 14.0625, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 10.0454, |
| "mean_token_accuracy": 0.824696258828044, |
| "num_tokens": 32164196.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.6018361985683441, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 15.25, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 9.6036, |
| "mean_token_accuracy": 0.8306465744972229, |
| "num_tokens": 32441530.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.6176456701010465, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 14.75, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 9.9561, |
| "mean_token_accuracy": 0.82696249820292, |
| "num_tokens": 32723145.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.6277465337887407, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 15.1875, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 10.0615, |
| "mean_token_accuracy": 0.8258139468729496, |
| "num_tokens": 33011263.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.6127156307920814, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 15.75, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 9.8454, |
| "mean_token_accuracy": 0.8283385183662176, |
| "num_tokens": 33298921.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.626422967761755, |
| "epoch": 1.888, |
| "grad_norm": 16.25, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 10.0059, |
| "mean_token_accuracy": 0.8256520442664623, |
| "num_tokens": 33576243.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.6264065893366932, |
| "epoch": 1.904, |
| "grad_norm": 15.4375, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 9.9977, |
| "mean_token_accuracy": 0.8253488805145025, |
| "num_tokens": 33850968.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.6095137868076563, |
| "epoch": 1.92, |
| "grad_norm": 14.9375, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 9.7316, |
| "mean_token_accuracy": 0.8285220514982938, |
| "num_tokens": 34128558.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_biology_entropy": 1.182666036605835, |
| "eval_biology_loss": 1.2319380044937134, |
| "eval_biology_mean_token_accuracy": 0.6909053907394409, |
| "eval_biology_num_tokens": 34128558.0, |
| "eval_biology_runtime": 19.7849, |
| "eval_biology_samples_per_second": 25.272, |
| "eval_biology_steps_per_second": 6.318, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_math_entropy": 0.649978009223938, |
| "eval_math_loss": 0.63917076587677, |
| "eval_math_mean_token_accuracy": 0.8233149046897889, |
| "eval_math_num_tokens": 34128558.0, |
| "eval_math_runtime": 24.6175, |
| "eval_math_samples_per_second": 20.311, |
| "eval_math_steps_per_second": 5.078, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.6194202324375511, |
| "epoch": 1.936, |
| "grad_norm": 16.25, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 9.9383, |
| "mean_token_accuracy": 0.8252742733806372, |
| "num_tokens": 34408056.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.6192464983090759, |
| "epoch": 1.952, |
| "grad_norm": 14.625, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 9.7913, |
| "mean_token_accuracy": 0.8282815985381603, |
| "num_tokens": 34684679.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.6408920273184776, |
| "epoch": 1.968, |
| "grad_norm": 16.875, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 10.1921, |
| "mean_token_accuracy": 0.8224945243448019, |
| "num_tokens": 34971038.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.6087088288739324, |
| "epoch": 1.984, |
| "grad_norm": 14.25, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 9.7506, |
| "mean_token_accuracy": 0.8292552776634693, |
| "num_tokens": 35262281.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.6189011264592409, |
| "epoch": 2.0, |
| "grad_norm": 14.5625, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 9.8391, |
| "mean_token_accuracy": 0.8280998166650534, |
| "num_tokens": 35560864.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.5718940345570445, |
| "epoch": 2.016, |
| "grad_norm": 17.875, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 8.9755, |
| "mean_token_accuracy": 0.8381088264286518, |
| "num_tokens": 35846704.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.5528001293540001, |
| "epoch": 2.032, |
| "grad_norm": 17.375, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 8.7959, |
| "mean_token_accuracy": 0.8400239538401365, |
| "num_tokens": 36128775.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.5431115614250303, |
| "epoch": 2.048, |
| "grad_norm": 16.875, |
| "learning_rate": 1.938e-05, |
| "loss": 8.6342, |
| "mean_token_accuracy": 0.8440989479422569, |
| "num_tokens": 36419504.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.5422856478020549, |
| "epoch": 2.064, |
| "grad_norm": 16.125, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 8.6381, |
| "mean_token_accuracy": 0.8434138212352991, |
| "num_tokens": 36706816.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.5466266760602594, |
| "epoch": 2.08, |
| "grad_norm": 17.25, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 8.6792, |
| "mean_token_accuracy": 0.8412496495991946, |
| "num_tokens": 36988475.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_biology_entropy": 0.9598336253166199, |
| "eval_biology_loss": 1.2842097282409668, |
| "eval_biology_mean_token_accuracy": 0.6866690034866333, |
| "eval_biology_num_tokens": 36988475.0, |
| "eval_biology_runtime": 19.8089, |
| "eval_biology_samples_per_second": 25.241, |
| "eval_biology_steps_per_second": 6.31, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_math_entropy": 0.5634605071544647, |
| "eval_math_loss": 0.6450303792953491, |
| "eval_math_mean_token_accuracy": 0.823544692993164, |
| "eval_math_num_tokens": 36988475.0, |
| "eval_math_runtime": 24.6182, |
| "eval_math_samples_per_second": 20.31, |
| "eval_math_steps_per_second": 5.078, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.5461967477574945, |
| "epoch": 2.096, |
| "grad_norm": 16.25, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 8.7832, |
| "mean_token_accuracy": 0.8421301823109388, |
| "num_tokens": 37270131.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.5379752703011036, |
| "epoch": 2.112, |
| "grad_norm": 17.125, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 8.6125, |
| "mean_token_accuracy": 0.8422962158918381, |
| "num_tokens": 37563537.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.5374840356409549, |
| "epoch": 2.128, |
| "grad_norm": 16.75, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 8.4982, |
| "mean_token_accuracy": 0.8446350190788507, |
| "num_tokens": 37843959.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.5455164171755313, |
| "epoch": 2.144, |
| "grad_norm": 16.375, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 8.6663, |
| "mean_token_accuracy": 0.842664523050189, |
| "num_tokens": 38133092.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.5403652492910623, |
| "epoch": 2.16, |
| "grad_norm": 15.0625, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 8.6681, |
| "mean_token_accuracy": 0.8432158157229424, |
| "num_tokens": 38421229.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.5242220051586628, |
| "epoch": 2.176, |
| "grad_norm": 16.875, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 8.3559, |
| "mean_token_accuracy": 0.8481345418840647, |
| "num_tokens": 38708043.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.552289474569261, |
| "epoch": 2.192, |
| "grad_norm": 16.25, |
| "learning_rate": 1.918e-05, |
| "loss": 8.8236, |
| "mean_token_accuracy": 0.8398358784615993, |
| "num_tokens": 38996930.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.5456716753542423, |
| "epoch": 2.208, |
| "grad_norm": 18.625, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 8.7515, |
| "mean_token_accuracy": 0.8416260961443186, |
| "num_tokens": 39279481.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.5409996012225747, |
| "epoch": 2.224, |
| "grad_norm": 17.5, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 8.6161, |
| "mean_token_accuracy": 0.843621500954032, |
| "num_tokens": 39569030.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.5462250377982855, |
| "epoch": 2.24, |
| "grad_norm": 16.5, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 8.7545, |
| "mean_token_accuracy": 0.8413930989801883, |
| "num_tokens": 39854873.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_biology_entropy": 0.9318458199501037, |
| "eval_biology_loss": 1.3069241046905518, |
| "eval_biology_mean_token_accuracy": 0.6855153131484986, |
| "eval_biology_num_tokens": 39854873.0, |
| "eval_biology_runtime": 19.7633, |
| "eval_biology_samples_per_second": 25.299, |
| "eval_biology_steps_per_second": 6.325, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_math_entropy": 0.5731087529659271, |
| "eval_math_loss": 0.6433758735656738, |
| "eval_math_mean_token_accuracy": 0.8230452270507812, |
| "eval_math_num_tokens": 39854873.0, |
| "eval_math_runtime": 24.8539, |
| "eval_math_samples_per_second": 20.118, |
| "eval_math_steps_per_second": 5.029, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.543942479789257, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 16.875, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 8.6955, |
| "mean_token_accuracy": 0.8418730091303587, |
| "num_tokens": 40141190.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.5582456098869443, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 19.625, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 8.8471, |
| "mean_token_accuracy": 0.8396125495433807, |
| "num_tokens": 40415203.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.5530563285574317, |
| "epoch": 2.288, |
| "grad_norm": 17.875, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 8.823, |
| "mean_token_accuracy": 0.8394552428275347, |
| "num_tokens": 40702393.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.5304178670048714, |
| "epoch": 2.304, |
| "grad_norm": 18.0, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 8.4621, |
| "mean_token_accuracy": 0.8458537045866251, |
| "num_tokens": 40982775.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.5507002430036664, |
| "epoch": 2.32, |
| "grad_norm": 17.25, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 8.778, |
| "mean_token_accuracy": 0.8414905358105897, |
| "num_tokens": 41263356.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.5461155388504266, |
| "epoch": 2.336, |
| "grad_norm": 16.375, |
| "learning_rate": 1.898e-05, |
| "loss": 8.7082, |
| "mean_token_accuracy": 0.8419744338840246, |
| "num_tokens": 41545235.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.5644198174588382, |
| "epoch": 2.352, |
| "grad_norm": 17.0, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 9.0111, |
| "mean_token_accuracy": 0.837278475239873, |
| "num_tokens": 41833417.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.5465062925592065, |
| "epoch": 2.368, |
| "grad_norm": 16.625, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 8.7822, |
| "mean_token_accuracy": 0.8409049317240715, |
| "num_tokens": 42117030.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.5634627625346184, |
| "epoch": 2.384, |
| "grad_norm": 17.0, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 8.9984, |
| "mean_token_accuracy": 0.8382249467074872, |
| "num_tokens": 42410990.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.5541804760694504, |
| "epoch": 2.4, |
| "grad_norm": 19.125, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 8.8381, |
| "mean_token_accuracy": 0.8405785549432039, |
| "num_tokens": 42691890.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_biology_entropy": 0.9389902620315552, |
| "eval_biology_loss": 1.3136844635009766, |
| "eval_biology_mean_token_accuracy": 0.6836875596046448, |
| "eval_biology_num_tokens": 42691890.0, |
| "eval_biology_runtime": 19.8044, |
| "eval_biology_samples_per_second": 25.247, |
| "eval_biology_steps_per_second": 6.312, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_math_entropy": 0.5745205206871032, |
| "eval_math_loss": 0.6360605955123901, |
| "eval_math_mean_token_accuracy": 0.824403573513031, |
| "eval_math_num_tokens": 42691890.0, |
| "eval_math_runtime": 24.8829, |
| "eval_math_samples_per_second": 20.094, |
| "eval_math_steps_per_second": 5.024, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.5681238017976284, |
| "epoch": 2.416, |
| "grad_norm": 17.375, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 9.0358, |
| "mean_token_accuracy": 0.8371844127774238, |
| "num_tokens": 42971588.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.5386728642508387, |
| "epoch": 2.432, |
| "grad_norm": 17.875, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 8.6841, |
| "mean_token_accuracy": 0.8422587804496289, |
| "num_tokens": 43253821.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.5443267293274403, |
| "epoch": 2.448, |
| "grad_norm": 18.125, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 8.67, |
| "mean_token_accuracy": 0.8435158774256706, |
| "num_tokens": 43550902.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.5498035730794072, |
| "epoch": 2.464, |
| "grad_norm": 18.0, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 8.7914, |
| "mean_token_accuracy": 0.8404768038541078, |
| "num_tokens": 43844259.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.5709992805495858, |
| "epoch": 2.48, |
| "grad_norm": 17.5, |
| "learning_rate": 1.878e-05, |
| "loss": 9.1217, |
| "mean_token_accuracy": 0.8360334102064371, |
| "num_tokens": 44115701.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.5559496510773897, |
| "epoch": 2.496, |
| "grad_norm": 17.125, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 8.8877, |
| "mean_token_accuracy": 0.8398744653910398, |
| "num_tokens": 44405520.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.5509569091722369, |
| "epoch": 2.512, |
| "grad_norm": 16.875, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 8.7783, |
| "mean_token_accuracy": 0.8410256687551737, |
| "num_tokens": 44686477.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.5495854092761874, |
| "epoch": 2.528, |
| "grad_norm": 16.0, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 8.7681, |
| "mean_token_accuracy": 0.8412394899874925, |
| "num_tokens": 44969760.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.5426954831928015, |
| "epoch": 2.544, |
| "grad_norm": 17.375, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 8.7142, |
| "mean_token_accuracy": 0.8419138621538877, |
| "num_tokens": 45255326.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.5427656076848507, |
| "epoch": 2.56, |
| "grad_norm": 16.75, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 8.6495, |
| "mean_token_accuracy": 0.8414557803422212, |
| "num_tokens": 45532525.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_biology_entropy": 0.9223047132492066, |
| "eval_biology_loss": 1.3209964036941528, |
| "eval_biology_mean_token_accuracy": 0.6842733683586121, |
| "eval_biology_num_tokens": 45532525.0, |
| "eval_biology_runtime": 19.7451, |
| "eval_biology_samples_per_second": 25.323, |
| "eval_biology_steps_per_second": 6.331, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_math_entropy": 0.5608251221179962, |
| "eval_math_loss": 0.6343366503715515, |
| "eval_math_mean_token_accuracy": 0.8256231875419616, |
| "eval_math_num_tokens": 45532525.0, |
| "eval_math_runtime": 24.5926, |
| "eval_math_samples_per_second": 20.331, |
| "eval_math_steps_per_second": 5.083, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.5462278285995126, |
| "epoch": 2.576, |
| "grad_norm": 17.375, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 8.7304, |
| "mean_token_accuracy": 0.8419133082032204, |
| "num_tokens": 45817478.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.5377364344894886, |
| "epoch": 2.592, |
| "grad_norm": 16.125, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 8.6128, |
| "mean_token_accuracy": 0.8431659761816264, |
| "num_tokens": 46109575.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.5603145483881236, |
| "epoch": 2.608, |
| "grad_norm": 17.625, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 8.926, |
| "mean_token_accuracy": 0.8384825445711612, |
| "num_tokens": 46391461.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.5244756257161498, |
| "epoch": 2.624, |
| "grad_norm": 16.0, |
| "learning_rate": 1.858e-05, |
| "loss": 8.3548, |
| "mean_token_accuracy": 0.8485719878226519, |
| "num_tokens": 46683117.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.5536964586004615, |
| "epoch": 2.64, |
| "grad_norm": 17.375, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 8.8373, |
| "mean_token_accuracy": 0.839028225839138, |
| "num_tokens": 46965534.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.5332709014415741, |
| "epoch": 2.656, |
| "grad_norm": 16.75, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 8.4657, |
| "mean_token_accuracy": 0.844694945588708, |
| "num_tokens": 47245852.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.5504178514704108, |
| "epoch": 2.672, |
| "grad_norm": 18.0, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 8.8108, |
| "mean_token_accuracy": 0.8397566247731447, |
| "num_tokens": 47524916.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.5476151436567307, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 15.5, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 8.7263, |
| "mean_token_accuracy": 0.8412932168692351, |
| "num_tokens": 47807131.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.5529261413961649, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 16.75, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 8.8394, |
| "mean_token_accuracy": 0.838797665014863, |
| "num_tokens": 48099654.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.5456200305372476, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 17.625, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 8.6682, |
| "mean_token_accuracy": 0.841873237863183, |
| "num_tokens": 48375019.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_biology_entropy": 0.9218110795021057, |
| "eval_biology_loss": 1.3180720806121826, |
| "eval_biology_mean_token_accuracy": 0.684489251613617, |
| "eval_biology_num_tokens": 48375019.0, |
| "eval_biology_runtime": 19.7671, |
| "eval_biology_samples_per_second": 25.295, |
| "eval_biology_steps_per_second": 6.324, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_math_entropy": 0.5653176684379577, |
| "eval_math_loss": 0.6293387413024902, |
| "eval_math_mean_token_accuracy": 0.8265204019546509, |
| "eval_math_num_tokens": 48375019.0, |
| "eval_math_runtime": 24.6242, |
| "eval_math_samples_per_second": 20.305, |
| "eval_math_steps_per_second": 5.076, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.5463435992598533, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 17.875, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 8.7467, |
| "mean_token_accuracy": 0.8409269347786903, |
| "num_tokens": 48659284.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.5540915697813034, |
| "epoch": 2.752, |
| "grad_norm": 16.625, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 8.8179, |
| "mean_token_accuracy": 0.8404988449066877, |
| "num_tokens": 48943804.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.5476498136296868, |
| "epoch": 2.768, |
| "grad_norm": 15.75, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 8.8006, |
| "mean_token_accuracy": 0.841009271889925, |
| "num_tokens": 49230939.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.5640784077346325, |
| "epoch": 2.784, |
| "grad_norm": 17.0, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 9.005, |
| "mean_token_accuracy": 0.8373467523604632, |
| "num_tokens": 49504425.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.5430868171155453, |
| "epoch": 2.8, |
| "grad_norm": 17.25, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 8.6415, |
| "mean_token_accuracy": 0.8431323904544115, |
| "num_tokens": 49782661.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.5524541085585952, |
| "epoch": 2.816, |
| "grad_norm": 17.0, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 8.7568, |
| "mean_token_accuracy": 0.8402639802545309, |
| "num_tokens": 50073632.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.5354580119252205, |
| "epoch": 2.832, |
| "grad_norm": 17.875, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 8.632, |
| "mean_token_accuracy": 0.8425567515194416, |
| "num_tokens": 50356964.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.5553580898791551, |
| "epoch": 2.848, |
| "grad_norm": 15.625, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 8.8708, |
| "mean_token_accuracy": 0.8403212446719408, |
| "num_tokens": 50644535.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.5478905290365219, |
| "epoch": 2.864, |
| "grad_norm": 18.375, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 8.7886, |
| "mean_token_accuracy": 0.841050173342228, |
| "num_tokens": 50925653.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.5450881006196141, |
| "epoch": 2.88, |
| "grad_norm": 16.625, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 8.6642, |
| "mean_token_accuracy": 0.84115383438766, |
| "num_tokens": 51204374.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_biology_entropy": 0.9217254042625427, |
| "eval_biology_loss": 1.3202892541885376, |
| "eval_biology_mean_token_accuracy": 0.6841764874458313, |
| "eval_biology_num_tokens": 51204374.0, |
| "eval_biology_runtime": 19.813, |
| "eval_biology_samples_per_second": 25.236, |
| "eval_biology_steps_per_second": 6.309, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_math_entropy": 0.5681756961345673, |
| "eval_math_loss": 0.6243875026702881, |
| "eval_math_mean_token_accuracy": 0.8276074986457824, |
| "eval_math_num_tokens": 51204374.0, |
| "eval_math_runtime": 24.6389, |
| "eval_math_samples_per_second": 20.293, |
| "eval_math_steps_per_second": 5.073, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.5379522321745753, |
| "epoch": 2.896, |
| "grad_norm": 17.75, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 8.6271, |
| "mean_token_accuracy": 0.8437154974788428, |
| "num_tokens": 51483944.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.5471471425145864, |
| "epoch": 2.912, |
| "grad_norm": 17.0, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 8.6769, |
| "mean_token_accuracy": 0.8418111637234688, |
| "num_tokens": 51765755.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.5508731028065086, |
| "epoch": 2.928, |
| "grad_norm": 17.0, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 8.7862, |
| "mean_token_accuracy": 0.8419726848602295, |
| "num_tokens": 52056379.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.5354436157271266, |
| "epoch": 2.944, |
| "grad_norm": 16.375, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 8.5928, |
| "mean_token_accuracy": 0.8430151861160994, |
| "num_tokens": 52346232.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.5504492402076722, |
| "epoch": 2.96, |
| "grad_norm": 17.5, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 8.7501, |
| "mean_token_accuracy": 0.8406570095568895, |
| "num_tokens": 52633789.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.5475983273237943, |
| "epoch": 2.976, |
| "grad_norm": 15.25, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 8.714, |
| "mean_token_accuracy": 0.8408999726176262, |
| "num_tokens": 52911755.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.5406377092003822, |
| "epoch": 2.992, |
| "grad_norm": 16.0, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 8.6328, |
| "mean_token_accuracy": 0.8425012800842524, |
| "num_tokens": 53198176.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.5113964939489961, |
| "epoch": 3.008, |
| "grad_norm": 22.875, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 7.822, |
| "mean_token_accuracy": 0.8548128705471754, |
| "num_tokens": 53481893.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.4152779897674918, |
| "epoch": 3.024, |
| "grad_norm": 22.0, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 6.7546, |
| "mean_token_accuracy": 0.8709000959992409, |
| "num_tokens": 53771717.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.40918179890140893, |
| "epoch": 3.04, |
| "grad_norm": 22.75, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 6.5585, |
| "mean_token_accuracy": 0.8754206687211991, |
| "num_tokens": 54058045.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_biology_entropy": 0.6926028978824615, |
| "eval_biology_loss": 1.5856647491455078, |
| "eval_biology_mean_token_accuracy": 0.6686906161308288, |
| "eval_biology_num_tokens": 54058045.0, |
| "eval_biology_runtime": 19.7886, |
| "eval_biology_samples_per_second": 25.267, |
| "eval_biology_steps_per_second": 6.317, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_math_entropy": 0.46198054814338685, |
| "eval_math_loss": 0.6858065724372864, |
| "eval_math_mean_token_accuracy": 0.822696931362152, |
| "eval_math_num_tokens": 54058045.0, |
| "eval_math_runtime": 24.6254, |
| "eval_math_samples_per_second": 20.304, |
| "eval_math_steps_per_second": 5.076, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.40489907208830117, |
| "epoch": 3.056, |
| "grad_norm": 25.25, |
| "learning_rate": 1.798e-05, |
| "loss": 6.3683, |
| "mean_token_accuracy": 0.8761902552098035, |
| "num_tokens": 54334332.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.40934212449938057, |
| "epoch": 3.072, |
| "grad_norm": 23.5, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 6.4642, |
| "mean_token_accuracy": 0.8751021821051836, |
| "num_tokens": 54624543.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.39203624669462445, |
| "epoch": 3.088, |
| "grad_norm": 25.375, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 6.2116, |
| "mean_token_accuracy": 0.8796712458133698, |
| "num_tokens": 54907550.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.40762526309117675, |
| "epoch": 3.104, |
| "grad_norm": 23.375, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 6.4546, |
| "mean_token_accuracy": 0.8756711948662996, |
| "num_tokens": 55190959.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.39818487148731946, |
| "epoch": 3.12, |
| "grad_norm": 24.0, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 6.3256, |
| "mean_token_accuracy": 0.8780492424964905, |
| "num_tokens": 55481635.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.4032851942814887, |
| "epoch": 3.136, |
| "grad_norm": 24.5, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 6.4233, |
| "mean_token_accuracy": 0.8760235741734504, |
| "num_tokens": 55769010.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.40782611249014733, |
| "epoch": 3.152, |
| "grad_norm": 23.25, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 6.4685, |
| "mean_token_accuracy": 0.8753455895930529, |
| "num_tokens": 56053160.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.41834324020892383, |
| "epoch": 3.168, |
| "grad_norm": 24.75, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 6.6597, |
| "mean_token_accuracy": 0.8723560575395822, |
| "num_tokens": 56337066.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.4165022653527558, |
| "epoch": 3.184, |
| "grad_norm": 27.375, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 6.5528, |
| "mean_token_accuracy": 0.8731590420007705, |
| "num_tokens": 56618899.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.40481978207826613, |
| "epoch": 3.2, |
| "grad_norm": 22.625, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 6.4003, |
| "mean_token_accuracy": 0.8771062396466732, |
| "num_tokens": 56910071.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_biology_entropy": 0.6016733210086822, |
| "eval_biology_loss": 1.7224782705307007, |
| "eval_biology_mean_token_accuracy": 0.6672822990417481, |
| "eval_biology_num_tokens": 56910071.0, |
| "eval_biology_runtime": 19.7656, |
| "eval_biology_samples_per_second": 25.297, |
| "eval_biology_steps_per_second": 6.324, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_math_entropy": 0.4420904459953308, |
| "eval_math_loss": 0.6964770555496216, |
| "eval_math_mean_token_accuracy": 0.8225251660346985, |
| "eval_math_num_tokens": 56910071.0, |
| "eval_math_runtime": 24.578, |
| "eval_math_samples_per_second": 20.343, |
| "eval_math_steps_per_second": 5.086, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 16, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.791878293573609e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|