{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.8, "eval_steps": 100, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.142920307815075, "epoch": 0.016, "grad_norm": 290.0, "learning_rate": 6.000000000000001e-07, "loss": 42.6658, "mean_token_accuracy": 0.5620782226324081, "num_tokens": 195524.0, "step": 10 }, { "entropy": 1.148210159689188, "epoch": 0.032, "grad_norm": 239.0, "learning_rate": 1.2666666666666669e-06, "loss": 41.9984, "mean_token_accuracy": 0.5613080382347106, "num_tokens": 390903.0, "step": 20 }, { "entropy": 1.1933260083198547, "epoch": 0.048, "grad_norm": 249.0, "learning_rate": 1.9333333333333336e-06, "loss": 40.6208, "mean_token_accuracy": 0.5657517908141017, "num_tokens": 589868.0, "step": 30 }, { "entropy": 1.2957281917333603, "epoch": 0.064, "grad_norm": 139.0, "learning_rate": 2.6e-06, "loss": 37.9032, "mean_token_accuracy": 0.5714796105399728, "num_tokens": 791190.0, "step": 40 }, { "entropy": 1.5075685508549213, "epoch": 0.08, "grad_norm": 94.0, "learning_rate": 3.266666666666667e-06, "loss": 35.7561, "mean_token_accuracy": 0.5766569443047047, "num_tokens": 989860.0, "step": 50 }, { "entropy": 1.7984249681234359, "epoch": 0.096, "grad_norm": 50.75, "learning_rate": 3.9333333333333335e-06, "loss": 33.4379, "mean_token_accuracy": 0.5814697606489062, "num_tokens": 1181777.0, "step": 60 }, { "entropy": 1.8387351341545581, "epoch": 0.112, "grad_norm": 43.0, "learning_rate": 4.600000000000001e-06, "loss": 30.4219, "mean_token_accuracy": 0.5971228444948793, "num_tokens": 1385513.0, "step": 70 }, { "entropy": 1.7275233700871468, "epoch": 0.128, "grad_norm": 33.5, "learning_rate": 5.2666666666666665e-06, "loss": 28.4703, "mean_token_accuracy": 0.6095364252105355, "num_tokens": 1582368.0, "step": 80 }, { "entropy": 1.7214979872107505, "epoch": 0.144, "grad_norm": 27.0, "learning_rate": 5.933333333333335e-06, "loss": 26.677, "mean_token_accuracy": 0.6243448719382286, "num_tokens": 1773764.0, "step": 90 }, { "entropy": 1.6311134904623033, "epoch": 0.16, "grad_norm": 22.0, "learning_rate": 6.600000000000001e-06, "loss": 25.7683, "mean_token_accuracy": 0.6301404371857643, "num_tokens": 1970077.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.5580159120559693, "eval_biology_loss": 1.5081593990325928, "eval_biology_mean_token_accuracy": 0.6457349667549134, "eval_biology_num_tokens": 1970077.0, "eval_biology_runtime": 48.7413, "eval_biology_samples_per_second": 10.258, "eval_biology_steps_per_second": 2.565, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.206756212234497, "eval_chemistry_loss": 1.1218774318695068, "eval_chemistry_mean_token_accuracy": 0.7205783066749573, "eval_chemistry_num_tokens": 1970077.0, "eval_chemistry_runtime": 60.3159, "eval_chemistry_samples_per_second": 8.29, "eval_chemistry_steps_per_second": 2.072, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.9672308325767517, "eval_math_loss": 1.159799337387085, "eval_math_mean_token_accuracy": 0.7189845342636109, "eval_math_num_tokens": 1970077.0, "eval_math_runtime": 61.8237, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.1670387201309205, "eval_physics_loss": 1.1291608810424805, "eval_physics_mean_token_accuracy": 0.7211072521209717, "eval_physics_num_tokens": 1970077.0, "eval_physics_runtime": 70.4586, "eval_physics_samples_per_second": 7.096, "eval_physics_steps_per_second": 1.774, "step": 100 }, { "entropy": 1.5482715763151647, "epoch": 0.176, "grad_norm": 21.125, "learning_rate": 7.266666666666668e-06, "loss": 24.5868, "mean_token_accuracy": 0.6385629490017891, "num_tokens": 2168354.0, "step": 110 }, { "entropy": 1.5266574397683144, "epoch": 0.192, "grad_norm": 22.875, "learning_rate": 7.933333333333334e-06, "loss": 24.2707, "mean_token_accuracy": 0.6432460084557533, "num_tokens": 2365822.0, "step": 120 }, { "entropy": 1.5192069873213767, "epoch": 0.208, "grad_norm": 20.875, "learning_rate": 8.6e-06, "loss": 24.1355, "mean_token_accuracy": 0.6436416517943144, "num_tokens": 2558762.0, "step": 130 }, { "entropy": 1.4698147468268872, "epoch": 0.224, "grad_norm": 20.125, "learning_rate": 9.266666666666667e-06, "loss": 23.5154, "mean_token_accuracy": 0.6499760080128908, "num_tokens": 2755347.0, "step": 140 }, { "entropy": 1.4506230603903532, "epoch": 0.24, "grad_norm": 19.625, "learning_rate": 9.933333333333334e-06, "loss": 23.2013, "mean_token_accuracy": 0.6523264441639185, "num_tokens": 2947346.0, "step": 150 }, { "entropy": 1.4590953961014748, "epoch": 0.256, "grad_norm": 18.5, "learning_rate": 1.0600000000000002e-05, "loss": 23.3227, "mean_token_accuracy": 0.6508617259562015, "num_tokens": 3139957.0, "step": 160 }, { "entropy": 1.419396448880434, "epoch": 0.272, "grad_norm": 19.75, "learning_rate": 1.1266666666666668e-05, "loss": 22.7352, "mean_token_accuracy": 0.6572458431124687, "num_tokens": 3335951.0, "step": 170 }, { "entropy": 1.4005608204752207, "epoch": 0.288, "grad_norm": 19.75, "learning_rate": 1.1933333333333335e-05, "loss": 22.3969, "mean_token_accuracy": 0.6585959013551473, "num_tokens": 3539731.0, "step": 180 }, { "entropy": 1.391934547200799, "epoch": 0.304, "grad_norm": 18.75, "learning_rate": 1.2600000000000001e-05, "loss": 22.31, "mean_token_accuracy": 0.6621056370437145, "num_tokens": 3733488.0, "step": 190 }, { "entropy": 1.4028674490749835, "epoch": 0.32, "grad_norm": 22.25, "learning_rate": 1.3266666666666668e-05, "loss": 22.5559, "mean_token_accuracy": 0.6576981086283922, "num_tokens": 3920545.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.3209806289672852, "eval_biology_loss": 1.338399887084961, "eval_biology_mean_token_accuracy": 0.6720403518676757, "eval_biology_num_tokens": 3920545.0, "eval_biology_runtime": 48.5853, "eval_biology_samples_per_second": 10.291, "eval_biology_steps_per_second": 2.573, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.0033348879814148, "eval_chemistry_loss": 0.9935092926025391, "eval_chemistry_mean_token_accuracy": 0.7448974308967591, "eval_chemistry_num_tokens": 3920545.0, "eval_chemistry_runtime": 60.24, "eval_chemistry_samples_per_second": 8.3, "eval_chemistry_steps_per_second": 2.075, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.8341804294586181, "eval_math_loss": 1.0635857582092285, "eval_math_mean_token_accuracy": 0.7432106451988221, "eval_math_num_tokens": 3920545.0, "eval_math_runtime": 61.8174, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.9652358031272888, "eval_physics_loss": 0.9950281977653503, "eval_physics_mean_token_accuracy": 0.7510108857154846, "eval_physics_num_tokens": 3920545.0, "eval_physics_runtime": 70.411, "eval_physics_samples_per_second": 7.101, "eval_physics_steps_per_second": 1.775, "step": 200 }, { "entropy": 1.3548175282776356, "epoch": 0.336, "grad_norm": 19.625, "learning_rate": 1.3933333333333334e-05, "loss": 21.7763, "mean_token_accuracy": 0.6656343434005976, "num_tokens": 4114077.0, "step": 210 }, { "entropy": 1.3656601022928954, "epoch": 0.352, "grad_norm": 20.625, "learning_rate": 1.46e-05, "loss": 22.0972, "mean_token_accuracy": 0.6638848338276148, "num_tokens": 4306949.0, "step": 220 }, { "entropy": 1.3525194190442562, "epoch": 0.368, "grad_norm": 18.125, "learning_rate": 1.5266666666666667e-05, "loss": 21.7293, "mean_token_accuracy": 0.6680811226367951, "num_tokens": 4504001.0, "step": 230 }, { "entropy": 1.3454820621758699, "epoch": 0.384, "grad_norm": 21.25, "learning_rate": 1.5933333333333336e-05, "loss": 21.7032, "mean_token_accuracy": 0.6671383358538151, "num_tokens": 4693812.0, "step": 240 }, { "entropy": 1.3525703553110362, "epoch": 0.4, "grad_norm": 17.5, "learning_rate": 1.66e-05, "loss": 21.7856, "mean_token_accuracy": 0.666401931643486, "num_tokens": 4887094.0, "step": 250 }, { "entropy": 1.351718918606639, "epoch": 0.416, "grad_norm": 19.0, "learning_rate": 1.726666666666667e-05, "loss": 21.9058, "mean_token_accuracy": 0.6651136819273233, "num_tokens": 5085369.0, "step": 260 }, { "entropy": 1.3526419658213853, "epoch": 0.432, "grad_norm": 20.875, "learning_rate": 1.7933333333333333e-05, "loss": 21.7813, "mean_token_accuracy": 0.6668458927422762, "num_tokens": 5271275.0, "step": 270 }, { "entropy": 1.3480545241385697, "epoch": 0.448, "grad_norm": 22.875, "learning_rate": 1.86e-05, "loss": 21.627, "mean_token_accuracy": 0.6677324704825878, "num_tokens": 5460559.0, "step": 280 }, { "entropy": 1.301166184991598, "epoch": 0.464, "grad_norm": 21.25, "learning_rate": 1.926666666666667e-05, "loss": 20.889, "mean_token_accuracy": 0.676617132872343, "num_tokens": 5653809.0, "step": 290 }, { "entropy": 1.318466317281127, "epoch": 0.48, "grad_norm": 17.125, "learning_rate": 1.9933333333333334e-05, "loss": 21.2936, "mean_token_accuracy": 0.6712827417999506, "num_tokens": 5850176.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.2827796216011047, "eval_biology_loss": 1.275201678276062, "eval_biology_mean_token_accuracy": 0.6830832781791687, "eval_biology_num_tokens": 5850176.0, "eval_biology_runtime": 48.4915, "eval_biology_samples_per_second": 10.311, "eval_biology_steps_per_second": 2.578, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.983495129108429, "eval_chemistry_loss": 0.9488818645477295, "eval_chemistry_mean_token_accuracy": 0.7523409638404847, "eval_chemistry_num_tokens": 5850176.0, "eval_chemistry_runtime": 60.1707, "eval_chemistry_samples_per_second": 8.31, "eval_chemistry_steps_per_second": 2.077, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.8216862387657166, "eval_math_loss": 1.0297818183898926, "eval_math_mean_token_accuracy": 0.7488151121139527, "eval_math_num_tokens": 5850176.0, "eval_math_runtime": 61.6905, "eval_math_samples_per_second": 8.105, "eval_math_steps_per_second": 2.026, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.9433758721351624, "eval_physics_loss": 0.9520999193191528, "eval_physics_mean_token_accuracy": 0.7585058889389038, "eval_physics_num_tokens": 5850176.0, "eval_physics_runtime": 70.301, "eval_physics_samples_per_second": 7.112, "eval_physics_steps_per_second": 1.778, "step": 300 }, { "entropy": 1.2579400472342968, "epoch": 0.496, "grad_norm": 17.75, "learning_rate": 1.9933333333333334e-05, "loss": 20.2011, "mean_token_accuracy": 0.6842056062072516, "num_tokens": 6046503.0, "step": 310 }, { "entropy": 1.3082518883049488, "epoch": 0.512, "grad_norm": 18.125, "learning_rate": 1.985925925925926e-05, "loss": 21.0658, "mean_token_accuracy": 0.6749501373618841, "num_tokens": 6240456.0, "step": 320 }, { "entropy": 1.3003981616348028, "epoch": 0.528, "grad_norm": 18.125, "learning_rate": 1.9785185185185187e-05, "loss": 20.9809, "mean_token_accuracy": 0.6757604543119669, "num_tokens": 6430555.0, "step": 330 }, { "entropy": 1.2986273631453513, "epoch": 0.544, "grad_norm": 17.0, "learning_rate": 1.971111111111111e-05, "loss": 20.8809, "mean_token_accuracy": 0.6782271713018417, "num_tokens": 6626006.0, "step": 340 }, { "entropy": 1.284830729290843, "epoch": 0.56, "grad_norm": 17.25, "learning_rate": 1.963703703703704e-05, "loss": 20.8197, "mean_token_accuracy": 0.6767117112874985, "num_tokens": 6820754.0, "step": 350 }, { "entropy": 1.2683125745505095, "epoch": 0.576, "grad_norm": 17.0, "learning_rate": 1.9562962962962964e-05, "loss": 20.4541, "mean_token_accuracy": 0.6809794403612613, "num_tokens": 7021844.0, "step": 360 }, { "entropy": 1.2863252360373736, "epoch": 0.592, "grad_norm": 18.875, "learning_rate": 1.948888888888889e-05, "loss": 20.8043, "mean_token_accuracy": 0.676701345667243, "num_tokens": 7213951.0, "step": 370 }, { "entropy": 1.2630502216517925, "epoch": 0.608, "grad_norm": 18.75, "learning_rate": 1.9414814814814817e-05, "loss": 20.4041, "mean_token_accuracy": 0.6803740747272968, "num_tokens": 7416773.0, "step": 380 }, { "entropy": 1.2804703898727894, "epoch": 0.624, "grad_norm": 19.25, "learning_rate": 1.9340740740740743e-05, "loss": 20.6218, "mean_token_accuracy": 0.6788272958248853, "num_tokens": 7612843.0, "step": 390 }, { "entropy": 1.2843346055597067, "epoch": 0.64, "grad_norm": 18.0, "learning_rate": 1.926666666666667e-05, "loss": 20.7171, "mean_token_accuracy": 0.6782444745302201, "num_tokens": 7801633.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.226506398677826, "eval_biology_loss": 1.2382104396820068, "eval_biology_mean_token_accuracy": 0.6894095778465271, "eval_biology_num_tokens": 7801633.0, "eval_biology_runtime": 48.5507, "eval_biology_samples_per_second": 10.299, "eval_biology_steps_per_second": 2.575, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.9317227191925049, "eval_chemistry_loss": 0.9207452535629272, "eval_chemistry_mean_token_accuracy": 0.7581370029449462, "eval_chemistry_num_tokens": 7801633.0, "eval_chemistry_runtime": 60.2113, "eval_chemistry_samples_per_second": 8.304, "eval_chemistry_steps_per_second": 2.076, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.7863595089912414, "eval_math_loss": 1.010460376739502, "eval_math_mean_token_accuracy": 0.7535392093658447, "eval_math_num_tokens": 7801633.0, "eval_math_runtime": 61.807, "eval_math_samples_per_second": 8.09, "eval_math_steps_per_second": 2.022, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.8958085932731629, "eval_physics_loss": 0.9257401823997498, "eval_physics_mean_token_accuracy": 0.7637984156608582, "eval_physics_num_tokens": 7801633.0, "eval_physics_runtime": 70.3663, "eval_physics_samples_per_second": 7.106, "eval_physics_steps_per_second": 1.776, "step": 400 }, { "entropy": 1.278659427165985, "epoch": 0.656, "grad_norm": 18.25, "learning_rate": 1.9192592592592593e-05, "loss": 20.6682, "mean_token_accuracy": 0.6772829819470644, "num_tokens": 7995843.0, "step": 410 }, { "entropy": 1.2931427203118802, "epoch": 0.672, "grad_norm": 18.625, "learning_rate": 1.911851851851852e-05, "loss": 20.8656, "mean_token_accuracy": 0.6753748003393412, "num_tokens": 8183103.0, "step": 420 }, { "entropy": 1.2739692747592926, "epoch": 0.688, "grad_norm": 16.75, "learning_rate": 1.9044444444444446e-05, "loss": 20.5407, "mean_token_accuracy": 0.6812681049108505, "num_tokens": 8385976.0, "step": 430 }, { "entropy": 1.2659825466573238, "epoch": 0.704, "grad_norm": 16.25, "learning_rate": 1.8970370370370372e-05, "loss": 20.4243, "mean_token_accuracy": 0.6820976916700602, "num_tokens": 8578431.0, "step": 440 }, { "entropy": 1.220404140278697, "epoch": 0.72, "grad_norm": 16.75, "learning_rate": 1.8896296296296295e-05, "loss": 19.6546, "mean_token_accuracy": 0.6908745598047972, "num_tokens": 8781342.0, "step": 450 }, { "entropy": 1.2406103231012822, "epoch": 0.736, "grad_norm": 16.75, "learning_rate": 1.8822222222222225e-05, "loss": 19.9745, "mean_token_accuracy": 0.6853331789374352, "num_tokens": 8977918.0, "step": 460 }, { "entropy": 1.2618801843374967, "epoch": 0.752, "grad_norm": 17.125, "learning_rate": 1.874814814814815e-05, "loss": 20.4041, "mean_token_accuracy": 0.6825968738645315, "num_tokens": 9169322.0, "step": 470 }, { "entropy": 1.2232345014810562, "epoch": 0.768, "grad_norm": 19.25, "learning_rate": 1.8674074074074075e-05, "loss": 19.7045, "mean_token_accuracy": 0.6888250291347504, "num_tokens": 9368141.0, "step": 480 }, { "entropy": 1.25159954726696, "epoch": 0.784, "grad_norm": 18.25, "learning_rate": 1.86e-05, "loss": 20.2036, "mean_token_accuracy": 0.6849453710019588, "num_tokens": 9565236.0, "step": 490 }, { "entropy": 1.264250884205103, "epoch": 0.8, "grad_norm": 19.25, "learning_rate": 1.8525925925925928e-05, "loss": 20.5299, "mean_token_accuracy": 0.6811827480792999, "num_tokens": 9761227.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.2163097896575927, "eval_biology_loss": 1.2177292108535767, "eval_biology_mean_token_accuracy": 0.6932459664344788, "eval_biology_num_tokens": 9761227.0, "eval_biology_runtime": 48.5438, "eval_biology_samples_per_second": 10.3, "eval_biology_steps_per_second": 2.575, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.9239063205718994, "eval_chemistry_loss": 0.9047155380249023, "eval_chemistry_mean_token_accuracy": 0.761792631149292, "eval_chemistry_num_tokens": 9761227.0, "eval_chemistry_runtime": 59.9546, "eval_chemistry_samples_per_second": 8.34, "eval_chemistry_steps_per_second": 2.085, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7864464523792267, "eval_math_loss": 0.9939978122711182, "eval_math_mean_token_accuracy": 0.7574145245552063, "eval_math_num_tokens": 9761227.0, "eval_math_runtime": 61.7812, "eval_math_samples_per_second": 8.093, "eval_math_steps_per_second": 2.023, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.889360978603363, "eval_physics_loss": 0.9096766710281372, "eval_physics_mean_token_accuracy": 0.7674052910804748, "eval_physics_num_tokens": 9761227.0, "eval_physics_runtime": 70.5356, "eval_physics_samples_per_second": 7.089, "eval_physics_steps_per_second": 1.772, "step": 500 }, { "entropy": 1.2362793002277612, "epoch": 0.816, "grad_norm": 19.625, "learning_rate": 1.8451851851851855e-05, "loss": 19.8133, "mean_token_accuracy": 0.6863504596054554, "num_tokens": 9958727.0, "step": 510 }, { "entropy": 1.2254926670342683, "epoch": 0.832, "grad_norm": 17.0, "learning_rate": 1.8377777777777778e-05, "loss": 19.8307, "mean_token_accuracy": 0.6866675779223442, "num_tokens": 10155771.0, "step": 520 }, { "entropy": 1.2238412775099277, "epoch": 0.848, "grad_norm": 18.75, "learning_rate": 1.8303703703703704e-05, "loss": 19.687, "mean_token_accuracy": 0.6897137116640806, "num_tokens": 10357721.0, "step": 530 }, { "entropy": 1.2536957442760468, "epoch": 0.864, "grad_norm": 17.125, "learning_rate": 1.822962962962963e-05, "loss": 20.1565, "mean_token_accuracy": 0.6850291140377521, "num_tokens": 10552495.0, "step": 540 }, { "entropy": 1.231699001789093, "epoch": 0.88, "grad_norm": 18.25, "learning_rate": 1.8155555555555557e-05, "loss": 19.8536, "mean_token_accuracy": 0.6891282081604004, "num_tokens": 10748749.0, "step": 550 }, { "entropy": 1.2470501396805047, "epoch": 0.896, "grad_norm": 18.0, "learning_rate": 1.8081481481481484e-05, "loss": 20.1706, "mean_token_accuracy": 0.6856059569865465, "num_tokens": 10943319.0, "step": 560 }, { "entropy": 1.2307742841541767, "epoch": 0.912, "grad_norm": 18.25, "learning_rate": 1.800740740740741e-05, "loss": 19.9062, "mean_token_accuracy": 0.6885740786790848, "num_tokens": 11136935.0, "step": 570 }, { "entropy": 1.2445739306509496, "epoch": 0.928, "grad_norm": 18.0, "learning_rate": 1.7933333333333333e-05, "loss": 20.0979, "mean_token_accuracy": 0.6851089850068093, "num_tokens": 11331098.0, "step": 580 }, { "entropy": 1.2021468229591847, "epoch": 0.944, "grad_norm": 15.875, "learning_rate": 1.785925925925926e-05, "loss": 19.4077, "mean_token_accuracy": 0.6915138956159353, "num_tokens": 11530550.0, "step": 590 }, { "entropy": 1.226809823140502, "epoch": 0.96, "grad_norm": 19.625, "learning_rate": 1.7785185185185186e-05, "loss": 19.8062, "mean_token_accuracy": 0.6897286407649517, "num_tokens": 11729645.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 1.1845180039405823, "eval_biology_loss": 1.203829050064087, "eval_biology_mean_token_accuracy": 0.6961685500144958, "eval_biology_num_tokens": 11729645.0, "eval_biology_runtime": 48.6169, "eval_biology_samples_per_second": 10.284, "eval_biology_steps_per_second": 2.571, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 0.90015394115448, "eval_chemistry_loss": 0.8946329355239868, "eval_chemistry_mean_token_accuracy": 0.7635614371299744, "eval_chemistry_num_tokens": 11729645.0, "eval_chemistry_runtime": 60.2919, "eval_chemistry_samples_per_second": 8.293, "eval_chemistry_steps_per_second": 2.073, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.7684455904960632, "eval_math_loss": 0.9900413751602173, "eval_math_mean_token_accuracy": 0.7588200316429138, "eval_math_num_tokens": 11729645.0, "eval_math_runtime": 61.8301, "eval_math_samples_per_second": 8.087, "eval_math_steps_per_second": 2.022, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.8686938014030456, "eval_physics_loss": 0.9008635878562927, "eval_physics_mean_token_accuracy": 0.7692377109527588, "eval_physics_num_tokens": 11729645.0, "eval_physics_runtime": 70.4349, "eval_physics_samples_per_second": 7.099, "eval_physics_steps_per_second": 1.775, "step": 600 }, { "entropy": 1.2100978799164295, "epoch": 0.976, "grad_norm": 18.0, "learning_rate": 1.7711111111111113e-05, "loss": 19.47, "mean_token_accuracy": 0.6918121088296175, "num_tokens": 11924644.0, "step": 610 }, { "entropy": 1.2226450834423304, "epoch": 0.992, "grad_norm": 16.625, "learning_rate": 1.763703703703704e-05, "loss": 19.8416, "mean_token_accuracy": 0.688375661149621, "num_tokens": 12123059.0, "step": 620 }, { "entropy": 1.2316548496484756, "epoch": 1.008, "grad_norm": 16.875, "learning_rate": 1.7562962962962962e-05, "loss": 19.6116, "mean_token_accuracy": 0.6919524800032377, "num_tokens": 12319366.0, "step": 630 }, { "entropy": 1.1779099617153406, "epoch": 1.024, "grad_norm": 19.0, "learning_rate": 1.7488888888888892e-05, "loss": 18.9763, "mean_token_accuracy": 0.6978646669536829, "num_tokens": 12524183.0, "step": 640 }, { "entropy": 1.2152834441512823, "epoch": 1.04, "grad_norm": 17.75, "learning_rate": 1.7414814814814815e-05, "loss": 19.6247, "mean_token_accuracy": 0.6903412740677595, "num_tokens": 12718593.0, "step": 650 }, { "entropy": 1.1799768891185523, "epoch": 1.056, "grad_norm": 18.625, "learning_rate": 1.7340740740740742e-05, "loss": 19.0432, "mean_token_accuracy": 0.6986244544386864, "num_tokens": 12917803.0, "step": 660 }, { "entropy": 1.2108702428638936, "epoch": 1.072, "grad_norm": 19.125, "learning_rate": 1.726666666666667e-05, "loss": 19.4166, "mean_token_accuracy": 0.6927186574786901, "num_tokens": 13105826.0, "step": 670 }, { "entropy": 1.1979756511747837, "epoch": 1.088, "grad_norm": 18.875, "learning_rate": 1.7192592592592595e-05, "loss": 19.2605, "mean_token_accuracy": 0.6957505799829959, "num_tokens": 13298619.0, "step": 680 }, { "entropy": 1.192365935444832, "epoch": 1.104, "grad_norm": 17.875, "learning_rate": 1.711851851851852e-05, "loss": 19.2461, "mean_token_accuracy": 0.695810866355896, "num_tokens": 13491486.0, "step": 690 }, { "entropy": 1.212946466356516, "epoch": 1.12, "grad_norm": 19.25, "learning_rate": 1.7044444444444445e-05, "loss": 19.5004, "mean_token_accuracy": 0.692466252297163, "num_tokens": 13674663.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 1.1571769905090332, "eval_biology_loss": 1.1946450471878052, "eval_biology_mean_token_accuracy": 0.6972691407203674, "eval_biology_num_tokens": 13674663.0, "eval_biology_runtime": 48.6729, "eval_biology_samples_per_second": 10.273, "eval_biology_steps_per_second": 2.568, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 0.8766862626075744, "eval_chemistry_loss": 0.8891168236732483, "eval_chemistry_mean_token_accuracy": 0.7645404329299926, "eval_chemistry_num_tokens": 13674663.0, "eval_chemistry_runtime": 60.3334, "eval_chemistry_samples_per_second": 8.287, "eval_chemistry_steps_per_second": 2.072, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.7603865313529968, "eval_math_loss": 0.9834137558937073, "eval_math_mean_token_accuracy": 0.7596666264533997, "eval_math_num_tokens": 13674663.0, "eval_math_runtime": 61.8146, "eval_math_samples_per_second": 8.089, "eval_math_steps_per_second": 2.022, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.850571988105774, "eval_physics_loss": 0.8937918543815613, "eval_physics_mean_token_accuracy": 0.7703958468437195, "eval_physics_num_tokens": 13674663.0, "eval_physics_runtime": 70.4674, "eval_physics_samples_per_second": 7.095, "eval_physics_steps_per_second": 1.774, "step": 700 }, { "entropy": 1.1909121543169021, "epoch": 1.1360000000000001, "grad_norm": 18.875, "learning_rate": 1.697037037037037e-05, "loss": 19.2462, "mean_token_accuracy": 0.6955781776458025, "num_tokens": 13869134.0, "step": 710 }, { "entropy": 1.1682380847632885, "epoch": 1.152, "grad_norm": 16.75, "learning_rate": 1.6896296296296298e-05, "loss": 18.8229, "mean_token_accuracy": 0.6991135813295841, "num_tokens": 14078365.0, "step": 720 }, { "entropy": 1.1939557407051324, "epoch": 1.168, "grad_norm": 16.875, "learning_rate": 1.6822222222222224e-05, "loss": 19.1346, "mean_token_accuracy": 0.6960698150098323, "num_tokens": 14266831.0, "step": 730 }, { "entropy": 1.180869185552001, "epoch": 1.184, "grad_norm": 18.25, "learning_rate": 1.6748148148148147e-05, "loss": 19.2654, "mean_token_accuracy": 0.6941378649324179, "num_tokens": 14465660.0, "step": 740 }, { "entropy": 1.1937656667083503, "epoch": 1.2, "grad_norm": 18.25, "learning_rate": 1.6674074074074077e-05, "loss": 19.0305, "mean_token_accuracy": 0.6964295905083417, "num_tokens": 14653228.0, "step": 750 }, { "entropy": 1.1589823190122843, "epoch": 1.216, "grad_norm": 17.875, "learning_rate": 1.66e-05, "loss": 18.6048, "mean_token_accuracy": 0.7018654596060514, "num_tokens": 14857782.0, "step": 760 }, { "entropy": 1.1703605465590954, "epoch": 1.232, "grad_norm": 17.375, "learning_rate": 1.6525925925925927e-05, "loss": 18.8831, "mean_token_accuracy": 0.7015001580119133, "num_tokens": 15047356.0, "step": 770 }, { "entropy": 1.1772115517407655, "epoch": 1.248, "grad_norm": 17.875, "learning_rate": 1.6451851851851853e-05, "loss": 19.0432, "mean_token_accuracy": 0.6959997840225697, "num_tokens": 15241098.0, "step": 780 }, { "entropy": 1.196473068371415, "epoch": 1.264, "grad_norm": 16.375, "learning_rate": 1.637777777777778e-05, "loss": 19.1591, "mean_token_accuracy": 0.6967897292226553, "num_tokens": 15437657.0, "step": 790 }, { "entropy": 1.2014197081327438, "epoch": 1.28, "grad_norm": 19.125, "learning_rate": 1.6303703703703706e-05, "loss": 19.4409, "mean_token_accuracy": 0.6926549930125475, "num_tokens": 15630795.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 1.134603425502777, "eval_biology_loss": 1.1884372234344482, "eval_biology_mean_token_accuracy": 0.6986491298675537, "eval_biology_num_tokens": 15630795.0, "eval_biology_runtime": 48.6306, "eval_biology_samples_per_second": 10.282, "eval_biology_steps_per_second": 2.57, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 0.8623910093307495, "eval_chemistry_loss": 0.885444700717926, "eval_chemistry_mean_token_accuracy": 0.7653528556823731, "eval_chemistry_num_tokens": 15630795.0, "eval_chemistry_runtime": 60.3508, "eval_chemistry_samples_per_second": 8.285, "eval_chemistry_steps_per_second": 2.071, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.7589023416042328, "eval_math_loss": 0.983073353767395, "eval_math_mean_token_accuracy": 0.7593517408370972, "eval_math_num_tokens": 15630795.0, "eval_math_runtime": 61.9026, "eval_math_samples_per_second": 8.077, "eval_math_steps_per_second": 2.019, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.84130739736557, "eval_physics_loss": 0.8907755613327026, "eval_physics_mean_token_accuracy": 0.771062777519226, "eval_physics_num_tokens": 15630795.0, "eval_physics_runtime": 70.5226, "eval_physics_samples_per_second": 7.09, "eval_physics_steps_per_second": 1.772, "step": 800 }, { "entropy": 1.2085642520338298, "epoch": 1.296, "grad_norm": 17.375, "learning_rate": 1.622962962962963e-05, "loss": 19.3831, "mean_token_accuracy": 0.6933640763163567, "num_tokens": 15827105.0, "step": 810 }, { "entropy": 1.1861349143087865, "epoch": 1.312, "grad_norm": 18.75, "learning_rate": 1.6155555555555556e-05, "loss": 19.3103, "mean_token_accuracy": 0.694928414747119, "num_tokens": 16019645.0, "step": 820 }, { "entropy": 1.195632776618004, "epoch": 1.328, "grad_norm": 18.5, "learning_rate": 1.6081481481481482e-05, "loss": 19.3068, "mean_token_accuracy": 0.6934712298214436, "num_tokens": 16221726.0, "step": 830 }, { "entropy": 1.1725192748010158, "epoch": 1.3439999999999999, "grad_norm": 17.0, "learning_rate": 1.600740740740741e-05, "loss": 18.7963, "mean_token_accuracy": 0.700145885720849, "num_tokens": 16427594.0, "step": 840 }, { "entropy": 1.179823150858283, "epoch": 1.3599999999999999, "grad_norm": 18.75, "learning_rate": 1.5933333333333336e-05, "loss": 19.1154, "mean_token_accuracy": 0.6961398232728243, "num_tokens": 16621605.0, "step": 850 }, { "entropy": 1.2228495314717294, "epoch": 1.376, "grad_norm": 19.5, "learning_rate": 1.5859259259259262e-05, "loss": 19.6627, "mean_token_accuracy": 0.6894211061298847, "num_tokens": 16813444.0, "step": 860 }, { "entropy": 1.19021125882864, "epoch": 1.392, "grad_norm": 17.5, "learning_rate": 1.5785185185185185e-05, "loss": 19.2411, "mean_token_accuracy": 0.6957099426537752, "num_tokens": 17006509.0, "step": 870 }, { "entropy": 1.184871331602335, "epoch": 1.408, "grad_norm": 15.4375, "learning_rate": 1.571111111111111e-05, "loss": 18.9785, "mean_token_accuracy": 0.6971315786242485, "num_tokens": 17197870.0, "step": 880 }, { "entropy": 1.1884775411337614, "epoch": 1.424, "grad_norm": 17.25, "learning_rate": 1.5637037037037038e-05, "loss": 19.1289, "mean_token_accuracy": 0.697301234304905, "num_tokens": 17394390.0, "step": 890 }, { "entropy": 1.1825116220861673, "epoch": 1.44, "grad_norm": 20.75, "learning_rate": 1.5562962962962965e-05, "loss": 19.1266, "mean_token_accuracy": 0.6968252252787351, "num_tokens": 17587777.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 1.1672434105873108, "eval_biology_loss": 1.1815813779830933, "eval_biology_mean_token_accuracy": 0.7003293070793152, "eval_biology_num_tokens": 17587777.0, "eval_biology_runtime": 48.6205, "eval_biology_samples_per_second": 10.284, "eval_biology_steps_per_second": 2.571, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 0.8869817838668823, "eval_chemistry_loss": 0.8829970955848694, "eval_chemistry_mean_token_accuracy": 0.7656374487876892, "eval_chemistry_num_tokens": 17587777.0, "eval_chemistry_runtime": 60.3339, "eval_chemistry_samples_per_second": 8.287, "eval_chemistry_steps_per_second": 2.072, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.7674483435153961, "eval_math_loss": 0.9792445302009583, "eval_math_mean_token_accuracy": 0.7597224740982056, "eval_math_num_tokens": 17587777.0, "eval_math_runtime": 61.8239, "eval_math_samples_per_second": 8.087, "eval_math_steps_per_second": 2.022, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.8621900615692139, "eval_physics_loss": 0.8884776830673218, "eval_physics_mean_token_accuracy": 0.7715646696090698, "eval_physics_num_tokens": 17587777.0, "eval_physics_runtime": 70.446, "eval_physics_samples_per_second": 7.098, "eval_physics_steps_per_second": 1.774, "step": 900 }, { "entropy": 1.1818725422024727, "epoch": 1.456, "grad_norm": 18.25, "learning_rate": 1.548888888888889e-05, "loss": 19.032, "mean_token_accuracy": 0.698200449720025, "num_tokens": 17788456.0, "step": 910 }, { "entropy": 1.1769807077944279, "epoch": 1.472, "grad_norm": 16.0, "learning_rate": 1.5414814814814814e-05, "loss": 18.8791, "mean_token_accuracy": 0.7008779179304838, "num_tokens": 17984063.0, "step": 920 }, { "entropy": 1.1641013238579034, "epoch": 1.488, "grad_norm": 18.5, "learning_rate": 1.5340740740740744e-05, "loss": 18.9913, "mean_token_accuracy": 0.6998269848525525, "num_tokens": 18175640.0, "step": 930 }, { "entropy": 1.1960251219570637, "epoch": 1.504, "grad_norm": 17.0, "learning_rate": 1.5266666666666667e-05, "loss": 19.2076, "mean_token_accuracy": 0.696322177350521, "num_tokens": 18367857.0, "step": 940 }, { "entropy": 1.1745740845799446, "epoch": 1.52, "grad_norm": 16.875, "learning_rate": 1.5192592592592594e-05, "loss": 19.0307, "mean_token_accuracy": 0.6969408400356769, "num_tokens": 18569146.0, "step": 950 }, { "entropy": 1.2008745949715376, "epoch": 1.536, "grad_norm": 18.75, "learning_rate": 1.5118518518518519e-05, "loss": 19.2895, "mean_token_accuracy": 0.6946466054767371, "num_tokens": 18755079.0, "step": 960 }, { "entropy": 1.1710849691182375, "epoch": 1.552, "grad_norm": 19.375, "learning_rate": 1.5044444444444445e-05, "loss": 18.9073, "mean_token_accuracy": 0.699705482646823, "num_tokens": 18956248.0, "step": 970 }, { "entropy": 1.163971472159028, "epoch": 1.568, "grad_norm": 18.5, "learning_rate": 1.497037037037037e-05, "loss": 18.7379, "mean_token_accuracy": 0.7023108277469874, "num_tokens": 19150315.0, "step": 980 }, { "entropy": 1.1755164857953786, "epoch": 1.584, "grad_norm": 17.75, "learning_rate": 1.4896296296296298e-05, "loss": 18.8826, "mean_token_accuracy": 0.6997408363968134, "num_tokens": 19344260.0, "step": 990 }, { "entropy": 1.2020615819841622, "epoch": 1.6, "grad_norm": 17.125, "learning_rate": 1.4822222222222225e-05, "loss": 19.3858, "mean_token_accuracy": 0.6933612376451492, "num_tokens": 19532552.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 1.1451181559562682, "eval_biology_loss": 1.1767185926437378, "eval_biology_mean_token_accuracy": 0.7013368840217591, "eval_biology_num_tokens": 19532552.0, "eval_biology_runtime": 48.6261, "eval_biology_samples_per_second": 10.283, "eval_biology_steps_per_second": 2.571, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 0.8642943887710571, "eval_chemistry_loss": 0.8798553347587585, "eval_chemistry_mean_token_accuracy": 0.7664505195617676, "eval_chemistry_num_tokens": 19532552.0, "eval_chemistry_runtime": 59.8839, "eval_chemistry_samples_per_second": 8.349, "eval_chemistry_steps_per_second": 2.087, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.7490488801002503, "eval_math_loss": 0.9804874062538147, "eval_math_mean_token_accuracy": 0.7602896738052368, "eval_math_num_tokens": 19532552.0, "eval_math_runtime": 61.7317, "eval_math_samples_per_second": 8.1, "eval_math_steps_per_second": 2.025, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.8397411880493164, "eval_physics_loss": 0.885618269443512, "eval_physics_mean_token_accuracy": 0.7722613711357117, "eval_physics_num_tokens": 19532552.0, "eval_physics_runtime": 70.4574, "eval_physics_samples_per_second": 7.096, "eval_physics_steps_per_second": 1.774, "step": 1000 }, { "entropy": 1.1645312760025264, "epoch": 1.616, "grad_norm": 17.125, "learning_rate": 1.474814814814815e-05, "loss": 18.8318, "mean_token_accuracy": 0.7000049009919167, "num_tokens": 19732719.0, "step": 1010 }, { "entropy": 1.2018159918487072, "epoch": 1.6320000000000001, "grad_norm": 19.625, "learning_rate": 1.4674074074074076e-05, "loss": 19.3741, "mean_token_accuracy": 0.6942509710788727, "num_tokens": 19926830.0, "step": 1020 }, { "entropy": 1.1848741736263038, "epoch": 1.6480000000000001, "grad_norm": 16.25, "learning_rate": 1.46e-05, "loss": 19.0931, "mean_token_accuracy": 0.6962833561003208, "num_tokens": 20118800.0, "step": 1030 }, { "entropy": 1.1461675189435483, "epoch": 1.6640000000000001, "grad_norm": 16.5, "learning_rate": 1.4525925925925927e-05, "loss": 18.5384, "mean_token_accuracy": 0.7037994157522917, "num_tokens": 20320511.0, "step": 1040 }, { "entropy": 1.1853893544524907, "epoch": 1.6800000000000002, "grad_norm": 18.375, "learning_rate": 1.4451851851851852e-05, "loss": 18.9769, "mean_token_accuracy": 0.6986714884638786, "num_tokens": 20513393.0, "step": 1050 }, { "entropy": 1.1721675164997578, "epoch": 1.696, "grad_norm": 17.5, "learning_rate": 1.4377777777777779e-05, "loss": 18.9508, "mean_token_accuracy": 0.7003318756818772, "num_tokens": 20707237.0, "step": 1060 }, { "entropy": 1.1738170266151429, "epoch": 1.712, "grad_norm": 19.5, "learning_rate": 1.4303703703703703e-05, "loss": 18.9752, "mean_token_accuracy": 0.6993441980332136, "num_tokens": 20910419.0, "step": 1070 }, { "entropy": 1.190333865955472, "epoch": 1.728, "grad_norm": 18.5, "learning_rate": 1.4229629629629632e-05, "loss": 19.1838, "mean_token_accuracy": 0.6956166718155146, "num_tokens": 21107498.0, "step": 1080 }, { "entropy": 1.1574083410203457, "epoch": 1.744, "grad_norm": 18.25, "learning_rate": 1.4155555555555556e-05, "loss": 18.5378, "mean_token_accuracy": 0.7021385233849287, "num_tokens": 21303955.0, "step": 1090 }, { "entropy": 1.1666433937847613, "epoch": 1.76, "grad_norm": 18.75, "learning_rate": 1.4081481481481483e-05, "loss": 18.9266, "mean_token_accuracy": 0.7011374026536942, "num_tokens": 21499572.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 1.1345319437980652, "eval_biology_loss": 1.1725776195526123, "eval_biology_mean_token_accuracy": 0.7024298944473266, "eval_biology_num_tokens": 21499572.0, "eval_biology_runtime": 48.5727, "eval_biology_samples_per_second": 10.294, "eval_biology_steps_per_second": 2.573, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 0.8619537029266358, "eval_chemistry_loss": 0.8766760230064392, "eval_chemistry_mean_token_accuracy": 0.7667445015907287, "eval_chemistry_num_tokens": 21499572.0, "eval_chemistry_runtime": 60.4564, "eval_chemistry_samples_per_second": 8.27, "eval_chemistry_steps_per_second": 2.068, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.7560438480377197, "eval_math_loss": 0.9768902063369751, "eval_math_mean_token_accuracy": 0.7610512175559998, "eval_math_num_tokens": 21499572.0, "eval_math_runtime": 61.7554, "eval_math_samples_per_second": 8.096, "eval_math_steps_per_second": 2.024, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.840686586856842, "eval_physics_loss": 0.8825888633728027, "eval_physics_mean_token_accuracy": 0.7726316556930543, "eval_physics_num_tokens": 21499572.0, "eval_physics_runtime": 70.3673, "eval_physics_samples_per_second": 7.106, "eval_physics_steps_per_second": 1.776, "step": 1100 }, { "entropy": 1.177320409566164, "epoch": 1.776, "grad_norm": 17.875, "learning_rate": 1.400740740740741e-05, "loss": 18.9676, "mean_token_accuracy": 0.6986722864210606, "num_tokens": 21692804.0, "step": 1110 }, { "entropy": 1.148191200569272, "epoch": 1.792, "grad_norm": 19.875, "learning_rate": 1.3933333333333334e-05, "loss": 18.5112, "mean_token_accuracy": 0.7040422059595585, "num_tokens": 21894218.0, "step": 1120 }, { "entropy": 1.1709640648216009, "epoch": 1.808, "grad_norm": 19.5, "learning_rate": 1.385925925925926e-05, "loss": 18.8424, "mean_token_accuracy": 0.7009096905589104, "num_tokens": 22082522.0, "step": 1130 }, { "entropy": 1.1934551119804382, "epoch": 1.8239999999999998, "grad_norm": 19.375, "learning_rate": 1.3785185185185186e-05, "loss": 19.3396, "mean_token_accuracy": 0.693663826212287, "num_tokens": 22278933.0, "step": 1140 }, { "entropy": 1.2042058877646924, "epoch": 1.8399999999999999, "grad_norm": 19.375, "learning_rate": 1.3711111111111112e-05, "loss": 19.393, "mean_token_accuracy": 0.6937405589967967, "num_tokens": 22473801.0, "step": 1150 }, { "entropy": 1.1648973379284144, "epoch": 1.8559999999999999, "grad_norm": 18.125, "learning_rate": 1.3637037037037037e-05, "loss": 18.7484, "mean_token_accuracy": 0.6991217479109764, "num_tokens": 22677853.0, "step": 1160 }, { "entropy": 1.1554684847593308, "epoch": 1.8719999999999999, "grad_norm": 17.375, "learning_rate": 1.3562962962962965e-05, "loss": 18.5305, "mean_token_accuracy": 0.7038368381559849, "num_tokens": 22874965.0, "step": 1170 }, { "entropy": 1.1899018451571464, "epoch": 1.888, "grad_norm": 20.75, "learning_rate": 1.3488888888888888e-05, "loss": 19.2428, "mean_token_accuracy": 0.6950660139322281, "num_tokens": 23068892.0, "step": 1180 }, { "entropy": 1.1851763129234314, "epoch": 1.904, "grad_norm": 18.375, "learning_rate": 1.3414814814814817e-05, "loss": 19.126, "mean_token_accuracy": 0.6976218212395906, "num_tokens": 23263827.0, "step": 1190 }, { "entropy": 1.1869227845221757, "epoch": 1.92, "grad_norm": 18.375, "learning_rate": 1.3340740740740741e-05, "loss": 19.135, "mean_token_accuracy": 0.6959322843700647, "num_tokens": 23463627.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 1.1337207446098327, "eval_biology_loss": 1.168716311454773, "eval_biology_mean_token_accuracy": 0.7030426645278931, "eval_biology_num_tokens": 23463627.0, "eval_biology_runtime": 48.5387, "eval_biology_samples_per_second": 10.301, "eval_biology_steps_per_second": 2.575, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 0.8601148505210876, "eval_chemistry_loss": 0.8744351267814636, "eval_chemistry_mean_token_accuracy": 0.767286482334137, "eval_chemistry_num_tokens": 23463627.0, "eval_chemistry_runtime": 60.2148, "eval_chemistry_samples_per_second": 8.304, "eval_chemistry_steps_per_second": 2.076, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.7519172282218933, "eval_math_loss": 0.9753768444061279, "eval_math_mean_token_accuracy": 0.7615019774436951, "eval_math_num_tokens": 23463627.0, "eval_math_runtime": 61.7169, "eval_math_samples_per_second": 8.102, "eval_math_steps_per_second": 2.025, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.836491331577301, "eval_physics_loss": 0.8801184296607971, "eval_physics_mean_token_accuracy": 0.7731836423873901, "eval_physics_num_tokens": 23463627.0, "eval_physics_runtime": 70.3444, "eval_physics_samples_per_second": 7.108, "eval_physics_steps_per_second": 1.777, "step": 1200 }, { "entropy": 1.1517000958323478, "epoch": 1.936, "grad_norm": 18.875, "learning_rate": 1.3266666666666668e-05, "loss": 18.5112, "mean_token_accuracy": 0.7040315445512533, "num_tokens": 23660418.0, "step": 1210 }, { "entropy": 1.172454860061407, "epoch": 1.952, "grad_norm": 18.75, "learning_rate": 1.3192592592592594e-05, "loss": 18.8708, "mean_token_accuracy": 0.6998139064759016, "num_tokens": 23858145.0, "step": 1220 }, { "entropy": 1.18152665682137, "epoch": 1.968, "grad_norm": 18.5, "learning_rate": 1.311851851851852e-05, "loss": 19.0753, "mean_token_accuracy": 0.6981570664793253, "num_tokens": 24053364.0, "step": 1230 }, { "entropy": 1.1768125779926777, "epoch": 1.984, "grad_norm": 17.625, "learning_rate": 1.3044444444444446e-05, "loss": 18.9463, "mean_token_accuracy": 0.6988612022250891, "num_tokens": 24249465.0, "step": 1240 }, { "entropy": 1.180902672186494, "epoch": 2.0, "grad_norm": 19.375, "learning_rate": 1.297037037037037e-05, "loss": 18.999, "mean_token_accuracy": 0.7006669268012047, "num_tokens": 24442582.0, "step": 1250 }, { "entropy": 1.1616276282817126, "epoch": 2.016, "grad_norm": 19.75, "learning_rate": 1.2896296296296299e-05, "loss": 18.592, "mean_token_accuracy": 0.7041712146252394, "num_tokens": 24632353.0, "step": 1260 }, { "entropy": 1.1689807120710611, "epoch": 2.032, "grad_norm": 18.625, "learning_rate": 1.2822222222222222e-05, "loss": 18.9039, "mean_token_accuracy": 0.7014766734093427, "num_tokens": 24822715.0, "step": 1270 }, { "entropy": 1.1456513587385415, "epoch": 2.048, "grad_norm": 20.0, "learning_rate": 1.274814814814815e-05, "loss": 18.4096, "mean_token_accuracy": 0.7051771484315396, "num_tokens": 25023118.0, "step": 1280 }, { "entropy": 1.1587952699512243, "epoch": 2.064, "grad_norm": 18.625, "learning_rate": 1.2674074074074075e-05, "loss": 18.6378, "mean_token_accuracy": 0.7034870360046626, "num_tokens": 25217414.0, "step": 1290 }, { "entropy": 1.1504007514566184, "epoch": 2.08, "grad_norm": 18.0, "learning_rate": 1.2600000000000001e-05, "loss": 18.4703, "mean_token_accuracy": 0.7041630525141954, "num_tokens": 25408961.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 1.1164145894050599, "eval_biology_loss": 1.1683411598205566, "eval_biology_mean_token_accuracy": 0.7027085943222046, "eval_biology_num_tokens": 25408961.0, "eval_biology_runtime": 48.5837, "eval_biology_samples_per_second": 10.292, "eval_biology_steps_per_second": 2.573, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 0.8446620798110962, "eval_chemistry_loss": 0.8756071925163269, "eval_chemistry_mean_token_accuracy": 0.7672800846099853, "eval_chemistry_num_tokens": 25408961.0, "eval_chemistry_runtime": 60.224, "eval_chemistry_samples_per_second": 8.302, "eval_chemistry_steps_per_second": 2.076, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.7416602709293365, "eval_math_loss": 0.9767736792564392, "eval_math_mean_token_accuracy": 0.7612695918083191, "eval_math_num_tokens": 25408961.0, "eval_math_runtime": 61.7354, "eval_math_samples_per_second": 8.099, "eval_math_steps_per_second": 2.025, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.822511604309082, "eval_physics_loss": 0.8807807564735413, "eval_physics_mean_token_accuracy": 0.7734324297904969, "eval_physics_num_tokens": 25408961.0, "eval_physics_runtime": 70.3722, "eval_physics_samples_per_second": 7.105, "eval_physics_steps_per_second": 1.776, "step": 1300 }, { "entropy": 1.145626274123788, "epoch": 2.096, "grad_norm": 20.75, "learning_rate": 1.2525925925925928e-05, "loss": 18.4669, "mean_token_accuracy": 0.7053351275622844, "num_tokens": 25600511.0, "step": 1310 }, { "entropy": 1.1261590894311666, "epoch": 2.112, "grad_norm": 19.375, "learning_rate": 1.2451851851851853e-05, "loss": 18.0421, "mean_token_accuracy": 0.7101508729159832, "num_tokens": 25796565.0, "step": 1320 }, { "entropy": 1.1319866240024568, "epoch": 2.128, "grad_norm": 18.375, "learning_rate": 1.237777777777778e-05, "loss": 18.2326, "mean_token_accuracy": 0.7063661482185125, "num_tokens": 25991156.0, "step": 1330 }, { "entropy": 1.12694109082222, "epoch": 2.144, "grad_norm": 21.625, "learning_rate": 1.2303703703703704e-05, "loss": 18.27, "mean_token_accuracy": 0.7078616585582495, "num_tokens": 26193237.0, "step": 1340 }, { "entropy": 1.1670064296573401, "epoch": 2.16, "grad_norm": 20.0, "learning_rate": 1.222962962962963e-05, "loss": 18.6321, "mean_token_accuracy": 0.704596522077918, "num_tokens": 26387993.0, "step": 1350 }, { "entropy": 1.1611683428287507, "epoch": 2.176, "grad_norm": 19.75, "learning_rate": 1.2155555555555555e-05, "loss": 18.8084, "mean_token_accuracy": 0.7007863517850638, "num_tokens": 26585269.0, "step": 1360 }, { "entropy": 1.1334992978721856, "epoch": 2.192, "grad_norm": 17.0, "learning_rate": 1.2081481481481484e-05, "loss": 18.1002, "mean_token_accuracy": 0.7102227192372084, "num_tokens": 26776318.0, "step": 1370 }, { "entropy": 1.142113695293665, "epoch": 2.208, "grad_norm": 18.25, "learning_rate": 1.2007407407407408e-05, "loss": 18.4288, "mean_token_accuracy": 0.7056139782071114, "num_tokens": 26974420.0, "step": 1380 }, { "entropy": 1.165258849412203, "epoch": 2.224, "grad_norm": 18.375, "learning_rate": 1.1933333333333335e-05, "loss": 18.8072, "mean_token_accuracy": 0.7021366007626056, "num_tokens": 27167577.0, "step": 1390 }, { "entropy": 1.1230817057192326, "epoch": 2.24, "grad_norm": 18.0, "learning_rate": 1.185925925925926e-05, "loss": 18.0154, "mean_token_accuracy": 0.7094326011836529, "num_tokens": 27364189.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 1.107084683418274, "eval_biology_loss": 1.166341781616211, "eval_biology_mean_token_accuracy": 0.7033038935661315, "eval_biology_num_tokens": 27364189.0, "eval_biology_runtime": 48.5794, "eval_biology_samples_per_second": 10.292, "eval_biology_steps_per_second": 2.573, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 0.8436218018531799, "eval_chemistry_loss": 0.8749056458473206, "eval_chemistry_mean_token_accuracy": 0.7673236474990844, "eval_chemistry_num_tokens": 27364189.0, "eval_chemistry_runtime": 60.2275, "eval_chemistry_samples_per_second": 8.302, "eval_chemistry_steps_per_second": 2.075, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.7436552357673645, "eval_math_loss": 0.9768530130386353, "eval_math_mean_token_accuracy": 0.7615942449569703, "eval_math_num_tokens": 27364189.0, "eval_math_runtime": 61.7353, "eval_math_samples_per_second": 8.099, "eval_math_steps_per_second": 2.025, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.8238044924736023, "eval_physics_loss": 0.880571186542511, "eval_physics_mean_token_accuracy": 0.7732309465408325, "eval_physics_num_tokens": 27364189.0, "eval_physics_runtime": 70.3425, "eval_physics_samples_per_second": 7.108, "eval_physics_steps_per_second": 1.777, "step": 1400 }, { "entropy": 1.1227998584508896, "epoch": 2.2560000000000002, "grad_norm": 17.875, "learning_rate": 1.1785185185185186e-05, "loss": 17.9995, "mean_token_accuracy": 0.71092384532094, "num_tokens": 27557387.0, "step": 1410 }, { "entropy": 1.1192971892654895, "epoch": 2.2720000000000002, "grad_norm": 19.5, "learning_rate": 1.1711111111111113e-05, "loss": 18.0703, "mean_token_accuracy": 0.7088606022298336, "num_tokens": 27755725.0, "step": 1420 }, { "entropy": 1.157552171498537, "epoch": 2.288, "grad_norm": 18.75, "learning_rate": 1.1637037037037037e-05, "loss": 18.6818, "mean_token_accuracy": 0.7010403741151094, "num_tokens": 27950694.0, "step": 1430 }, { "entropy": 1.1524705573916436, "epoch": 2.304, "grad_norm": 17.75, "learning_rate": 1.1562962962962964e-05, "loss": 18.6601, "mean_token_accuracy": 0.7030924465507269, "num_tokens": 28150719.0, "step": 1440 }, { "entropy": 1.1215086288750171, "epoch": 2.32, "grad_norm": 17.5, "learning_rate": 1.1488888888888889e-05, "loss": 17.9268, "mean_token_accuracy": 0.7104179698973894, "num_tokens": 28348652.0, "step": 1450 }, { "entropy": 1.1350885152816772, "epoch": 2.336, "grad_norm": 18.375, "learning_rate": 1.1414814814814817e-05, "loss": 18.3565, "mean_token_accuracy": 0.7067790202796459, "num_tokens": 28542945.0, "step": 1460 }, { "entropy": 1.156265541538596, "epoch": 2.352, "grad_norm": 19.0, "learning_rate": 1.1340740740740742e-05, "loss": 18.6992, "mean_token_accuracy": 0.7013768840581178, "num_tokens": 28731927.0, "step": 1470 }, { "entropy": 1.1396565582603215, "epoch": 2.368, "grad_norm": 18.75, "learning_rate": 1.1266666666666668e-05, "loss": 18.2949, "mean_token_accuracy": 0.7059750188142061, "num_tokens": 28929298.0, "step": 1480 }, { "entropy": 1.1448044694960118, "epoch": 2.384, "grad_norm": 20.625, "learning_rate": 1.1192592592592593e-05, "loss": 18.526, "mean_token_accuracy": 0.703592960909009, "num_tokens": 29121142.0, "step": 1490 }, { "entropy": 1.140185246989131, "epoch": 2.4, "grad_norm": 18.0, "learning_rate": 1.111851851851852e-05, "loss": 18.3273, "mean_token_accuracy": 0.706566022336483, "num_tokens": 29317919.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 1.1156310276985169, "eval_biology_loss": 1.1647558212280273, "eval_biology_mean_token_accuracy": 0.7032402672767639, "eval_biology_num_tokens": 29317919.0, "eval_biology_runtime": 48.611, "eval_biology_samples_per_second": 10.286, "eval_biology_steps_per_second": 2.571, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 0.8463425951004029, "eval_chemistry_loss": 0.8742334246635437, "eval_chemistry_mean_token_accuracy": 0.767361388683319, "eval_chemistry_num_tokens": 29317919.0, "eval_chemistry_runtime": 59.9689, "eval_chemistry_samples_per_second": 8.338, "eval_chemistry_steps_per_second": 2.084, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.7472673971652984, "eval_math_loss": 0.9761422872543335, "eval_math_mean_token_accuracy": 0.7615765709877014, "eval_math_num_tokens": 29317919.0, "eval_math_runtime": 61.7442, "eval_math_samples_per_second": 8.098, "eval_math_steps_per_second": 2.024, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.8270321841239929, "eval_physics_loss": 0.8801943063735962, "eval_physics_mean_token_accuracy": 0.7734404511451721, "eval_physics_num_tokens": 29317919.0, "eval_physics_runtime": 70.5165, "eval_physics_samples_per_second": 7.091, "eval_physics_steps_per_second": 1.773, "step": 1500 }, { "entropy": 1.1317574352025985, "epoch": 2.416, "grad_norm": 19.625, "learning_rate": 1.1044444444444444e-05, "loss": 18.3109, "mean_token_accuracy": 0.7062701795250177, "num_tokens": 29518541.0, "step": 1510 }, { "entropy": 1.130976415425539, "epoch": 2.432, "grad_norm": 20.25, "learning_rate": 1.0970370370370371e-05, "loss": 18.1642, "mean_token_accuracy": 0.7086035583168269, "num_tokens": 29720768.0, "step": 1520 }, { "entropy": 1.1605936624109745, "epoch": 2.448, "grad_norm": 20.5, "learning_rate": 1.0896296296296298e-05, "loss": 18.7972, "mean_token_accuracy": 0.7010477486997843, "num_tokens": 29916619.0, "step": 1530 }, { "entropy": 1.1125331491231918, "epoch": 2.464, "grad_norm": 20.375, "learning_rate": 1.0822222222222222e-05, "loss": 17.8199, "mean_token_accuracy": 0.7112086053937674, "num_tokens": 30121198.0, "step": 1540 }, { "entropy": 1.1271852746605873, "epoch": 2.48, "grad_norm": 18.5, "learning_rate": 1.074814814814815e-05, "loss": 18.1447, "mean_token_accuracy": 0.707981801405549, "num_tokens": 30325508.0, "step": 1550 }, { "entropy": 1.1350488025695085, "epoch": 2.496, "grad_norm": 17.875, "learning_rate": 1.0674074074074074e-05, "loss": 18.3331, "mean_token_accuracy": 0.706438259780407, "num_tokens": 30517445.0, "step": 1560 }, { "entropy": 1.149303700402379, "epoch": 2.512, "grad_norm": 21.25, "learning_rate": 1.0600000000000002e-05, "loss": 18.5415, "mean_token_accuracy": 0.7031121108680963, "num_tokens": 30714049.0, "step": 1570 }, { "entropy": 1.134909725189209, "epoch": 2.528, "grad_norm": 19.0, "learning_rate": 1.0525925925925927e-05, "loss": 18.3422, "mean_token_accuracy": 0.7059885617345572, "num_tokens": 30911317.0, "step": 1580 }, { "entropy": 1.1456572752445937, "epoch": 2.544, "grad_norm": 17.875, "learning_rate": 1.0451851851851853e-05, "loss": 18.3318, "mean_token_accuracy": 0.70671008490026, "num_tokens": 31108983.0, "step": 1590 }, { "entropy": 1.1408224642276763, "epoch": 2.56, "grad_norm": 18.875, "learning_rate": 1.0377777777777778e-05, "loss": 18.441, "mean_token_accuracy": 0.705613837391138, "num_tokens": 31298994.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 1.1095367937088012, "eval_biology_loss": 1.1631091833114624, "eval_biology_mean_token_accuracy": 0.7037216110229492, "eval_biology_num_tokens": 31298994.0, "eval_biology_runtime": 48.5574, "eval_biology_samples_per_second": 10.297, "eval_biology_steps_per_second": 2.574, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 0.842681580543518, "eval_chemistry_loss": 0.8729196786880493, "eval_chemistry_mean_token_accuracy": 0.7677588958740235, "eval_chemistry_num_tokens": 31298994.0, "eval_chemistry_runtime": 60.3125, "eval_chemistry_samples_per_second": 8.29, "eval_chemistry_steps_per_second": 2.073, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.7447464742660522, "eval_math_loss": 0.9752342700958252, "eval_math_mean_token_accuracy": 0.7617981548309326, "eval_math_num_tokens": 31298994.0, "eval_math_runtime": 61.7884, "eval_math_samples_per_second": 8.092, "eval_math_steps_per_second": 2.023, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 0.8225784077644348, "eval_physics_loss": 0.8789661526679993, "eval_physics_mean_token_accuracy": 0.7737646398544311, "eval_physics_num_tokens": 31298994.0, "eval_physics_runtime": 70.3931, "eval_physics_samples_per_second": 7.103, "eval_physics_steps_per_second": 1.776, "step": 1600 }, { "entropy": 1.152071548998356, "epoch": 2.576, "grad_norm": 18.5, "learning_rate": 1.0303703703703705e-05, "loss": 18.5693, "mean_token_accuracy": 0.7044297493994236, "num_tokens": 31488512.0, "step": 1610 }, { "entropy": 1.1696349333971738, "epoch": 2.592, "grad_norm": 19.0, "learning_rate": 1.0229629629629631e-05, "loss": 18.8544, "mean_token_accuracy": 0.7005049493163824, "num_tokens": 31684651.0, "step": 1620 }, { "entropy": 1.1627265084534883, "epoch": 2.608, "grad_norm": 19.0, "learning_rate": 1.0155555555555556e-05, "loss": 18.6882, "mean_token_accuracy": 0.7010684039443731, "num_tokens": 31876008.0, "step": 1630 }, { "entropy": 1.130651018768549, "epoch": 2.624, "grad_norm": 19.375, "learning_rate": 1.0081481481481484e-05, "loss": 18.1415, "mean_token_accuracy": 0.7068800464272499, "num_tokens": 32069617.0, "step": 1640 }, { "entropy": 1.1576575651764869, "epoch": 2.64, "grad_norm": 18.25, "learning_rate": 1.0007407407407407e-05, "loss": 18.7397, "mean_token_accuracy": 0.700667466595769, "num_tokens": 32262932.0, "step": 1650 }, { "entropy": 1.1208720214664936, "epoch": 2.656, "grad_norm": 18.625, "learning_rate": 9.933333333333334e-06, "loss": 17.9602, "mean_token_accuracy": 0.7097393788397313, "num_tokens": 32465495.0, "step": 1660 }, { "entropy": 1.1169210582971574, "epoch": 2.672, "grad_norm": 18.5, "learning_rate": 9.85925925925926e-06, "loss": 18.0691, "mean_token_accuracy": 0.7105248533189297, "num_tokens": 32665458.0, "step": 1670 }, { "entropy": 1.1241317071020602, "epoch": 2.6879999999999997, "grad_norm": 18.25, "learning_rate": 9.785185185185187e-06, "loss": 18.1474, "mean_token_accuracy": 0.7082856688648462, "num_tokens": 32867135.0, "step": 1680 }, { "entropy": 1.1339986488223075, "epoch": 2.7039999999999997, "grad_norm": 18.75, "learning_rate": 9.711111111111111e-06, "loss": 18.2237, "mean_token_accuracy": 0.7075312845408916, "num_tokens": 33070366.0, "step": 1690 }, { "entropy": 1.1257793482393026, "epoch": 2.7199999999999998, "grad_norm": 21.0, "learning_rate": 9.637037037037038e-06, "loss": 18.1537, "mean_token_accuracy": 0.707587756216526, "num_tokens": 33269122.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 1.1073571362495422, "eval_biology_loss": 1.161886215209961, "eval_biology_mean_token_accuracy": 0.7041741323471069, "eval_biology_num_tokens": 33269122.0, "eval_biology_runtime": 48.635, "eval_biology_samples_per_second": 10.281, "eval_biology_steps_per_second": 2.57, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 0.8408446173667907, "eval_chemistry_loss": 0.8714523911476135, "eval_chemistry_mean_token_accuracy": 0.7681687192916871, "eval_chemistry_num_tokens": 33269122.0, "eval_chemistry_runtime": 60.3526, "eval_chemistry_samples_per_second": 8.285, "eval_chemistry_steps_per_second": 2.071, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.74438600897789, "eval_math_loss": 0.9744483828544617, "eval_math_mean_token_accuracy": 0.7620029444694519, "eval_math_num_tokens": 33269122.0, "eval_math_runtime": 61.8511, "eval_math_samples_per_second": 8.084, "eval_math_steps_per_second": 2.021, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 0.8209939393997192, "eval_physics_loss": 0.8775029182434082, "eval_physics_mean_token_accuracy": 0.7741135511398315, "eval_physics_num_tokens": 33269122.0, "eval_physics_runtime": 70.4692, "eval_physics_samples_per_second": 7.095, "eval_physics_steps_per_second": 1.774, "step": 1700 }, { "entropy": 1.1649049088358878, "epoch": 2.7359999999999998, "grad_norm": 18.0, "learning_rate": 9.562962962962965e-06, "loss": 18.7293, "mean_token_accuracy": 0.7025999147444963, "num_tokens": 33456592.0, "step": 1710 }, { "entropy": 1.15311808437109, "epoch": 2.752, "grad_norm": 19.25, "learning_rate": 9.48888888888889e-06, "loss": 18.4325, "mean_token_accuracy": 0.7036401994526387, "num_tokens": 33645862.0, "step": 1720 }, { "entropy": 1.1674226205796003, "epoch": 2.768, "grad_norm": 18.75, "learning_rate": 9.414814814814816e-06, "loss": 18.8481, "mean_token_accuracy": 0.699421489983797, "num_tokens": 33835740.0, "step": 1730 }, { "entropy": 1.1250699553638697, "epoch": 2.784, "grad_norm": 18.0, "learning_rate": 9.34074074074074e-06, "loss": 18.1108, "mean_token_accuracy": 0.7099443785846233, "num_tokens": 34034702.0, "step": 1740 }, { "entropy": 1.1213359594345094, "epoch": 2.8, "grad_norm": 17.875, "learning_rate": 9.266666666666667e-06, "loss": 18.1123, "mean_token_accuracy": 0.7095074690878391, "num_tokens": 34235603.0, "step": 1750 }, { "entropy": 1.1317555967718362, "epoch": 2.816, "grad_norm": 19.625, "learning_rate": 9.192592592592594e-06, "loss": 18.2156, "mean_token_accuracy": 0.7085574407130479, "num_tokens": 34434833.0, "step": 1760 }, { "entropy": 1.1643431086093188, "epoch": 2.832, "grad_norm": 19.875, "learning_rate": 9.118518518518518e-06, "loss": 18.7866, "mean_token_accuracy": 0.702312757447362, "num_tokens": 34625581.0, "step": 1770 }, { "entropy": 1.1494974169880152, "epoch": 2.848, "grad_norm": 20.375, "learning_rate": 9.044444444444445e-06, "loss": 18.5235, "mean_token_accuracy": 0.705839891731739, "num_tokens": 34816521.0, "step": 1780 }, { "entropy": 1.1289043568074704, "epoch": 2.864, "grad_norm": 19.125, "learning_rate": 8.970370370370372e-06, "loss": 18.2474, "mean_token_accuracy": 0.7077045034617185, "num_tokens": 35013631.0, "step": 1790 }, { "entropy": 1.1608365170657635, "epoch": 2.88, "grad_norm": 20.75, "learning_rate": 8.896296296296298e-06, "loss": 18.651, "mean_token_accuracy": 0.7021035224199295, "num_tokens": 35207800.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 1.1227384514808654, "eval_biology_loss": 1.1606189012527466, "eval_biology_mean_token_accuracy": 0.7041417050361634, "eval_biology_num_tokens": 35207800.0, "eval_biology_runtime": 48.5521, "eval_biology_samples_per_second": 10.298, "eval_biology_steps_per_second": 2.575, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 0.8532550582885742, "eval_chemistry_loss": 0.8718345761299133, "eval_chemistry_mean_token_accuracy": 0.7678817505836487, "eval_chemistry_num_tokens": 35207800.0, "eval_chemistry_runtime": 60.2048, "eval_chemistry_samples_per_second": 8.305, "eval_chemistry_steps_per_second": 2.076, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.7495931763648986, "eval_math_loss": 0.9741966724395752, "eval_math_mean_token_accuracy": 0.7619234776496887, "eval_math_num_tokens": 35207800.0, "eval_math_runtime": 61.7637, "eval_math_samples_per_second": 8.095, "eval_math_steps_per_second": 2.024, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 0.8320713305473327, "eval_physics_loss": 0.8780032992362976, "eval_physics_mean_token_accuracy": 0.7738002591133117, "eval_physics_num_tokens": 35207800.0, "eval_physics_runtime": 70.3744, "eval_physics_samples_per_second": 7.105, "eval_physics_steps_per_second": 1.776, "step": 1800 }, { "entropy": 1.1618830259889363, "epoch": 2.896, "grad_norm": 19.0, "learning_rate": 8.822222222222223e-06, "loss": 18.4927, "mean_token_accuracy": 0.7027845978736877, "num_tokens": 35402995.0, "step": 1810 }, { "entropy": 1.153843991830945, "epoch": 2.912, "grad_norm": 19.375, "learning_rate": 8.74814814814815e-06, "loss": 18.7586, "mean_token_accuracy": 0.7019136741757392, "num_tokens": 35595456.0, "step": 1820 }, { "entropy": 1.1577350933104753, "epoch": 2.928, "grad_norm": 19.5, "learning_rate": 8.674074074074074e-06, "loss": 18.625, "mean_token_accuracy": 0.703600461781025, "num_tokens": 35787309.0, "step": 1830 }, { "entropy": 1.140915045887232, "epoch": 2.944, "grad_norm": 17.875, "learning_rate": 8.6e-06, "loss": 18.3324, "mean_token_accuracy": 0.7066537465900182, "num_tokens": 35982686.0, "step": 1840 }, { "entropy": 1.1303145423531533, "epoch": 2.96, "grad_norm": 19.5, "learning_rate": 8.525925925925927e-06, "loss": 18.2798, "mean_token_accuracy": 0.7072657477110624, "num_tokens": 36179520.0, "step": 1850 }, { "entropy": 1.1355241533368825, "epoch": 2.976, "grad_norm": 18.625, "learning_rate": 8.451851851851852e-06, "loss": 18.3476, "mean_token_accuracy": 0.7064431738108397, "num_tokens": 36373689.0, "step": 1860 }, { "entropy": 1.1398762241005898, "epoch": 2.992, "grad_norm": 22.375, "learning_rate": 8.377777777777779e-06, "loss": 18.4292, "mean_token_accuracy": 0.7060572128742933, "num_tokens": 36569612.0, "step": 1870 }, { "entropy": 1.1687027130275964, "epoch": 3.008, "grad_norm": 19.875, "learning_rate": 8.303703703703705e-06, "loss": 18.6648, "mean_token_accuracy": 0.7008682768791914, "num_tokens": 36759194.0, "step": 1880 }, { "entropy": 1.1632583592087031, "epoch": 3.024, "grad_norm": 18.625, "learning_rate": 8.229629629629632e-06, "loss": 18.7022, "mean_token_accuracy": 0.7013984300196171, "num_tokens": 36947293.0, "step": 1890 }, { "entropy": 1.1147421635687351, "epoch": 3.04, "grad_norm": 19.75, "learning_rate": 8.155555555555556e-06, "loss": 18.0582, "mean_token_accuracy": 0.7098672483116388, "num_tokens": 37136901.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 1.1026909346580505, "eval_biology_loss": 1.1605921983718872, "eval_biology_mean_token_accuracy": 0.7041417622566223, "eval_biology_num_tokens": 37136901.0, "eval_biology_runtime": 48.5111, "eval_biology_samples_per_second": 10.307, "eval_biology_steps_per_second": 2.577, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 0.835658191204071, "eval_chemistry_loss": 0.8722280859947205, "eval_chemistry_mean_token_accuracy": 0.7677701048851013, "eval_chemistry_num_tokens": 37136901.0, "eval_chemistry_runtime": 60.1707, "eval_chemistry_samples_per_second": 8.31, "eval_chemistry_steps_per_second": 2.077, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.7388259909152984, "eval_math_loss": 0.9761671423912048, "eval_math_mean_token_accuracy": 0.7617930121421814, "eval_math_num_tokens": 37136901.0, "eval_math_runtime": 61.9396, "eval_math_samples_per_second": 8.072, "eval_math_steps_per_second": 2.018, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 0.8157856950759887, "eval_physics_loss": 0.8783439993858337, "eval_physics_mean_token_accuracy": 0.7741703715324402, "eval_physics_num_tokens": 37136901.0, "eval_physics_runtime": 70.3241, "eval_physics_samples_per_second": 7.11, "eval_physics_steps_per_second": 1.777, "step": 1900 }, { "entropy": 1.1322670388966798, "epoch": 3.056, "grad_norm": 18.875, "learning_rate": 8.081481481481483e-06, "loss": 18.1169, "mean_token_accuracy": 0.7071005918085576, "num_tokens": 37331588.0, "step": 1910 }, { "entropy": 1.1061217069625855, "epoch": 3.072, "grad_norm": 19.875, "learning_rate": 8.007407407407408e-06, "loss": 17.7951, "mean_token_accuracy": 0.71182203553617, "num_tokens": 37521211.0, "step": 1920 }, { "entropy": 1.1269909385591745, "epoch": 3.088, "grad_norm": 19.125, "learning_rate": 7.933333333333334e-06, "loss": 18.2156, "mean_token_accuracy": 0.7076956331729889, "num_tokens": 37713328.0, "step": 1930 }, { "entropy": 1.1100518554449081, "epoch": 3.104, "grad_norm": 18.625, "learning_rate": 7.859259259259259e-06, "loss": 17.7995, "mean_token_accuracy": 0.7128304049372673, "num_tokens": 37913470.0, "step": 1940 }, { "entropy": 1.1166743770241738, "epoch": 3.12, "grad_norm": 18.5, "learning_rate": 7.785185185185185e-06, "loss": 18.061, "mean_token_accuracy": 0.7102392159402371, "num_tokens": 38106718.0, "step": 1950 }, { "entropy": 1.1333389516919852, "epoch": 3.136, "grad_norm": 20.625, "learning_rate": 7.711111111111112e-06, "loss": 18.1058, "mean_token_accuracy": 0.7095236502587795, "num_tokens": 38295701.0, "step": 1960 }, { "entropy": 1.1190204188227653, "epoch": 3.152, "grad_norm": 19.25, "learning_rate": 7.637037037037037e-06, "loss": 17.9389, "mean_token_accuracy": 0.7105425789952278, "num_tokens": 38493951.0, "step": 1970 }, { "entropy": 1.146292532607913, "epoch": 3.168, "grad_norm": 20.125, "learning_rate": 7.562962962962963e-06, "loss": 18.4401, "mean_token_accuracy": 0.7044953163713217, "num_tokens": 38688390.0, "step": 1980 }, { "entropy": 1.1336342521011828, "epoch": 3.184, "grad_norm": 17.875, "learning_rate": 7.48888888888889e-06, "loss": 18.2461, "mean_token_accuracy": 0.7075372830033302, "num_tokens": 38880157.0, "step": 1990 }, { "entropy": 1.110951554775238, "epoch": 3.2, "grad_norm": 18.5, "learning_rate": 7.4148148148148155e-06, "loss": 18.0452, "mean_token_accuracy": 0.7110834132879973, "num_tokens": 39081782.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 1.0987086420059204, "eval_biology_loss": 1.1604441404342651, "eval_biology_mean_token_accuracy": 0.7043456192016602, "eval_biology_num_tokens": 39081782.0, "eval_biology_runtime": 48.5741, "eval_biology_samples_per_second": 10.294, "eval_biology_steps_per_second": 2.573, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 0.8334956364631653, "eval_chemistry_loss": 0.8719582557678223, "eval_chemistry_mean_token_accuracy": 0.7679322199821472, "eval_chemistry_num_tokens": 39081782.0, "eval_chemistry_runtime": 60.6063, "eval_chemistry_samples_per_second": 8.25, "eval_chemistry_steps_per_second": 2.062, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.7380490090847015, "eval_math_loss": 0.9763364195823669, "eval_math_mean_token_accuracy": 0.7618815884590149, "eval_math_num_tokens": 39081782.0, "eval_math_runtime": 62.6076, "eval_math_samples_per_second": 7.986, "eval_math_steps_per_second": 1.997, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 0.814041579246521, "eval_physics_loss": 0.8782714605331421, "eval_physics_mean_token_accuracy": 0.7738414001464844, "eval_physics_num_tokens": 39081782.0, "eval_physics_runtime": 71.4094, "eval_physics_samples_per_second": 7.002, "eval_physics_steps_per_second": 1.75, "step": 2000 }, { "entropy": 1.1354887392371893, "epoch": 3.216, "grad_norm": 21.125, "learning_rate": 7.340740740740742e-06, "loss": 18.2102, "mean_token_accuracy": 0.708154209703207, "num_tokens": 39275055.0, "step": 2010 }, { "entropy": 1.12113347761333, "epoch": 3.232, "grad_norm": 20.5, "learning_rate": 7.266666666666668e-06, "loss": 18.05, "mean_token_accuracy": 0.7111630469560624, "num_tokens": 39474717.0, "step": 2020 }, { "entropy": 1.1566194161772727, "epoch": 3.248, "grad_norm": 22.5, "learning_rate": 7.192592592592593e-06, "loss": 18.5836, "mean_token_accuracy": 0.7029663059860468, "num_tokens": 39660543.0, "step": 2030 }, { "entropy": 1.1064893435686827, "epoch": 3.2640000000000002, "grad_norm": 19.875, "learning_rate": 7.118518518518519e-06, "loss": 17.9265, "mean_token_accuracy": 0.7116453271359205, "num_tokens": 39866885.0, "step": 2040 }, { "entropy": 1.117007714137435, "epoch": 3.2800000000000002, "grad_norm": 20.125, "learning_rate": 7.044444444444445e-06, "loss": 17.9832, "mean_token_accuracy": 0.7106686752289534, "num_tokens": 40060390.0, "step": 2050 }, { "entropy": 1.133465865254402, "epoch": 3.296, "grad_norm": 20.625, "learning_rate": 6.97037037037037e-06, "loss": 18.1734, "mean_token_accuracy": 0.7074764121323824, "num_tokens": 40253053.0, "step": 2060 }, { "entropy": 1.1380175232887269, "epoch": 3.312, "grad_norm": 20.25, "learning_rate": 6.896296296296297e-06, "loss": 18.404, "mean_token_accuracy": 0.7049634169787169, "num_tokens": 40444210.0, "step": 2070 }, { "entropy": 1.1117772050201893, "epoch": 3.328, "grad_norm": 19.25, "learning_rate": 6.8222222222222225e-06, "loss": 17.8341, "mean_token_accuracy": 0.7136213395744562, "num_tokens": 40644156.0, "step": 2080 }, { "entropy": 1.120354413986206, "epoch": 3.344, "grad_norm": 20.375, "learning_rate": 6.748148148148149e-06, "loss": 18.1816, "mean_token_accuracy": 0.7078599747270345, "num_tokens": 40837974.0, "step": 2090 }, { "entropy": 1.1193263031542302, "epoch": 3.36, "grad_norm": 18.375, "learning_rate": 6.674074074074075e-06, "loss": 17.9094, "mean_token_accuracy": 0.7105781909078359, "num_tokens": 41037512.0, "step": 2100 }, { "epoch": 3.36, "eval_biology_entropy": 1.0968467388153076, "eval_biology_loss": 1.1601123809814453, "eval_biology_mean_token_accuracy": 0.7041891474723816, "eval_biology_num_tokens": 41037512.0, "eval_biology_runtime": 49.3769, "eval_biology_samples_per_second": 10.126, "eval_biology_steps_per_second": 2.532, "step": 2100 }, { "epoch": 3.36, "eval_chemistry_entropy": 0.8338381066322327, "eval_chemistry_loss": 0.8716986775398254, "eval_chemistry_mean_token_accuracy": 0.7677229156494141, "eval_chemistry_num_tokens": 41037512.0, "eval_chemistry_runtime": 61.2003, "eval_chemistry_samples_per_second": 8.17, "eval_chemistry_steps_per_second": 2.042, "step": 2100 }, { "epoch": 3.36, "eval_math_entropy": 0.739958809375763, "eval_math_loss": 0.976417601108551, "eval_math_mean_token_accuracy": 0.7617873783111573, "eval_math_num_tokens": 41037512.0, "eval_math_runtime": 62.7508, "eval_math_samples_per_second": 7.968, "eval_math_steps_per_second": 1.992, "step": 2100 }, { "epoch": 3.36, "eval_physics_entropy": 0.8157489862442017, "eval_physics_loss": 0.8782733082771301, "eval_physics_mean_token_accuracy": 0.7740161881446839, "eval_physics_num_tokens": 41037512.0, "eval_physics_runtime": 71.4573, "eval_physics_samples_per_second": 6.997, "eval_physics_steps_per_second": 1.749, "step": 2100 }, { "entropy": 1.1283354975283146, "epoch": 3.376, "grad_norm": 18.375, "learning_rate": 6.600000000000001e-06, "loss": 18.2687, "mean_token_accuracy": 0.7075107876211405, "num_tokens": 41234279.0, "step": 2110 }, { "entropy": 1.164820646867156, "epoch": 3.392, "grad_norm": 20.125, "learning_rate": 6.525925925925927e-06, "loss": 18.6247, "mean_token_accuracy": 0.7037689939141274, "num_tokens": 41427328.0, "step": 2120 }, { "entropy": 1.1609703712165356, "epoch": 3.408, "grad_norm": 18.5, "learning_rate": 6.4518518518518525e-06, "loss": 18.6601, "mean_token_accuracy": 0.7022816762328148, "num_tokens": 41613930.0, "step": 2130 }, { "entropy": 1.118883929029107, "epoch": 3.424, "grad_norm": 17.375, "learning_rate": 6.377777777777778e-06, "loss": 18.1224, "mean_token_accuracy": 0.7102984309196472, "num_tokens": 41809259.0, "step": 2140 }, { "entropy": 1.1378046832978725, "epoch": 3.44, "grad_norm": 20.25, "learning_rate": 6.303703703703704e-06, "loss": 18.2796, "mean_token_accuracy": 0.7064081773161888, "num_tokens": 42008278.0, "step": 2150 }, { "entropy": 1.1136955060064793, "epoch": 3.456, "grad_norm": 18.625, "learning_rate": 6.2296296296296295e-06, "loss": 17.9429, "mean_token_accuracy": 0.7102738797664643, "num_tokens": 42210283.0, "step": 2160 }, { "entropy": 1.1351352456957102, "epoch": 3.472, "grad_norm": 19.5, "learning_rate": 6.155555555555556e-06, "loss": 18.2105, "mean_token_accuracy": 0.7089967802166939, "num_tokens": 42405493.0, "step": 2170 }, { "entropy": 1.0994913596659899, "epoch": 3.488, "grad_norm": 18.75, "learning_rate": 6.081481481481482e-06, "loss": 17.6731, "mean_token_accuracy": 0.7132742658257485, "num_tokens": 42601837.0, "step": 2180 }, { "entropy": 1.1243377164006234, "epoch": 3.504, "grad_norm": 19.5, "learning_rate": 6.007407407407407e-06, "loss": 18.1375, "mean_token_accuracy": 0.7097833503037692, "num_tokens": 42798533.0, "step": 2190 }, { "entropy": 1.1014703746885062, "epoch": 3.52, "grad_norm": 20.75, "learning_rate": 5.933333333333335e-06, "loss": 17.8044, "mean_token_accuracy": 0.712919196113944, "num_tokens": 43001779.0, "step": 2200 }, { "epoch": 3.52, "eval_biology_entropy": 1.0952841796875, "eval_biology_loss": 1.1598260402679443, "eval_biology_mean_token_accuracy": 0.7040795273780823, "eval_biology_num_tokens": 43001779.0, "eval_biology_runtime": 48.6822, "eval_biology_samples_per_second": 10.271, "eval_biology_steps_per_second": 2.568, "step": 2200 }, { "epoch": 3.52, "eval_chemistry_entropy": 0.8339877910614014, "eval_chemistry_loss": 0.8721098899841309, "eval_chemistry_mean_token_accuracy": 0.7680032277107238, "eval_chemistry_num_tokens": 43001779.0, "eval_chemistry_runtime": 60.3472, "eval_chemistry_samples_per_second": 8.285, "eval_chemistry_steps_per_second": 2.071, "step": 2200 }, { "epoch": 3.52, "eval_math_entropy": 0.7390543642044067, "eval_math_loss": 0.9761691093444824, "eval_math_mean_token_accuracy": 0.7619256863594055, "eval_math_num_tokens": 43001779.0, "eval_math_runtime": 61.856, "eval_math_samples_per_second": 8.083, "eval_math_steps_per_second": 2.021, "step": 2200 }, { "epoch": 3.52, "eval_physics_entropy": 0.8150221276283264, "eval_physics_loss": 0.8782007098197937, "eval_physics_mean_token_accuracy": 0.773835651397705, "eval_physics_num_tokens": 43001779.0, "eval_physics_runtime": 70.5185, "eval_physics_samples_per_second": 7.09, "eval_physics_steps_per_second": 1.773, "step": 2200 }, { "entropy": 1.131494064256549, "epoch": 3.536, "grad_norm": 18.75, "learning_rate": 5.85925925925926e-06, "loss": 18.2698, "mean_token_accuracy": 0.7066725388169288, "num_tokens": 43192229.0, "step": 2210 }, { "entropy": 1.1356515496969224, "epoch": 3.552, "grad_norm": 19.75, "learning_rate": 5.785185185185186e-06, "loss": 18.2761, "mean_token_accuracy": 0.7078566204756498, "num_tokens": 43386416.0, "step": 2220 }, { "entropy": 1.1160131219774485, "epoch": 3.568, "grad_norm": 19.375, "learning_rate": 5.711111111111112e-06, "loss": 17.9301, "mean_token_accuracy": 0.711183400452137, "num_tokens": 43586122.0, "step": 2230 }, { "entropy": 1.1139442063868046, "epoch": 3.584, "grad_norm": 19.125, "learning_rate": 5.637037037037037e-06, "loss": 17.9737, "mean_token_accuracy": 0.7113449327647686, "num_tokens": 43783769.0, "step": 2240 }, { "entropy": 1.1195767130702734, "epoch": 3.6, "grad_norm": 21.125, "learning_rate": 5.562962962962963e-06, "loss": 17.9187, "mean_token_accuracy": 0.7108651768416167, "num_tokens": 43973235.0, "step": 2250 }, { "entropy": 1.0934316322207451, "epoch": 3.616, "grad_norm": 19.125, "learning_rate": 5.4888888888888895e-06, "loss": 17.6835, "mean_token_accuracy": 0.713237265124917, "num_tokens": 44176594.0, "step": 2260 }, { "entropy": 1.1332040429115295, "epoch": 3.632, "grad_norm": 21.375, "learning_rate": 5.414814814814815e-06, "loss": 18.1103, "mean_token_accuracy": 0.7089979019016027, "num_tokens": 44377465.0, "step": 2270 }, { "entropy": 1.1584312468767166, "epoch": 3.648, "grad_norm": 19.875, "learning_rate": 5.340740740740741e-06, "loss": 18.7558, "mean_token_accuracy": 0.7008867420256137, "num_tokens": 44569130.0, "step": 2280 }, { "entropy": 1.1106038503348827, "epoch": 3.664, "grad_norm": 18.875, "learning_rate": 5.2666666666666665e-06, "loss": 17.7824, "mean_token_accuracy": 0.7132147330790758, "num_tokens": 44772500.0, "step": 2290 }, { "entropy": 1.115388607978821, "epoch": 3.68, "grad_norm": 19.0, "learning_rate": 5.192592592592594e-06, "loss": 18.0662, "mean_token_accuracy": 0.7111902508884669, "num_tokens": 44972326.0, "step": 2300 }, { "epoch": 3.68, "eval_biology_entropy": 1.0993176627159118, "eval_biology_loss": 1.1593457460403442, "eval_biology_mean_token_accuracy": 0.7043005504608154, "eval_biology_num_tokens": 44972326.0, "eval_biology_runtime": 48.6262, "eval_biology_samples_per_second": 10.283, "eval_biology_steps_per_second": 2.571, "step": 2300 }, { "epoch": 3.68, "eval_chemistry_entropy": 0.8339422068595886, "eval_chemistry_loss": 0.8719797134399414, "eval_chemistry_mean_token_accuracy": 0.7679404969215393, "eval_chemistry_num_tokens": 44972326.0, "eval_chemistry_runtime": 60.2942, "eval_chemistry_samples_per_second": 8.293, "eval_chemistry_steps_per_second": 2.073, "step": 2300 }, { "epoch": 3.68, "eval_math_entropy": 0.7411710631847381, "eval_math_loss": 0.9758427143096924, "eval_math_mean_token_accuracy": 0.7618683638572693, "eval_math_num_tokens": 44972326.0, "eval_math_runtime": 61.768, "eval_math_samples_per_second": 8.095, "eval_math_steps_per_second": 2.024, "step": 2300 }, { "epoch": 3.68, "eval_physics_entropy": 0.8151098203659057, "eval_physics_loss": 0.8783143162727356, "eval_physics_mean_token_accuracy": 0.7738341612815857, "eval_physics_num_tokens": 44972326.0, "eval_physics_runtime": 70.3991, "eval_physics_samples_per_second": 7.102, "eval_physics_steps_per_second": 1.776, "step": 2300 }, { "entropy": 1.1294588424265384, "epoch": 3.6959999999999997, "grad_norm": 20.125, "learning_rate": 5.1185185185185195e-06, "loss": 18.2591, "mean_token_accuracy": 0.7078940033912658, "num_tokens": 45171386.0, "step": 2310 }, { "entropy": 1.1489933133125305, "epoch": 3.7119999999999997, "grad_norm": 18.625, "learning_rate": 5.044444444444445e-06, "loss": 18.4725, "mean_token_accuracy": 0.705705888569355, "num_tokens": 45361017.0, "step": 2320 }, { "entropy": 1.1155077636241912, "epoch": 3.7279999999999998, "grad_norm": 19.375, "learning_rate": 4.970370370370371e-06, "loss": 17.9265, "mean_token_accuracy": 0.7115162432193756, "num_tokens": 45561773.0, "step": 2330 }, { "entropy": 1.1495999217033386, "epoch": 3.7439999999999998, "grad_norm": 19.0, "learning_rate": 4.8962962962962965e-06, "loss": 18.516, "mean_token_accuracy": 0.7043994337320327, "num_tokens": 45753014.0, "step": 2340 }, { "entropy": 1.13122322447598, "epoch": 3.76, "grad_norm": 20.75, "learning_rate": 4.822222222222222e-06, "loss": 18.2202, "mean_token_accuracy": 0.7084156259894371, "num_tokens": 45947397.0, "step": 2350 }, { "entropy": 1.122436200082302, "epoch": 3.776, "grad_norm": 18.125, "learning_rate": 4.748148148148149e-06, "loss": 18.0499, "mean_token_accuracy": 0.7104008805006743, "num_tokens": 46143253.0, "step": 2360 }, { "entropy": 1.1514490522444247, "epoch": 3.792, "grad_norm": 18.625, "learning_rate": 4.674074074074074e-06, "loss": 18.549, "mean_token_accuracy": 0.7031971588730812, "num_tokens": 46337744.0, "step": 2370 }, { "entropy": 1.119861488416791, "epoch": 3.808, "grad_norm": 19.875, "learning_rate": 4.600000000000001e-06, "loss": 18.087, "mean_token_accuracy": 0.7108966052532196, "num_tokens": 46528272.0, "step": 2380 }, { "entropy": 1.135647664964199, "epoch": 3.824, "grad_norm": 19.125, "learning_rate": 4.5259259259259265e-06, "loss": 18.3009, "mean_token_accuracy": 0.7064112696796656, "num_tokens": 46719274.0, "step": 2390 }, { "entropy": 1.114129998907447, "epoch": 3.84, "grad_norm": 19.125, "learning_rate": 4.451851851851852e-06, "loss": 17.9589, "mean_token_accuracy": 0.7134663391858339, "num_tokens": 46916027.0, "step": 2400 }, { "epoch": 3.84, "eval_biology_entropy": 1.1034115777015685, "eval_biology_loss": 1.1590811014175415, "eval_biology_mean_token_accuracy": 0.7044289937019348, "eval_biology_num_tokens": 46916027.0, "eval_biology_runtime": 48.5938, "eval_biology_samples_per_second": 10.289, "eval_biology_steps_per_second": 2.572, "step": 2400 }, { "epoch": 3.84, "eval_chemistry_entropy": 0.8372816433906555, "eval_chemistry_loss": 0.8715822696685791, "eval_chemistry_mean_token_accuracy": 0.7678695392608642, "eval_chemistry_num_tokens": 46916027.0, "eval_chemistry_runtime": 60.2605, "eval_chemistry_samples_per_second": 8.297, "eval_chemistry_steps_per_second": 2.074, "step": 2400 }, { "epoch": 3.84, "eval_math_entropy": 0.7421645526885986, "eval_math_loss": 0.9759746193885803, "eval_math_mean_token_accuracy": 0.7618973565101623, "eval_math_num_tokens": 46916027.0, "eval_math_runtime": 61.9787, "eval_math_samples_per_second": 8.067, "eval_math_steps_per_second": 2.017, "step": 2400 }, { "epoch": 3.84, "eval_physics_entropy": 0.8176065077781677, "eval_physics_loss": 0.8779301643371582, "eval_physics_mean_token_accuracy": 0.774030951499939, "eval_physics_num_tokens": 46916027.0, "eval_physics_runtime": 70.3314, "eval_physics_samples_per_second": 7.109, "eval_physics_steps_per_second": 1.777, "step": 2400 }, { "entropy": 1.1277068488299846, "epoch": 3.856, "grad_norm": 20.125, "learning_rate": 4.377777777777778e-06, "loss": 18.0574, "mean_token_accuracy": 0.7107046756893396, "num_tokens": 47114880.0, "step": 2410 }, { "entropy": 1.144086579605937, "epoch": 3.872, "grad_norm": 19.125, "learning_rate": 4.3037037037037035e-06, "loss": 18.3726, "mean_token_accuracy": 0.7060531999915838, "num_tokens": 47311977.0, "step": 2420 }, { "entropy": 1.119268636032939, "epoch": 3.888, "grad_norm": 19.875, "learning_rate": 4.22962962962963e-06, "loss": 18.0165, "mean_token_accuracy": 0.7108395133167505, "num_tokens": 47509800.0, "step": 2430 }, { "entropy": 1.124232827872038, "epoch": 3.904, "grad_norm": 19.25, "learning_rate": 4.155555555555556e-06, "loss": 18.1055, "mean_token_accuracy": 0.7098960153758526, "num_tokens": 47704018.0, "step": 2440 }, { "entropy": 1.1140838485211133, "epoch": 3.92, "grad_norm": 18.875, "learning_rate": 4.081481481481482e-06, "loss": 17.8969, "mean_token_accuracy": 0.7108140826225281, "num_tokens": 47903745.0, "step": 2450 }, { "entropy": 1.1396939534693957, "epoch": 3.936, "grad_norm": 20.25, "learning_rate": 4.007407407407408e-06, "loss": 18.3584, "mean_token_accuracy": 0.7066603232175112, "num_tokens": 48102312.0, "step": 2460 }, { "entropy": 1.1124013729393483, "epoch": 3.952, "grad_norm": 18.75, "learning_rate": 3.9333333333333335e-06, "loss": 17.9989, "mean_token_accuracy": 0.7095886286348104, "num_tokens": 48299684.0, "step": 2470 }, { "entropy": 1.144656204432249, "epoch": 3.968, "grad_norm": 21.5, "learning_rate": 3.85925925925926e-06, "loss": 18.405, "mean_token_accuracy": 0.7056023754179478, "num_tokens": 48496831.0, "step": 2480 }, { "entropy": 1.126984755322337, "epoch": 3.984, "grad_norm": 18.625, "learning_rate": 3.7851851851851857e-06, "loss": 18.1717, "mean_token_accuracy": 0.7091844279319048, "num_tokens": 48694070.0, "step": 2490 }, { "entropy": 1.1470630817115306, "epoch": 4.0, "grad_norm": 20.5, "learning_rate": 3.7111111111111113e-06, "loss": 18.6232, "mean_token_accuracy": 0.7034741207957268, "num_tokens": 48885164.0, "step": 2500 }, { "epoch": 4.0, "eval_biology_entropy": 1.100130069732666, "eval_biology_loss": 1.1589951515197754, "eval_biology_mean_token_accuracy": 0.7041674499511719, "eval_biology_num_tokens": 48885164.0, "eval_biology_runtime": 48.5982, "eval_biology_samples_per_second": 10.288, "eval_biology_steps_per_second": 2.572, "step": 2500 }, { "epoch": 4.0, "eval_chemistry_entropy": 0.8349951648712158, "eval_chemistry_loss": 0.8717024326324463, "eval_chemistry_mean_token_accuracy": 0.7677888073921204, "eval_chemistry_num_tokens": 48885164.0, "eval_chemistry_runtime": 60.793, "eval_chemistry_samples_per_second": 8.225, "eval_chemistry_steps_per_second": 2.056, "step": 2500 }, { "epoch": 4.0, "eval_math_entropy": 0.7401801931858063, "eval_math_loss": 0.9758692383766174, "eval_math_mean_token_accuracy": 0.7619253330230713, "eval_math_num_tokens": 48885164.0, "eval_math_runtime": 62.6315, "eval_math_samples_per_second": 7.983, "eval_math_steps_per_second": 1.996, "step": 2500 }, { "epoch": 4.0, "eval_physics_entropy": 0.815434151172638, "eval_physics_loss": 0.8780592679977417, "eval_physics_mean_token_accuracy": 0.7740482540130615, "eval_physics_num_tokens": 48885164.0, "eval_physics_runtime": 70.9878, "eval_physics_samples_per_second": 7.043, "eval_physics_steps_per_second": 1.761, "step": 2500 }, { "entropy": 1.1160879634320735, "epoch": 4.016, "grad_norm": 18.875, "learning_rate": 3.6370370370370374e-06, "loss": 17.8992, "mean_token_accuracy": 0.7137045960873365, "num_tokens": 49083717.0, "step": 2510 }, { "entropy": 1.1370587714016438, "epoch": 4.032, "grad_norm": 18.875, "learning_rate": 3.562962962962963e-06, "loss": 18.2029, "mean_token_accuracy": 0.7086552571505308, "num_tokens": 49282689.0, "step": 2520 }, { "entropy": 1.1187588647007942, "epoch": 4.048, "grad_norm": 21.75, "learning_rate": 3.4888888888888896e-06, "loss": 17.936, "mean_token_accuracy": 0.7125615429133176, "num_tokens": 49480172.0, "step": 2530 }, { "entropy": 1.1416249305009842, "epoch": 4.064, "grad_norm": 21.25, "learning_rate": 3.4148148148148153e-06, "loss": 18.4587, "mean_token_accuracy": 0.7071565296500921, "num_tokens": 49670167.0, "step": 2540 }, { "entropy": 1.1168063767254353, "epoch": 4.08, "grad_norm": 19.625, "learning_rate": 3.340740740740741e-06, "loss": 17.962, "mean_token_accuracy": 0.7106139473617077, "num_tokens": 49866326.0, "step": 2550 }, { "entropy": 1.1387323811650276, "epoch": 4.096, "grad_norm": 17.625, "learning_rate": 3.266666666666667e-06, "loss": 18.2881, "mean_token_accuracy": 0.7052336398512125, "num_tokens": 50056950.0, "step": 2560 }, { "entropy": 1.1328555908054114, "epoch": 4.112, "grad_norm": 19.75, "learning_rate": 3.1925925925925927e-06, "loss": 18.169, "mean_token_accuracy": 0.7100179735571146, "num_tokens": 50251768.0, "step": 2570 }, { "entropy": 1.1460201136767865, "epoch": 4.128, "grad_norm": 20.25, "learning_rate": 3.1185185185185183e-06, "loss": 18.5196, "mean_token_accuracy": 0.7043800208717584, "num_tokens": 50444319.0, "step": 2580 }, { "entropy": 1.1398494437336921, "epoch": 4.144, "grad_norm": 19.75, "learning_rate": 3.044444444444445e-06, "loss": 18.2388, "mean_token_accuracy": 0.7064049437642097, "num_tokens": 50631631.0, "step": 2590 }, { "entropy": 1.141350831091404, "epoch": 4.16, "grad_norm": 19.125, "learning_rate": 2.9703703703703705e-06, "loss": 18.3667, "mean_token_accuracy": 0.7068084985017776, "num_tokens": 50826513.0, "step": 2600 }, { "epoch": 4.16, "eval_biology_entropy": 1.0955388507843018, "eval_biology_loss": 1.1590566635131836, "eval_biology_mean_token_accuracy": 0.7045985751152039, "eval_biology_num_tokens": 50826513.0, "eval_biology_runtime": 48.7104, "eval_biology_samples_per_second": 10.265, "eval_biology_steps_per_second": 2.566, "step": 2600 }, { "epoch": 4.16, "eval_chemistry_entropy": 0.8316172590255737, "eval_chemistry_loss": 0.8717626333236694, "eval_chemistry_mean_token_accuracy": 0.767913815498352, "eval_chemistry_num_tokens": 50826513.0, "eval_chemistry_runtime": 60.7653, "eval_chemistry_samples_per_second": 8.228, "eval_chemistry_steps_per_second": 2.057, "step": 2600 }, { "epoch": 4.16, "eval_math_entropy": 0.7378519124984741, "eval_math_loss": 0.9764444828033447, "eval_math_mean_token_accuracy": 0.7619533925056458, "eval_math_num_tokens": 50826513.0, "eval_math_runtime": 61.9671, "eval_math_samples_per_second": 8.069, "eval_math_steps_per_second": 2.017, "step": 2600 }, { "epoch": 4.16, "eval_physics_entropy": 0.8122129874229431, "eval_physics_loss": 0.878105878829956, "eval_physics_mean_token_accuracy": 0.7741021389961242, "eval_physics_num_tokens": 50826513.0, "eval_physics_runtime": 70.6183, "eval_physics_samples_per_second": 7.08, "eval_physics_steps_per_second": 1.77, "step": 2600 }, { "entropy": 1.1222807440906764, "epoch": 4.176, "grad_norm": 19.125, "learning_rate": 2.8962962962962966e-06, "loss": 18.0717, "mean_token_accuracy": 0.7091497462242842, "num_tokens": 51026177.0, "step": 2610 }, { "entropy": 1.1358438849449157, "epoch": 4.192, "grad_norm": 20.125, "learning_rate": 2.8222222222222223e-06, "loss": 18.2249, "mean_token_accuracy": 0.7061910640448332, "num_tokens": 51218570.0, "step": 2620 }, { "entropy": 1.108224703371525, "epoch": 4.208, "grad_norm": 18.125, "learning_rate": 2.748148148148148e-06, "loss": 17.8847, "mean_token_accuracy": 0.7117452036589385, "num_tokens": 51417937.0, "step": 2630 }, { "entropy": 1.121630134433508, "epoch": 4.224, "grad_norm": 18.625, "learning_rate": 2.6740740740740744e-06, "loss": 18.071, "mean_token_accuracy": 0.7106337703764438, "num_tokens": 51615267.0, "step": 2640 }, { "entropy": 1.0866072356700898, "epoch": 4.24, "grad_norm": 19.5, "learning_rate": 2.6e-06, "loss": 17.4428, "mean_token_accuracy": 0.716962655633688, "num_tokens": 51815136.0, "step": 2650 }, { "entropy": 1.0989352997392416, "epoch": 4.256, "grad_norm": 18.375, "learning_rate": 2.525925925925926e-06, "loss": 17.5111, "mean_token_accuracy": 0.7172319073230028, "num_tokens": 52018293.0, "step": 2660 }, { "entropy": 1.1121397718787194, "epoch": 4.272, "grad_norm": 18.75, "learning_rate": 2.451851851851852e-06, "loss": 17.8373, "mean_token_accuracy": 0.713592042773962, "num_tokens": 52214165.0, "step": 2670 }, { "entropy": 1.0949925631284714, "epoch": 4.288, "grad_norm": 18.875, "learning_rate": 2.377777777777778e-06, "loss": 17.6726, "mean_token_accuracy": 0.714476003870368, "num_tokens": 52422210.0, "step": 2680 }, { "entropy": 1.1069289091974497, "epoch": 4.304, "grad_norm": 19.125, "learning_rate": 2.303703703703704e-06, "loss": 17.7715, "mean_token_accuracy": 0.7128037516027689, "num_tokens": 52617031.0, "step": 2690 }, { "entropy": 1.1163511652499438, "epoch": 4.32, "grad_norm": 18.375, "learning_rate": 2.22962962962963e-06, "loss": 18.0913, "mean_token_accuracy": 0.7107409708201885, "num_tokens": 52814574.0, "step": 2700 }, { "epoch": 4.32, "eval_biology_entropy": 1.0932833123207093, "eval_biology_loss": 1.1590940952301025, "eval_biology_mean_token_accuracy": 0.7046565957069397, "eval_biology_num_tokens": 52814574.0, "eval_biology_runtime": 49.2743, "eval_biology_samples_per_second": 10.147, "eval_biology_steps_per_second": 2.537, "step": 2700 }, { "epoch": 4.32, "eval_chemistry_entropy": 0.8303835077285766, "eval_chemistry_loss": 0.87211012840271, "eval_chemistry_mean_token_accuracy": 0.7680432667732239, "eval_chemistry_num_tokens": 52814574.0, "eval_chemistry_runtime": 60.8922, "eval_chemistry_samples_per_second": 8.211, "eval_chemistry_steps_per_second": 2.053, "step": 2700 }, { "epoch": 4.32, "eval_math_entropy": 0.7375843653678894, "eval_math_loss": 0.9766404032707214, "eval_math_mean_token_accuracy": 0.7619228887557984, "eval_math_num_tokens": 52814574.0, "eval_math_runtime": 62.4061, "eval_math_samples_per_second": 8.012, "eval_math_steps_per_second": 2.003, "step": 2700 }, { "epoch": 4.32, "eval_physics_entropy": 0.8115231275558472, "eval_physics_loss": 0.8783358335494995, "eval_physics_mean_token_accuracy": 0.7739383320808411, "eval_physics_num_tokens": 52814574.0, "eval_physics_runtime": 71.1956, "eval_physics_samples_per_second": 7.023, "eval_physics_steps_per_second": 1.756, "step": 2700 }, { "entropy": 1.1100974697619677, "epoch": 4.336, "grad_norm": 20.5, "learning_rate": 2.1555555555555558e-06, "loss": 17.9798, "mean_token_accuracy": 0.7109350692480803, "num_tokens": 53013899.0, "step": 2710 }, { "entropy": 1.1317887622863054, "epoch": 4.352, "grad_norm": 18.625, "learning_rate": 2.0814814814814814e-06, "loss": 18.2432, "mean_token_accuracy": 0.7078307528048754, "num_tokens": 53209262.0, "step": 2720 }, { "entropy": 1.12730851508677, "epoch": 4.368, "grad_norm": 19.875, "learning_rate": 2.0074074074074075e-06, "loss": 18.0828, "mean_token_accuracy": 0.7095625881105662, "num_tokens": 53404971.0, "step": 2730 }, { "entropy": 1.134312851727009, "epoch": 4.384, "grad_norm": 19.375, "learning_rate": 1.9333333333333336e-06, "loss": 18.2957, "mean_token_accuracy": 0.7060767561197281, "num_tokens": 53594766.0, "step": 2740 }, { "entropy": 1.1393331296741962, "epoch": 4.4, "grad_norm": 21.0, "learning_rate": 1.8592592592592595e-06, "loss": 18.418, "mean_token_accuracy": 0.7064639564603568, "num_tokens": 53787876.0, "step": 2750 }, { "entropy": 1.1089225117117167, "epoch": 4.416, "grad_norm": 19.375, "learning_rate": 1.7851851851851853e-06, "loss": 17.8242, "mean_token_accuracy": 0.7097496975213289, "num_tokens": 53991437.0, "step": 2760 }, { "entropy": 1.1256129287183285, "epoch": 4.432, "grad_norm": 20.0, "learning_rate": 1.7111111111111112e-06, "loss": 18.0238, "mean_token_accuracy": 0.7105293430387973, "num_tokens": 54191547.0, "step": 2770 }, { "entropy": 1.113245976343751, "epoch": 4.448, "grad_norm": 20.375, "learning_rate": 1.6370370370370373e-06, "loss": 17.7909, "mean_token_accuracy": 0.7116924650967121, "num_tokens": 54385194.0, "step": 2780 }, { "entropy": 1.124070692807436, "epoch": 4.464, "grad_norm": 19.0, "learning_rate": 1.562962962962963e-06, "loss": 18.0812, "mean_token_accuracy": 0.7098761692643165, "num_tokens": 54577099.0, "step": 2790 }, { "entropy": 1.1426789626479148, "epoch": 4.48, "grad_norm": 20.375, "learning_rate": 1.4888888888888888e-06, "loss": 18.4486, "mean_token_accuracy": 0.705568815022707, "num_tokens": 54769022.0, "step": 2800 }, { "epoch": 4.48, "eval_biology_entropy": 1.0955566415786744, "eval_biology_loss": 1.1589276790618896, "eval_biology_mean_token_accuracy": 0.7042924590110778, "eval_biology_num_tokens": 54769022.0, "eval_biology_runtime": 48.5409, "eval_biology_samples_per_second": 10.301, "eval_biology_steps_per_second": 2.575, "step": 2800 }, { "epoch": 4.48, "eval_chemistry_entropy": 0.831970660686493, "eval_chemistry_loss": 0.871811032295227, "eval_chemistry_mean_token_accuracy": 0.7678974647521972, "eval_chemistry_num_tokens": 54769022.0, "eval_chemistry_runtime": 60.2241, "eval_chemistry_samples_per_second": 8.302, "eval_chemistry_steps_per_second": 2.076, "step": 2800 }, { "epoch": 4.48, "eval_math_entropy": 0.7386114675998687, "eval_math_loss": 0.9762167930603027, "eval_math_mean_token_accuracy": 0.7621296954154968, "eval_math_num_tokens": 54769022.0, "eval_math_runtime": 61.8181, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 2800 }, { "epoch": 4.48, "eval_physics_entropy": 0.8128011884689331, "eval_physics_loss": 0.8782095909118652, "eval_physics_mean_token_accuracy": 0.7740145058631897, "eval_physics_num_tokens": 54769022.0, "eval_physics_runtime": 70.385, "eval_physics_samples_per_second": 7.104, "eval_physics_steps_per_second": 1.776, "step": 2800 }, { "entropy": 1.1157229054719209, "epoch": 4.496, "grad_norm": 18.0, "learning_rate": 1.414814814814815e-06, "loss": 17.9071, "mean_token_accuracy": 0.7103788953274488, "num_tokens": 54968338.0, "step": 2810 }, { "entropy": 1.1294743590056897, "epoch": 4.5120000000000005, "grad_norm": 19.875, "learning_rate": 1.3407407407407408e-06, "loss": 18.2126, "mean_token_accuracy": 0.7076791927218438, "num_tokens": 55167039.0, "step": 2820 }, { "entropy": 1.1057243023067713, "epoch": 4.5280000000000005, "grad_norm": 18.625, "learning_rate": 1.2666666666666669e-06, "loss": 17.7167, "mean_token_accuracy": 0.7155276846140624, "num_tokens": 55369598.0, "step": 2830 }, { "entropy": 1.1095443222671748, "epoch": 4.5440000000000005, "grad_norm": 20.125, "learning_rate": 1.1925925925925928e-06, "loss": 17.8819, "mean_token_accuracy": 0.7097273614257574, "num_tokens": 55563629.0, "step": 2840 }, { "entropy": 1.1140538066625596, "epoch": 4.5600000000000005, "grad_norm": 20.125, "learning_rate": 1.1185185185185186e-06, "loss": 17.9501, "mean_token_accuracy": 0.7119776252657175, "num_tokens": 55759240.0, "step": 2850 }, { "entropy": 1.1115111865103244, "epoch": 4.576, "grad_norm": 20.375, "learning_rate": 1.0444444444444445e-06, "loss": 17.9192, "mean_token_accuracy": 0.711128744110465, "num_tokens": 55953631.0, "step": 2860 }, { "entropy": 1.1233138255774975, "epoch": 4.592, "grad_norm": 21.0, "learning_rate": 9.703703703703704e-07, "loss": 18.1225, "mean_token_accuracy": 0.7096393074840307, "num_tokens": 56150202.0, "step": 2870 }, { "entropy": 1.1561065390706062, "epoch": 4.608, "grad_norm": 21.25, "learning_rate": 8.962962962962964e-07, "loss": 18.6791, "mean_token_accuracy": 0.7017291557043791, "num_tokens": 56336358.0, "step": 2880 }, { "entropy": 1.140657200664282, "epoch": 4.624, "grad_norm": 18.5, "learning_rate": 8.222222222222223e-07, "loss": 18.4252, "mean_token_accuracy": 0.7071378316730261, "num_tokens": 56532634.0, "step": 2890 }, { "entropy": 1.117747975513339, "epoch": 4.64, "grad_norm": 20.625, "learning_rate": 7.481481481481482e-07, "loss": 18.0552, "mean_token_accuracy": 0.7107968173921109, "num_tokens": 56721597.0, "step": 2900 }, { "epoch": 4.64, "eval_biology_entropy": 1.095109058856964, "eval_biology_loss": 1.1590646505355835, "eval_biology_mean_token_accuracy": 0.7044678544998169, "eval_biology_num_tokens": 56721597.0, "eval_biology_runtime": 48.5811, "eval_biology_samples_per_second": 10.292, "eval_biology_steps_per_second": 2.573, "step": 2900 }, { "epoch": 4.64, "eval_chemistry_entropy": 0.8311039981842041, "eval_chemistry_loss": 0.8717870712280273, "eval_chemistry_mean_token_accuracy": 0.7681379013061523, "eval_chemistry_num_tokens": 56721597.0, "eval_chemistry_runtime": 60.3714, "eval_chemistry_samples_per_second": 8.282, "eval_chemistry_steps_per_second": 2.071, "step": 2900 }, { "epoch": 4.64, "eval_math_entropy": 0.7375130612850189, "eval_math_loss": 0.9765008687973022, "eval_math_mean_token_accuracy": 0.7620035552978516, "eval_math_num_tokens": 56721597.0, "eval_math_runtime": 61.7727, "eval_math_samples_per_second": 8.094, "eval_math_steps_per_second": 2.024, "step": 2900 }, { "epoch": 4.64, "eval_physics_entropy": 0.8118382081985474, "eval_physics_loss": 0.8782839775085449, "eval_physics_mean_token_accuracy": 0.7739185023307801, "eval_physics_num_tokens": 56721597.0, "eval_physics_runtime": 70.3938, "eval_physics_samples_per_second": 7.103, "eval_physics_steps_per_second": 1.776, "step": 2900 }, { "entropy": 1.1298798985779286, "epoch": 4.656, "grad_norm": 19.5, "learning_rate": 6.740740740740741e-07, "loss": 18.1305, "mean_token_accuracy": 0.708253213763237, "num_tokens": 56915763.0, "step": 2910 }, { "entropy": 1.1272273954004048, "epoch": 4.672, "grad_norm": 17.625, "learning_rate": 6.000000000000001e-07, "loss": 18.2545, "mean_token_accuracy": 0.7086662597954273, "num_tokens": 57106674.0, "step": 2920 }, { "entropy": 1.1215174172073603, "epoch": 4.688, "grad_norm": 20.0, "learning_rate": 5.25925925925926e-07, "loss": 18.2098, "mean_token_accuracy": 0.7087991438806057, "num_tokens": 57308644.0, "step": 2930 }, { "entropy": 1.1302482053637504, "epoch": 4.704, "grad_norm": 19.875, "learning_rate": 4.518518518518519e-07, "loss": 18.1629, "mean_token_accuracy": 0.7095564223825932, "num_tokens": 57503295.0, "step": 2940 }, { "entropy": 1.1331190101802349, "epoch": 4.72, "grad_norm": 19.5, "learning_rate": 3.777777777777778e-07, "loss": 18.3343, "mean_token_accuracy": 0.7055216923356056, "num_tokens": 57694635.0, "step": 2950 }, { "entropy": 1.1156794786453248, "epoch": 4.736, "grad_norm": 18.125, "learning_rate": 3.0370370370370374e-07, "loss": 17.967, "mean_token_accuracy": 0.7111107666045428, "num_tokens": 57898052.0, "step": 2960 }, { "entropy": 1.122471484914422, "epoch": 4.752, "grad_norm": 18.75, "learning_rate": 2.2962962962962964e-07, "loss": 18.0766, "mean_token_accuracy": 0.7092397723346948, "num_tokens": 58091828.0, "step": 2970 }, { "entropy": 1.0995518658310175, "epoch": 4.768, "grad_norm": 18.375, "learning_rate": 1.5555555555555556e-07, "loss": 17.5004, "mean_token_accuracy": 0.7156328197568655, "num_tokens": 58290608.0, "step": 2980 }, { "entropy": 1.1175754133611917, "epoch": 4.784, "grad_norm": 21.25, "learning_rate": 8.148148148148148e-08, "loss": 18.0567, "mean_token_accuracy": 0.7084891442209482, "num_tokens": 58491415.0, "step": 2990 }, { "entropy": 1.134732787311077, "epoch": 4.8, "grad_norm": 21.0, "learning_rate": 7.407407407407408e-09, "loss": 18.1648, "mean_token_accuracy": 0.7078682146966457, "num_tokens": 58681639.0, "step": 3000 }, { "epoch": 4.8, "eval_biology_entropy": 1.095814537525177, "eval_biology_loss": 1.1591075658798218, "eval_biology_mean_token_accuracy": 0.7045488510131835, "eval_biology_num_tokens": 58681639.0, "eval_biology_runtime": 49.2299, "eval_biology_samples_per_second": 10.156, "eval_biology_steps_per_second": 2.539, "step": 3000 }, { "epoch": 4.8, "eval_chemistry_entropy": 0.8320227708816529, "eval_chemistry_loss": 0.8718997836112976, "eval_chemistry_mean_token_accuracy": 0.7678845291137695, "eval_chemistry_num_tokens": 58681639.0, "eval_chemistry_runtime": 60.1789, "eval_chemistry_samples_per_second": 8.309, "eval_chemistry_steps_per_second": 2.077, "step": 3000 }, { "epoch": 4.8, "eval_math_entropy": 0.7383916871547699, "eval_math_loss": 0.9763190746307373, "eval_math_mean_token_accuracy": 0.7621074786186218, "eval_math_num_tokens": 58681639.0, "eval_math_runtime": 62.0839, "eval_math_samples_per_second": 8.054, "eval_math_steps_per_second": 2.013, "step": 3000 }, { "epoch": 4.8, "eval_physics_entropy": 0.8127228660583496, "eval_physics_loss": 0.878301739692688, "eval_physics_mean_token_accuracy": 0.7740593042373657, "eval_physics_num_tokens": 58681639.0, "eval_physics_runtime": 70.7249, "eval_physics_samples_per_second": 7.07, "eval_physics_steps_per_second": 1.767, "step": 3000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.622995707307729e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }