{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4, "eval_steps": 100, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.142920307815075, "epoch": 0.016, "grad_norm": 290.0, "learning_rate": 6.000000000000001e-07, "loss": 42.6658, "mean_token_accuracy": 0.5620782226324081, "num_tokens": 195524.0, "step": 10 }, { "entropy": 1.148210159689188, "epoch": 0.032, "grad_norm": 239.0, "learning_rate": 1.2666666666666669e-06, "loss": 41.9984, "mean_token_accuracy": 0.5613080382347106, "num_tokens": 390903.0, "step": 20 }, { "entropy": 1.1933260083198547, "epoch": 0.048, "grad_norm": 249.0, "learning_rate": 1.9333333333333336e-06, "loss": 40.6208, "mean_token_accuracy": 0.5657517908141017, "num_tokens": 589868.0, "step": 30 }, { "entropy": 1.2957281917333603, "epoch": 0.064, "grad_norm": 139.0, "learning_rate": 2.6e-06, "loss": 37.9032, "mean_token_accuracy": 0.5714796105399728, "num_tokens": 791190.0, "step": 40 }, { "entropy": 1.5075685508549213, "epoch": 0.08, "grad_norm": 94.0, "learning_rate": 3.266666666666667e-06, "loss": 35.7561, "mean_token_accuracy": 0.5766569443047047, "num_tokens": 989860.0, "step": 50 }, { "entropy": 1.7984249681234359, "epoch": 0.096, "grad_norm": 50.75, "learning_rate": 3.9333333333333335e-06, "loss": 33.4379, "mean_token_accuracy": 0.5814697606489062, "num_tokens": 1181777.0, "step": 60 }, { "entropy": 1.8387351341545581, "epoch": 0.112, "grad_norm": 43.0, "learning_rate": 4.600000000000001e-06, "loss": 30.4219, "mean_token_accuracy": 0.5971228444948793, "num_tokens": 1385513.0, "step": 70 }, { "entropy": 1.7275233700871468, "epoch": 0.128, "grad_norm": 33.5, "learning_rate": 5.2666666666666665e-06, "loss": 28.4703, "mean_token_accuracy": 0.6095364252105355, "num_tokens": 1582368.0, "step": 80 }, { "entropy": 1.7214979872107505, "epoch": 0.144, "grad_norm": 27.0, "learning_rate": 5.933333333333335e-06, "loss": 26.677, "mean_token_accuracy": 0.6243448719382286, "num_tokens": 1773764.0, "step": 90 }, { "entropy": 1.6311134904623033, "epoch": 0.16, "grad_norm": 22.0, "learning_rate": 6.600000000000001e-06, "loss": 25.7683, "mean_token_accuracy": 0.6301404371857643, "num_tokens": 1970077.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.5580159120559693, "eval_biology_loss": 1.5081593990325928, "eval_biology_mean_token_accuracy": 0.6457349667549134, "eval_biology_num_tokens": 1970077.0, "eval_biology_runtime": 48.7413, "eval_biology_samples_per_second": 10.258, "eval_biology_steps_per_second": 2.565, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.206756212234497, "eval_chemistry_loss": 1.1218774318695068, "eval_chemistry_mean_token_accuracy": 0.7205783066749573, "eval_chemistry_num_tokens": 1970077.0, "eval_chemistry_runtime": 60.3159, "eval_chemistry_samples_per_second": 8.29, "eval_chemistry_steps_per_second": 2.072, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.9672308325767517, "eval_math_loss": 1.159799337387085, "eval_math_mean_token_accuracy": 0.7189845342636109, "eval_math_num_tokens": 1970077.0, "eval_math_runtime": 61.8237, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.1670387201309205, "eval_physics_loss": 1.1291608810424805, "eval_physics_mean_token_accuracy": 0.7211072521209717, "eval_physics_num_tokens": 1970077.0, "eval_physics_runtime": 70.4586, "eval_physics_samples_per_second": 7.096, "eval_physics_steps_per_second": 1.774, "step": 100 }, { "entropy": 1.5482715763151647, "epoch": 0.176, "grad_norm": 21.125, "learning_rate": 7.266666666666668e-06, "loss": 24.5868, "mean_token_accuracy": 0.6385629490017891, "num_tokens": 2168354.0, "step": 110 }, { "entropy": 1.5266574397683144, "epoch": 0.192, "grad_norm": 22.875, "learning_rate": 7.933333333333334e-06, "loss": 24.2707, "mean_token_accuracy": 0.6432460084557533, "num_tokens": 2365822.0, "step": 120 }, { "entropy": 1.5192069873213767, "epoch": 0.208, "grad_norm": 20.875, "learning_rate": 8.6e-06, "loss": 24.1355, "mean_token_accuracy": 0.6436416517943144, "num_tokens": 2558762.0, "step": 130 }, { "entropy": 1.4698147468268872, "epoch": 0.224, "grad_norm": 20.125, "learning_rate": 9.266666666666667e-06, "loss": 23.5154, "mean_token_accuracy": 0.6499760080128908, "num_tokens": 2755347.0, "step": 140 }, { "entropy": 1.4506230603903532, "epoch": 0.24, "grad_norm": 19.625, "learning_rate": 9.933333333333334e-06, "loss": 23.2013, "mean_token_accuracy": 0.6523264441639185, "num_tokens": 2947346.0, "step": 150 }, { "entropy": 1.4590953961014748, "epoch": 0.256, "grad_norm": 18.5, "learning_rate": 1.0600000000000002e-05, "loss": 23.3227, "mean_token_accuracy": 0.6508617259562015, "num_tokens": 3139957.0, "step": 160 }, { "entropy": 1.419396448880434, "epoch": 0.272, "grad_norm": 19.75, "learning_rate": 1.1266666666666668e-05, "loss": 22.7352, "mean_token_accuracy": 0.6572458431124687, "num_tokens": 3335951.0, "step": 170 }, { "entropy": 1.4005608204752207, "epoch": 0.288, "grad_norm": 19.75, "learning_rate": 1.1933333333333335e-05, "loss": 22.3969, "mean_token_accuracy": 0.6585959013551473, "num_tokens": 3539731.0, "step": 180 }, { "entropy": 1.391934547200799, "epoch": 0.304, "grad_norm": 18.75, "learning_rate": 1.2600000000000001e-05, "loss": 22.31, "mean_token_accuracy": 0.6621056370437145, "num_tokens": 3733488.0, "step": 190 }, { "entropy": 1.4028674490749835, "epoch": 0.32, "grad_norm": 22.25, "learning_rate": 1.3266666666666668e-05, "loss": 22.5559, "mean_token_accuracy": 0.6576981086283922, "num_tokens": 3920545.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.3209806289672852, "eval_biology_loss": 1.338399887084961, "eval_biology_mean_token_accuracy": 0.6720403518676757, "eval_biology_num_tokens": 3920545.0, "eval_biology_runtime": 48.5853, "eval_biology_samples_per_second": 10.291, "eval_biology_steps_per_second": 2.573, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.0033348879814148, "eval_chemistry_loss": 0.9935092926025391, "eval_chemistry_mean_token_accuracy": 0.7448974308967591, "eval_chemistry_num_tokens": 3920545.0, "eval_chemistry_runtime": 60.24, "eval_chemistry_samples_per_second": 8.3, "eval_chemistry_steps_per_second": 2.075, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.8341804294586181, "eval_math_loss": 1.0635857582092285, "eval_math_mean_token_accuracy": 0.7432106451988221, "eval_math_num_tokens": 3920545.0, "eval_math_runtime": 61.8174, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.9652358031272888, "eval_physics_loss": 0.9950281977653503, "eval_physics_mean_token_accuracy": 0.7510108857154846, "eval_physics_num_tokens": 3920545.0, "eval_physics_runtime": 70.411, "eval_physics_samples_per_second": 7.101, "eval_physics_steps_per_second": 1.775, "step": 200 }, { "entropy": 1.3548175282776356, "epoch": 0.336, "grad_norm": 19.625, "learning_rate": 1.3933333333333334e-05, "loss": 21.7763, "mean_token_accuracy": 0.6656343434005976, "num_tokens": 4114077.0, "step": 210 }, { "entropy": 1.3656601022928954, "epoch": 0.352, "grad_norm": 20.625, "learning_rate": 1.46e-05, "loss": 22.0972, "mean_token_accuracy": 0.6638848338276148, "num_tokens": 4306949.0, "step": 220 }, { "entropy": 1.3525194190442562, "epoch": 0.368, "grad_norm": 18.125, "learning_rate": 1.5266666666666667e-05, "loss": 21.7293, "mean_token_accuracy": 0.6680811226367951, "num_tokens": 4504001.0, "step": 230 }, { "entropy": 1.3454820621758699, "epoch": 0.384, "grad_norm": 21.25, "learning_rate": 1.5933333333333336e-05, "loss": 21.7032, "mean_token_accuracy": 0.6671383358538151, "num_tokens": 4693812.0, "step": 240 }, { "entropy": 1.3525703553110362, "epoch": 0.4, "grad_norm": 17.5, "learning_rate": 1.66e-05, "loss": 21.7856, "mean_token_accuracy": 0.666401931643486, "num_tokens": 4887094.0, "step": 250 }, { "entropy": 1.351718918606639, "epoch": 0.416, "grad_norm": 19.0, "learning_rate": 1.726666666666667e-05, "loss": 21.9058, "mean_token_accuracy": 0.6651136819273233, "num_tokens": 5085369.0, "step": 260 }, { "entropy": 1.3526419658213853, "epoch": 0.432, "grad_norm": 20.875, "learning_rate": 1.7933333333333333e-05, "loss": 21.7813, "mean_token_accuracy": 0.6668458927422762, "num_tokens": 5271275.0, "step": 270 }, { "entropy": 1.3480545241385697, "epoch": 0.448, "grad_norm": 22.875, "learning_rate": 1.86e-05, "loss": 21.627, "mean_token_accuracy": 0.6677324704825878, "num_tokens": 5460559.0, "step": 280 }, { "entropy": 1.301166184991598, "epoch": 0.464, "grad_norm": 21.25, "learning_rate": 1.926666666666667e-05, "loss": 20.889, "mean_token_accuracy": 0.676617132872343, "num_tokens": 5653809.0, "step": 290 }, { "entropy": 1.318466317281127, "epoch": 0.48, "grad_norm": 17.125, "learning_rate": 1.9933333333333334e-05, "loss": 21.2936, "mean_token_accuracy": 0.6712827417999506, "num_tokens": 5850176.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.2827796216011047, "eval_biology_loss": 1.275201678276062, "eval_biology_mean_token_accuracy": 0.6830832781791687, "eval_biology_num_tokens": 5850176.0, "eval_biology_runtime": 48.4915, "eval_biology_samples_per_second": 10.311, "eval_biology_steps_per_second": 2.578, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.983495129108429, "eval_chemistry_loss": 0.9488818645477295, "eval_chemistry_mean_token_accuracy": 0.7523409638404847, "eval_chemistry_num_tokens": 5850176.0, "eval_chemistry_runtime": 60.1707, "eval_chemistry_samples_per_second": 8.31, "eval_chemistry_steps_per_second": 2.077, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.8216862387657166, "eval_math_loss": 1.0297818183898926, "eval_math_mean_token_accuracy": 0.7488151121139527, "eval_math_num_tokens": 5850176.0, "eval_math_runtime": 61.6905, "eval_math_samples_per_second": 8.105, "eval_math_steps_per_second": 2.026, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.9433758721351624, "eval_physics_loss": 0.9520999193191528, "eval_physics_mean_token_accuracy": 0.7585058889389038, "eval_physics_num_tokens": 5850176.0, "eval_physics_runtime": 70.301, "eval_physics_samples_per_second": 7.112, "eval_physics_steps_per_second": 1.778, "step": 300 }, { "entropy": 1.2579400472342968, "epoch": 0.496, "grad_norm": 17.75, "learning_rate": 1.9933333333333334e-05, "loss": 20.2011, "mean_token_accuracy": 0.6842056062072516, "num_tokens": 6046503.0, "step": 310 }, { "entropy": 1.3082518883049488, "epoch": 0.512, "grad_norm": 18.125, "learning_rate": 1.985925925925926e-05, "loss": 21.0658, "mean_token_accuracy": 0.6749501373618841, "num_tokens": 6240456.0, "step": 320 }, { "entropy": 1.3003981616348028, "epoch": 0.528, "grad_norm": 18.125, "learning_rate": 1.9785185185185187e-05, "loss": 20.9809, "mean_token_accuracy": 0.6757604543119669, "num_tokens": 6430555.0, "step": 330 }, { "entropy": 1.2986273631453513, "epoch": 0.544, "grad_norm": 17.0, "learning_rate": 1.971111111111111e-05, "loss": 20.8809, "mean_token_accuracy": 0.6782271713018417, "num_tokens": 6626006.0, "step": 340 }, { "entropy": 1.284830729290843, "epoch": 0.56, "grad_norm": 17.25, "learning_rate": 1.963703703703704e-05, "loss": 20.8197, "mean_token_accuracy": 0.6767117112874985, "num_tokens": 6820754.0, "step": 350 }, { "entropy": 1.2683125745505095, "epoch": 0.576, "grad_norm": 17.0, "learning_rate": 1.9562962962962964e-05, "loss": 20.4541, "mean_token_accuracy": 0.6809794403612613, "num_tokens": 7021844.0, "step": 360 }, { "entropy": 1.2863252360373736, "epoch": 0.592, "grad_norm": 18.875, "learning_rate": 1.948888888888889e-05, "loss": 20.8043, "mean_token_accuracy": 0.676701345667243, "num_tokens": 7213951.0, "step": 370 }, { "entropy": 1.2630502216517925, "epoch": 0.608, "grad_norm": 18.75, "learning_rate": 1.9414814814814817e-05, "loss": 20.4041, "mean_token_accuracy": 0.6803740747272968, "num_tokens": 7416773.0, "step": 380 }, { "entropy": 1.2804703898727894, "epoch": 0.624, "grad_norm": 19.25, "learning_rate": 1.9340740740740743e-05, "loss": 20.6218, "mean_token_accuracy": 0.6788272958248853, "num_tokens": 7612843.0, "step": 390 }, { "entropy": 1.2843346055597067, "epoch": 0.64, "grad_norm": 18.0, "learning_rate": 1.926666666666667e-05, "loss": 20.7171, "mean_token_accuracy": 0.6782444745302201, "num_tokens": 7801633.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.226506398677826, "eval_biology_loss": 1.2382104396820068, "eval_biology_mean_token_accuracy": 0.6894095778465271, "eval_biology_num_tokens": 7801633.0, "eval_biology_runtime": 48.5507, "eval_biology_samples_per_second": 10.299, "eval_biology_steps_per_second": 2.575, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.9317227191925049, "eval_chemistry_loss": 0.9207452535629272, "eval_chemistry_mean_token_accuracy": 0.7581370029449462, "eval_chemistry_num_tokens": 7801633.0, "eval_chemistry_runtime": 60.2113, "eval_chemistry_samples_per_second": 8.304, "eval_chemistry_steps_per_second": 2.076, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.7863595089912414, "eval_math_loss": 1.010460376739502, "eval_math_mean_token_accuracy": 0.7535392093658447, "eval_math_num_tokens": 7801633.0, "eval_math_runtime": 61.807, "eval_math_samples_per_second": 8.09, "eval_math_steps_per_second": 2.022, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.8958085932731629, "eval_physics_loss": 0.9257401823997498, "eval_physics_mean_token_accuracy": 0.7637984156608582, "eval_physics_num_tokens": 7801633.0, "eval_physics_runtime": 70.3663, "eval_physics_samples_per_second": 7.106, "eval_physics_steps_per_second": 1.776, "step": 400 }, { "entropy": 1.278659427165985, "epoch": 0.656, "grad_norm": 18.25, "learning_rate": 1.9192592592592593e-05, "loss": 20.6682, "mean_token_accuracy": 0.6772829819470644, "num_tokens": 7995843.0, "step": 410 }, { "entropy": 1.2931427203118802, "epoch": 0.672, "grad_norm": 18.625, "learning_rate": 1.911851851851852e-05, "loss": 20.8656, "mean_token_accuracy": 0.6753748003393412, "num_tokens": 8183103.0, "step": 420 }, { "entropy": 1.2739692747592926, "epoch": 0.688, "grad_norm": 16.75, "learning_rate": 1.9044444444444446e-05, "loss": 20.5407, "mean_token_accuracy": 0.6812681049108505, "num_tokens": 8385976.0, "step": 430 }, { "entropy": 1.2659825466573238, "epoch": 0.704, "grad_norm": 16.25, "learning_rate": 1.8970370370370372e-05, "loss": 20.4243, "mean_token_accuracy": 0.6820976916700602, "num_tokens": 8578431.0, "step": 440 }, { "entropy": 1.220404140278697, "epoch": 0.72, "grad_norm": 16.75, "learning_rate": 1.8896296296296295e-05, "loss": 19.6546, "mean_token_accuracy": 0.6908745598047972, "num_tokens": 8781342.0, "step": 450 }, { "entropy": 1.2406103231012822, "epoch": 0.736, "grad_norm": 16.75, "learning_rate": 1.8822222222222225e-05, "loss": 19.9745, "mean_token_accuracy": 0.6853331789374352, "num_tokens": 8977918.0, "step": 460 }, { "entropy": 1.2618801843374967, "epoch": 0.752, "grad_norm": 17.125, "learning_rate": 1.874814814814815e-05, "loss": 20.4041, "mean_token_accuracy": 0.6825968738645315, "num_tokens": 9169322.0, "step": 470 }, { "entropy": 1.2232345014810562, "epoch": 0.768, "grad_norm": 19.25, "learning_rate": 1.8674074074074075e-05, "loss": 19.7045, "mean_token_accuracy": 0.6888250291347504, "num_tokens": 9368141.0, "step": 480 }, { "entropy": 1.25159954726696, "epoch": 0.784, "grad_norm": 18.25, "learning_rate": 1.86e-05, "loss": 20.2036, "mean_token_accuracy": 0.6849453710019588, "num_tokens": 9565236.0, "step": 490 }, { "entropy": 1.264250884205103, "epoch": 0.8, "grad_norm": 19.25, "learning_rate": 1.8525925925925928e-05, "loss": 20.5299, "mean_token_accuracy": 0.6811827480792999, "num_tokens": 9761227.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.2163097896575927, "eval_biology_loss": 1.2177292108535767, "eval_biology_mean_token_accuracy": 0.6932459664344788, "eval_biology_num_tokens": 9761227.0, "eval_biology_runtime": 48.5438, "eval_biology_samples_per_second": 10.3, "eval_biology_steps_per_second": 2.575, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.9239063205718994, "eval_chemistry_loss": 0.9047155380249023, "eval_chemistry_mean_token_accuracy": 0.761792631149292, "eval_chemistry_num_tokens": 9761227.0, "eval_chemistry_runtime": 59.9546, "eval_chemistry_samples_per_second": 8.34, "eval_chemistry_steps_per_second": 2.085, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7864464523792267, "eval_math_loss": 0.9939978122711182, "eval_math_mean_token_accuracy": 0.7574145245552063, "eval_math_num_tokens": 9761227.0, "eval_math_runtime": 61.7812, "eval_math_samples_per_second": 8.093, "eval_math_steps_per_second": 2.023, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.889360978603363, "eval_physics_loss": 0.9096766710281372, "eval_physics_mean_token_accuracy": 0.7674052910804748, "eval_physics_num_tokens": 9761227.0, "eval_physics_runtime": 70.5356, "eval_physics_samples_per_second": 7.089, "eval_physics_steps_per_second": 1.772, "step": 500 }, { "entropy": 1.2362793002277612, "epoch": 0.816, "grad_norm": 19.625, "learning_rate": 1.8451851851851855e-05, "loss": 19.8133, "mean_token_accuracy": 0.6863504596054554, "num_tokens": 9958727.0, "step": 510 }, { "entropy": 1.2254926670342683, "epoch": 0.832, "grad_norm": 17.0, "learning_rate": 1.8377777777777778e-05, "loss": 19.8307, "mean_token_accuracy": 0.6866675779223442, "num_tokens": 10155771.0, "step": 520 }, { "entropy": 1.2238412775099277, "epoch": 0.848, "grad_norm": 18.75, "learning_rate": 1.8303703703703704e-05, "loss": 19.687, "mean_token_accuracy": 0.6897137116640806, "num_tokens": 10357721.0, "step": 530 }, { "entropy": 1.2536957442760468, "epoch": 0.864, "grad_norm": 17.125, "learning_rate": 1.822962962962963e-05, "loss": 20.1565, "mean_token_accuracy": 0.6850291140377521, "num_tokens": 10552495.0, "step": 540 }, { "entropy": 1.231699001789093, "epoch": 0.88, "grad_norm": 18.25, "learning_rate": 1.8155555555555557e-05, "loss": 19.8536, "mean_token_accuracy": 0.6891282081604004, "num_tokens": 10748749.0, "step": 550 }, { "entropy": 1.2470501396805047, "epoch": 0.896, "grad_norm": 18.0, "learning_rate": 1.8081481481481484e-05, "loss": 20.1706, "mean_token_accuracy": 0.6856059569865465, "num_tokens": 10943319.0, "step": 560 }, { "entropy": 1.2307742841541767, "epoch": 0.912, "grad_norm": 18.25, "learning_rate": 1.800740740740741e-05, "loss": 19.9062, "mean_token_accuracy": 0.6885740786790848, "num_tokens": 11136935.0, "step": 570 }, { "entropy": 1.2445739306509496, "epoch": 0.928, "grad_norm": 18.0, "learning_rate": 1.7933333333333333e-05, "loss": 20.0979, "mean_token_accuracy": 0.6851089850068093, "num_tokens": 11331098.0, "step": 580 }, { "entropy": 1.2021468229591847, "epoch": 0.944, "grad_norm": 15.875, "learning_rate": 1.785925925925926e-05, "loss": 19.4077, "mean_token_accuracy": 0.6915138956159353, "num_tokens": 11530550.0, "step": 590 }, { "entropy": 1.226809823140502, "epoch": 0.96, "grad_norm": 19.625, "learning_rate": 1.7785185185185186e-05, "loss": 19.8062, "mean_token_accuracy": 0.6897286407649517, "num_tokens": 11729645.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 1.1845180039405823, "eval_biology_loss": 1.203829050064087, "eval_biology_mean_token_accuracy": 0.6961685500144958, "eval_biology_num_tokens": 11729645.0, "eval_biology_runtime": 48.6169, "eval_biology_samples_per_second": 10.284, "eval_biology_steps_per_second": 2.571, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 0.90015394115448, "eval_chemistry_loss": 0.8946329355239868, "eval_chemistry_mean_token_accuracy": 0.7635614371299744, "eval_chemistry_num_tokens": 11729645.0, "eval_chemistry_runtime": 60.2919, "eval_chemistry_samples_per_second": 8.293, "eval_chemistry_steps_per_second": 2.073, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.7684455904960632, "eval_math_loss": 0.9900413751602173, "eval_math_mean_token_accuracy": 0.7588200316429138, "eval_math_num_tokens": 11729645.0, "eval_math_runtime": 61.8301, "eval_math_samples_per_second": 8.087, "eval_math_steps_per_second": 2.022, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.8686938014030456, "eval_physics_loss": 0.9008635878562927, "eval_physics_mean_token_accuracy": 0.7692377109527588, "eval_physics_num_tokens": 11729645.0, "eval_physics_runtime": 70.4349, "eval_physics_samples_per_second": 7.099, "eval_physics_steps_per_second": 1.775, "step": 600 }, { "entropy": 1.2100978799164295, "epoch": 0.976, "grad_norm": 18.0, "learning_rate": 1.7711111111111113e-05, "loss": 19.47, "mean_token_accuracy": 0.6918121088296175, "num_tokens": 11924644.0, "step": 610 }, { "entropy": 1.2226450834423304, "epoch": 0.992, "grad_norm": 16.625, "learning_rate": 1.763703703703704e-05, "loss": 19.8416, "mean_token_accuracy": 0.688375661149621, "num_tokens": 12123059.0, "step": 620 }, { "entropy": 1.2316548496484756, "epoch": 1.008, "grad_norm": 16.875, "learning_rate": 1.7562962962962962e-05, "loss": 19.6116, "mean_token_accuracy": 0.6919524800032377, "num_tokens": 12319366.0, "step": 630 }, { "entropy": 1.1779099617153406, "epoch": 1.024, "grad_norm": 19.0, "learning_rate": 1.7488888888888892e-05, "loss": 18.9763, "mean_token_accuracy": 0.6978646669536829, "num_tokens": 12524183.0, "step": 640 }, { "entropy": 1.2152834441512823, "epoch": 1.04, "grad_norm": 17.75, "learning_rate": 1.7414814814814815e-05, "loss": 19.6247, "mean_token_accuracy": 0.6903412740677595, "num_tokens": 12718593.0, "step": 650 }, { "entropy": 1.1799768891185523, "epoch": 1.056, "grad_norm": 18.625, "learning_rate": 1.7340740740740742e-05, "loss": 19.0432, "mean_token_accuracy": 0.6986244544386864, "num_tokens": 12917803.0, "step": 660 }, { "entropy": 1.2108702428638936, "epoch": 1.072, "grad_norm": 19.125, "learning_rate": 1.726666666666667e-05, "loss": 19.4166, "mean_token_accuracy": 0.6927186574786901, "num_tokens": 13105826.0, "step": 670 }, { "entropy": 1.1979756511747837, "epoch": 1.088, "grad_norm": 18.875, "learning_rate": 1.7192592592592595e-05, "loss": 19.2605, "mean_token_accuracy": 0.6957505799829959, "num_tokens": 13298619.0, "step": 680 }, { "entropy": 1.192365935444832, "epoch": 1.104, "grad_norm": 17.875, "learning_rate": 1.711851851851852e-05, "loss": 19.2461, "mean_token_accuracy": 0.695810866355896, "num_tokens": 13491486.0, "step": 690 }, { "entropy": 1.212946466356516, "epoch": 1.12, "grad_norm": 19.25, "learning_rate": 1.7044444444444445e-05, "loss": 19.5004, "mean_token_accuracy": 0.692466252297163, "num_tokens": 13674663.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 1.1571769905090332, "eval_biology_loss": 1.1946450471878052, "eval_biology_mean_token_accuracy": 0.6972691407203674, "eval_biology_num_tokens": 13674663.0, "eval_biology_runtime": 48.6729, "eval_biology_samples_per_second": 10.273, "eval_biology_steps_per_second": 2.568, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 0.8766862626075744, "eval_chemistry_loss": 0.8891168236732483, "eval_chemistry_mean_token_accuracy": 0.7645404329299926, "eval_chemistry_num_tokens": 13674663.0, "eval_chemistry_runtime": 60.3334, "eval_chemistry_samples_per_second": 8.287, "eval_chemistry_steps_per_second": 2.072, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.7603865313529968, "eval_math_loss": 0.9834137558937073, "eval_math_mean_token_accuracy": 0.7596666264533997, "eval_math_num_tokens": 13674663.0, "eval_math_runtime": 61.8146, "eval_math_samples_per_second": 8.089, "eval_math_steps_per_second": 2.022, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.850571988105774, "eval_physics_loss": 0.8937918543815613, "eval_physics_mean_token_accuracy": 0.7703958468437195, "eval_physics_num_tokens": 13674663.0, "eval_physics_runtime": 70.4674, "eval_physics_samples_per_second": 7.095, "eval_physics_steps_per_second": 1.774, "step": 700 }, { "entropy": 1.1909121543169021, "epoch": 1.1360000000000001, "grad_norm": 18.875, "learning_rate": 1.697037037037037e-05, "loss": 19.2462, "mean_token_accuracy": 0.6955781776458025, "num_tokens": 13869134.0, "step": 710 }, { "entropy": 1.1682380847632885, "epoch": 1.152, "grad_norm": 16.75, "learning_rate": 1.6896296296296298e-05, "loss": 18.8229, "mean_token_accuracy": 0.6991135813295841, "num_tokens": 14078365.0, "step": 720 }, { "entropy": 1.1939557407051324, "epoch": 1.168, "grad_norm": 16.875, "learning_rate": 1.6822222222222224e-05, "loss": 19.1346, "mean_token_accuracy": 0.6960698150098323, "num_tokens": 14266831.0, "step": 730 }, { "entropy": 1.180869185552001, "epoch": 1.184, "grad_norm": 18.25, "learning_rate": 1.6748148148148147e-05, "loss": 19.2654, "mean_token_accuracy": 0.6941378649324179, "num_tokens": 14465660.0, "step": 740 }, { "entropy": 1.1937656667083503, "epoch": 1.2, "grad_norm": 18.25, "learning_rate": 1.6674074074074077e-05, "loss": 19.0305, "mean_token_accuracy": 0.6964295905083417, "num_tokens": 14653228.0, "step": 750 }, { "entropy": 1.1589823190122843, "epoch": 1.216, "grad_norm": 17.875, "learning_rate": 1.66e-05, "loss": 18.6048, "mean_token_accuracy": 0.7018654596060514, "num_tokens": 14857782.0, "step": 760 }, { "entropy": 1.1703605465590954, "epoch": 1.232, "grad_norm": 17.375, "learning_rate": 1.6525925925925927e-05, "loss": 18.8831, "mean_token_accuracy": 0.7015001580119133, "num_tokens": 15047356.0, "step": 770 }, { "entropy": 1.1772115517407655, "epoch": 1.248, "grad_norm": 17.875, "learning_rate": 1.6451851851851853e-05, "loss": 19.0432, "mean_token_accuracy": 0.6959997840225697, "num_tokens": 15241098.0, "step": 780 }, { "entropy": 1.196473068371415, "epoch": 1.264, "grad_norm": 16.375, "learning_rate": 1.637777777777778e-05, "loss": 19.1591, "mean_token_accuracy": 0.6967897292226553, "num_tokens": 15437657.0, "step": 790 }, { "entropy": 1.2014197081327438, "epoch": 1.28, "grad_norm": 19.125, "learning_rate": 1.6303703703703706e-05, "loss": 19.4409, "mean_token_accuracy": 0.6926549930125475, "num_tokens": 15630795.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 1.134603425502777, "eval_biology_loss": 1.1884372234344482, "eval_biology_mean_token_accuracy": 0.6986491298675537, "eval_biology_num_tokens": 15630795.0, "eval_biology_runtime": 48.6306, "eval_biology_samples_per_second": 10.282, "eval_biology_steps_per_second": 2.57, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 0.8623910093307495, "eval_chemistry_loss": 0.885444700717926, "eval_chemistry_mean_token_accuracy": 0.7653528556823731, "eval_chemistry_num_tokens": 15630795.0, "eval_chemistry_runtime": 60.3508, "eval_chemistry_samples_per_second": 8.285, "eval_chemistry_steps_per_second": 2.071, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.7589023416042328, "eval_math_loss": 0.983073353767395, "eval_math_mean_token_accuracy": 0.7593517408370972, "eval_math_num_tokens": 15630795.0, "eval_math_runtime": 61.9026, "eval_math_samples_per_second": 8.077, "eval_math_steps_per_second": 2.019, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.84130739736557, "eval_physics_loss": 0.8907755613327026, "eval_physics_mean_token_accuracy": 0.771062777519226, "eval_physics_num_tokens": 15630795.0, "eval_physics_runtime": 70.5226, "eval_physics_samples_per_second": 7.09, "eval_physics_steps_per_second": 1.772, "step": 800 }, { "entropy": 1.2085642520338298, "epoch": 1.296, "grad_norm": 17.375, "learning_rate": 1.622962962962963e-05, "loss": 19.3831, "mean_token_accuracy": 0.6933640763163567, "num_tokens": 15827105.0, "step": 810 }, { "entropy": 1.1861349143087865, "epoch": 1.312, "grad_norm": 18.75, "learning_rate": 1.6155555555555556e-05, "loss": 19.3103, "mean_token_accuracy": 0.694928414747119, "num_tokens": 16019645.0, "step": 820 }, { "entropy": 1.195632776618004, "epoch": 1.328, "grad_norm": 18.5, "learning_rate": 1.6081481481481482e-05, "loss": 19.3068, "mean_token_accuracy": 0.6934712298214436, "num_tokens": 16221726.0, "step": 830 }, { "entropy": 1.1725192748010158, "epoch": 1.3439999999999999, "grad_norm": 17.0, "learning_rate": 1.600740740740741e-05, "loss": 18.7963, "mean_token_accuracy": 0.700145885720849, "num_tokens": 16427594.0, "step": 840 }, { "entropy": 1.179823150858283, "epoch": 1.3599999999999999, "grad_norm": 18.75, "learning_rate": 1.5933333333333336e-05, "loss": 19.1154, "mean_token_accuracy": 0.6961398232728243, "num_tokens": 16621605.0, "step": 850 }, { "entropy": 1.2228495314717294, "epoch": 1.376, "grad_norm": 19.5, "learning_rate": 1.5859259259259262e-05, "loss": 19.6627, "mean_token_accuracy": 0.6894211061298847, "num_tokens": 16813444.0, "step": 860 }, { "entropy": 1.19021125882864, "epoch": 1.392, "grad_norm": 17.5, "learning_rate": 1.5785185185185185e-05, "loss": 19.2411, "mean_token_accuracy": 0.6957099426537752, "num_tokens": 17006509.0, "step": 870 }, { "entropy": 1.184871331602335, "epoch": 1.408, "grad_norm": 15.4375, "learning_rate": 1.571111111111111e-05, "loss": 18.9785, "mean_token_accuracy": 0.6971315786242485, "num_tokens": 17197870.0, "step": 880 }, { "entropy": 1.1884775411337614, "epoch": 1.424, "grad_norm": 17.25, "learning_rate": 1.5637037037037038e-05, "loss": 19.1289, "mean_token_accuracy": 0.697301234304905, "num_tokens": 17394390.0, "step": 890 }, { "entropy": 1.1825116220861673, "epoch": 1.44, "grad_norm": 20.75, "learning_rate": 1.5562962962962965e-05, "loss": 19.1266, "mean_token_accuracy": 0.6968252252787351, "num_tokens": 17587777.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 1.1672434105873108, "eval_biology_loss": 1.1815813779830933, "eval_biology_mean_token_accuracy": 0.7003293070793152, "eval_biology_num_tokens": 17587777.0, "eval_biology_runtime": 48.6205, "eval_biology_samples_per_second": 10.284, "eval_biology_steps_per_second": 2.571, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 0.8869817838668823, "eval_chemistry_loss": 0.8829970955848694, "eval_chemistry_mean_token_accuracy": 0.7656374487876892, "eval_chemistry_num_tokens": 17587777.0, "eval_chemistry_runtime": 60.3339, "eval_chemistry_samples_per_second": 8.287, "eval_chemistry_steps_per_second": 2.072, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.7674483435153961, "eval_math_loss": 0.9792445302009583, "eval_math_mean_token_accuracy": 0.7597224740982056, "eval_math_num_tokens": 17587777.0, "eval_math_runtime": 61.8239, "eval_math_samples_per_second": 8.087, "eval_math_steps_per_second": 2.022, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.8621900615692139, "eval_physics_loss": 0.8884776830673218, "eval_physics_mean_token_accuracy": 0.7715646696090698, "eval_physics_num_tokens": 17587777.0, "eval_physics_runtime": 70.446, "eval_physics_samples_per_second": 7.098, "eval_physics_steps_per_second": 1.774, "step": 900 }, { "entropy": 1.1818725422024727, "epoch": 1.456, "grad_norm": 18.25, "learning_rate": 1.548888888888889e-05, "loss": 19.032, "mean_token_accuracy": 0.698200449720025, "num_tokens": 17788456.0, "step": 910 }, { "entropy": 1.1769807077944279, "epoch": 1.472, "grad_norm": 16.0, "learning_rate": 1.5414814814814814e-05, "loss": 18.8791, "mean_token_accuracy": 0.7008779179304838, "num_tokens": 17984063.0, "step": 920 }, { "entropy": 1.1641013238579034, "epoch": 1.488, "grad_norm": 18.5, "learning_rate": 1.5340740740740744e-05, "loss": 18.9913, "mean_token_accuracy": 0.6998269848525525, "num_tokens": 18175640.0, "step": 930 }, { "entropy": 1.1960251219570637, "epoch": 1.504, "grad_norm": 17.0, "learning_rate": 1.5266666666666667e-05, "loss": 19.2076, "mean_token_accuracy": 0.696322177350521, "num_tokens": 18367857.0, "step": 940 }, { "entropy": 1.1745740845799446, "epoch": 1.52, "grad_norm": 16.875, "learning_rate": 1.5192592592592594e-05, "loss": 19.0307, "mean_token_accuracy": 0.6969408400356769, "num_tokens": 18569146.0, "step": 950 }, { "entropy": 1.2008745949715376, "epoch": 1.536, "grad_norm": 18.75, "learning_rate": 1.5118518518518519e-05, "loss": 19.2895, "mean_token_accuracy": 0.6946466054767371, "num_tokens": 18755079.0, "step": 960 }, { "entropy": 1.1710849691182375, "epoch": 1.552, "grad_norm": 19.375, "learning_rate": 1.5044444444444445e-05, "loss": 18.9073, "mean_token_accuracy": 0.699705482646823, "num_tokens": 18956248.0, "step": 970 }, { "entropy": 1.163971472159028, "epoch": 1.568, "grad_norm": 18.5, "learning_rate": 1.497037037037037e-05, "loss": 18.7379, "mean_token_accuracy": 0.7023108277469874, "num_tokens": 19150315.0, "step": 980 }, { "entropy": 1.1755164857953786, "epoch": 1.584, "grad_norm": 17.75, "learning_rate": 1.4896296296296298e-05, "loss": 18.8826, "mean_token_accuracy": 0.6997408363968134, "num_tokens": 19344260.0, "step": 990 }, { "entropy": 1.2020615819841622, "epoch": 1.6, "grad_norm": 17.125, "learning_rate": 1.4822222222222225e-05, "loss": 19.3858, "mean_token_accuracy": 0.6933612376451492, "num_tokens": 19532552.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 1.1451181559562682, "eval_biology_loss": 1.1767185926437378, "eval_biology_mean_token_accuracy": 0.7013368840217591, "eval_biology_num_tokens": 19532552.0, "eval_biology_runtime": 48.6261, "eval_biology_samples_per_second": 10.283, "eval_biology_steps_per_second": 2.571, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 0.8642943887710571, "eval_chemistry_loss": 0.8798553347587585, "eval_chemistry_mean_token_accuracy": 0.7664505195617676, "eval_chemistry_num_tokens": 19532552.0, "eval_chemistry_runtime": 59.8839, "eval_chemistry_samples_per_second": 8.349, "eval_chemistry_steps_per_second": 2.087, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.7490488801002503, "eval_math_loss": 0.9804874062538147, "eval_math_mean_token_accuracy": 0.7602896738052368, "eval_math_num_tokens": 19532552.0, "eval_math_runtime": 61.7317, "eval_math_samples_per_second": 8.1, "eval_math_steps_per_second": 2.025, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.8397411880493164, "eval_physics_loss": 0.885618269443512, "eval_physics_mean_token_accuracy": 0.7722613711357117, "eval_physics_num_tokens": 19532552.0, "eval_physics_runtime": 70.4574, "eval_physics_samples_per_second": 7.096, "eval_physics_steps_per_second": 1.774, "step": 1000 }, { "entropy": 1.1645312760025264, "epoch": 1.616, "grad_norm": 17.125, "learning_rate": 1.474814814814815e-05, "loss": 18.8318, "mean_token_accuracy": 0.7000049009919167, "num_tokens": 19732719.0, "step": 1010 }, { "entropy": 1.2018159918487072, "epoch": 1.6320000000000001, "grad_norm": 19.625, "learning_rate": 1.4674074074074076e-05, "loss": 19.3741, "mean_token_accuracy": 0.6942509710788727, "num_tokens": 19926830.0, "step": 1020 }, { "entropy": 1.1848741736263038, "epoch": 1.6480000000000001, "grad_norm": 16.25, "learning_rate": 1.46e-05, "loss": 19.0931, "mean_token_accuracy": 0.6962833561003208, "num_tokens": 20118800.0, "step": 1030 }, { "entropy": 1.1461675189435483, "epoch": 1.6640000000000001, "grad_norm": 16.5, "learning_rate": 1.4525925925925927e-05, "loss": 18.5384, "mean_token_accuracy": 0.7037994157522917, "num_tokens": 20320511.0, "step": 1040 }, { "entropy": 1.1853893544524907, "epoch": 1.6800000000000002, "grad_norm": 18.375, "learning_rate": 1.4451851851851852e-05, "loss": 18.9769, "mean_token_accuracy": 0.6986714884638786, "num_tokens": 20513393.0, "step": 1050 }, { "entropy": 1.1721675164997578, "epoch": 1.696, "grad_norm": 17.5, "learning_rate": 1.4377777777777779e-05, "loss": 18.9508, "mean_token_accuracy": 0.7003318756818772, "num_tokens": 20707237.0, "step": 1060 }, { "entropy": 1.1738170266151429, "epoch": 1.712, "grad_norm": 19.5, "learning_rate": 1.4303703703703703e-05, "loss": 18.9752, "mean_token_accuracy": 0.6993441980332136, "num_tokens": 20910419.0, "step": 1070 }, { "entropy": 1.190333865955472, "epoch": 1.728, "grad_norm": 18.5, "learning_rate": 1.4229629629629632e-05, "loss": 19.1838, "mean_token_accuracy": 0.6956166718155146, "num_tokens": 21107498.0, "step": 1080 }, { "entropy": 1.1574083410203457, "epoch": 1.744, "grad_norm": 18.25, "learning_rate": 1.4155555555555556e-05, "loss": 18.5378, "mean_token_accuracy": 0.7021385233849287, "num_tokens": 21303955.0, "step": 1090 }, { "entropy": 1.1666433937847613, "epoch": 1.76, "grad_norm": 18.75, "learning_rate": 1.4081481481481483e-05, "loss": 18.9266, "mean_token_accuracy": 0.7011374026536942, "num_tokens": 21499572.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 1.1345319437980652, "eval_biology_loss": 1.1725776195526123, "eval_biology_mean_token_accuracy": 0.7024298944473266, "eval_biology_num_tokens": 21499572.0, "eval_biology_runtime": 48.5727, "eval_biology_samples_per_second": 10.294, "eval_biology_steps_per_second": 2.573, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 0.8619537029266358, "eval_chemistry_loss": 0.8766760230064392, "eval_chemistry_mean_token_accuracy": 0.7667445015907287, "eval_chemistry_num_tokens": 21499572.0, "eval_chemistry_runtime": 60.4564, "eval_chemistry_samples_per_second": 8.27, "eval_chemistry_steps_per_second": 2.068, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.7560438480377197, "eval_math_loss": 0.9768902063369751, "eval_math_mean_token_accuracy": 0.7610512175559998, "eval_math_num_tokens": 21499572.0, "eval_math_runtime": 61.7554, "eval_math_samples_per_second": 8.096, "eval_math_steps_per_second": 2.024, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.840686586856842, "eval_physics_loss": 0.8825888633728027, "eval_physics_mean_token_accuracy": 0.7726316556930543, "eval_physics_num_tokens": 21499572.0, "eval_physics_runtime": 70.3673, "eval_physics_samples_per_second": 7.106, "eval_physics_steps_per_second": 1.776, "step": 1100 }, { "entropy": 1.177320409566164, "epoch": 1.776, "grad_norm": 17.875, "learning_rate": 1.400740740740741e-05, "loss": 18.9676, "mean_token_accuracy": 0.6986722864210606, "num_tokens": 21692804.0, "step": 1110 }, { "entropy": 1.148191200569272, "epoch": 1.792, "grad_norm": 19.875, "learning_rate": 1.3933333333333334e-05, "loss": 18.5112, "mean_token_accuracy": 0.7040422059595585, "num_tokens": 21894218.0, "step": 1120 }, { "entropy": 1.1709640648216009, "epoch": 1.808, "grad_norm": 19.5, "learning_rate": 1.385925925925926e-05, "loss": 18.8424, "mean_token_accuracy": 0.7009096905589104, "num_tokens": 22082522.0, "step": 1130 }, { "entropy": 1.1934551119804382, "epoch": 1.8239999999999998, "grad_norm": 19.375, "learning_rate": 1.3785185185185186e-05, "loss": 19.3396, "mean_token_accuracy": 0.693663826212287, "num_tokens": 22278933.0, "step": 1140 }, { "entropy": 1.2042058877646924, "epoch": 1.8399999999999999, "grad_norm": 19.375, "learning_rate": 1.3711111111111112e-05, "loss": 19.393, "mean_token_accuracy": 0.6937405589967967, "num_tokens": 22473801.0, "step": 1150 }, { "entropy": 1.1648973379284144, "epoch": 1.8559999999999999, "grad_norm": 18.125, "learning_rate": 1.3637037037037037e-05, "loss": 18.7484, "mean_token_accuracy": 0.6991217479109764, "num_tokens": 22677853.0, "step": 1160 }, { "entropy": 1.1554684847593308, "epoch": 1.8719999999999999, "grad_norm": 17.375, "learning_rate": 1.3562962962962965e-05, "loss": 18.5305, "mean_token_accuracy": 0.7038368381559849, "num_tokens": 22874965.0, "step": 1170 }, { "entropy": 1.1899018451571464, "epoch": 1.888, "grad_norm": 20.75, "learning_rate": 1.3488888888888888e-05, "loss": 19.2428, "mean_token_accuracy": 0.6950660139322281, "num_tokens": 23068892.0, "step": 1180 }, { "entropy": 1.1851763129234314, "epoch": 1.904, "grad_norm": 18.375, "learning_rate": 1.3414814814814817e-05, "loss": 19.126, "mean_token_accuracy": 0.6976218212395906, "num_tokens": 23263827.0, "step": 1190 }, { "entropy": 1.1869227845221757, "epoch": 1.92, "grad_norm": 18.375, "learning_rate": 1.3340740740740741e-05, "loss": 19.135, "mean_token_accuracy": 0.6959322843700647, "num_tokens": 23463627.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 1.1337207446098327, "eval_biology_loss": 1.168716311454773, "eval_biology_mean_token_accuracy": 0.7030426645278931, "eval_biology_num_tokens": 23463627.0, "eval_biology_runtime": 48.5387, "eval_biology_samples_per_second": 10.301, "eval_biology_steps_per_second": 2.575, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 0.8601148505210876, "eval_chemistry_loss": 0.8744351267814636, "eval_chemistry_mean_token_accuracy": 0.767286482334137, "eval_chemistry_num_tokens": 23463627.0, "eval_chemistry_runtime": 60.2148, "eval_chemistry_samples_per_second": 8.304, "eval_chemistry_steps_per_second": 2.076, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.7519172282218933, "eval_math_loss": 0.9753768444061279, "eval_math_mean_token_accuracy": 0.7615019774436951, "eval_math_num_tokens": 23463627.0, "eval_math_runtime": 61.7169, "eval_math_samples_per_second": 8.102, "eval_math_steps_per_second": 2.025, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.836491331577301, "eval_physics_loss": 0.8801184296607971, "eval_physics_mean_token_accuracy": 0.7731836423873901, "eval_physics_num_tokens": 23463627.0, "eval_physics_runtime": 70.3444, "eval_physics_samples_per_second": 7.108, "eval_physics_steps_per_second": 1.777, "step": 1200 }, { "entropy": 1.1517000958323478, "epoch": 1.936, "grad_norm": 18.875, "learning_rate": 1.3266666666666668e-05, "loss": 18.5112, "mean_token_accuracy": 0.7040315445512533, "num_tokens": 23660418.0, "step": 1210 }, { "entropy": 1.172454860061407, "epoch": 1.952, "grad_norm": 18.75, "learning_rate": 1.3192592592592594e-05, "loss": 18.8708, "mean_token_accuracy": 0.6998139064759016, "num_tokens": 23858145.0, "step": 1220 }, { "entropy": 1.18152665682137, "epoch": 1.968, "grad_norm": 18.5, "learning_rate": 1.311851851851852e-05, "loss": 19.0753, "mean_token_accuracy": 0.6981570664793253, "num_tokens": 24053364.0, "step": 1230 }, { "entropy": 1.1768125779926777, "epoch": 1.984, "grad_norm": 17.625, "learning_rate": 1.3044444444444446e-05, "loss": 18.9463, "mean_token_accuracy": 0.6988612022250891, "num_tokens": 24249465.0, "step": 1240 }, { "entropy": 1.180902672186494, "epoch": 2.0, "grad_norm": 19.375, "learning_rate": 1.297037037037037e-05, "loss": 18.999, "mean_token_accuracy": 0.7006669268012047, "num_tokens": 24442582.0, "step": 1250 }, { "entropy": 1.1616276282817126, "epoch": 2.016, "grad_norm": 19.75, "learning_rate": 1.2896296296296299e-05, "loss": 18.592, "mean_token_accuracy": 0.7041712146252394, "num_tokens": 24632353.0, "step": 1260 }, { "entropy": 1.1689807120710611, "epoch": 2.032, "grad_norm": 18.625, "learning_rate": 1.2822222222222222e-05, "loss": 18.9039, "mean_token_accuracy": 0.7014766734093427, "num_tokens": 24822715.0, "step": 1270 }, { "entropy": 1.1456513587385415, "epoch": 2.048, "grad_norm": 20.0, "learning_rate": 1.274814814814815e-05, "loss": 18.4096, "mean_token_accuracy": 0.7051771484315396, "num_tokens": 25023118.0, "step": 1280 }, { "entropy": 1.1587952699512243, "epoch": 2.064, "grad_norm": 18.625, "learning_rate": 1.2674074074074075e-05, "loss": 18.6378, "mean_token_accuracy": 0.7034870360046626, "num_tokens": 25217414.0, "step": 1290 }, { "entropy": 1.1504007514566184, "epoch": 2.08, "grad_norm": 18.0, "learning_rate": 1.2600000000000001e-05, "loss": 18.4703, "mean_token_accuracy": 0.7041630525141954, "num_tokens": 25408961.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 1.1164145894050599, "eval_biology_loss": 1.1683411598205566, "eval_biology_mean_token_accuracy": 0.7027085943222046, "eval_biology_num_tokens": 25408961.0, "eval_biology_runtime": 48.5837, "eval_biology_samples_per_second": 10.292, "eval_biology_steps_per_second": 2.573, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 0.8446620798110962, "eval_chemistry_loss": 0.8756071925163269, "eval_chemistry_mean_token_accuracy": 0.7672800846099853, "eval_chemistry_num_tokens": 25408961.0, "eval_chemistry_runtime": 60.224, "eval_chemistry_samples_per_second": 8.302, "eval_chemistry_steps_per_second": 2.076, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.7416602709293365, "eval_math_loss": 0.9767736792564392, "eval_math_mean_token_accuracy": 0.7612695918083191, "eval_math_num_tokens": 25408961.0, "eval_math_runtime": 61.7354, "eval_math_samples_per_second": 8.099, "eval_math_steps_per_second": 2.025, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.822511604309082, "eval_physics_loss": 0.8807807564735413, "eval_physics_mean_token_accuracy": 0.7734324297904969, "eval_physics_num_tokens": 25408961.0, "eval_physics_runtime": 70.3722, "eval_physics_samples_per_second": 7.105, "eval_physics_steps_per_second": 1.776, "step": 1300 }, { "entropy": 1.145626274123788, "epoch": 2.096, "grad_norm": 20.75, "learning_rate": 1.2525925925925928e-05, "loss": 18.4669, "mean_token_accuracy": 0.7053351275622844, "num_tokens": 25600511.0, "step": 1310 }, { "entropy": 1.1261590894311666, "epoch": 2.112, "grad_norm": 19.375, "learning_rate": 1.2451851851851853e-05, "loss": 18.0421, "mean_token_accuracy": 0.7101508729159832, "num_tokens": 25796565.0, "step": 1320 }, { "entropy": 1.1319866240024568, "epoch": 2.128, "grad_norm": 18.375, "learning_rate": 1.237777777777778e-05, "loss": 18.2326, "mean_token_accuracy": 0.7063661482185125, "num_tokens": 25991156.0, "step": 1330 }, { "entropy": 1.12694109082222, "epoch": 2.144, "grad_norm": 21.625, "learning_rate": 1.2303703703703704e-05, "loss": 18.27, "mean_token_accuracy": 0.7078616585582495, "num_tokens": 26193237.0, "step": 1340 }, { "entropy": 1.1670064296573401, "epoch": 2.16, "grad_norm": 20.0, "learning_rate": 1.222962962962963e-05, "loss": 18.6321, "mean_token_accuracy": 0.704596522077918, "num_tokens": 26387993.0, "step": 1350 }, { "entropy": 1.1611683428287507, "epoch": 2.176, "grad_norm": 19.75, "learning_rate": 1.2155555555555555e-05, "loss": 18.8084, "mean_token_accuracy": 0.7007863517850638, "num_tokens": 26585269.0, "step": 1360 }, { "entropy": 1.1334992978721856, "epoch": 2.192, "grad_norm": 17.0, "learning_rate": 1.2081481481481484e-05, "loss": 18.1002, "mean_token_accuracy": 0.7102227192372084, "num_tokens": 26776318.0, "step": 1370 }, { "entropy": 1.142113695293665, "epoch": 2.208, "grad_norm": 18.25, "learning_rate": 1.2007407407407408e-05, "loss": 18.4288, "mean_token_accuracy": 0.7056139782071114, "num_tokens": 26974420.0, "step": 1380 }, { "entropy": 1.165258849412203, "epoch": 2.224, "grad_norm": 18.375, "learning_rate": 1.1933333333333335e-05, "loss": 18.8072, "mean_token_accuracy": 0.7021366007626056, "num_tokens": 27167577.0, "step": 1390 }, { "entropy": 1.1230817057192326, "epoch": 2.24, "grad_norm": 18.0, "learning_rate": 1.185925925925926e-05, "loss": 18.0154, "mean_token_accuracy": 0.7094326011836529, "num_tokens": 27364189.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 1.107084683418274, "eval_biology_loss": 1.166341781616211, "eval_biology_mean_token_accuracy": 0.7033038935661315, "eval_biology_num_tokens": 27364189.0, "eval_biology_runtime": 48.5794, "eval_biology_samples_per_second": 10.292, "eval_biology_steps_per_second": 2.573, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 0.8436218018531799, "eval_chemistry_loss": 0.8749056458473206, "eval_chemistry_mean_token_accuracy": 0.7673236474990844, "eval_chemistry_num_tokens": 27364189.0, "eval_chemistry_runtime": 60.2275, "eval_chemistry_samples_per_second": 8.302, "eval_chemistry_steps_per_second": 2.075, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.7436552357673645, "eval_math_loss": 0.9768530130386353, "eval_math_mean_token_accuracy": 0.7615942449569703, "eval_math_num_tokens": 27364189.0, "eval_math_runtime": 61.7353, "eval_math_samples_per_second": 8.099, "eval_math_steps_per_second": 2.025, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.8238044924736023, "eval_physics_loss": 0.880571186542511, "eval_physics_mean_token_accuracy": 0.7732309465408325, "eval_physics_num_tokens": 27364189.0, "eval_physics_runtime": 70.3425, "eval_physics_samples_per_second": 7.108, "eval_physics_steps_per_second": 1.777, "step": 1400 }, { "entropy": 1.1227998584508896, "epoch": 2.2560000000000002, "grad_norm": 17.875, "learning_rate": 1.1785185185185186e-05, "loss": 17.9995, "mean_token_accuracy": 0.71092384532094, "num_tokens": 27557387.0, "step": 1410 }, { "entropy": 1.1192971892654895, "epoch": 2.2720000000000002, "grad_norm": 19.5, "learning_rate": 1.1711111111111113e-05, "loss": 18.0703, "mean_token_accuracy": 0.7088606022298336, "num_tokens": 27755725.0, "step": 1420 }, { "entropy": 1.157552171498537, "epoch": 2.288, "grad_norm": 18.75, "learning_rate": 1.1637037037037037e-05, "loss": 18.6818, "mean_token_accuracy": 0.7010403741151094, "num_tokens": 27950694.0, "step": 1430 }, { "entropy": 1.1524705573916436, "epoch": 2.304, "grad_norm": 17.75, "learning_rate": 1.1562962962962964e-05, "loss": 18.6601, "mean_token_accuracy": 0.7030924465507269, "num_tokens": 28150719.0, "step": 1440 }, { "entropy": 1.1215086288750171, "epoch": 2.32, "grad_norm": 17.5, "learning_rate": 1.1488888888888889e-05, "loss": 17.9268, "mean_token_accuracy": 0.7104179698973894, "num_tokens": 28348652.0, "step": 1450 }, { "entropy": 1.1350885152816772, "epoch": 2.336, "grad_norm": 18.375, "learning_rate": 1.1414814814814817e-05, "loss": 18.3565, "mean_token_accuracy": 0.7067790202796459, "num_tokens": 28542945.0, "step": 1460 }, { "entropy": 1.156265541538596, "epoch": 2.352, "grad_norm": 19.0, "learning_rate": 1.1340740740740742e-05, "loss": 18.6992, "mean_token_accuracy": 0.7013768840581178, "num_tokens": 28731927.0, "step": 1470 }, { "entropy": 1.1396565582603215, "epoch": 2.368, "grad_norm": 18.75, "learning_rate": 1.1266666666666668e-05, "loss": 18.2949, "mean_token_accuracy": 0.7059750188142061, "num_tokens": 28929298.0, "step": 1480 }, { "entropy": 1.1448044694960118, "epoch": 2.384, "grad_norm": 20.625, "learning_rate": 1.1192592592592593e-05, "loss": 18.526, "mean_token_accuracy": 0.703592960909009, "num_tokens": 29121142.0, "step": 1490 }, { "entropy": 1.140185246989131, "epoch": 2.4, "grad_norm": 18.0, "learning_rate": 1.111851851851852e-05, "loss": 18.3273, "mean_token_accuracy": 0.706566022336483, "num_tokens": 29317919.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 1.1156310276985169, "eval_biology_loss": 1.1647558212280273, "eval_biology_mean_token_accuracy": 0.7032402672767639, "eval_biology_num_tokens": 29317919.0, "eval_biology_runtime": 48.611, "eval_biology_samples_per_second": 10.286, "eval_biology_steps_per_second": 2.571, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 0.8463425951004029, "eval_chemistry_loss": 0.8742334246635437, "eval_chemistry_mean_token_accuracy": 0.767361388683319, "eval_chemistry_num_tokens": 29317919.0, "eval_chemistry_runtime": 59.9689, "eval_chemistry_samples_per_second": 8.338, "eval_chemistry_steps_per_second": 2.084, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.7472673971652984, "eval_math_loss": 0.9761422872543335, "eval_math_mean_token_accuracy": 0.7615765709877014, "eval_math_num_tokens": 29317919.0, "eval_math_runtime": 61.7442, "eval_math_samples_per_second": 8.098, "eval_math_steps_per_second": 2.024, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.8270321841239929, "eval_physics_loss": 0.8801943063735962, "eval_physics_mean_token_accuracy": 0.7734404511451721, "eval_physics_num_tokens": 29317919.0, "eval_physics_runtime": 70.5165, "eval_physics_samples_per_second": 7.091, "eval_physics_steps_per_second": 1.773, "step": 1500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.305483613425587e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }