diff --git "a/checkpoint-5000/trainer_state.json" "b/checkpoint-5000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5000/trainer_state.json" @@ -0,0 +1,7234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2252267289823626, + "eval_steps": 100, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.7626848379150033, + "epoch": 0.004451121126133645, + "grad_norm": 1888.0, + "learning_rate": 1.8e-07, + "loss": 36.6657, + "mean_token_accuracy": 0.6676603280007839, + "num_tokens": 63133.0, + "step": 10 + }, + { + "entropy": 0.81443009134382, + "epoch": 0.00890224225226729, + "grad_norm": 1792.0, + "learning_rate": 3.8e-07, + "loss": 36.4468, + "mean_token_accuracy": 0.6505163595080375, + "num_tokens": 128968.0, + "step": 20 + }, + { + "entropy": 0.8291782144457102, + "epoch": 0.013353363378400934, + "grad_norm": 968.0, + "learning_rate": 5.800000000000001e-07, + "loss": 34.7077, + "mean_token_accuracy": 0.6580854222178459, + "num_tokens": 193314.0, + "step": 30 + }, + { + "entropy": 0.9378003980964422, + "epoch": 0.01780448450453458, + "grad_norm": 656.0, + "learning_rate": 7.8e-07, + "loss": 33.9052, + "mean_token_accuracy": 0.6638370925560594, + "num_tokens": 257189.0, + "step": 40 + }, + { + "entropy": 0.9725235970690846, + "epoch": 0.022255605630668224, + "grad_norm": 724.0, + "learning_rate": 9.800000000000001e-07, + "loss": 29.8492, + "mean_token_accuracy": 0.673487476631999, + "num_tokens": 324224.0, + "step": 50 + }, + { + "entropy": 1.0304172853007914, + "epoch": 0.026706726756801868, + "grad_norm": 488.0, + "learning_rate": 1.1800000000000001e-06, + "loss": 28.3082, + "mean_token_accuracy": 0.6809290068224072, + "num_tokens": 394654.0, + "step": 60 + }, + { + "entropy": 1.000378421880305, + "epoch": 0.031157847882935515, + "grad_norm": 420.0, + "learning_rate": 1.3800000000000001e-06, + "loss": 26.217, + "mean_token_accuracy": 0.6916999943554402, + "num_tokens": 461658.0, + "step": 70 + }, + { + "entropy": 1.0228427596390248, + "epoch": 0.03560896900906916, + "grad_norm": 258.0, + "learning_rate": 1.5800000000000001e-06, + "loss": 23.6386, + "mean_token_accuracy": 0.7020359886810184, + "num_tokens": 528285.0, + "step": 80 + }, + { + "entropy": 1.0894161500036716, + "epoch": 0.04006009013520281, + "grad_norm": 218.0, + "learning_rate": 1.7800000000000001e-06, + "loss": 23.2798, + "mean_token_accuracy": 0.6986296704038978, + "num_tokens": 595336.0, + "step": 90 + }, + { + "entropy": 1.0631676118820905, + "epoch": 0.04451121126133645, + "grad_norm": 246.0, + "learning_rate": 1.98e-06, + "loss": 21.4229, + "mean_token_accuracy": 0.7073360819369554, + "num_tokens": 663126.0, + "step": 100 + }, + { + "epoch": 0.04451121126133645, + "eval_biology_entropy": 1.48638627243042, + "eval_biology_loss": 1.9182748794555664, + "eval_biology_mean_token_accuracy": 0.6060935441255569, + "eval_biology_num_tokens": 663126.0, + "eval_biology_runtime": 49.1778, + "eval_biology_samples_per_second": 10.167, + "eval_biology_steps_per_second": 10.167, + "step": 100 + }, + { + "epoch": 0.04451121126133645, + "eval_chemistry_entropy": 1.0556317766308785, + "eval_chemistry_loss": 1.2898850440979004, + "eval_chemistry_mean_token_accuracy": 0.7124251598715782, + "eval_chemistry_num_tokens": 663126.0, + "eval_chemistry_runtime": 57.0461, + "eval_chemistry_samples_per_second": 8.765, + "eval_chemistry_steps_per_second": 8.765, + "step": 100 + }, + { + "epoch": 0.04451121126133645, + "eval_math_entropy": 0.8456157124042512, + "eval_math_loss": 1.1942517757415771, + "eval_math_mean_token_accuracy": 0.7334114202260971, + "eval_math_num_tokens": 663126.0, + "eval_math_runtime": 59.0482, + "eval_math_samples_per_second": 8.468, + "eval_math_steps_per_second": 8.468, + "step": 100 + }, + { + "epoch": 0.04451121126133645, + "eval_physics_entropy": 0.9410946148037911, + "eval_physics_loss": 1.1514631509780884, + "eval_physics_mean_token_accuracy": 0.7352130191326142, + "eval_physics_num_tokens": 663126.0, + "eval_physics_runtime": 65.895, + "eval_physics_samples_per_second": 7.588, + "eval_physics_steps_per_second": 7.588, + "step": 100 + }, + { + "entropy": 1.0484314041212202, + "epoch": 0.048962332387470095, + "grad_norm": 197.0, + "learning_rate": 2.1800000000000003e-06, + "loss": 19.9524, + "mean_token_accuracy": 0.7201540337875485, + "num_tokens": 731060.0, + "step": 110 + }, + { + "entropy": 1.012692185677588, + "epoch": 0.053413453513603736, + "grad_norm": 178.0, + "learning_rate": 2.38e-06, + "loss": 18.8663, + "mean_token_accuracy": 0.7315327662974596, + "num_tokens": 801635.0, + "step": 120 + }, + { + "entropy": 1.0016614170745015, + "epoch": 0.05786457463973738, + "grad_norm": 150.0, + "learning_rate": 2.5800000000000003e-06, + "loss": 19.0949, + "mean_token_accuracy": 0.7297663483768702, + "num_tokens": 867260.0, + "step": 130 + }, + { + "entropy": 1.007587356492877, + "epoch": 0.06231569576587103, + "grad_norm": 131.0, + "learning_rate": 2.7800000000000005e-06, + "loss": 18.9083, + "mean_token_accuracy": 0.7291399639099836, + "num_tokens": 931344.0, + "step": 140 + }, + { + "entropy": 1.0287242146208881, + "epoch": 0.06676681689200467, + "grad_norm": 179.0, + "learning_rate": 2.9800000000000003e-06, + "loss": 17.5655, + "mean_token_accuracy": 0.7393418021500111, + "num_tokens": 993586.0, + "step": 150 + }, + { + "entropy": 1.0563909145072103, + "epoch": 0.07121793801813832, + "grad_norm": 126.0, + "learning_rate": 3.1800000000000005e-06, + "loss": 16.9203, + "mean_token_accuracy": 0.7381794668734074, + "num_tokens": 1060812.0, + "step": 160 + }, + { + "entropy": 1.002509175427258, + "epoch": 0.07566905914427197, + "grad_norm": 127.0, + "learning_rate": 3.3800000000000007e-06, + "loss": 15.5704, + "mean_token_accuracy": 0.7520427949726581, + "num_tokens": 1131832.0, + "step": 170 + }, + { + "entropy": 0.9855028538033366, + "epoch": 0.08012018027040561, + "grad_norm": 101.5, + "learning_rate": 3.58e-06, + "loss": 15.2418, + "mean_token_accuracy": 0.7483469128608704, + "num_tokens": 1200650.0, + "step": 180 + }, + { + "entropy": 0.953622367605567, + "epoch": 0.08457130139653925, + "grad_norm": 117.0, + "learning_rate": 3.7800000000000002e-06, + "loss": 14.9841, + "mean_token_accuracy": 0.7532309770584107, + "num_tokens": 1265107.0, + "step": 190 + }, + { + "entropy": 0.9146013628691435, + "epoch": 0.0890224225226729, + "grad_norm": 123.0, + "learning_rate": 3.980000000000001e-06, + "loss": 14.4834, + "mean_token_accuracy": 0.7538760716095567, + "num_tokens": 1327380.0, + "step": 200 + }, + { + "epoch": 0.0890224225226729, + "eval_biology_entropy": 1.2566380687952041, + "eval_biology_loss": 1.4376745223999023, + "eval_biology_mean_token_accuracy": 0.6541494448781013, + "eval_biology_num_tokens": 1327380.0, + "eval_biology_runtime": 49.1875, + "eval_biology_samples_per_second": 10.165, + "eval_biology_steps_per_second": 10.165, + "step": 200 + }, + { + "epoch": 0.0890224225226729, + "eval_chemistry_entropy": 0.8981590569019318, + "eval_chemistry_loss": 0.9085330963134766, + "eval_chemistry_mean_token_accuracy": 0.7548674891591072, + "eval_chemistry_num_tokens": 1327380.0, + "eval_chemistry_runtime": 57.0026, + "eval_chemistry_samples_per_second": 8.772, + "eval_chemistry_steps_per_second": 8.772, + "step": 200 + }, + { + "epoch": 0.0890224225226729, + "eval_math_entropy": 0.7623042597174644, + "eval_math_loss": 0.8974745869636536, + "eval_math_mean_token_accuracy": 0.7647860664129257, + "eval_math_num_tokens": 1327380.0, + "eval_math_runtime": 58.8532, + "eval_math_samples_per_second": 8.496, + "eval_math_steps_per_second": 8.496, + "step": 200 + }, + { + "epoch": 0.0890224225226729, + "eval_physics_entropy": 0.8175762248635292, + "eval_physics_loss": 0.8456214666366577, + "eval_physics_mean_token_accuracy": 0.771473237156868, + "eval_physics_num_tokens": 1327380.0, + "eval_physics_runtime": 65.7934, + "eval_physics_samples_per_second": 7.6, + "eval_physics_steps_per_second": 7.6, + "step": 200 + }, + { + "entropy": 0.9297651303932071, + "epoch": 0.09347354364880654, + "grad_norm": 121.0, + "learning_rate": 4.18e-06, + "loss": 15.1938, + "mean_token_accuracy": 0.7461945816874505, + "num_tokens": 1393379.0, + "step": 210 + }, + { + "entropy": 0.8862789057195186, + "epoch": 0.09792466477494019, + "grad_norm": 98.0, + "learning_rate": 4.38e-06, + "loss": 14.1582, + "mean_token_accuracy": 0.759770842641592, + "num_tokens": 1460130.0, + "step": 220 + }, + { + "entropy": 0.8502754125744104, + "epoch": 0.10237578590107384, + "grad_norm": 93.5, + "learning_rate": 4.58e-06, + "loss": 13.6798, + "mean_token_accuracy": 0.7648626536130905, + "num_tokens": 1529183.0, + "step": 230 + }, + { + "entropy": 0.8967906672507524, + "epoch": 0.10682690702720747, + "grad_norm": 117.0, + "learning_rate": 4.78e-06, + "loss": 14.5446, + "mean_token_accuracy": 0.7548742420971394, + "num_tokens": 1597405.0, + "step": 240 + }, + { + "entropy": 0.8926160346716643, + "epoch": 0.11127802815334112, + "grad_norm": 97.0, + "learning_rate": 4.980000000000001e-06, + "loss": 14.4754, + "mean_token_accuracy": 0.7543840739876032, + "num_tokens": 1661767.0, + "step": 250 + }, + { + "entropy": 0.906158303655684, + "epoch": 0.11572914927947477, + "grad_norm": 81.5, + "learning_rate": 5.18e-06, + "loss": 14.5882, + "mean_token_accuracy": 0.7560893476009369, + "num_tokens": 1728207.0, + "step": 260 + }, + { + "entropy": 0.8557273065671325, + "epoch": 0.12018027040560841, + "grad_norm": 98.5, + "learning_rate": 5.380000000000001e-06, + "loss": 13.7256, + "mean_token_accuracy": 0.7653402671217918, + "num_tokens": 1796055.0, + "step": 270 + }, + { + "entropy": 0.824279030226171, + "epoch": 0.12463139153174206, + "grad_norm": 97.0, + "learning_rate": 5.580000000000001e-06, + "loss": 13.2294, + "mean_token_accuracy": 0.7715799886733293, + "num_tokens": 1859684.0, + "step": 280 + }, + { + "entropy": 0.885771127231419, + "epoch": 0.1290825126578757, + "grad_norm": 102.5, + "learning_rate": 5.78e-06, + "loss": 14.3879, + "mean_token_accuracy": 0.7568217847496271, + "num_tokens": 1921704.0, + "step": 290 + }, + { + "entropy": 0.868380057439208, + "epoch": 0.13353363378400934, + "grad_norm": 87.5, + "learning_rate": 5.98e-06, + "loss": 13.9363, + "mean_token_accuracy": 0.7626302808523178, + "num_tokens": 1985766.0, + "step": 300 + }, + { + "epoch": 0.13353363378400934, + "eval_biology_entropy": 1.2238210166096688, + "eval_biology_loss": 1.366097331047058, + "eval_biology_mean_token_accuracy": 0.6672504586577416, + "eval_biology_num_tokens": 1985766.0, + "eval_biology_runtime": 49.1235, + "eval_biology_samples_per_second": 10.178, + "eval_biology_steps_per_second": 10.178, + "step": 300 + }, + { + "epoch": 0.13353363378400934, + "eval_chemistry_entropy": 0.8314956972002983, + "eval_chemistry_loss": 0.8379236459732056, + "eval_chemistry_mean_token_accuracy": 0.7699049721360207, + "eval_chemistry_num_tokens": 1985766.0, + "eval_chemistry_runtime": 57.0049, + "eval_chemistry_samples_per_second": 8.771, + "eval_chemistry_steps_per_second": 8.771, + "step": 300 + }, + { + "epoch": 0.13353363378400934, + "eval_math_entropy": 0.714144405901432, + "eval_math_loss": 0.874640703201294, + "eval_math_mean_token_accuracy": 0.7714004491567612, + "eval_math_num_tokens": 1985766.0, + "eval_math_runtime": 58.9383, + "eval_math_samples_per_second": 8.483, + "eval_math_steps_per_second": 8.483, + "step": 300 + }, + { + "epoch": 0.13353363378400934, + "eval_physics_entropy": 0.7712625939249992, + "eval_physics_loss": 0.787721574306488, + "eval_physics_mean_token_accuracy": 0.7829854927659035, + "eval_physics_num_tokens": 1985766.0, + "eval_physics_runtime": 65.8905, + "eval_physics_samples_per_second": 7.588, + "eval_physics_steps_per_second": 7.588, + "step": 300 + }, + { + "entropy": 0.8396226148121059, + "epoch": 0.137984754910143, + "grad_norm": 96.5, + "learning_rate": 6.18e-06, + "loss": 13.612, + "mean_token_accuracy": 0.7638351432979107, + "num_tokens": 2049041.0, + "step": 310 + }, + { + "entropy": 0.8020199347287417, + "epoch": 0.14243587603627664, + "grad_norm": 81.0, + "learning_rate": 6.380000000000001e-06, + "loss": 12.8945, + "mean_token_accuracy": 0.776324712112546, + "num_tokens": 2118068.0, + "step": 320 + }, + { + "entropy": 0.8072531137615442, + "epoch": 0.14688699716241027, + "grad_norm": 107.5, + "learning_rate": 6.5800000000000005e-06, + "loss": 12.776, + "mean_token_accuracy": 0.7789770424365997, + "num_tokens": 2186387.0, + "step": 330 + }, + { + "entropy": 0.8215415397658944, + "epoch": 0.15133811828854393, + "grad_norm": 84.5, + "learning_rate": 6.780000000000001e-06, + "loss": 13.3957, + "mean_token_accuracy": 0.7686271335929632, + "num_tokens": 2252650.0, + "step": 340 + }, + { + "entropy": 0.7919685224071145, + "epoch": 0.15578923941467757, + "grad_norm": 87.0, + "learning_rate": 6.98e-06, + "loss": 12.7862, + "mean_token_accuracy": 0.7796416375786066, + "num_tokens": 2320358.0, + "step": 350 + }, + { + "entropy": 0.7888439664617181, + "epoch": 0.16024036054081123, + "grad_norm": 90.5, + "learning_rate": 7.180000000000001e-06, + "loss": 12.6082, + "mean_token_accuracy": 0.7811494890600443, + "num_tokens": 2388824.0, + "step": 360 + }, + { + "entropy": 0.7693003987893462, + "epoch": 0.16469148166694486, + "grad_norm": 82.0, + "learning_rate": 7.3800000000000005e-06, + "loss": 12.3808, + "mean_token_accuracy": 0.7856796395033598, + "num_tokens": 2456144.0, + "step": 370 + }, + { + "entropy": 0.8541094154119492, + "epoch": 0.1691426027930785, + "grad_norm": 80.0, + "learning_rate": 7.58e-06, + "loss": 13.8583, + "mean_token_accuracy": 0.7654584795236588, + "num_tokens": 2515325.0, + "step": 380 + }, + { + "entropy": 0.7904249761253596, + "epoch": 0.17359372391921216, + "grad_norm": 68.5, + "learning_rate": 7.78e-06, + "loss": 12.659, + "mean_token_accuracy": 0.7788146499544382, + "num_tokens": 2580490.0, + "step": 390 + }, + { + "entropy": 0.7941560545936227, + "epoch": 0.1780448450453458, + "grad_norm": 72.5, + "learning_rate": 7.980000000000002e-06, + "loss": 12.8107, + "mean_token_accuracy": 0.7767765510827302, + "num_tokens": 2644330.0, + "step": 400 + }, + { + "epoch": 0.1780448450453458, + "eval_biology_entropy": 1.2128318169116974, + "eval_biology_loss": 1.3349602222442627, + "eval_biology_mean_token_accuracy": 0.6734810082316398, + "eval_biology_num_tokens": 2644330.0, + "eval_biology_runtime": 49.0571, + "eval_biology_samples_per_second": 10.192, + "eval_biology_steps_per_second": 10.192, + "step": 400 + }, + { + "epoch": 0.1780448450453458, + "eval_chemistry_entropy": 0.7846001248955726, + "eval_chemistry_loss": 0.798675537109375, + "eval_chemistry_mean_token_accuracy": 0.7786763918995857, + "eval_chemistry_num_tokens": 2644330.0, + "eval_chemistry_runtime": 56.9232, + "eval_chemistry_samples_per_second": 8.784, + "eval_chemistry_steps_per_second": 8.784, + "step": 400 + }, + { + "epoch": 0.1780448450453458, + "eval_math_entropy": 0.6919120717644691, + "eval_math_loss": 0.8597798943519592, + "eval_math_mean_token_accuracy": 0.7751281379461289, + "eval_math_num_tokens": 2644330.0, + "eval_math_runtime": 58.7801, + "eval_math_samples_per_second": 8.506, + "eval_math_steps_per_second": 8.506, + "step": 400 + }, + { + "epoch": 0.1780448450453458, + "eval_physics_entropy": 0.7369731608629226, + "eval_physics_loss": 0.7621525526046753, + "eval_physics_mean_token_accuracy": 0.7884504457712174, + "eval_physics_num_tokens": 2644330.0, + "eval_physics_runtime": 65.7104, + "eval_physics_samples_per_second": 7.609, + "eval_physics_steps_per_second": 7.609, + "step": 400 + }, + { + "entropy": 0.7377481091767549, + "epoch": 0.18249596617147945, + "grad_norm": 73.0, + "learning_rate": 8.18e-06, + "loss": 11.7926, + "mean_token_accuracy": 0.7937501568347216, + "num_tokens": 2713234.0, + "step": 410 + }, + { + "entropy": 0.7830979386344552, + "epoch": 0.18694708729761308, + "grad_norm": 76.5, + "learning_rate": 8.380000000000001e-06, + "loss": 12.9301, + "mean_token_accuracy": 0.7784280318766832, + "num_tokens": 2783261.0, + "step": 420 + }, + { + "entropy": 0.7945666501298547, + "epoch": 0.19139820842374672, + "grad_norm": 61.5, + "learning_rate": 8.580000000000001e-06, + "loss": 12.6515, + "mean_token_accuracy": 0.7819751728326082, + "num_tokens": 2850170.0, + "step": 430 + }, + { + "entropy": 0.7770357062108815, + "epoch": 0.19584932954988038, + "grad_norm": 75.0, + "learning_rate": 8.78e-06, + "loss": 12.5072, + "mean_token_accuracy": 0.7829508323222398, + "num_tokens": 2913700.0, + "step": 440 + }, + { + "entropy": 0.7575128111988306, + "epoch": 0.200300450676014, + "grad_norm": 81.0, + "learning_rate": 8.98e-06, + "loss": 12.4714, + "mean_token_accuracy": 0.7839657858014106, + "num_tokens": 2981212.0, + "step": 450 + }, + { + "entropy": 0.7656491708010436, + "epoch": 0.20475157180214767, + "grad_norm": 85.0, + "learning_rate": 9.180000000000002e-06, + "loss": 12.0974, + "mean_token_accuracy": 0.7915447618812322, + "num_tokens": 3048973.0, + "step": 460 + }, + { + "entropy": 0.7621381992474199, + "epoch": 0.2092026929282813, + "grad_norm": 61.75, + "learning_rate": 9.38e-06, + "loss": 12.5808, + "mean_token_accuracy": 0.7818816823884844, + "num_tokens": 3117033.0, + "step": 470 + }, + { + "entropy": 0.7816813634708524, + "epoch": 0.21365381405441494, + "grad_norm": 73.5, + "learning_rate": 9.58e-06, + "loss": 12.3819, + "mean_token_accuracy": 0.7846902538090944, + "num_tokens": 3185255.0, + "step": 480 + }, + { + "entropy": 0.7647983279079199, + "epoch": 0.2181049351805486, + "grad_norm": 76.5, + "learning_rate": 9.780000000000001e-06, + "loss": 12.4249, + "mean_token_accuracy": 0.7847896091639995, + "num_tokens": 3253121.0, + "step": 490 + }, + { + "entropy": 0.7214844634756445, + "epoch": 0.22255605630668224, + "grad_norm": 75.5, + "learning_rate": 9.980000000000001e-06, + "loss": 11.4214, + "mean_token_accuracy": 0.7984224483370781, + "num_tokens": 3322823.0, + "step": 500 + }, + { + "epoch": 0.22255605630668224, + "eval_biology_entropy": 1.1607659912109376, + "eval_biology_loss": 1.328081488609314, + "eval_biology_mean_token_accuracy": 0.6758214622139931, + "eval_biology_num_tokens": 3322823.0, + "eval_biology_runtime": 49.1342, + "eval_biology_samples_per_second": 10.176, + "eval_biology_steps_per_second": 10.176, + "step": 500 + }, + { + "epoch": 0.22255605630668224, + "eval_chemistry_entropy": 0.7407792928814888, + "eval_chemistry_loss": 0.775718629360199, + "eval_chemistry_mean_token_accuracy": 0.7834744991064072, + "eval_chemistry_num_tokens": 3322823.0, + "eval_chemistry_runtime": 57.1618, + "eval_chemistry_samples_per_second": 8.747, + "eval_chemistry_steps_per_second": 8.747, + "step": 500 + }, + { + "epoch": 0.22255605630668224, + "eval_math_entropy": 0.6776470097601414, + "eval_math_loss": 0.8585710525512695, + "eval_math_mean_token_accuracy": 0.7761784045696258, + "eval_math_num_tokens": 3322823.0, + "eval_math_runtime": 58.8725, + "eval_math_samples_per_second": 8.493, + "eval_math_steps_per_second": 8.493, + "step": 500 + }, + { + "epoch": 0.22255605630668224, + "eval_physics_entropy": 0.7148050211071968, + "eval_physics_loss": 0.7503296732902527, + "eval_physics_mean_token_accuracy": 0.7919680835604668, + "eval_physics_num_tokens": 3322823.0, + "eval_physics_runtime": 65.8508, + "eval_physics_samples_per_second": 7.593, + "eval_physics_steps_per_second": 7.593, + "step": 500 + }, + { + "entropy": 0.7285958690568805, + "epoch": 0.2270071774328159, + "grad_norm": 72.5, + "learning_rate": 1.018e-05, + "loss": 11.8973, + "mean_token_accuracy": 0.7914360746741295, + "num_tokens": 3393393.0, + "step": 510 + }, + { + "entropy": 0.8255035754293203, + "epoch": 0.23145829855894953, + "grad_norm": 80.0, + "learning_rate": 1.038e-05, + "loss": 13.0946, + "mean_token_accuracy": 0.774183202162385, + "num_tokens": 3454750.0, + "step": 520 + }, + { + "entropy": 0.708361669909209, + "epoch": 0.2359094196850832, + "grad_norm": 71.0, + "learning_rate": 1.0580000000000002e-05, + "loss": 11.7446, + "mean_token_accuracy": 0.7961884457617998, + "num_tokens": 3526914.0, + "step": 530 + }, + { + "entropy": 0.8215967276133597, + "epoch": 0.24036054081121683, + "grad_norm": 79.0, + "learning_rate": 1.0780000000000002e-05, + "loss": 13.0207, + "mean_token_accuracy": 0.7765944451093674, + "num_tokens": 3589449.0, + "step": 540 + }, + { + "entropy": 0.7564663768745958, + "epoch": 0.24481166193735046, + "grad_norm": 82.0, + "learning_rate": 1.0980000000000002e-05, + "loss": 12.2873, + "mean_token_accuracy": 0.7854228101670742, + "num_tokens": 3655092.0, + "step": 550 + }, + { + "entropy": 0.7720718337222934, + "epoch": 0.24926278306348412, + "grad_norm": 90.5, + "learning_rate": 1.1180000000000001e-05, + "loss": 12.4637, + "mean_token_accuracy": 0.7846879370510578, + "num_tokens": 3719113.0, + "step": 560 + }, + { + "entropy": 0.7583298075944185, + "epoch": 0.25371390418961776, + "grad_norm": 66.5, + "learning_rate": 1.138e-05, + "loss": 12.4348, + "mean_token_accuracy": 0.781843975931406, + "num_tokens": 3785282.0, + "step": 570 + }, + { + "entropy": 0.7643377220258116, + "epoch": 0.2581650253157514, + "grad_norm": 63.75, + "learning_rate": 1.1580000000000001e-05, + "loss": 12.4092, + "mean_token_accuracy": 0.7843703482300043, + "num_tokens": 3852276.0, + "step": 580 + }, + { + "entropy": 0.7012410337105394, + "epoch": 0.262616146441885, + "grad_norm": 67.0, + "learning_rate": 1.178e-05, + "loss": 11.3623, + "mean_token_accuracy": 0.7991111058741808, + "num_tokens": 3925649.0, + "step": 590 + }, + { + "entropy": 0.7401338685303926, + "epoch": 0.2670672675680187, + "grad_norm": 71.5, + "learning_rate": 1.198e-05, + "loss": 11.9064, + "mean_token_accuracy": 0.7897865924984216, + "num_tokens": 3990505.0, + "step": 600 + }, + { + "epoch": 0.2670672675680187, + "eval_biology_entropy": 1.216272303700447, + "eval_biology_loss": 1.3142751455307007, + "eval_biology_mean_token_accuracy": 0.6766086940765381, + "eval_biology_num_tokens": 3990505.0, + "eval_biology_runtime": 49.3541, + "eval_biology_samples_per_second": 10.131, + "eval_biology_steps_per_second": 10.131, + "step": 600 + }, + { + "epoch": 0.2670672675680187, + "eval_chemistry_entropy": 0.7538464320898056, + "eval_chemistry_loss": 0.7621961832046509, + "eval_chemistry_mean_token_accuracy": 0.7865861940383911, + "eval_chemistry_num_tokens": 3990505.0, + "eval_chemistry_runtime": 57.3005, + "eval_chemistry_samples_per_second": 8.726, + "eval_chemistry_steps_per_second": 8.726, + "step": 600 + }, + { + "epoch": 0.2670672675680187, + "eval_math_entropy": 0.6806064413189888, + "eval_math_loss": 0.8638136982917786, + "eval_math_mean_token_accuracy": 0.7749927558898926, + "eval_math_num_tokens": 3990505.0, + "eval_math_runtime": 59.1764, + "eval_math_samples_per_second": 8.449, + "eval_math_steps_per_second": 8.449, + "step": 600 + }, + { + "epoch": 0.2670672675680187, + "eval_physics_entropy": 0.7255966661572456, + "eval_physics_loss": 0.7440791726112366, + "eval_physics_mean_token_accuracy": 0.7921827912330628, + "eval_physics_num_tokens": 3990505.0, + "eval_physics_runtime": 66.0704, + "eval_physics_samples_per_second": 7.568, + "eval_physics_steps_per_second": 7.568, + "step": 600 + }, + { + "entropy": 0.738010767288506, + "epoch": 0.27151838869415235, + "grad_norm": 64.0, + "learning_rate": 1.218e-05, + "loss": 11.9409, + "mean_token_accuracy": 0.7896174006164074, + "num_tokens": 4060657.0, + "step": 610 + }, + { + "entropy": 0.7203703055158257, + "epoch": 0.275969509820286, + "grad_norm": 67.0, + "learning_rate": 1.2380000000000002e-05, + "loss": 11.5655, + "mean_token_accuracy": 0.7945824809372425, + "num_tokens": 4133119.0, + "step": 620 + }, + { + "entropy": 0.743073652498424, + "epoch": 0.2804206309464196, + "grad_norm": 69.5, + "learning_rate": 1.2580000000000002e-05, + "loss": 12.0073, + "mean_token_accuracy": 0.7911369562149048, + "num_tokens": 4200051.0, + "step": 630 + }, + { + "entropy": 0.7967112486250698, + "epoch": 0.2848717520725533, + "grad_norm": 72.5, + "learning_rate": 1.2780000000000001e-05, + "loss": 12.7974, + "mean_token_accuracy": 0.7836818125098943, + "num_tokens": 4267403.0, + "step": 640 + }, + { + "entropy": 0.7334000766277313, + "epoch": 0.28932287319868694, + "grad_norm": 71.5, + "learning_rate": 1.2980000000000001e-05, + "loss": 11.8952, + "mean_token_accuracy": 0.7923904918134212, + "num_tokens": 4333268.0, + "step": 650 + }, + { + "entropy": 0.7248436002992094, + "epoch": 0.29377399432482054, + "grad_norm": 79.5, + "learning_rate": 1.3180000000000001e-05, + "loss": 11.7473, + "mean_token_accuracy": 0.7948706120252609, + "num_tokens": 4400242.0, + "step": 660 + }, + { + "entropy": 0.7555086806416511, + "epoch": 0.2982251154509542, + "grad_norm": 77.0, + "learning_rate": 1.3380000000000002e-05, + "loss": 12.0493, + "mean_token_accuracy": 0.7891406118869781, + "num_tokens": 4464714.0, + "step": 670 + }, + { + "entropy": 0.7099162889644504, + "epoch": 0.30267623657708786, + "grad_norm": 62.75, + "learning_rate": 1.3580000000000002e-05, + "loss": 11.5969, + "mean_token_accuracy": 0.7962878335267305, + "num_tokens": 4535458.0, + "step": 680 + }, + { + "entropy": 0.7450342487543822, + "epoch": 0.3071273577032215, + "grad_norm": 66.5, + "learning_rate": 1.378e-05, + "loss": 11.9777, + "mean_token_accuracy": 0.7921688258647919, + "num_tokens": 4599099.0, + "step": 690 + }, + { + "entropy": 0.7137214630842209, + "epoch": 0.31157847882935513, + "grad_norm": 56.5, + "learning_rate": 1.398e-05, + "loss": 11.5022, + "mean_token_accuracy": 0.7959922354668378, + "num_tokens": 4662903.0, + "step": 700 + }, + { + "epoch": 0.31157847882935513, + "eval_biology_entropy": 1.21318653935194, + "eval_biology_loss": 1.3126111030578613, + "eval_biology_mean_token_accuracy": 0.6781674832701683, + "eval_biology_num_tokens": 4662903.0, + "eval_biology_runtime": 49.2021, + "eval_biology_samples_per_second": 10.162, + "eval_biology_steps_per_second": 10.162, + "step": 700 + }, + { + "epoch": 0.31157847882935513, + "eval_chemistry_entropy": 0.7448744902908802, + "eval_chemistry_loss": 0.7531416416168213, + "eval_chemistry_mean_token_accuracy": 0.788379067659378, + "eval_chemistry_num_tokens": 4662903.0, + "eval_chemistry_runtime": 57.0266, + "eval_chemistry_samples_per_second": 8.768, + "eval_chemistry_steps_per_second": 8.768, + "step": 700 + }, + { + "epoch": 0.31157847882935513, + "eval_math_entropy": 0.6775659238696098, + "eval_math_loss": 0.8588793277740479, + "eval_math_mean_token_accuracy": 0.7769290524721145, + "eval_math_num_tokens": 4662903.0, + "eval_math_runtime": 58.9903, + "eval_math_samples_per_second": 8.476, + "eval_math_steps_per_second": 8.476, + "step": 700 + }, + { + "epoch": 0.31157847882935513, + "eval_physics_entropy": 0.7208585093617439, + "eval_physics_loss": 0.7438266277313232, + "eval_physics_mean_token_accuracy": 0.792436719596386, + "eval_physics_num_tokens": 4662903.0, + "eval_physics_runtime": 65.9428, + "eval_physics_samples_per_second": 7.582, + "eval_physics_steps_per_second": 7.582, + "step": 700 + }, + { + "entropy": 0.7532984343357384, + "epoch": 0.3160295999554888, + "grad_norm": 82.5, + "learning_rate": 1.418e-05, + "loss": 12.3646, + "mean_token_accuracy": 0.7847790695726872, + "num_tokens": 4729166.0, + "step": 710 + }, + { + "entropy": 0.7946592880412936, + "epoch": 0.32048072108162245, + "grad_norm": 79.0, + "learning_rate": 1.4380000000000001e-05, + "loss": 12.7491, + "mean_token_accuracy": 0.7800281438976526, + "num_tokens": 4796815.0, + "step": 720 + }, + { + "entropy": 0.6913739880546927, + "epoch": 0.32493184220775606, + "grad_norm": 68.0, + "learning_rate": 1.4580000000000001e-05, + "loss": 11.0843, + "mean_token_accuracy": 0.803905238583684, + "num_tokens": 4865172.0, + "step": 730 + }, + { + "entropy": 0.7447662945836783, + "epoch": 0.3293829633338897, + "grad_norm": 65.5, + "learning_rate": 1.478e-05, + "loss": 12.1577, + "mean_token_accuracy": 0.7854402553290128, + "num_tokens": 4932671.0, + "step": 740 + }, + { + "entropy": 0.7749863206408918, + "epoch": 0.3338340844600234, + "grad_norm": 64.0, + "learning_rate": 1.498e-05, + "loss": 12.4497, + "mean_token_accuracy": 0.7847358033061027, + "num_tokens": 4998710.0, + "step": 750 + }, + { + "entropy": 0.7126937594264746, + "epoch": 0.338285205586157, + "grad_norm": 65.5, + "learning_rate": 1.5180000000000002e-05, + "loss": 11.549, + "mean_token_accuracy": 0.7943205136805773, + "num_tokens": 5066948.0, + "step": 760 + }, + { + "entropy": 0.7463799838908016, + "epoch": 0.34273632671229065, + "grad_norm": 66.5, + "learning_rate": 1.5380000000000002e-05, + "loss": 12.052, + "mean_token_accuracy": 0.7893575474619865, + "num_tokens": 5129950.0, + "step": 770 + }, + { + "entropy": 0.7669129572808743, + "epoch": 0.3471874478384243, + "grad_norm": 64.5, + "learning_rate": 1.5580000000000003e-05, + "loss": 12.4587, + "mean_token_accuracy": 0.7811328556388617, + "num_tokens": 5193519.0, + "step": 780 + }, + { + "entropy": 0.7404427804984153, + "epoch": 0.351638568964558, + "grad_norm": 87.0, + "learning_rate": 1.578e-05, + "loss": 11.9383, + "mean_token_accuracy": 0.7870475023984909, + "num_tokens": 5256503.0, + "step": 790 + }, + { + "entropy": 0.8177574999630451, + "epoch": 0.3560896900906916, + "grad_norm": 69.0, + "learning_rate": 1.5980000000000003e-05, + "loss": 13.1167, + "mean_token_accuracy": 0.7771423246711493, + "num_tokens": 5324751.0, + "step": 800 + }, + { + "epoch": 0.3560896900906916, + "eval_biology_entropy": 1.2992450638413429, + "eval_biology_loss": 1.3091635704040527, + "eval_biology_mean_token_accuracy": 0.6765963944792748, + "eval_biology_num_tokens": 5324751.0, + "eval_biology_runtime": 49.4045, + "eval_biology_samples_per_second": 10.121, + "eval_biology_steps_per_second": 10.121, + "step": 800 + }, + { + "epoch": 0.3560896900906916, + "eval_chemistry_entropy": 0.7858416083455085, + "eval_chemistry_loss": 0.755605936050415, + "eval_chemistry_mean_token_accuracy": 0.7878280827403068, + "eval_chemistry_num_tokens": 5324751.0, + "eval_chemistry_runtime": 57.2096, + "eval_chemistry_samples_per_second": 8.74, + "eval_chemistry_steps_per_second": 8.74, + "step": 800 + }, + { + "epoch": 0.3560896900906916, + "eval_math_entropy": 0.7085552672445774, + "eval_math_loss": 0.8628776669502258, + "eval_math_mean_token_accuracy": 0.7751141096353531, + "eval_math_num_tokens": 5324751.0, + "eval_math_runtime": 58.9925, + "eval_math_samples_per_second": 8.476, + "eval_math_steps_per_second": 8.476, + "step": 800 + }, + { + "epoch": 0.3560896900906916, + "eval_physics_entropy": 0.7641420513391495, + "eval_physics_loss": 0.7484508156776428, + "eval_physics_mean_token_accuracy": 0.7914790090918541, + "eval_physics_num_tokens": 5324751.0, + "eval_physics_runtime": 65.8581, + "eval_physics_samples_per_second": 7.592, + "eval_physics_steps_per_second": 7.592, + "step": 800 + }, + { + "entropy": 0.6714803985320031, + "epoch": 0.36054081121682524, + "grad_norm": 65.5, + "learning_rate": 1.618e-05, + "loss": 11.0959, + "mean_token_accuracy": 0.8037038501352072, + "num_tokens": 5391664.0, + "step": 810 + }, + { + "entropy": 0.7128949659876526, + "epoch": 0.3649919323429589, + "grad_norm": 71.5, + "learning_rate": 1.638e-05, + "loss": 11.223, + "mean_token_accuracy": 0.7998184654861689, + "num_tokens": 5458519.0, + "step": 820 + }, + { + "entropy": 0.7470979234203696, + "epoch": 0.3694430534690925, + "grad_norm": 64.5, + "learning_rate": 1.658e-05, + "loss": 12.1501, + "mean_token_accuracy": 0.7894150543957948, + "num_tokens": 5523391.0, + "step": 830 + }, + { + "entropy": 0.7144336312077939, + "epoch": 0.37389417459522617, + "grad_norm": 61.25, + "learning_rate": 1.6780000000000002e-05, + "loss": 11.736, + "mean_token_accuracy": 0.7935146864503622, + "num_tokens": 5585508.0, + "step": 840 + }, + { + "entropy": 0.7430889600887894, + "epoch": 0.37834529572135983, + "grad_norm": 72.0, + "learning_rate": 1.698e-05, + "loss": 11.8763, + "mean_token_accuracy": 0.7954710725694895, + "num_tokens": 5650755.0, + "step": 850 + }, + { + "entropy": 0.7431833678856492, + "epoch": 0.38279641684749344, + "grad_norm": 62.25, + "learning_rate": 1.718e-05, + "loss": 11.9703, + "mean_token_accuracy": 0.7893174722790718, + "num_tokens": 5716321.0, + "step": 860 + }, + { + "entropy": 0.7719974908977747, + "epoch": 0.3872475379736271, + "grad_norm": 66.5, + "learning_rate": 1.7380000000000003e-05, + "loss": 12.4773, + "mean_token_accuracy": 0.7817879624664783, + "num_tokens": 5782698.0, + "step": 870 + }, + { + "entropy": 0.7235654108226299, + "epoch": 0.39169865909976076, + "grad_norm": 53.75, + "learning_rate": 1.758e-05, + "loss": 11.6881, + "mean_token_accuracy": 0.7943595319986343, + "num_tokens": 5848889.0, + "step": 880 + }, + { + "entropy": 0.7637507017701864, + "epoch": 0.3961497802258944, + "grad_norm": 64.0, + "learning_rate": 1.7780000000000003e-05, + "loss": 12.5431, + "mean_token_accuracy": 0.7863847129046917, + "num_tokens": 5912919.0, + "step": 890 + }, + { + "entropy": 0.742395118996501, + "epoch": 0.400600901352028, + "grad_norm": 57.5, + "learning_rate": 1.798e-05, + "loss": 11.9098, + "mean_token_accuracy": 0.79401213824749, + "num_tokens": 5980229.0, + "step": 900 + }, + { + "epoch": 0.400600901352028, + "eval_biology_entropy": 1.2397134986519813, + "eval_biology_loss": 1.3142763376235962, + "eval_biology_mean_token_accuracy": 0.6771629434227944, + "eval_biology_num_tokens": 5980229.0, + "eval_biology_runtime": 49.4582, + "eval_biology_samples_per_second": 10.11, + "eval_biology_steps_per_second": 10.11, + "step": 900 + }, + { + "epoch": 0.400600901352028, + "eval_chemistry_entropy": 0.748387647986412, + "eval_chemistry_loss": 0.7568163275718689, + "eval_chemistry_mean_token_accuracy": 0.7869879400134087, + "eval_chemistry_num_tokens": 5980229.0, + "eval_chemistry_runtime": 57.0636, + "eval_chemistry_samples_per_second": 8.762, + "eval_chemistry_steps_per_second": 8.762, + "step": 900 + }, + { + "epoch": 0.400600901352028, + "eval_math_entropy": 0.6898269492387772, + "eval_math_loss": 0.866584062576294, + "eval_math_mean_token_accuracy": 0.7752813557386399, + "eval_math_num_tokens": 5980229.0, + "eval_math_runtime": 59.1998, + "eval_math_samples_per_second": 8.446, + "eval_math_steps_per_second": 8.446, + "step": 900 + }, + { + "epoch": 0.400600901352028, + "eval_physics_entropy": 0.7360889357328415, + "eval_physics_loss": 0.7502807378768921, + "eval_physics_mean_token_accuracy": 0.7913857514262199, + "eval_physics_num_tokens": 5980229.0, + "eval_physics_runtime": 66.2752, + "eval_physics_samples_per_second": 7.544, + "eval_physics_steps_per_second": 7.544, + "step": 900 + }, + { + "entropy": 0.6976769424974918, + "epoch": 0.4050520224781617, + "grad_norm": 57.75, + "learning_rate": 1.8180000000000002e-05, + "loss": 11.373, + "mean_token_accuracy": 0.7986949652433395, + "num_tokens": 6048508.0, + "step": 910 + }, + { + "entropy": 0.7293819307349623, + "epoch": 0.40950314360429535, + "grad_norm": 59.75, + "learning_rate": 1.8380000000000004e-05, + "loss": 11.7158, + "mean_token_accuracy": 0.7940746705979109, + "num_tokens": 6114855.0, + "step": 920 + }, + { + "entropy": 0.7529061837121844, + "epoch": 0.41395426473042896, + "grad_norm": 58.5, + "learning_rate": 1.858e-05, + "loss": 12.1233, + "mean_token_accuracy": 0.7879564348608256, + "num_tokens": 6179050.0, + "step": 930 + }, + { + "entropy": 0.7420403499156236, + "epoch": 0.4184053858565626, + "grad_norm": 60.25, + "learning_rate": 1.878e-05, + "loss": 12.11, + "mean_token_accuracy": 0.7902424175292253, + "num_tokens": 6244987.0, + "step": 940 + }, + { + "entropy": 0.7153761054389178, + "epoch": 0.4228565069826963, + "grad_norm": 65.0, + "learning_rate": 1.898e-05, + "loss": 11.5745, + "mean_token_accuracy": 0.7952297408133745, + "num_tokens": 6310799.0, + "step": 950 + }, + { + "entropy": 0.7087640034034848, + "epoch": 0.4273076281088299, + "grad_norm": 62.5, + "learning_rate": 1.918e-05, + "loss": 11.5358, + "mean_token_accuracy": 0.7965508222579956, + "num_tokens": 6378152.0, + "step": 960 + }, + { + "entropy": 0.7380615974776447, + "epoch": 0.43175874923496355, + "grad_norm": 52.25, + "learning_rate": 1.938e-05, + "loss": 11.7537, + "mean_token_accuracy": 0.7940621051937342, + "num_tokens": 6442491.0, + "step": 970 + }, + { + "entropy": 0.7700119714252651, + "epoch": 0.4362098703610972, + "grad_norm": 55.75, + "learning_rate": 1.9580000000000002e-05, + "loss": 12.5586, + "mean_token_accuracy": 0.7843215424567461, + "num_tokens": 6510231.0, + "step": 980 + }, + { + "entropy": 0.7417776238173246, + "epoch": 0.44066099148723087, + "grad_norm": 57.25, + "learning_rate": 1.978e-05, + "loss": 12.0337, + "mean_token_accuracy": 0.7908143695443869, + "num_tokens": 6572858.0, + "step": 990 + }, + { + "entropy": 0.7601414297707378, + "epoch": 0.4451121126133645, + "grad_norm": 49.75, + "learning_rate": 1.9980000000000002e-05, + "loss": 12.3686, + "mean_token_accuracy": 0.7845167253166437, + "num_tokens": 6637273.0, + "step": 1000 + }, + { + "epoch": 0.4451121126133645, + "eval_biology_entropy": 1.2479438411593438, + "eval_biology_loss": 1.3289152383804321, + "eval_biology_mean_token_accuracy": 0.67418065315485, + "eval_biology_num_tokens": 6637273.0, + "eval_biology_runtime": 49.5469, + "eval_biology_samples_per_second": 10.091, + "eval_biology_steps_per_second": 10.091, + "step": 1000 + }, + { + "epoch": 0.4451121126133645, + "eval_chemistry_entropy": 0.7556402538418769, + "eval_chemistry_loss": 0.7549638748168945, + "eval_chemistry_mean_token_accuracy": 0.7876601304411888, + "eval_chemistry_num_tokens": 6637273.0, + "eval_chemistry_runtime": 57.6175, + "eval_chemistry_samples_per_second": 8.678, + "eval_chemistry_steps_per_second": 8.678, + "step": 1000 + }, + { + "epoch": 0.4451121126133645, + "eval_math_entropy": 0.7018868820667267, + "eval_math_loss": 0.8697984218597412, + "eval_math_mean_token_accuracy": 0.7734663159847259, + "eval_math_num_tokens": 6637273.0, + "eval_math_runtime": 59.4977, + "eval_math_samples_per_second": 8.404, + "eval_math_steps_per_second": 8.404, + "step": 1000 + }, + { + "epoch": 0.4451121126133645, + "eval_physics_entropy": 0.7586731649637223, + "eval_physics_loss": 0.7596898078918457, + "eval_physics_mean_token_accuracy": 0.7887503932714462, + "eval_physics_num_tokens": 6637273.0, + "eval_physics_runtime": 66.4638, + "eval_physics_samples_per_second": 7.523, + "eval_physics_steps_per_second": 7.523, + "step": 1000 + }, + { + "entropy": 0.7277510199695826, + "epoch": 0.44956323373949814, + "grad_norm": 60.75, + "learning_rate": 1.9980000000000002e-05, + "loss": 11.7227, + "mean_token_accuracy": 0.7909994043409825, + "num_tokens": 6700799.0, + "step": 1010 + }, + { + "entropy": 0.7427934074774385, + "epoch": 0.4540143548656318, + "grad_norm": 58.25, + "learning_rate": 1.995777777777778e-05, + "loss": 12.1597, + "mean_token_accuracy": 0.7892370050773024, + "num_tokens": 6765798.0, + "step": 1020 + }, + { + "entropy": 0.6983263311907649, + "epoch": 0.4584654759917654, + "grad_norm": 57.25, + "learning_rate": 1.9935555555555557e-05, + "loss": 11.3167, + "mean_token_accuracy": 0.7985999789088964, + "num_tokens": 6832774.0, + "step": 1030 + }, + { + "entropy": 0.7731405256316066, + "epoch": 0.46291659711789906, + "grad_norm": 57.5, + "learning_rate": 1.9913333333333335e-05, + "loss": 12.4454, + "mean_token_accuracy": 0.7837167579680682, + "num_tokens": 6896684.0, + "step": 1040 + }, + { + "entropy": 0.7266025519929826, + "epoch": 0.4673677182440327, + "grad_norm": 58.0, + "learning_rate": 1.9891111111111112e-05, + "loss": 11.7153, + "mean_token_accuracy": 0.7996914375573396, + "num_tokens": 6964423.0, + "step": 1050 + }, + { + "entropy": 0.7238340148702264, + "epoch": 0.4718188393701664, + "grad_norm": 53.75, + "learning_rate": 1.986888888888889e-05, + "loss": 11.8798, + "mean_token_accuracy": 0.7931064553558826, + "num_tokens": 7033417.0, + "step": 1060 + }, + { + "entropy": 0.7728124929592013, + "epoch": 0.4762699604963, + "grad_norm": 55.75, + "learning_rate": 1.9846666666666668e-05, + "loss": 12.3099, + "mean_token_accuracy": 0.7855284217745065, + "num_tokens": 7101274.0, + "step": 1070 + }, + { + "entropy": 0.7387059103697539, + "epoch": 0.48072108162243365, + "grad_norm": 50.0, + "learning_rate": 1.9824444444444445e-05, + "loss": 12.1706, + "mean_token_accuracy": 0.7893157167360186, + "num_tokens": 7166608.0, + "step": 1080 + }, + { + "entropy": 0.7172827901318669, + "epoch": 0.4851722027485673, + "grad_norm": 71.0, + "learning_rate": 1.9802222222222226e-05, + "loss": 11.5067, + "mean_token_accuracy": 0.801088172197342, + "num_tokens": 7240958.0, + "step": 1090 + }, + { + "entropy": 0.7408099344000221, + "epoch": 0.4896233238747009, + "grad_norm": 57.5, + "learning_rate": 1.978e-05, + "loss": 11.9898, + "mean_token_accuracy": 0.789643631502986, + "num_tokens": 7311968.0, + "step": 1100 + }, + { + "epoch": 0.4896233238747009, + "eval_biology_entropy": 1.2769847421050071, + "eval_biology_loss": 1.3426203727722168, + "eval_biology_mean_token_accuracy": 0.6723933680057526, + "eval_biology_num_tokens": 7311968.0, + "eval_biology_runtime": 49.9477, + "eval_biology_samples_per_second": 10.01, + "eval_biology_steps_per_second": 10.01, + "step": 1100 + }, + { + "epoch": 0.4896233238747009, + "eval_chemistry_entropy": 0.7427506719231606, + "eval_chemistry_loss": 0.7505869269371033, + "eval_chemistry_mean_token_accuracy": 0.78900011998415, + "eval_chemistry_num_tokens": 7311968.0, + "eval_chemistry_runtime": 57.6815, + "eval_chemistry_samples_per_second": 8.668, + "eval_chemistry_steps_per_second": 8.668, + "step": 1100 + }, + { + "epoch": 0.4896233238747009, + "eval_math_entropy": 0.6985864902734756, + "eval_math_loss": 0.873950183391571, + "eval_math_mean_token_accuracy": 0.7733350949287414, + "eval_math_num_tokens": 7311968.0, + "eval_math_runtime": 59.3201, + "eval_math_samples_per_second": 8.429, + "eval_math_steps_per_second": 8.429, + "step": 1100 + }, + { + "epoch": 0.4896233238747009, + "eval_physics_entropy": 0.7379600911140441, + "eval_physics_loss": 0.7551804780960083, + "eval_physics_mean_token_accuracy": 0.7902662333846092, + "eval_physics_num_tokens": 7311968.0, + "eval_physics_runtime": 66.0636, + "eval_physics_samples_per_second": 7.568, + "eval_physics_steps_per_second": 7.568, + "step": 1100 + }, + { + "entropy": 0.7308115581981838, + "epoch": 0.4940744450008346, + "grad_norm": 57.5, + "learning_rate": 1.975777777777778e-05, + "loss": 11.7632, + "mean_token_accuracy": 0.7919334191828966, + "num_tokens": 7378961.0, + "step": 1110 + }, + { + "entropy": 0.7051068279892206, + "epoch": 0.49852556612696824, + "grad_norm": 62.5, + "learning_rate": 1.9735555555555556e-05, + "loss": 11.4412, + "mean_token_accuracy": 0.7976060153916478, + "num_tokens": 7444923.0, + "step": 1120 + }, + { + "entropy": 0.7438309504650533, + "epoch": 0.5029766872531019, + "grad_norm": 55.5, + "learning_rate": 1.9713333333333337e-05, + "loss": 11.8086, + "mean_token_accuracy": 0.7935861334204674, + "num_tokens": 7509885.0, + "step": 1130 + }, + { + "entropy": 0.7525124587118626, + "epoch": 0.5074278083792355, + "grad_norm": 55.0, + "learning_rate": 1.969111111111111e-05, + "loss": 12.4112, + "mean_token_accuracy": 0.7843238320201635, + "num_tokens": 7576456.0, + "step": 1140 + }, + { + "entropy": 0.7765682274475694, + "epoch": 0.5118789295053692, + "grad_norm": 56.5, + "learning_rate": 1.9668888888888892e-05, + "loss": 12.3668, + "mean_token_accuracy": 0.7860391337424517, + "num_tokens": 7641821.0, + "step": 1150 + }, + { + "entropy": 0.7054132025688886, + "epoch": 0.5163300506315028, + "grad_norm": 54.75, + "learning_rate": 1.9646666666666666e-05, + "loss": 11.5281, + "mean_token_accuracy": 0.7944953251630068, + "num_tokens": 7706502.0, + "step": 1160 + }, + { + "entropy": 0.7991299530491233, + "epoch": 0.5207811717576365, + "grad_norm": 61.5, + "learning_rate": 1.9624444444444447e-05, + "loss": 12.8341, + "mean_token_accuracy": 0.782982062548399, + "num_tokens": 7772353.0, + "step": 1170 + }, + { + "entropy": 0.7132468383759261, + "epoch": 0.52523229288377, + "grad_norm": 49.75, + "learning_rate": 1.9602222222222225e-05, + "loss": 11.7127, + "mean_token_accuracy": 0.796727203950286, + "num_tokens": 7838516.0, + "step": 1180 + }, + { + "entropy": 0.7221027395687998, + "epoch": 0.5296834140099037, + "grad_norm": 61.5, + "learning_rate": 1.9580000000000002e-05, + "loss": 11.4862, + "mean_token_accuracy": 0.7964579328894615, + "num_tokens": 7902568.0, + "step": 1190 + }, + { + "entropy": 0.7554050230421125, + "epoch": 0.5341345351360374, + "grad_norm": 55.0, + "learning_rate": 1.955777777777778e-05, + "loss": 12.018, + "mean_token_accuracy": 0.7877018984407187, + "num_tokens": 7969704.0, + "step": 1200 + }, + { + "epoch": 0.5341345351360374, + "eval_biology_entropy": 1.2183623771071435, + "eval_biology_loss": 1.3340805768966675, + "eval_biology_mean_token_accuracy": 0.6738465265035629, + "eval_biology_num_tokens": 7969704.0, + "eval_biology_runtime": 49.7, + "eval_biology_samples_per_second": 10.06, + "eval_biology_steps_per_second": 10.06, + "step": 1200 + }, + { + "epoch": 0.5341345351360374, + "eval_chemistry_entropy": 0.7287724848091602, + "eval_chemistry_loss": 0.7458793520927429, + "eval_chemistry_mean_token_accuracy": 0.7902151345610618, + "eval_chemistry_num_tokens": 7969704.0, + "eval_chemistry_runtime": 57.2957, + "eval_chemistry_samples_per_second": 8.727, + "eval_chemistry_steps_per_second": 8.727, + "step": 1200 + }, + { + "epoch": 0.5341345351360374, + "eval_math_entropy": 0.6740069650709629, + "eval_math_loss": 0.8847513794898987, + "eval_math_mean_token_accuracy": 0.7725249216556549, + "eval_math_num_tokens": 7969704.0, + "eval_math_runtime": 59.1059, + "eval_math_samples_per_second": 8.459, + "eval_math_steps_per_second": 8.459, + "step": 1200 + }, + { + "epoch": 0.5341345351360374, + "eval_physics_entropy": 0.7253319441080094, + "eval_physics_loss": 0.7566389441490173, + "eval_physics_mean_token_accuracy": 0.790060887992382, + "eval_physics_num_tokens": 7969704.0, + "eval_physics_runtime": 66.1619, + "eval_physics_samples_per_second": 7.557, + "eval_physics_steps_per_second": 7.557, + "step": 1200 + }, + { + "entropy": 0.692601692955941, + "epoch": 0.538585656262171, + "grad_norm": 47.25, + "learning_rate": 1.9535555555555557e-05, + "loss": 11.4509, + "mean_token_accuracy": 0.7953360680490732, + "num_tokens": 8034765.0, + "step": 1210 + }, + { + "entropy": 0.7510794518515468, + "epoch": 0.5430367773883047, + "grad_norm": 45.5, + "learning_rate": 1.9513333333333335e-05, + "loss": 11.9891, + "mean_token_accuracy": 0.7911988774314522, + "num_tokens": 8103869.0, + "step": 1220 + }, + { + "entropy": 0.6859279408119618, + "epoch": 0.5474878985144384, + "grad_norm": 44.25, + "learning_rate": 1.9491111111111113e-05, + "loss": 11.092, + "mean_token_accuracy": 0.8032916888594628, + "num_tokens": 8173386.0, + "step": 1230 + }, + { + "entropy": 0.7368258788250387, + "epoch": 0.551939019640572, + "grad_norm": 51.75, + "learning_rate": 1.946888888888889e-05, + "loss": 11.8628, + "mean_token_accuracy": 0.7892861522734165, + "num_tokens": 8242162.0, + "step": 1240 + }, + { + "entropy": 0.7227346986532212, + "epoch": 0.5563901407667056, + "grad_norm": 58.5, + "learning_rate": 1.9446666666666668e-05, + "loss": 11.6799, + "mean_token_accuracy": 0.7937116343528032, + "num_tokens": 8308919.0, + "step": 1250 + }, + { + "entropy": 0.7348461709916592, + "epoch": 0.5608412618928392, + "grad_norm": 65.5, + "learning_rate": 1.9424444444444446e-05, + "loss": 11.985, + "mean_token_accuracy": 0.7889482360333204, + "num_tokens": 8365617.0, + "step": 1260 + }, + { + "entropy": 0.7478212734684349, + "epoch": 0.5652923830189729, + "grad_norm": 49.0, + "learning_rate": 1.9402222222222223e-05, + "loss": 11.9453, + "mean_token_accuracy": 0.7913831114768982, + "num_tokens": 8430728.0, + "step": 1270 + }, + { + "entropy": 0.7140952883288264, + "epoch": 0.5697435041451065, + "grad_norm": 49.0, + "learning_rate": 1.938e-05, + "loss": 11.5919, + "mean_token_accuracy": 0.7973519455641508, + "num_tokens": 8497852.0, + "step": 1280 + }, + { + "entropy": 0.7445681392215192, + "epoch": 0.5741946252712402, + "grad_norm": 49.0, + "learning_rate": 1.935777777777778e-05, + "loss": 12.1553, + "mean_token_accuracy": 0.78705313205719, + "num_tokens": 8558952.0, + "step": 1290 + }, + { + "entropy": 0.739489266090095, + "epoch": 0.5786457463973739, + "grad_norm": 58.25, + "learning_rate": 1.9335555555555556e-05, + "loss": 11.8581, + "mean_token_accuracy": 0.796079633757472, + "num_tokens": 8622868.0, + "step": 1300 + }, + { + "epoch": 0.5786457463973739, + "eval_biology_entropy": 1.2557496595382691, + "eval_biology_loss": 1.3392294645309448, + "eval_biology_mean_token_accuracy": 0.673343878209591, + "eval_biology_num_tokens": 8622868.0, + "eval_biology_runtime": 49.5632, + "eval_biology_samples_per_second": 10.088, + "eval_biology_steps_per_second": 10.088, + "step": 1300 + }, + { + "epoch": 0.5786457463973739, + "eval_chemistry_entropy": 0.720954676181078, + "eval_chemistry_loss": 0.7364406585693359, + "eval_chemistry_mean_token_accuracy": 0.7934513345956803, + "eval_chemistry_num_tokens": 8622868.0, + "eval_chemistry_runtime": 57.2523, + "eval_chemistry_samples_per_second": 8.733, + "eval_chemistry_steps_per_second": 8.733, + "step": 1300 + }, + { + "epoch": 0.5786457463973739, + "eval_math_entropy": 0.6704416258633137, + "eval_math_loss": 0.892277717590332, + "eval_math_mean_token_accuracy": 0.7730021580457688, + "eval_math_num_tokens": 8622868.0, + "eval_math_runtime": 59.0108, + "eval_math_samples_per_second": 8.473, + "eval_math_steps_per_second": 8.473, + "step": 1300 + }, + { + "epoch": 0.5786457463973739, + "eval_physics_entropy": 0.7133055707812309, + "eval_physics_loss": 0.7568183541297913, + "eval_physics_mean_token_accuracy": 0.7910581494569778, + "eval_physics_num_tokens": 8622868.0, + "eval_physics_runtime": 66.0807, + "eval_physics_samples_per_second": 7.567, + "eval_physics_steps_per_second": 7.567, + "step": 1300 + }, + { + "entropy": 0.6994675314053893, + "epoch": 0.5830968675235075, + "grad_norm": 56.0, + "learning_rate": 1.9313333333333334e-05, + "loss": 11.4251, + "mean_token_accuracy": 0.7967592880129815, + "num_tokens": 8694510.0, + "step": 1310 + }, + { + "entropy": 0.78238091673702, + "epoch": 0.5875479886496411, + "grad_norm": 65.0, + "learning_rate": 1.9291111111111115e-05, + "loss": 12.6434, + "mean_token_accuracy": 0.7801247961819172, + "num_tokens": 8757753.0, + "step": 1320 + }, + { + "entropy": 0.7391151077114045, + "epoch": 0.5919991097757747, + "grad_norm": 50.75, + "learning_rate": 1.926888888888889e-05, + "loss": 11.9013, + "mean_token_accuracy": 0.7954815953969956, + "num_tokens": 8823153.0, + "step": 1330 + }, + { + "entropy": 0.7482655656524002, + "epoch": 0.5964502309019084, + "grad_norm": 53.0, + "learning_rate": 1.924666666666667e-05, + "loss": 12.1372, + "mean_token_accuracy": 0.7891814015805722, + "num_tokens": 8886401.0, + "step": 1340 + }, + { + "entropy": 0.7140636395663023, + "epoch": 0.6009013520280421, + "grad_norm": 46.75, + "learning_rate": 1.9224444444444444e-05, + "loss": 11.4324, + "mean_token_accuracy": 0.7953898053616285, + "num_tokens": 8958530.0, + "step": 1350 + }, + { + "entropy": 0.7349427663721144, + "epoch": 0.6053524731541757, + "grad_norm": 55.0, + "learning_rate": 1.9202222222222225e-05, + "loss": 11.8847, + "mean_token_accuracy": 0.7926598964259028, + "num_tokens": 9024677.0, + "step": 1360 + }, + { + "entropy": 0.7534153172746301, + "epoch": 0.6098035942803094, + "grad_norm": 52.0, + "learning_rate": 1.918e-05, + "loss": 12.0276, + "mean_token_accuracy": 0.7890890792012215, + "num_tokens": 9090280.0, + "step": 1370 + }, + { + "entropy": 0.7084095226600766, + "epoch": 0.614254715406443, + "grad_norm": 52.75, + "learning_rate": 1.915777777777778e-05, + "loss": 11.5771, + "mean_token_accuracy": 0.7964484915137291, + "num_tokens": 9153771.0, + "step": 1380 + }, + { + "entropy": 0.6750353675335645, + "epoch": 0.6187058365325766, + "grad_norm": 48.25, + "learning_rate": 1.9135555555555555e-05, + "loss": 10.7449, + "mean_token_accuracy": 0.8097869891673326, + "num_tokens": 9227117.0, + "step": 1390 + }, + { + "entropy": 0.7293720198795199, + "epoch": 0.6231569576587103, + "grad_norm": 49.0, + "learning_rate": 1.9113333333333336e-05, + "loss": 11.9594, + "mean_token_accuracy": 0.7891727603971959, + "num_tokens": 9291760.0, + "step": 1400 + }, + { + "epoch": 0.6231569576587103, + "eval_biology_entropy": 1.2729821098446845, + "eval_biology_loss": 1.3309696912765503, + "eval_biology_mean_token_accuracy": 0.6723712916374207, + "eval_biology_num_tokens": 9291760.0, + "eval_biology_runtime": 49.2684, + "eval_biology_samples_per_second": 10.148, + "eval_biology_steps_per_second": 10.148, + "step": 1400 + }, + { + "epoch": 0.6231569576587103, + "eval_chemistry_entropy": 0.7357699049711227, + "eval_chemistry_loss": 0.7299637794494629, + "eval_chemistry_mean_token_accuracy": 0.793709151148796, + "eval_chemistry_num_tokens": 9291760.0, + "eval_chemistry_runtime": 57.0749, + "eval_chemistry_samples_per_second": 8.76, + "eval_chemistry_steps_per_second": 8.76, + "step": 1400 + }, + { + "epoch": 0.6231569576587103, + "eval_math_entropy": 0.713969606757164, + "eval_math_loss": 0.8790645599365234, + "eval_math_mean_token_accuracy": 0.7724634169340133, + "eval_math_num_tokens": 9291760.0, + "eval_math_runtime": 58.9543, + "eval_math_samples_per_second": 8.481, + "eval_math_steps_per_second": 8.481, + "step": 1400 + }, + { + "epoch": 0.6231569576587103, + "eval_physics_entropy": 0.738197058737278, + "eval_physics_loss": 0.7536621689796448, + "eval_physics_mean_token_accuracy": 0.7910983339548111, + "eval_physics_num_tokens": 9291760.0, + "eval_physics_runtime": 65.823, + "eval_physics_samples_per_second": 7.596, + "eval_physics_steps_per_second": 7.596, + "step": 1400 + }, + { + "entropy": 0.7626627525314689, + "epoch": 0.6276080787848439, + "grad_norm": 51.25, + "learning_rate": 1.9091111111111113e-05, + "loss": 12.1176, + "mean_token_accuracy": 0.7862323332577944, + "num_tokens": 9357561.0, + "step": 1410 + }, + { + "entropy": 0.7020683256909251, + "epoch": 0.6320591999109776, + "grad_norm": 55.0, + "learning_rate": 1.906888888888889e-05, + "loss": 11.5299, + "mean_token_accuracy": 0.7983165990561247, + "num_tokens": 9425022.0, + "step": 1420 + }, + { + "entropy": 0.7129457925446332, + "epoch": 0.6365103210371112, + "grad_norm": 55.5, + "learning_rate": 1.904666666666667e-05, + "loss": 11.363, + "mean_token_accuracy": 0.8003849845379591, + "num_tokens": 9492028.0, + "step": 1430 + }, + { + "entropy": 0.682979597337544, + "epoch": 0.6409614421632449, + "grad_norm": 53.5, + "learning_rate": 1.9024444444444446e-05, + "loss": 11.0672, + "mean_token_accuracy": 0.8002727191895247, + "num_tokens": 9561091.0, + "step": 1440 + }, + { + "entropy": 0.7097643257118762, + "epoch": 0.6454125632893785, + "grad_norm": 52.5, + "learning_rate": 1.9002222222222224e-05, + "loss": 11.4105, + "mean_token_accuracy": 0.798115537315607, + "num_tokens": 9628383.0, + "step": 1450 + }, + { + "entropy": 0.6520757641643286, + "epoch": 0.6498636844155121, + "grad_norm": 45.0, + "learning_rate": 1.898e-05, + "loss": 10.6294, + "mean_token_accuracy": 0.8115652769804, + "num_tokens": 9695723.0, + "step": 1460 + }, + { + "entropy": 0.6995091023854911, + "epoch": 0.6543148055416458, + "grad_norm": 48.0, + "learning_rate": 1.895777777777778e-05, + "loss": 11.2764, + "mean_token_accuracy": 0.8028371874243021, + "num_tokens": 9763393.0, + "step": 1470 + }, + { + "entropy": 0.736479183845222, + "epoch": 0.6587659266677794, + "grad_norm": 46.75, + "learning_rate": 1.8935555555555556e-05, + "loss": 11.7277, + "mean_token_accuracy": 0.7955611757934093, + "num_tokens": 9827272.0, + "step": 1480 + }, + { + "entropy": 0.6883244249969721, + "epoch": 0.6632170477939131, + "grad_norm": 48.75, + "learning_rate": 1.8913333333333334e-05, + "loss": 11.1978, + "mean_token_accuracy": 0.8004064936190843, + "num_tokens": 9892418.0, + "step": 1490 + }, + { + "entropy": 0.6904169335030019, + "epoch": 0.6676681689200468, + "grad_norm": 48.25, + "learning_rate": 1.8891111111111115e-05, + "loss": 11.1417, + "mean_token_accuracy": 0.8022403538227081, + "num_tokens": 9955369.0, + "step": 1500 + }, + { + "epoch": 0.6676681689200468, + "eval_biology_entropy": 1.1994027597904204, + "eval_biology_loss": 1.3322917222976685, + "eval_biology_mean_token_accuracy": 0.6736352816224098, + "eval_biology_num_tokens": 9955369.0, + "eval_biology_runtime": 49.2971, + "eval_biology_samples_per_second": 10.143, + "eval_biology_steps_per_second": 10.143, + "step": 1500 + }, + { + "epoch": 0.6676681689200468, + "eval_chemistry_entropy": 0.6897619203329086, + "eval_chemistry_loss": 0.7175793051719666, + "eval_chemistry_mean_token_accuracy": 0.7974075863361358, + "eval_chemistry_num_tokens": 9955369.0, + "eval_chemistry_runtime": 57.473, + "eval_chemistry_samples_per_second": 8.7, + "eval_chemistry_steps_per_second": 8.7, + "step": 1500 + }, + { + "epoch": 0.6676681689200468, + "eval_math_entropy": 0.659498262912035, + "eval_math_loss": 0.8881840705871582, + "eval_math_mean_token_accuracy": 0.773587989807129, + "eval_math_num_tokens": 9955369.0, + "eval_math_runtime": 59.1515, + "eval_math_samples_per_second": 8.453, + "eval_math_steps_per_second": 8.453, + "step": 1500 + }, + { + "epoch": 0.6676681689200468, + "eval_physics_entropy": 0.6934876502454281, + "eval_physics_loss": 0.747431755065918, + "eval_physics_mean_token_accuracy": 0.7924305193424225, + "eval_physics_num_tokens": 9955369.0, + "eval_physics_runtime": 66.2089, + "eval_physics_samples_per_second": 7.552, + "eval_physics_steps_per_second": 7.552, + "step": 1500 + }, + { + "entropy": 0.7013076378963887, + "epoch": 0.6721192900461804, + "grad_norm": 60.75, + "learning_rate": 1.886888888888889e-05, + "loss": 11.3597, + "mean_token_accuracy": 0.7976187769323587, + "num_tokens": 10026298.0, + "step": 1510 + }, + { + "entropy": 0.66996502391994, + "epoch": 0.676570411172314, + "grad_norm": 48.25, + "learning_rate": 1.884666666666667e-05, + "loss": 10.6869, + "mean_token_accuracy": 0.809822041168809, + "num_tokens": 10096065.0, + "step": 1520 + }, + { + "entropy": 0.6915224540047348, + "epoch": 0.6810215322984476, + "grad_norm": 52.25, + "learning_rate": 1.8824444444444445e-05, + "loss": 11.3488, + "mean_token_accuracy": 0.8012049272656441, + "num_tokens": 10160745.0, + "step": 1530 + }, + { + "entropy": 0.6686725497245789, + "epoch": 0.6854726534245813, + "grad_norm": 46.75, + "learning_rate": 1.8802222222222226e-05, + "loss": 10.6806, + "mean_token_accuracy": 0.8083007030189038, + "num_tokens": 10233806.0, + "step": 1540 + }, + { + "entropy": 0.6853563466109336, + "epoch": 0.689923774550715, + "grad_norm": 56.25, + "learning_rate": 1.878e-05, + "loss": 11.2147, + "mean_token_accuracy": 0.802401140332222, + "num_tokens": 10298845.0, + "step": 1550 + }, + { + "entropy": 0.6936365003697574, + "epoch": 0.6943748956768486, + "grad_norm": 49.25, + "learning_rate": 1.875777777777778e-05, + "loss": 11.0701, + "mean_token_accuracy": 0.8034875515848399, + "num_tokens": 10364601.0, + "step": 1560 + }, + { + "entropy": 0.6916884422302246, + "epoch": 0.6988260168029823, + "grad_norm": 53.0, + "learning_rate": 1.873555555555556e-05, + "loss": 11.2846, + "mean_token_accuracy": 0.7983762890100479, + "num_tokens": 10434433.0, + "step": 1570 + }, + { + "entropy": 0.7143267430365086, + "epoch": 0.703277137929116, + "grad_norm": 56.5, + "learning_rate": 1.8713333333333336e-05, + "loss": 11.4464, + "mean_token_accuracy": 0.7968503847718239, + "num_tokens": 10500891.0, + "step": 1580 + }, + { + "entropy": 0.7026416527107358, + "epoch": 0.7077282590552495, + "grad_norm": 48.5, + "learning_rate": 1.8691111111111114e-05, + "loss": 11.4511, + "mean_token_accuracy": 0.8001922458410263, + "num_tokens": 10567784.0, + "step": 1590 + }, + { + "entropy": 0.7117853639647365, + "epoch": 0.7121793801813832, + "grad_norm": 47.25, + "learning_rate": 1.866888888888889e-05, + "loss": 11.2339, + "mean_token_accuracy": 0.8018446419388056, + "num_tokens": 10633325.0, + "step": 1600 + }, + { + "epoch": 0.7121793801813832, + "eval_biology_entropy": 1.2444143397212029, + "eval_biology_loss": 1.3356362581253052, + "eval_biology_mean_token_accuracy": 0.6730564608573913, + "eval_biology_num_tokens": 10633325.0, + "eval_biology_runtime": 72.0334, + "eval_biology_samples_per_second": 6.941, + "eval_biology_steps_per_second": 6.941, + "step": 1600 + }, + { + "epoch": 0.7121793801813832, + "eval_chemistry_entropy": 0.7002458871603012, + "eval_chemistry_loss": 0.7100893259048462, + "eval_chemistry_mean_token_accuracy": 0.7978058751821518, + "eval_chemistry_num_tokens": 10633325.0, + "eval_chemistry_runtime": 80.5635, + "eval_chemistry_samples_per_second": 6.206, + "eval_chemistry_steps_per_second": 6.206, + "step": 1600 + }, + { + "epoch": 0.7121793801813832, + "eval_math_entropy": 0.6884930176436901, + "eval_math_loss": 0.8825717568397522, + "eval_math_mean_token_accuracy": 0.7724320909976959, + "eval_math_num_tokens": 10633325.0, + "eval_math_runtime": 62.848, + "eval_math_samples_per_second": 7.956, + "eval_math_steps_per_second": 7.956, + "step": 1600 + }, + { + "epoch": 0.7121793801813832, + "eval_physics_entropy": 0.703965005338192, + "eval_physics_loss": 0.7453010082244873, + "eval_physics_mean_token_accuracy": 0.7937779233455659, + "eval_physics_num_tokens": 10633325.0, + "eval_physics_runtime": 65.8361, + "eval_physics_samples_per_second": 7.595, + "eval_physics_steps_per_second": 7.595, + "step": 1600 + }, + { + "entropy": 0.653254197910428, + "epoch": 0.7166305013075168, + "grad_norm": 47.75, + "learning_rate": 1.864666666666667e-05, + "loss": 10.7312, + "mean_token_accuracy": 0.8095974924042821, + "num_tokens": 10699737.0, + "step": 1610 + }, + { + "entropy": 0.6189540217630565, + "epoch": 0.7210816224336505, + "grad_norm": 52.25, + "learning_rate": 1.8624444444444446e-05, + "loss": 9.9286, + "mean_token_accuracy": 0.8196552351117135, + "num_tokens": 10764319.0, + "step": 1620 + }, + { + "entropy": 0.6991719406098127, + "epoch": 0.7255327435597841, + "grad_norm": 48.75, + "learning_rate": 1.8602222222222224e-05, + "loss": 11.3232, + "mean_token_accuracy": 0.7975562997162342, + "num_tokens": 10831617.0, + "step": 1630 + }, + { + "entropy": 0.7137979302555323, + "epoch": 0.7299838646859178, + "grad_norm": 58.5, + "learning_rate": 1.858e-05, + "loss": 11.3908, + "mean_token_accuracy": 0.8014465879648924, + "num_tokens": 10897916.0, + "step": 1640 + }, + { + "entropy": 0.6850169265642763, + "epoch": 0.7344349858120514, + "grad_norm": 50.75, + "learning_rate": 1.855777777777778e-05, + "loss": 11.1551, + "mean_token_accuracy": 0.8014529883861542, + "num_tokens": 10965483.0, + "step": 1650 + }, + { + "entropy": 0.7198417543433606, + "epoch": 0.738886106938185, + "grad_norm": 47.0, + "learning_rate": 1.8535555555555557e-05, + "loss": 11.6312, + "mean_token_accuracy": 0.7964452419430017, + "num_tokens": 11030485.0, + "step": 1660 + }, + { + "entropy": 0.6864631313830614, + "epoch": 0.7433372280643187, + "grad_norm": 50.0, + "learning_rate": 1.8513333333333335e-05, + "loss": 11.0248, + "mean_token_accuracy": 0.804034722596407, + "num_tokens": 11097108.0, + "step": 1670 + }, + { + "entropy": 0.6339416711591184, + "epoch": 0.7477883491904523, + "grad_norm": 48.75, + "learning_rate": 1.8491111111111112e-05, + "loss": 10.4331, + "mean_token_accuracy": 0.8120405212044716, + "num_tokens": 11165356.0, + "step": 1680 + }, + { + "entropy": 0.7042925103567541, + "epoch": 0.752239470316586, + "grad_norm": 46.25, + "learning_rate": 1.846888888888889e-05, + "loss": 11.2078, + "mean_token_accuracy": 0.8007234908640385, + "num_tokens": 11234429.0, + "step": 1690 + }, + { + "entropy": 0.6936139498837293, + "epoch": 0.7566905914427197, + "grad_norm": 50.0, + "learning_rate": 1.8446666666666667e-05, + "loss": 11.1976, + "mean_token_accuracy": 0.8022552561014891, + "num_tokens": 11299223.0, + "step": 1700 + }, + { + "epoch": 0.7566905914427197, + "eval_biology_entropy": 1.2853857824206352, + "eval_biology_loss": 1.333956003189087, + "eval_biology_mean_token_accuracy": 0.6733258009552956, + "eval_biology_num_tokens": 11299223.0, + "eval_biology_runtime": 49.3568, + "eval_biology_samples_per_second": 10.13, + "eval_biology_steps_per_second": 10.13, + "step": 1700 + }, + { + "epoch": 0.7566905914427197, + "eval_chemistry_entropy": 0.7026637124717235, + "eval_chemistry_loss": 0.7078750133514404, + "eval_chemistry_mean_token_accuracy": 0.7986145396232605, + "eval_chemistry_num_tokens": 11299223.0, + "eval_chemistry_runtime": 57.1452, + "eval_chemistry_samples_per_second": 8.75, + "eval_chemistry_steps_per_second": 8.75, + "step": 1700 + }, + { + "epoch": 0.7566905914427197, + "eval_math_entropy": 0.7062955890595913, + "eval_math_loss": 0.8821802735328674, + "eval_math_mean_token_accuracy": 0.7737128890752792, + "eval_math_num_tokens": 11299223.0, + "eval_math_runtime": 61.1933, + "eval_math_samples_per_second": 8.171, + "eval_math_steps_per_second": 8.171, + "step": 1700 + }, + { + "epoch": 0.7566905914427197, + "eval_physics_entropy": 0.7158967233300209, + "eval_physics_loss": 0.7414941191673279, + "eval_physics_mean_token_accuracy": 0.7944492185711861, + "eval_physics_num_tokens": 11299223.0, + "eval_physics_runtime": 66.3539, + "eval_physics_samples_per_second": 7.535, + "eval_physics_steps_per_second": 7.535, + "step": 1700 + }, + { + "entropy": 0.6807508809491992, + "epoch": 0.7611417125688533, + "grad_norm": 44.0, + "learning_rate": 1.842444444444445e-05, + "loss": 10.8976, + "mean_token_accuracy": 0.8057308234274387, + "num_tokens": 11371200.0, + "step": 1710 + }, + { + "entropy": 0.683816609159112, + "epoch": 0.7655928336949869, + "grad_norm": 48.75, + "learning_rate": 1.8402222222222223e-05, + "loss": 11.2848, + "mean_token_accuracy": 0.7998971730470658, + "num_tokens": 11436799.0, + "step": 1720 + }, + { + "entropy": 0.682188394293189, + "epoch": 0.7700439548211205, + "grad_norm": 57.25, + "learning_rate": 1.8380000000000004e-05, + "loss": 10.8721, + "mean_token_accuracy": 0.8079916540533304, + "num_tokens": 11502965.0, + "step": 1730 + }, + { + "entropy": 0.7102242357097566, + "epoch": 0.7744950759472542, + "grad_norm": 46.25, + "learning_rate": 1.8357777777777778e-05, + "loss": 11.5854, + "mean_token_accuracy": 0.7963233698159456, + "num_tokens": 11567422.0, + "step": 1740 + }, + { + "entropy": 0.6699508598074317, + "epoch": 0.7789461970733879, + "grad_norm": 38.75, + "learning_rate": 1.833555555555556e-05, + "loss": 10.8756, + "mean_token_accuracy": 0.8071661192923785, + "num_tokens": 11637076.0, + "step": 1750 + }, + { + "entropy": 0.7298749316483736, + "epoch": 0.7833973181995215, + "grad_norm": 48.25, + "learning_rate": 1.8313333333333333e-05, + "loss": 11.7861, + "mean_token_accuracy": 0.7899679750204086, + "num_tokens": 11703496.0, + "step": 1760 + }, + { + "entropy": 0.6433588310144842, + "epoch": 0.7878484393256552, + "grad_norm": 36.25, + "learning_rate": 1.8291111111111114e-05, + "loss": 10.2179, + "mean_token_accuracy": 0.8177038058638573, + "num_tokens": 11772773.0, + "step": 1770 + }, + { + "entropy": 0.6560773782432079, + "epoch": 0.7922995604517888, + "grad_norm": 47.0, + "learning_rate": 1.8268888888888888e-05, + "loss": 10.7873, + "mean_token_accuracy": 0.8082022242248058, + "num_tokens": 11840317.0, + "step": 1780 + }, + { + "entropy": 0.7407286252826453, + "epoch": 0.7967506815779224, + "grad_norm": 55.0, + "learning_rate": 1.824666666666667e-05, + "loss": 11.9908, + "mean_token_accuracy": 0.793482593819499, + "num_tokens": 11903376.0, + "step": 1790 + }, + { + "entropy": 0.6884403178468347, + "epoch": 0.801201802704056, + "grad_norm": 52.5, + "learning_rate": 1.8224444444444447e-05, + "loss": 10.9702, + "mean_token_accuracy": 0.8052472107112407, + "num_tokens": 11965530.0, + "step": 1800 + }, + { + "epoch": 0.801201802704056, + "eval_biology_entropy": 1.2536135312318801, + "eval_biology_loss": 1.3246384859085083, + "eval_biology_mean_token_accuracy": 0.6739821583032608, + "eval_biology_num_tokens": 11965530.0, + "eval_biology_runtime": 49.5291, + "eval_biology_samples_per_second": 10.095, + "eval_biology_steps_per_second": 10.095, + "step": 1800 + }, + { + "epoch": 0.801201802704056, + "eval_chemistry_entropy": 0.6877463189661502, + "eval_chemistry_loss": 0.699435293674469, + "eval_chemistry_mean_token_accuracy": 0.8004006836414337, + "eval_chemistry_num_tokens": 11965530.0, + "eval_chemistry_runtime": 57.2961, + "eval_chemistry_samples_per_second": 8.727, + "eval_chemistry_steps_per_second": 8.727, + "step": 1800 + }, + { + "epoch": 0.801201802704056, + "eval_math_entropy": 0.6771485269367695, + "eval_math_loss": 0.8832710385322571, + "eval_math_mean_token_accuracy": 0.7728882786035538, + "eval_math_num_tokens": 11965530.0, + "eval_math_runtime": 59.0619, + "eval_math_samples_per_second": 8.466, + "eval_math_steps_per_second": 8.466, + "step": 1800 + }, + { + "epoch": 0.801201802704056, + "eval_physics_entropy": 0.6993200024366378, + "eval_physics_loss": 0.7383862733840942, + "eval_physics_mean_token_accuracy": 0.794426214993, + "eval_physics_num_tokens": 11965530.0, + "eval_physics_runtime": 66.0, + "eval_physics_samples_per_second": 7.576, + "eval_physics_steps_per_second": 7.576, + "step": 1800 + }, + { + "entropy": 0.6854008860886097, + "epoch": 0.8056529238301897, + "grad_norm": 54.0, + "learning_rate": 1.8202222222222225e-05, + "loss": 11.1961, + "mean_token_accuracy": 0.80209940392524, + "num_tokens": 12031493.0, + "step": 1810 + }, + { + "entropy": 0.6895728969946504, + "epoch": 0.8101040449563234, + "grad_norm": 46.25, + "learning_rate": 1.8180000000000002e-05, + "loss": 11.1827, + "mean_token_accuracy": 0.8035332810133695, + "num_tokens": 12097917.0, + "step": 1820 + }, + { + "entropy": 0.6885084038600325, + "epoch": 0.814555166082457, + "grad_norm": 46.5, + "learning_rate": 1.815777777777778e-05, + "loss": 11.0952, + "mean_token_accuracy": 0.8038906361907721, + "num_tokens": 12163029.0, + "step": 1830 + }, + { + "entropy": 0.6959913271479309, + "epoch": 0.8190062872085907, + "grad_norm": 58.5, + "learning_rate": 1.8135555555555557e-05, + "loss": 11.3646, + "mean_token_accuracy": 0.7995086185634136, + "num_tokens": 12224427.0, + "step": 1840 + }, + { + "entropy": 0.7170273121446371, + "epoch": 0.8234574083347244, + "grad_norm": 43.0, + "learning_rate": 1.8113333333333335e-05, + "loss": 11.4108, + "mean_token_accuracy": 0.80226712487638, + "num_tokens": 12291917.0, + "step": 1850 + }, + { + "entropy": 0.6658903407864273, + "epoch": 0.8279085294608579, + "grad_norm": 46.25, + "learning_rate": 1.8091111111111113e-05, + "loss": 10.7704, + "mean_token_accuracy": 0.8066752411425113, + "num_tokens": 12365332.0, + "step": 1860 + }, + { + "entropy": 0.6656836275942624, + "epoch": 0.8323596505869916, + "grad_norm": 40.25, + "learning_rate": 1.806888888888889e-05, + "loss": 10.6917, + "mean_token_accuracy": 0.8068058449774981, + "num_tokens": 12441423.0, + "step": 1870 + }, + { + "entropy": 0.6506184342317283, + "epoch": 0.8368107717131252, + "grad_norm": 50.25, + "learning_rate": 1.8046666666666668e-05, + "loss": 10.4962, + "mean_token_accuracy": 0.8128324717283248, + "num_tokens": 12509124.0, + "step": 1880 + }, + { + "entropy": 0.626772437337786, + "epoch": 0.8412618928392589, + "grad_norm": 43.25, + "learning_rate": 1.8024444444444445e-05, + "loss": 10.2576, + "mean_token_accuracy": 0.8113849848508835, + "num_tokens": 12575355.0, + "step": 1890 + }, + { + "entropy": 0.6942961663939059, + "epoch": 0.8457130139653926, + "grad_norm": 57.25, + "learning_rate": 1.8002222222222223e-05, + "loss": 11.3533, + "mean_token_accuracy": 0.7994141079485416, + "num_tokens": 12642651.0, + "step": 1900 + }, + { + "epoch": 0.8457130139653926, + "eval_biology_entropy": 1.2530685477256776, + "eval_biology_loss": 1.3307456970214844, + "eval_biology_mean_token_accuracy": 0.6736879838109017, + "eval_biology_num_tokens": 12642651.0, + "eval_biology_runtime": 49.3602, + "eval_biology_samples_per_second": 10.13, + "eval_biology_steps_per_second": 10.13, + "step": 1900 + }, + { + "epoch": 0.8457130139653926, + "eval_chemistry_entropy": 0.6917615284323693, + "eval_chemistry_loss": 0.6964050531387329, + "eval_chemistry_mean_token_accuracy": 0.8021229248046875, + "eval_chemistry_num_tokens": 12642651.0, + "eval_chemistry_runtime": 57.0518, + "eval_chemistry_samples_per_second": 8.764, + "eval_chemistry_steps_per_second": 8.764, + "step": 1900 + }, + { + "epoch": 0.8457130139653926, + "eval_math_entropy": 0.675072967261076, + "eval_math_loss": 0.8877173066139221, + "eval_math_mean_token_accuracy": 0.7728915640115738, + "eval_math_num_tokens": 12642651.0, + "eval_math_runtime": 58.9152, + "eval_math_samples_per_second": 8.487, + "eval_math_steps_per_second": 8.487, + "step": 1900 + }, + { + "epoch": 0.8457130139653926, + "eval_physics_entropy": 0.7041290953755379, + "eval_physics_loss": 0.7368381023406982, + "eval_physics_mean_token_accuracy": 0.7941438639163971, + "eval_physics_num_tokens": 12642651.0, + "eval_physics_runtime": 65.8671, + "eval_physics_samples_per_second": 7.591, + "eval_physics_steps_per_second": 7.591, + "step": 1900 + }, + { + "entropy": 0.651003266312182, + "epoch": 0.8501641350915262, + "grad_norm": 42.75, + "learning_rate": 1.798e-05, + "loss": 10.3017, + "mean_token_accuracy": 0.8126345485448837, + "num_tokens": 12710065.0, + "step": 1910 + }, + { + "entropy": 0.6822556433267891, + "epoch": 0.8546152562176598, + "grad_norm": 49.5, + "learning_rate": 1.7957777777777778e-05, + "loss": 11.0203, + "mean_token_accuracy": 0.8035612374544143, + "num_tokens": 12778408.0, + "step": 1920 + }, + { + "entropy": 0.6388307875953615, + "epoch": 0.8590663773437934, + "grad_norm": 54.25, + "learning_rate": 1.7935555555555556e-05, + "loss": 10.413, + "mean_token_accuracy": 0.8124310929328203, + "num_tokens": 12845949.0, + "step": 1930 + }, + { + "entropy": 0.6870338559150696, + "epoch": 0.8635174984699271, + "grad_norm": 52.75, + "learning_rate": 1.7913333333333337e-05, + "loss": 10.9443, + "mean_token_accuracy": 0.8076328281313181, + "num_tokens": 12910630.0, + "step": 1940 + }, + { + "entropy": 0.6719606230035424, + "epoch": 0.8679686195960608, + "grad_norm": 46.5, + "learning_rate": 1.789111111111111e-05, + "loss": 11.1521, + "mean_token_accuracy": 0.8018901061266661, + "num_tokens": 12975379.0, + "step": 1950 + }, + { + "entropy": 0.6635585248470306, + "epoch": 0.8724197407221944, + "grad_norm": 40.75, + "learning_rate": 1.7868888888888892e-05, + "loss": 10.5294, + "mean_token_accuracy": 0.8090925574302673, + "num_tokens": 13046473.0, + "step": 1960 + }, + { + "entropy": 0.7126090485602617, + "epoch": 0.8768708618483281, + "grad_norm": 44.5, + "learning_rate": 1.7846666666666666e-05, + "loss": 11.7059, + "mean_token_accuracy": 0.794832269102335, + "num_tokens": 13105019.0, + "step": 1970 + }, + { + "entropy": 0.7243582944385707, + "epoch": 0.8813219829744617, + "grad_norm": 47.75, + "learning_rate": 1.7824444444444447e-05, + "loss": 11.4291, + "mean_token_accuracy": 0.8015620414167642, + "num_tokens": 13167768.0, + "step": 1980 + }, + { + "entropy": 0.6495856091380119, + "epoch": 0.8857731041005953, + "grad_norm": 45.25, + "learning_rate": 1.780222222222222e-05, + "loss": 10.6768, + "mean_token_accuracy": 0.8102217994630336, + "num_tokens": 13235899.0, + "step": 1990 + }, + { + "entropy": 0.6717848453670741, + "epoch": 0.890224225226729, + "grad_norm": 41.25, + "learning_rate": 1.7780000000000003e-05, + "loss": 10.7358, + "mean_token_accuracy": 0.8080738458782435, + "num_tokens": 13301659.0, + "step": 2000 + }, + { + "epoch": 0.890224225226729, + "eval_biology_entropy": 1.2462715034484864, + "eval_biology_loss": 1.323424220085144, + "eval_biology_mean_token_accuracy": 0.6743068833947181, + "eval_biology_num_tokens": 13301659.0, + "eval_biology_runtime": 49.4062, + "eval_biology_samples_per_second": 10.12, + "eval_biology_steps_per_second": 10.12, + "step": 2000 + }, + { + "epoch": 0.890224225226729, + "eval_chemistry_entropy": 0.6809069735705853, + "eval_chemistry_loss": 0.6882286667823792, + "eval_chemistry_mean_token_accuracy": 0.8031812709569931, + "eval_chemistry_num_tokens": 13301659.0, + "eval_chemistry_runtime": 58.7929, + "eval_chemistry_samples_per_second": 8.504, + "eval_chemistry_steps_per_second": 8.504, + "step": 2000 + }, + { + "epoch": 0.890224225226729, + "eval_math_entropy": 0.6781798366010189, + "eval_math_loss": 0.8778423070907593, + "eval_math_mean_token_accuracy": 0.7743719705343246, + "eval_math_num_tokens": 13301659.0, + "eval_math_runtime": 58.9039, + "eval_math_samples_per_second": 8.488, + "eval_math_steps_per_second": 8.488, + "step": 2000 + }, + { + "epoch": 0.890224225226729, + "eval_physics_entropy": 0.6972870816588401, + "eval_physics_loss": 0.7357947826385498, + "eval_physics_mean_token_accuracy": 0.7952804889678955, + "eval_physics_num_tokens": 13301659.0, + "eval_physics_runtime": 65.8896, + "eval_physics_samples_per_second": 7.588, + "eval_physics_steps_per_second": 7.588, + "step": 2000 + }, + { + "entropy": 0.6560850510373711, + "epoch": 0.8946753463528626, + "grad_norm": 47.0, + "learning_rate": 1.7757777777777777e-05, + "loss": 10.655, + "mean_token_accuracy": 0.8084572266787291, + "num_tokens": 13368880.0, + "step": 2010 + }, + { + "entropy": 0.7044681856408715, + "epoch": 0.8991264674789963, + "grad_norm": 51.75, + "learning_rate": 1.7735555555555558e-05, + "loss": 11.2687, + "mean_token_accuracy": 0.8017475705593824, + "num_tokens": 13434364.0, + "step": 2020 + }, + { + "entropy": 0.6643494661897421, + "epoch": 0.9035775886051299, + "grad_norm": 46.5, + "learning_rate": 1.7713333333333335e-05, + "loss": 10.8443, + "mean_token_accuracy": 0.8093426391482353, + "num_tokens": 13499871.0, + "step": 2030 + }, + { + "entropy": 0.6345715833827853, + "epoch": 0.9080287097312636, + "grad_norm": 43.75, + "learning_rate": 1.7691111111111113e-05, + "loss": 10.2789, + "mean_token_accuracy": 0.817326781526208, + "num_tokens": 13570986.0, + "step": 2040 + }, + { + "entropy": 0.6034220636822283, + "epoch": 0.9124798308573973, + "grad_norm": 42.5, + "learning_rate": 1.766888888888889e-05, + "loss": 9.6924, + "mean_token_accuracy": 0.8247087761759758, + "num_tokens": 13639400.0, + "step": 2050 + }, + { + "entropy": 0.7053790994919836, + "epoch": 0.9169309519835308, + "grad_norm": 44.0, + "learning_rate": 1.7646666666666668e-05, + "loss": 11.4691, + "mean_token_accuracy": 0.8013266988098622, + "num_tokens": 13706343.0, + "step": 2060 + }, + { + "entropy": 0.7296744499355554, + "epoch": 0.9213820731096645, + "grad_norm": 50.5, + "learning_rate": 1.7624444444444446e-05, + "loss": 11.7297, + "mean_token_accuracy": 0.7966894403100013, + "num_tokens": 13772145.0, + "step": 2070 + }, + { + "entropy": 0.6874000053852797, + "epoch": 0.9258331942357981, + "grad_norm": 39.25, + "learning_rate": 1.7602222222222223e-05, + "loss": 11.2303, + "mean_token_accuracy": 0.8028161101043224, + "num_tokens": 13833901.0, + "step": 2080 + }, + { + "entropy": 0.6875790210440755, + "epoch": 0.9302843153619318, + "grad_norm": 56.5, + "learning_rate": 1.758e-05, + "loss": 11.0777, + "mean_token_accuracy": 0.803833607584238, + "num_tokens": 13890922.0, + "step": 2090 + }, + { + "entropy": 0.6549088167957962, + "epoch": 0.9347354364880655, + "grad_norm": 49.5, + "learning_rate": 1.755777777777778e-05, + "loss": 10.731, + "mean_token_accuracy": 0.8080365601927042, + "num_tokens": 13954258.0, + "step": 2100 + }, + { + "epoch": 0.9347354364880655, + "eval_biology_entropy": 1.2299529240131377, + "eval_biology_loss": 1.3241015672683716, + "eval_biology_mean_token_accuracy": 0.6739259087443352, + "eval_biology_num_tokens": 13954258.0, + "eval_biology_runtime": 49.606, + "eval_biology_samples_per_second": 10.079, + "eval_biology_steps_per_second": 10.079, + "step": 2100 + }, + { + "epoch": 0.9347354364880655, + "eval_chemistry_entropy": 0.6708778413832188, + "eval_chemistry_loss": 0.686123788356781, + "eval_chemistry_mean_token_accuracy": 0.8042934964895249, + "eval_chemistry_num_tokens": 13954258.0, + "eval_chemistry_runtime": 57.3379, + "eval_chemistry_samples_per_second": 8.72, + "eval_chemistry_steps_per_second": 8.72, + "step": 2100 + }, + { + "epoch": 0.9347354364880655, + "eval_math_entropy": 0.6867099131047726, + "eval_math_loss": 0.8771218061447144, + "eval_math_mean_token_accuracy": 0.7741661154031754, + "eval_math_num_tokens": 13954258.0, + "eval_math_runtime": 58.9912, + "eval_math_samples_per_second": 8.476, + "eval_math_steps_per_second": 8.476, + "step": 2100 + }, + { + "epoch": 0.9347354364880655, + "eval_physics_entropy": 0.7040616157650947, + "eval_physics_loss": 0.7341793179512024, + "eval_physics_mean_token_accuracy": 0.7954913071393966, + "eval_physics_num_tokens": 13954258.0, + "eval_physics_runtime": 67.9163, + "eval_physics_samples_per_second": 7.362, + "eval_physics_steps_per_second": 7.362, + "step": 2100 + }, + { + "entropy": 0.6549541524611413, + "epoch": 0.9391865576141991, + "grad_norm": 48.5, + "learning_rate": 1.7535555555555556e-05, + "loss": 10.3866, + "mean_token_accuracy": 0.8154989194124938, + "num_tokens": 14024422.0, + "step": 2110 + }, + { + "entropy": 0.6461752643808722, + "epoch": 0.9436376787403328, + "grad_norm": 52.75, + "learning_rate": 1.7513333333333334e-05, + "loss": 10.5039, + "mean_token_accuracy": 0.8102493099868298, + "num_tokens": 14093888.0, + "step": 2120 + }, + { + "entropy": 0.6772083146497607, + "epoch": 0.9480887998664663, + "grad_norm": 50.75, + "learning_rate": 1.749111111111111e-05, + "loss": 11.0251, + "mean_token_accuracy": 0.8028841782361269, + "num_tokens": 14160983.0, + "step": 2130 + }, + { + "entropy": 0.6704742998816073, + "epoch": 0.9525399209926, + "grad_norm": 50.75, + "learning_rate": 1.746888888888889e-05, + "loss": 10.741, + "mean_token_accuracy": 0.8090329814702273, + "num_tokens": 14222626.0, + "step": 2140 + }, + { + "entropy": 0.6773571631871164, + "epoch": 0.9569910421187336, + "grad_norm": 46.75, + "learning_rate": 1.7446666666666667e-05, + "loss": 11.0197, + "mean_token_accuracy": 0.80166573561728, + "num_tokens": 14287730.0, + "step": 2150 + }, + { + "entropy": 0.6920426723547279, + "epoch": 0.9614421632448673, + "grad_norm": 47.25, + "learning_rate": 1.7424444444444444e-05, + "loss": 11.0965, + "mean_token_accuracy": 0.8056489668786526, + "num_tokens": 14354918.0, + "step": 2160 + }, + { + "entropy": 0.6500336158089339, + "epoch": 0.965893284371001, + "grad_norm": 54.0, + "learning_rate": 1.7402222222222222e-05, + "loss": 10.5981, + "mean_token_accuracy": 0.809846056997776, + "num_tokens": 14427338.0, + "step": 2170 + }, + { + "entropy": 0.6648889834061265, + "epoch": 0.9703444054971346, + "grad_norm": 43.5, + "learning_rate": 1.7380000000000003e-05, + "loss": 10.7817, + "mean_token_accuracy": 0.805622360855341, + "num_tokens": 14493349.0, + "step": 2180 + }, + { + "entropy": 0.6773046649992466, + "epoch": 0.9747955266232682, + "grad_norm": 46.5, + "learning_rate": 1.735777777777778e-05, + "loss": 10.8144, + "mean_token_accuracy": 0.8050124768167735, + "num_tokens": 14555091.0, + "step": 2190 + }, + { + "entropy": 0.6381578643806278, + "epoch": 0.9792466477494018, + "grad_norm": 43.25, + "learning_rate": 1.7335555555555558e-05, + "loss": 10.2993, + "mean_token_accuracy": 0.8164823170751333, + "num_tokens": 14620244.0, + "step": 2200 + }, + { + "epoch": 0.9792466477494018, + "eval_biology_entropy": 1.2420614485144614, + "eval_biology_loss": 1.3244640827178955, + "eval_biology_mean_token_accuracy": 0.6752117122411728, + "eval_biology_num_tokens": 14620244.0, + "eval_biology_runtime": 49.4504, + "eval_biology_samples_per_second": 10.111, + "eval_biology_steps_per_second": 10.111, + "step": 2200 + }, + { + "epoch": 0.9792466477494018, + "eval_chemistry_entropy": 0.6719401916861534, + "eval_chemistry_loss": 0.6793832182884216, + "eval_chemistry_mean_token_accuracy": 0.8049565172195434, + "eval_chemistry_num_tokens": 14620244.0, + "eval_chemistry_runtime": 57.2054, + "eval_chemistry_samples_per_second": 8.74, + "eval_chemistry_steps_per_second": 8.74, + "step": 2200 + }, + { + "epoch": 0.9792466477494018, + "eval_math_entropy": 0.6749896105825901, + "eval_math_loss": 0.8789539933204651, + "eval_math_mean_token_accuracy": 0.7742650161385536, + "eval_math_num_tokens": 14620244.0, + "eval_math_runtime": 59.0818, + "eval_math_samples_per_second": 8.463, + "eval_math_steps_per_second": 8.463, + "step": 2200 + }, + { + "epoch": 0.9792466477494018, + "eval_physics_entropy": 0.7016914016604423, + "eval_physics_loss": 0.7301677465438843, + "eval_physics_mean_token_accuracy": 0.7967698615789414, + "eval_physics_num_tokens": 14620244.0, + "eval_physics_runtime": 65.8419, + "eval_physics_samples_per_second": 7.594, + "eval_physics_steps_per_second": 7.594, + "step": 2200 + }, + { + "entropy": 0.6407156278379261, + "epoch": 0.9836977688755355, + "grad_norm": 42.25, + "learning_rate": 1.7313333333333336e-05, + "loss": 10.2834, + "mean_token_accuracy": 0.8151691749691963, + "num_tokens": 14688111.0, + "step": 2210 + }, + { + "entropy": 0.6430140177719295, + "epoch": 0.9881488900016692, + "grad_norm": 42.25, + "learning_rate": 1.7291111111111113e-05, + "loss": 10.3826, + "mean_token_accuracy": 0.8134169917553663, + "num_tokens": 14760951.0, + "step": 2220 + }, + { + "entropy": 0.7000418317504227, + "epoch": 0.9926000111278028, + "grad_norm": 46.25, + "learning_rate": 1.726888888888889e-05, + "loss": 11.3631, + "mean_token_accuracy": 0.8008828196674586, + "num_tokens": 14823141.0, + "step": 2230 + }, + { + "entropy": 0.6652011920697987, + "epoch": 0.9970511322539365, + "grad_norm": 46.25, + "learning_rate": 1.724666666666667e-05, + "loss": 10.7503, + "mean_token_accuracy": 0.807745348662138, + "num_tokens": 14886593.0, + "step": 2240 + }, + { + "entropy": 0.616963776481616, + "epoch": 1.00133533633784, + "grad_norm": 45.5, + "learning_rate": 1.7224444444444446e-05, + "loss": 9.2513, + "mean_token_accuracy": 0.8238951962489587, + "num_tokens": 14951602.0, + "step": 2250 + }, + { + "entropy": 0.4407861434854567, + "epoch": 1.0057864574639737, + "grad_norm": 47.25, + "learning_rate": 1.7202222222222224e-05, + "loss": 7.2508, + "mean_token_accuracy": 0.8669742338359356, + "num_tokens": 15016941.0, + "step": 2260 + }, + { + "entropy": 0.44640444628894327, + "epoch": 1.0102375785901074, + "grad_norm": 45.0, + "learning_rate": 1.718e-05, + "loss": 7.1412, + "mean_token_accuracy": 0.8671533763408661, + "num_tokens": 15083858.0, + "step": 2270 + }, + { + "entropy": 0.418984080851078, + "epoch": 1.014688699716241, + "grad_norm": 42.25, + "learning_rate": 1.715777777777778e-05, + "loss": 6.6222, + "mean_token_accuracy": 0.8744408570230007, + "num_tokens": 15152865.0, + "step": 2280 + }, + { + "entropy": 0.4360108694061637, + "epoch": 1.0191398208423748, + "grad_norm": 46.0, + "learning_rate": 1.7135555555555557e-05, + "loss": 7.1434, + "mean_token_accuracy": 0.8651991344988346, + "num_tokens": 15218564.0, + "step": 2290 + }, + { + "entropy": 0.4562030893750489, + "epoch": 1.0235909419685083, + "grad_norm": 54.0, + "learning_rate": 1.7113333333333334e-05, + "loss": 7.238, + "mean_token_accuracy": 0.8644272200763226, + "num_tokens": 15281217.0, + "step": 2300 + }, + { + "epoch": 1.0235909419685083, + "eval_biology_entropy": 1.008221236884594, + "eval_biology_loss": 1.368038296699524, + "eval_biology_mean_token_accuracy": 0.6735649871826171, + "eval_biology_num_tokens": 15281217.0, + "eval_biology_runtime": 49.3422, + "eval_biology_samples_per_second": 10.133, + "eval_biology_steps_per_second": 10.133, + "step": 2300 + }, + { + "epoch": 1.0235909419685083, + "eval_chemistry_entropy": 0.5389311257600784, + "eval_chemistry_loss": 0.7006461024284363, + "eval_chemistry_mean_token_accuracy": 0.8036246404647828, + "eval_chemistry_num_tokens": 15281217.0, + "eval_chemistry_runtime": 57.146, + "eval_chemistry_samples_per_second": 8.75, + "eval_chemistry_steps_per_second": 8.75, + "step": 2300 + }, + { + "epoch": 1.0235909419685083, + "eval_math_entropy": 0.609027997970581, + "eval_math_loss": 0.8887434005737305, + "eval_math_mean_token_accuracy": 0.7749794082641601, + "eval_math_num_tokens": 15281217.0, + "eval_math_runtime": 58.9834, + "eval_math_samples_per_second": 8.477, + "eval_math_steps_per_second": 8.477, + "step": 2300 + }, + { + "epoch": 1.0235909419685083, + "eval_physics_entropy": 0.5886749680042267, + "eval_physics_loss": 0.7461948394775391, + "eval_physics_mean_token_accuracy": 0.795639050245285, + "eval_physics_num_tokens": 15281217.0, + "eval_physics_runtime": 65.8428, + "eval_physics_samples_per_second": 7.594, + "eval_physics_steps_per_second": 7.594, + "step": 2300 + }, + { + "entropy": 0.4447737319394946, + "epoch": 1.0280420630946419, + "grad_norm": 42.0, + "learning_rate": 1.7091111111111112e-05, + "loss": 7.2028, + "mean_token_accuracy": 0.8641991697251796, + "num_tokens": 15346128.0, + "step": 2310 + }, + { + "entropy": 0.4506528583355248, + "epoch": 1.0324931842207756, + "grad_norm": 40.25, + "learning_rate": 1.706888888888889e-05, + "loss": 7.4779, + "mean_token_accuracy": 0.860355892404914, + "num_tokens": 15412606.0, + "step": 2320 + }, + { + "entropy": 0.44774985490366814, + "epoch": 1.0369443053469092, + "grad_norm": 47.0, + "learning_rate": 1.704666666666667e-05, + "loss": 7.0083, + "mean_token_accuracy": 0.8667535860091448, + "num_tokens": 15484636.0, + "step": 2330 + }, + { + "entropy": 0.41303274463862183, + "epoch": 1.041395426473043, + "grad_norm": 51.0, + "learning_rate": 1.7024444444444445e-05, + "loss": 6.7097, + "mean_token_accuracy": 0.8727950230240822, + "num_tokens": 15556398.0, + "step": 2340 + }, + { + "entropy": 0.4270808838773519, + "epoch": 1.0458465475991765, + "grad_norm": 57.0, + "learning_rate": 1.7002222222222226e-05, + "loss": 6.9922, + "mean_token_accuracy": 0.8695517361164093, + "num_tokens": 15624248.0, + "step": 2350 + }, + { + "entropy": 0.4641818036325276, + "epoch": 1.0502976687253103, + "grad_norm": 45.0, + "learning_rate": 1.698e-05, + "loss": 7.3255, + "mean_token_accuracy": 0.8635765865445137, + "num_tokens": 15689184.0, + "step": 2360 + }, + { + "entropy": 0.4517363480292261, + "epoch": 1.0547487898514438, + "grad_norm": 53.25, + "learning_rate": 1.695777777777778e-05, + "loss": 7.4565, + "mean_token_accuracy": 0.8605622339993715, + "num_tokens": 15754370.0, + "step": 2370 + }, + { + "entropy": 0.4486183371394873, + "epoch": 1.0591999109775774, + "grad_norm": 46.25, + "learning_rate": 1.6935555555555555e-05, + "loss": 7.1355, + "mean_token_accuracy": 0.8669181287288665, + "num_tokens": 15824975.0, + "step": 2380 + }, + { + "entropy": 0.4484099537134171, + "epoch": 1.0636510321037111, + "grad_norm": 47.5, + "learning_rate": 1.6913333333333336e-05, + "loss": 7.3633, + "mean_token_accuracy": 0.8630270641297102, + "num_tokens": 15886149.0, + "step": 2390 + }, + { + "entropy": 0.44232180174440144, + "epoch": 1.0681021532298447, + "grad_norm": 43.0, + "learning_rate": 1.689111111111111e-05, + "loss": 7.1218, + "mean_token_accuracy": 0.8661713477224111, + "num_tokens": 15956560.0, + "step": 2400 + }, + { + "epoch": 1.0681021532298447, + "eval_biology_entropy": 1.0578035997748374, + "eval_biology_loss": 1.3536245822906494, + "eval_biology_mean_token_accuracy": 0.6739300854206085, + "eval_biology_num_tokens": 15956560.0, + "eval_biology_runtime": 49.2382, + "eval_biology_samples_per_second": 10.155, + "eval_biology_steps_per_second": 10.155, + "step": 2400 + }, + { + "epoch": 1.0681021532298447, + "eval_chemistry_entropy": 0.5646019796133042, + "eval_chemistry_loss": 0.688908576965332, + "eval_chemistry_mean_token_accuracy": 0.8057490674257278, + "eval_chemistry_num_tokens": 15956560.0, + "eval_chemistry_runtime": 57.0342, + "eval_chemistry_samples_per_second": 8.767, + "eval_chemistry_steps_per_second": 8.767, + "step": 2400 + }, + { + "epoch": 1.0681021532298447, + "eval_math_entropy": 0.6327130054831505, + "eval_math_loss": 0.8823345303535461, + "eval_math_mean_token_accuracy": 0.7737685534954071, + "eval_math_num_tokens": 15956560.0, + "eval_math_runtime": 58.8387, + "eval_math_samples_per_second": 8.498, + "eval_math_steps_per_second": 8.498, + "step": 2400 + }, + { + "epoch": 1.0681021532298447, + "eval_physics_entropy": 0.6062736743688584, + "eval_physics_loss": 0.7395496964454651, + "eval_physics_mean_token_accuracy": 0.7973635075688362, + "eval_physics_num_tokens": 15956560.0, + "eval_physics_runtime": 65.7545, + "eval_physics_samples_per_second": 7.604, + "eval_physics_steps_per_second": 7.604, + "step": 2400 + }, + { + "entropy": 0.46211078222841023, + "epoch": 1.0725532743559785, + "grad_norm": 45.5, + "learning_rate": 1.686888888888889e-05, + "loss": 7.4331, + "mean_token_accuracy": 0.8602133870124817, + "num_tokens": 16025633.0, + "step": 2410 + }, + { + "entropy": 0.4457336406223476, + "epoch": 1.077004395482112, + "grad_norm": 56.75, + "learning_rate": 1.684666666666667e-05, + "loss": 7.1661, + "mean_token_accuracy": 0.8661088775843382, + "num_tokens": 16094535.0, + "step": 2420 + }, + { + "entropy": 0.4561259967274964, + "epoch": 1.0814555166082458, + "grad_norm": 48.5, + "learning_rate": 1.6824444444444447e-05, + "loss": 7.339, + "mean_token_accuracy": 0.8607300788164138, + "num_tokens": 16159781.0, + "step": 2430 + }, + { + "entropy": 0.4475363499484956, + "epoch": 1.0859066377343793, + "grad_norm": 45.5, + "learning_rate": 1.6802222222222224e-05, + "loss": 7.2157, + "mean_token_accuracy": 0.8648816615343093, + "num_tokens": 16227695.0, + "step": 2440 + }, + { + "entropy": 0.44123266637325287, + "epoch": 1.090357758860513, + "grad_norm": 57.25, + "learning_rate": 1.6780000000000002e-05, + "loss": 7.0983, + "mean_token_accuracy": 0.8653297737240792, + "num_tokens": 16293211.0, + "step": 2450 + }, + { + "entropy": 0.44780588448047637, + "epoch": 1.0948088799866467, + "grad_norm": 48.75, + "learning_rate": 1.675777777777778e-05, + "loss": 7.0788, + "mean_token_accuracy": 0.8699019614607095, + "num_tokens": 16361111.0, + "step": 2460 + }, + { + "entropy": 0.4603701994754374, + "epoch": 1.0992600011127802, + "grad_norm": 44.0, + "learning_rate": 1.6735555555555557e-05, + "loss": 7.4261, + "mean_token_accuracy": 0.8626238785684108, + "num_tokens": 16424587.0, + "step": 2470 + }, + { + "entropy": 0.4399394809268415, + "epoch": 1.103711122238914, + "grad_norm": 38.25, + "learning_rate": 1.6713333333333335e-05, + "loss": 7.1414, + "mean_token_accuracy": 0.8677175424993038, + "num_tokens": 16495082.0, + "step": 2480 + }, + { + "entropy": 0.43159899283200504, + "epoch": 1.1081622433650475, + "grad_norm": 55.0, + "learning_rate": 1.6691111111111112e-05, + "loss": 7.0192, + "mean_token_accuracy": 0.8687852907925844, + "num_tokens": 16557596.0, + "step": 2490 + }, + { + "entropy": 0.4613020237535238, + "epoch": 1.1126133644911813, + "grad_norm": 40.75, + "learning_rate": 1.666888888888889e-05, + "loss": 7.2844, + "mean_token_accuracy": 0.8621427550911903, + "num_tokens": 16625999.0, + "step": 2500 + }, + { + "epoch": 1.1126133644911813, + "eval_biology_entropy": 1.0604398164749145, + "eval_biology_loss": 1.3490709066390991, + "eval_biology_mean_token_accuracy": 0.6743446952104568, + "eval_biology_num_tokens": 16625999.0, + "eval_biology_runtime": 49.2078, + "eval_biology_samples_per_second": 10.161, + "eval_biology_steps_per_second": 10.161, + "step": 2500 + }, + { + "epoch": 1.1126133644911813, + "eval_chemistry_entropy": 0.5752839610278606, + "eval_chemistry_loss": 0.6835747361183167, + "eval_chemistry_mean_token_accuracy": 0.8070742944478989, + "eval_chemistry_num_tokens": 16625999.0, + "eval_chemistry_runtime": 57.321, + "eval_chemistry_samples_per_second": 8.723, + "eval_chemistry_steps_per_second": 8.723, + "step": 2500 + }, + { + "epoch": 1.1126133644911813, + "eval_math_entropy": 0.6476987217962742, + "eval_math_loss": 0.8821178078651428, + "eval_math_mean_token_accuracy": 0.7737806038856506, + "eval_math_num_tokens": 16625999.0, + "eval_math_runtime": 59.0463, + "eval_math_samples_per_second": 8.468, + "eval_math_steps_per_second": 8.468, + "step": 2500 + }, + { + "epoch": 1.1126133644911813, + "eval_physics_entropy": 0.6317173306643963, + "eval_physics_loss": 0.736090898513794, + "eval_physics_mean_token_accuracy": 0.7971423674821854, + "eval_physics_num_tokens": 16625999.0, + "eval_physics_runtime": 65.9978, + "eval_physics_samples_per_second": 7.576, + "eval_physics_steps_per_second": 7.576, + "step": 2500 + }, + { + "entropy": 0.4300693230703473, + "epoch": 1.1170644856173149, + "grad_norm": 47.75, + "learning_rate": 1.6646666666666668e-05, + "loss": 7.1968, + "mean_token_accuracy": 0.8626983627676964, + "num_tokens": 16697896.0, + "step": 2510 + }, + { + "entropy": 0.5308609900064767, + "epoch": 1.1215156067434484, + "grad_norm": 38.5, + "learning_rate": 1.6624444444444445e-05, + "loss": 8.3002, + "mean_token_accuracy": 0.8506600432097912, + "num_tokens": 16763539.0, + "step": 2520 + }, + { + "entropy": 0.43189331707544626, + "epoch": 1.1259667278695822, + "grad_norm": 51.75, + "learning_rate": 1.6602222222222223e-05, + "loss": 7.1178, + "mean_token_accuracy": 0.8644890647381545, + "num_tokens": 16829702.0, + "step": 2530 + }, + { + "entropy": 0.4360230270773172, + "epoch": 1.1304178489957157, + "grad_norm": 39.25, + "learning_rate": 1.658e-05, + "loss": 6.8611, + "mean_token_accuracy": 0.8705176323652267, + "num_tokens": 16899120.0, + "step": 2540 + }, + { + "entropy": 0.42720533413812517, + "epoch": 1.1348689701218495, + "grad_norm": 44.0, + "learning_rate": 1.6557777777777778e-05, + "loss": 6.8917, + "mean_token_accuracy": 0.8710293497890234, + "num_tokens": 16964702.0, + "step": 2550 + }, + { + "entropy": 0.44110968001186845, + "epoch": 1.139320091247983, + "grad_norm": 57.0, + "learning_rate": 1.6535555555555556e-05, + "loss": 7.2176, + "mean_token_accuracy": 0.8638679474592209, + "num_tokens": 17028582.0, + "step": 2560 + }, + { + "entropy": 0.4430508129298687, + "epoch": 1.1437712123741166, + "grad_norm": 43.25, + "learning_rate": 1.6513333333333333e-05, + "loss": 6.9898, + "mean_token_accuracy": 0.8678263735026122, + "num_tokens": 17094349.0, + "step": 2570 + }, + { + "entropy": 0.4693911409936845, + "epoch": 1.1482223335002504, + "grad_norm": 57.0, + "learning_rate": 1.6491111111111114e-05, + "loss": 7.5441, + "mean_token_accuracy": 0.8590914167463779, + "num_tokens": 17161099.0, + "step": 2580 + }, + { + "entropy": 0.4485397749580443, + "epoch": 1.152673454626384, + "grad_norm": 46.75, + "learning_rate": 1.646888888888889e-05, + "loss": 7.169, + "mean_token_accuracy": 0.8659718453884124, + "num_tokens": 17226592.0, + "step": 2590 + }, + { + "entropy": 0.4522705724928528, + "epoch": 1.1571245757525177, + "grad_norm": 48.0, + "learning_rate": 1.644666666666667e-05, + "loss": 7.3441, + "mean_token_accuracy": 0.8593382403254509, + "num_tokens": 17293076.0, + "step": 2600 + }, + { + "epoch": 1.1571245757525177, + "eval_biology_entropy": 1.0611050905287265, + "eval_biology_loss": 1.355592966079712, + "eval_biology_mean_token_accuracy": 0.6738189800977706, + "eval_biology_num_tokens": 17293076.0, + "eval_biology_runtime": 49.4441, + "eval_biology_samples_per_second": 10.112, + "eval_biology_steps_per_second": 10.112, + "step": 2600 + }, + { + "epoch": 1.1571245757525177, + "eval_chemistry_entropy": 0.5737494812905788, + "eval_chemistry_loss": 0.6839864253997803, + "eval_chemistry_mean_token_accuracy": 0.8067270002365112, + "eval_chemistry_num_tokens": 17293076.0, + "eval_chemistry_runtime": 57.1981, + "eval_chemistry_samples_per_second": 8.742, + "eval_chemistry_steps_per_second": 8.742, + "step": 2600 + }, + { + "epoch": 1.1571245757525177, + "eval_math_entropy": 0.6371037636697292, + "eval_math_loss": 0.8860217332839966, + "eval_math_mean_token_accuracy": 0.7737574310302734, + "eval_math_num_tokens": 17293076.0, + "eval_math_runtime": 59.0287, + "eval_math_samples_per_second": 8.47, + "eval_math_steps_per_second": 8.47, + "step": 2600 + }, + { + "epoch": 1.1571245757525177, + "eval_physics_entropy": 0.6235325867533684, + "eval_physics_loss": 0.738845705986023, + "eval_physics_mean_token_accuracy": 0.7969796572327614, + "eval_physics_num_tokens": 17293076.0, + "eval_physics_runtime": 65.954, + "eval_physics_samples_per_second": 7.581, + "eval_physics_steps_per_second": 7.581, + "step": 2600 + }, + { + "entropy": 0.46481462884694336, + "epoch": 1.1615756968786513, + "grad_norm": 50.75, + "learning_rate": 1.6424444444444444e-05, + "loss": 7.5857, + "mean_token_accuracy": 0.8580488007515669, + "num_tokens": 17360241.0, + "step": 2610 + }, + { + "entropy": 0.508445237018168, + "epoch": 1.166026818004785, + "grad_norm": 57.75, + "learning_rate": 1.6402222222222225e-05, + "loss": 8.0312, + "mean_token_accuracy": 0.8495650395751, + "num_tokens": 17424840.0, + "step": 2620 + }, + { + "entropy": 0.44586200537160037, + "epoch": 1.1704779391309186, + "grad_norm": 49.75, + "learning_rate": 1.638e-05, + "loss": 7.2128, + "mean_token_accuracy": 0.8640360705554485, + "num_tokens": 17493054.0, + "step": 2630 + }, + { + "entropy": 0.4696395605336875, + "epoch": 1.1749290602570523, + "grad_norm": 56.5, + "learning_rate": 1.635777777777778e-05, + "loss": 7.4301, + "mean_token_accuracy": 0.8624280441552401, + "num_tokens": 17557844.0, + "step": 2640 + }, + { + "entropy": 0.46180193265900016, + "epoch": 1.179380181383186, + "grad_norm": 52.0, + "learning_rate": 1.6335555555555558e-05, + "loss": 7.487, + "mean_token_accuracy": 0.860778896510601, + "num_tokens": 17627019.0, + "step": 2650 + }, + { + "entropy": 0.48290405622683463, + "epoch": 1.1838313025093194, + "grad_norm": 42.5, + "learning_rate": 1.6313333333333335e-05, + "loss": 7.8112, + "mean_token_accuracy": 0.8533384174108505, + "num_tokens": 17692635.0, + "step": 2660 + }, + { + "entropy": 0.45608944771811366, + "epoch": 1.1882824236354532, + "grad_norm": 37.25, + "learning_rate": 1.6291111111111113e-05, + "loss": 7.2588, + "mean_token_accuracy": 0.8633730322122574, + "num_tokens": 17760815.0, + "step": 2670 + }, + { + "entropy": 0.4498800415545702, + "epoch": 1.1927335447615868, + "grad_norm": 50.75, + "learning_rate": 1.626888888888889e-05, + "loss": 7.4378, + "mean_token_accuracy": 0.8621528469026088, + "num_tokens": 17826556.0, + "step": 2680 + }, + { + "entropy": 0.4677328685298562, + "epoch": 1.1971846658877205, + "grad_norm": 46.5, + "learning_rate": 1.6246666666666668e-05, + "loss": 7.4865, + "mean_token_accuracy": 0.8602121725678444, + "num_tokens": 17890493.0, + "step": 2690 + }, + { + "entropy": 0.4351489793509245, + "epoch": 1.201635787013854, + "grad_norm": 51.0, + "learning_rate": 1.6224444444444446e-05, + "loss": 6.8985, + "mean_token_accuracy": 0.8673448100686073, + "num_tokens": 17955924.0, + "step": 2700 + }, + { + "epoch": 1.201635787013854, + "eval_biology_entropy": 1.0872275706529617, + "eval_biology_loss": 1.3522777557373047, + "eval_biology_mean_token_accuracy": 0.6728494735360145, + "eval_biology_num_tokens": 17955924.0, + "eval_biology_runtime": 49.4587, + "eval_biology_samples_per_second": 10.109, + "eval_biology_steps_per_second": 10.109, + "step": 2700 + }, + { + "epoch": 1.201635787013854, + "eval_chemistry_entropy": 0.5667885291278363, + "eval_chemistry_loss": 0.6790626049041748, + "eval_chemistry_mean_token_accuracy": 0.8083000086545944, + "eval_chemistry_num_tokens": 17955924.0, + "eval_chemistry_runtime": 57.1776, + "eval_chemistry_samples_per_second": 8.745, + "eval_chemistry_steps_per_second": 8.745, + "step": 2700 + }, + { + "epoch": 1.201635787013854, + "eval_math_entropy": 0.6420919281840325, + "eval_math_loss": 0.8805245161056519, + "eval_math_mean_token_accuracy": 0.7750180523395538, + "eval_math_num_tokens": 17955924.0, + "eval_math_runtime": 59.0158, + "eval_math_samples_per_second": 8.472, + "eval_math_steps_per_second": 8.472, + "step": 2700 + }, + { + "epoch": 1.201635787013854, + "eval_physics_entropy": 0.6224361670017242, + "eval_physics_loss": 0.7381331920623779, + "eval_physics_mean_token_accuracy": 0.797513249874115, + "eval_physics_num_tokens": 17955924.0, + "eval_physics_runtime": 65.9492, + "eval_physics_samples_per_second": 7.582, + "eval_physics_steps_per_second": 7.582, + "step": 2700 + }, + { + "entropy": 0.4229597695637494, + "epoch": 1.2060869081399876, + "grad_norm": 53.25, + "learning_rate": 1.6202222222222223e-05, + "loss": 6.7818, + "mean_token_accuracy": 0.8693600654602051, + "num_tokens": 18024547.0, + "step": 2710 + }, + { + "entropy": 0.4426684442907572, + "epoch": 1.2105380292661214, + "grad_norm": 54.75, + "learning_rate": 1.618e-05, + "loss": 7.2144, + "mean_token_accuracy": 0.8655797932296991, + "num_tokens": 18093886.0, + "step": 2720 + }, + { + "entropy": 0.4442020413465798, + "epoch": 1.214989150392255, + "grad_norm": 56.5, + "learning_rate": 1.615777777777778e-05, + "loss": 7.1444, + "mean_token_accuracy": 0.8636309675872326, + "num_tokens": 18166010.0, + "step": 2730 + }, + { + "entropy": 0.4571035811677575, + "epoch": 1.2194402715183887, + "grad_norm": 52.25, + "learning_rate": 1.6135555555555556e-05, + "loss": 7.246, + "mean_token_accuracy": 0.8641397166997195, + "num_tokens": 18236214.0, + "step": 2740 + }, + { + "entropy": 0.43060594173148276, + "epoch": 1.2238913926445223, + "grad_norm": 43.0, + "learning_rate": 1.6113333333333334e-05, + "loss": 7.045, + "mean_token_accuracy": 0.8672611359506845, + "num_tokens": 18300911.0, + "step": 2750 + }, + { + "entropy": 0.4837562100961804, + "epoch": 1.228342513770656, + "grad_norm": 56.75, + "learning_rate": 1.609111111111111e-05, + "loss": 7.7934, + "mean_token_accuracy": 0.8567057065665722, + "num_tokens": 18361378.0, + "step": 2760 + }, + { + "entropy": 0.4756465101614594, + "epoch": 1.2327936348967896, + "grad_norm": 48.75, + "learning_rate": 1.606888888888889e-05, + "loss": 7.7264, + "mean_token_accuracy": 0.8567628230899572, + "num_tokens": 18426528.0, + "step": 2770 + }, + { + "entropy": 0.41422095978632567, + "epoch": 1.2372447560229234, + "grad_norm": 51.25, + "learning_rate": 1.6046666666666667e-05, + "loss": 6.584, + "mean_token_accuracy": 0.8728754710406065, + "num_tokens": 18495853.0, + "step": 2780 + }, + { + "entropy": 0.4424589385278523, + "epoch": 1.241695877149057, + "grad_norm": 49.25, + "learning_rate": 1.6024444444444444e-05, + "loss": 7.1162, + "mean_token_accuracy": 0.8664431758224964, + "num_tokens": 18561871.0, + "step": 2790 + }, + { + "entropy": 0.4754915161058307, + "epoch": 1.2461469982751905, + "grad_norm": 51.5, + "learning_rate": 1.6002222222222222e-05, + "loss": 7.7951, + "mean_token_accuracy": 0.8558566756546497, + "num_tokens": 18632874.0, + "step": 2800 + }, + { + "epoch": 1.2461469982751905, + "eval_biology_entropy": 1.130611531972885, + "eval_biology_loss": 1.351530909538269, + "eval_biology_mean_token_accuracy": 0.6723179424405098, + "eval_biology_num_tokens": 18632874.0, + "eval_biology_runtime": 49.3883, + "eval_biology_samples_per_second": 10.124, + "eval_biology_steps_per_second": 10.124, + "step": 2800 + }, + { + "epoch": 1.2461469982751905, + "eval_chemistry_entropy": 0.5990455005764961, + "eval_chemistry_loss": 0.6771162152290344, + "eval_chemistry_mean_token_accuracy": 0.8073492966890335, + "eval_chemistry_num_tokens": 18632874.0, + "eval_chemistry_runtime": 57.202, + "eval_chemistry_samples_per_second": 8.741, + "eval_chemistry_steps_per_second": 8.741, + "step": 2800 + }, + { + "epoch": 1.2461469982751905, + "eval_math_entropy": 0.6585764658153057, + "eval_math_loss": 0.8815036416053772, + "eval_math_mean_token_accuracy": 0.774179373383522, + "eval_math_num_tokens": 18632874.0, + "eval_math_runtime": 59.0194, + "eval_math_samples_per_second": 8.472, + "eval_math_steps_per_second": 8.472, + "step": 2800 + }, + { + "epoch": 1.2461469982751905, + "eval_physics_entropy": 0.6383759444653988, + "eval_physics_loss": 0.7322890758514404, + "eval_physics_mean_token_accuracy": 0.79879817289114, + "eval_physics_num_tokens": 18632874.0, + "eval_physics_runtime": 65.8683, + "eval_physics_samples_per_second": 7.591, + "eval_physics_steps_per_second": 7.591, + "step": 2800 + }, + { + "entropy": 0.44392181765288113, + "epoch": 1.2505981194013243, + "grad_norm": 50.0, + "learning_rate": 1.5980000000000003e-05, + "loss": 7.1925, + "mean_token_accuracy": 0.8625312387943268, + "num_tokens": 18705033.0, + "step": 2810 + }, + { + "entropy": 0.49188855281099675, + "epoch": 1.2550492405274578, + "grad_norm": 42.25, + "learning_rate": 1.5957777777777777e-05, + "loss": 7.7007, + "mean_token_accuracy": 0.8566019531339407, + "num_tokens": 18775027.0, + "step": 2820 + }, + { + "entropy": 0.4274396298453212, + "epoch": 1.2595003616535916, + "grad_norm": 61.5, + "learning_rate": 1.5935555555555558e-05, + "loss": 6.9418, + "mean_token_accuracy": 0.8678123977035284, + "num_tokens": 18840707.0, + "step": 2830 + }, + { + "entropy": 0.42932912511751053, + "epoch": 1.2639514827797251, + "grad_norm": 47.5, + "learning_rate": 1.5913333333333332e-05, + "loss": 6.9849, + "mean_token_accuracy": 0.8674451969563961, + "num_tokens": 18907324.0, + "step": 2840 + }, + { + "entropy": 0.4437895940616727, + "epoch": 1.2684026039058587, + "grad_norm": 47.25, + "learning_rate": 1.5891111111111113e-05, + "loss": 7.2269, + "mean_token_accuracy": 0.8640030194073915, + "num_tokens": 18980315.0, + "step": 2850 + }, + { + "entropy": 0.4900133884511888, + "epoch": 1.2728537250319925, + "grad_norm": 46.75, + "learning_rate": 1.5868888888888888e-05, + "loss": 7.8413, + "mean_token_accuracy": 0.8534307833760977, + "num_tokens": 19047920.0, + "step": 2860 + }, + { + "entropy": 0.4579563525505364, + "epoch": 1.277304846158126, + "grad_norm": 42.75, + "learning_rate": 1.584666666666667e-05, + "loss": 7.415, + "mean_token_accuracy": 0.8628888107836247, + "num_tokens": 19116973.0, + "step": 2870 + }, + { + "entropy": 0.4479681572876871, + "epoch": 1.2817559672842598, + "grad_norm": 45.25, + "learning_rate": 1.5824444444444446e-05, + "loss": 6.9909, + "mean_token_accuracy": 0.86840063855052, + "num_tokens": 19180813.0, + "step": 2880 + }, + { + "entropy": 0.43391375951468947, + "epoch": 1.2862070884103933, + "grad_norm": 46.5, + "learning_rate": 1.5802222222222224e-05, + "loss": 7.2092, + "mean_token_accuracy": 0.8637670233845711, + "num_tokens": 19257345.0, + "step": 2890 + }, + { + "entropy": 0.4650261805392802, + "epoch": 1.2906582095365269, + "grad_norm": 53.0, + "learning_rate": 1.578e-05, + "loss": 7.4577, + "mean_token_accuracy": 0.8622964296489954, + "num_tokens": 19324641.0, + "step": 2900 + }, + { + "epoch": 1.2906582095365269, + "eval_biology_entropy": 1.0379975486993789, + "eval_biology_loss": 1.3569517135620117, + "eval_biology_mean_token_accuracy": 0.6738772737383842, + "eval_biology_num_tokens": 19324641.0, + "eval_biology_runtime": 49.3114, + "eval_biology_samples_per_second": 10.14, + "eval_biology_steps_per_second": 10.14, + "step": 2900 + }, + { + "epoch": 1.2906582095365269, + "eval_chemistry_entropy": 0.558593369692564, + "eval_chemistry_loss": 0.6772850155830383, + "eval_chemistry_mean_token_accuracy": 0.8087174137830734, + "eval_chemistry_num_tokens": 19324641.0, + "eval_chemistry_runtime": 57.0403, + "eval_chemistry_samples_per_second": 8.766, + "eval_chemistry_steps_per_second": 8.766, + "step": 2900 + }, + { + "epoch": 1.2906582095365269, + "eval_math_entropy": 0.6302942368090153, + "eval_math_loss": 0.8803268074989319, + "eval_math_mean_token_accuracy": 0.7754354075193405, + "eval_math_num_tokens": 19324641.0, + "eval_math_runtime": 58.8403, + "eval_math_samples_per_second": 8.498, + "eval_math_steps_per_second": 8.498, + "step": 2900 + }, + { + "epoch": 1.2906582095365269, + "eval_physics_entropy": 0.6058774860501289, + "eval_physics_loss": 0.7356924414634705, + "eval_physics_mean_token_accuracy": 0.7990164247751236, + "eval_physics_num_tokens": 19324641.0, + "eval_physics_runtime": 65.7933, + "eval_physics_samples_per_second": 7.6, + "eval_physics_steps_per_second": 7.6, + "step": 2900 + }, + { + "entropy": 0.4442966864909977, + "epoch": 1.2951093306626607, + "grad_norm": 51.75, + "learning_rate": 1.575777777777778e-05, + "loss": 7.1428, + "mean_token_accuracy": 0.866588406637311, + "num_tokens": 19391043.0, + "step": 2910 + }, + { + "entropy": 0.41584208998829125, + "epoch": 1.2995604517887944, + "grad_norm": 47.5, + "learning_rate": 1.5735555555555557e-05, + "loss": 6.6871, + "mean_token_accuracy": 0.8722489140927792, + "num_tokens": 19461093.0, + "step": 2920 + }, + { + "entropy": 0.41565433200448754, + "epoch": 1.304011572914928, + "grad_norm": 52.5, + "learning_rate": 1.5713333333333334e-05, + "loss": 6.8207, + "mean_token_accuracy": 0.8697727452963591, + "num_tokens": 19527736.0, + "step": 2930 + }, + { + "entropy": 0.4530805963091552, + "epoch": 1.3084626940410615, + "grad_norm": 45.75, + "learning_rate": 1.5691111111111112e-05, + "loss": 7.2089, + "mean_token_accuracy": 0.8639964748173952, + "num_tokens": 19595645.0, + "step": 2940 + }, + { + "entropy": 0.4512560985982418, + "epoch": 1.3129138151671953, + "grad_norm": 39.5, + "learning_rate": 1.5668888888888893e-05, + "loss": 7.3374, + "mean_token_accuracy": 0.8624167494475842, + "num_tokens": 19660322.0, + "step": 2950 + }, + { + "entropy": 0.47328136414289473, + "epoch": 1.3173649362933288, + "grad_norm": 51.0, + "learning_rate": 1.5646666666666667e-05, + "loss": 7.5667, + "mean_token_accuracy": 0.8593833021819591, + "num_tokens": 19725399.0, + "step": 2960 + }, + { + "entropy": 0.45125425988808276, + "epoch": 1.3218160574194626, + "grad_norm": 58.0, + "learning_rate": 1.5624444444444448e-05, + "loss": 7.3002, + "mean_token_accuracy": 0.8622968144714832, + "num_tokens": 19791328.0, + "step": 2970 + }, + { + "entropy": 0.43172686779871583, + "epoch": 1.3262671785455962, + "grad_norm": 49.25, + "learning_rate": 1.5602222222222222e-05, + "loss": 6.869, + "mean_token_accuracy": 0.869724229350686, + "num_tokens": 19857430.0, + "step": 2980 + }, + { + "entropy": 0.4200130053795874, + "epoch": 1.3307182996717297, + "grad_norm": 53.75, + "learning_rate": 1.5580000000000003e-05, + "loss": 6.7219, + "mean_token_accuracy": 0.8698449391871691, + "num_tokens": 19922864.0, + "step": 2990 + }, + { + "entropy": 0.45354582378640773, + "epoch": 1.3351694207978635, + "grad_norm": 48.75, + "learning_rate": 1.5557777777777778e-05, + "loss": 7.3908, + "mean_token_accuracy": 0.8611706964671612, + "num_tokens": 19988673.0, + "step": 3000 + }, + { + "epoch": 1.3351694207978635, + "eval_biology_entropy": 1.0367796572446824, + "eval_biology_loss": 1.3580524921417236, + "eval_biology_mean_token_accuracy": 0.6729938799142837, + "eval_biology_num_tokens": 19988673.0, + "eval_biology_runtime": 49.1633, + "eval_biology_samples_per_second": 10.17, + "eval_biology_steps_per_second": 10.17, + "step": 3000 + }, + { + "epoch": 1.3351694207978635, + "eval_chemistry_entropy": 0.5578776714205742, + "eval_chemistry_loss": 0.676606297492981, + "eval_chemistry_mean_token_accuracy": 0.8088430635929108, + "eval_chemistry_num_tokens": 19988673.0, + "eval_chemistry_runtime": 57.2977, + "eval_chemistry_samples_per_second": 8.726, + "eval_chemistry_steps_per_second": 8.726, + "step": 3000 + }, + { + "epoch": 1.3351694207978635, + "eval_math_entropy": 0.6245537775158883, + "eval_math_loss": 0.8893261551856995, + "eval_math_mean_token_accuracy": 0.7745395843982696, + "eval_math_num_tokens": 19988673.0, + "eval_math_runtime": 58.9904, + "eval_math_samples_per_second": 8.476, + "eval_math_steps_per_second": 8.476, + "step": 3000 + }, + { + "epoch": 1.3351694207978635, + "eval_physics_entropy": 0.6184971186220646, + "eval_physics_loss": 0.7320802807807922, + "eval_physics_mean_token_accuracy": 0.7992791814804077, + "eval_physics_num_tokens": 19988673.0, + "eval_physics_runtime": 65.9691, + "eval_physics_samples_per_second": 7.579, + "eval_physics_steps_per_second": 7.579, + "step": 3000 + }, + { + "entropy": 0.4390603884123266, + "epoch": 1.339620541923997, + "grad_norm": 50.5, + "learning_rate": 1.553555555555556e-05, + "loss": 7.2469, + "mean_token_accuracy": 0.8639258954674005, + "num_tokens": 20054087.0, + "step": 3010 + }, + { + "entropy": 0.44685489051043986, + "epoch": 1.3440716630501308, + "grad_norm": 51.5, + "learning_rate": 1.5513333333333333e-05, + "loss": 6.9631, + "mean_token_accuracy": 0.8673676140606403, + "num_tokens": 20117738.0, + "step": 3020 + }, + { + "entropy": 0.4509993907995522, + "epoch": 1.3485227841762644, + "grad_norm": 38.75, + "learning_rate": 1.5491111111111114e-05, + "loss": 7.4131, + "mean_token_accuracy": 0.861151909828186, + "num_tokens": 20180919.0, + "step": 3030 + }, + { + "entropy": 0.4641651880927384, + "epoch": 1.352973905302398, + "grad_norm": 53.75, + "learning_rate": 1.546888888888889e-05, + "loss": 7.2877, + "mean_token_accuracy": 0.8633020594716072, + "num_tokens": 20248890.0, + "step": 3040 + }, + { + "entropy": 0.4600936593487859, + "epoch": 1.3574250264285317, + "grad_norm": 61.75, + "learning_rate": 1.544666666666667e-05, + "loss": 7.6556, + "mean_token_accuracy": 0.858083164319396, + "num_tokens": 20312039.0, + "step": 3050 + }, + { + "entropy": 0.4317381651140749, + "epoch": 1.3618761475546655, + "grad_norm": 47.0, + "learning_rate": 1.5424444444444447e-05, + "loss": 6.8012, + "mean_token_accuracy": 0.871273398399353, + "num_tokens": 20379865.0, + "step": 3060 + }, + { + "entropy": 0.44113692445680497, + "epoch": 1.366327268680799, + "grad_norm": 46.25, + "learning_rate": 1.5402222222222224e-05, + "loss": 7.205, + "mean_token_accuracy": 0.8653013337403536, + "num_tokens": 20450098.0, + "step": 3070 + }, + { + "entropy": 0.45343676628544927, + "epoch": 1.3707783898069326, + "grad_norm": 51.25, + "learning_rate": 1.5380000000000002e-05, + "loss": 7.2772, + "mean_token_accuracy": 0.8633841559290886, + "num_tokens": 20515713.0, + "step": 3080 + }, + { + "entropy": 0.5017632408998907, + "epoch": 1.3752295109330663, + "grad_norm": 47.25, + "learning_rate": 1.535777777777778e-05, + "loss": 8.1881, + "mean_token_accuracy": 0.849872462078929, + "num_tokens": 20577338.0, + "step": 3090 + }, + { + "entropy": 0.43909767055884, + "epoch": 1.3796806320591999, + "grad_norm": 44.0, + "learning_rate": 1.5335555555555557e-05, + "loss": 6.9575, + "mean_token_accuracy": 0.8682761292904615, + "num_tokens": 20640374.0, + "step": 3100 + }, + { + "epoch": 1.3796806320591999, + "eval_biology_entropy": 1.0970244548320771, + "eval_biology_loss": 1.344295859336853, + "eval_biology_mean_token_accuracy": 0.6746094572544098, + "eval_biology_num_tokens": 20640374.0, + "eval_biology_runtime": 49.3497, + "eval_biology_samples_per_second": 10.132, + "eval_biology_steps_per_second": 10.132, + "step": 3100 + }, + { + "epoch": 1.3796806320591999, + "eval_chemistry_entropy": 0.5688635316491127, + "eval_chemistry_loss": 0.6705365180969238, + "eval_chemistry_mean_token_accuracy": 0.8096451599597931, + "eval_chemistry_num_tokens": 20640374.0, + "eval_chemistry_runtime": 57.2027, + "eval_chemistry_samples_per_second": 8.741, + "eval_chemistry_steps_per_second": 8.741, + "step": 3100 + }, + { + "epoch": 1.3796806320591999, + "eval_math_entropy": 0.6324037388563156, + "eval_math_loss": 0.88166344165802, + "eval_math_mean_token_accuracy": 0.7753028560876847, + "eval_math_num_tokens": 20640374.0, + "eval_math_runtime": 59.048, + "eval_math_samples_per_second": 8.468, + "eval_math_steps_per_second": 8.468, + "step": 3100 + }, + { + "epoch": 1.3796806320591999, + "eval_physics_entropy": 0.6208124775588513, + "eval_physics_loss": 0.7312092781066895, + "eval_physics_mean_token_accuracy": 0.7999175149798393, + "eval_physics_num_tokens": 20640374.0, + "eval_physics_runtime": 65.9619, + "eval_physics_samples_per_second": 7.58, + "eval_physics_steps_per_second": 7.58, + "step": 3100 + }, + { + "entropy": 0.4564077785238624, + "epoch": 1.3841317531853337, + "grad_norm": 48.25, + "learning_rate": 1.5313333333333335e-05, + "loss": 7.3493, + "mean_token_accuracy": 0.8634169407188892, + "num_tokens": 20701599.0, + "step": 3110 + }, + { + "entropy": 0.45823419177904723, + "epoch": 1.3885828743114672, + "grad_norm": 49.75, + "learning_rate": 1.5291111111111112e-05, + "loss": 7.4618, + "mean_token_accuracy": 0.8625175200402737, + "num_tokens": 20769790.0, + "step": 3120 + }, + { + "entropy": 0.4572055713273585, + "epoch": 1.3930339954376008, + "grad_norm": 38.25, + "learning_rate": 1.526888888888889e-05, + "loss": 7.3255, + "mean_token_accuracy": 0.8633602283895015, + "num_tokens": 20837901.0, + "step": 3130 + }, + { + "entropy": 0.4625745963305235, + "epoch": 1.3974851165637345, + "grad_norm": 50.0, + "learning_rate": 1.5246666666666668e-05, + "loss": 7.5241, + "mean_token_accuracy": 0.8608714625239372, + "num_tokens": 20900772.0, + "step": 3140 + }, + { + "entropy": 0.4768506956286728, + "epoch": 1.401936237689868, + "grad_norm": 49.0, + "learning_rate": 1.5224444444444447e-05, + "loss": 7.678, + "mean_token_accuracy": 0.8553702417761088, + "num_tokens": 20964368.0, + "step": 3150 + }, + { + "entropy": 0.4599393155425787, + "epoch": 1.4063873588160019, + "grad_norm": 42.75, + "learning_rate": 1.5202222222222223e-05, + "loss": 7.4504, + "mean_token_accuracy": 0.8620400402694941, + "num_tokens": 21032357.0, + "step": 3160 + }, + { + "entropy": 0.4775816229172051, + "epoch": 1.4108384799421354, + "grad_norm": 48.0, + "learning_rate": 1.5180000000000002e-05, + "loss": 7.7545, + "mean_token_accuracy": 0.8548209808766842, + "num_tokens": 21097884.0, + "step": 3170 + }, + { + "entropy": 0.4543197107501328, + "epoch": 1.415289601068269, + "grad_norm": 54.0, + "learning_rate": 1.5157777777777778e-05, + "loss": 7.3566, + "mean_token_accuracy": 0.8643424637615681, + "num_tokens": 21162515.0, + "step": 3180 + }, + { + "entropy": 0.4249213400296867, + "epoch": 1.4197407221944027, + "grad_norm": 59.0, + "learning_rate": 1.5135555555555557e-05, + "loss": 6.9137, + "mean_token_accuracy": 0.8678638692945242, + "num_tokens": 21230408.0, + "step": 3190 + }, + { + "entropy": 0.47805986092425884, + "epoch": 1.4241918433205363, + "grad_norm": 48.5, + "learning_rate": 1.5113333333333335e-05, + "loss": 7.5895, + "mean_token_accuracy": 0.8564241603016853, + "num_tokens": 21299050.0, + "step": 3200 + }, + { + "epoch": 1.4241918433205363, + "eval_biology_entropy": 1.121574101269245, + "eval_biology_loss": 1.3434149026870728, + "eval_biology_mean_token_accuracy": 0.6744625233411788, + "eval_biology_num_tokens": 21299050.0, + "eval_biology_runtime": 49.2984, + "eval_biology_samples_per_second": 10.142, + "eval_biology_steps_per_second": 10.142, + "step": 3200 + }, + { + "epoch": 1.4241918433205363, + "eval_chemistry_entropy": 0.5754444530308247, + "eval_chemistry_loss": 0.6670053601264954, + "eval_chemistry_mean_token_accuracy": 0.8101741118431092, + "eval_chemistry_num_tokens": 21299050.0, + "eval_chemistry_runtime": 57.1135, + "eval_chemistry_samples_per_second": 8.754, + "eval_chemistry_steps_per_second": 8.754, + "step": 3200 + }, + { + "epoch": 1.4241918433205363, + "eval_math_entropy": 0.6417503325939179, + "eval_math_loss": 0.8890306353569031, + "eval_math_mean_token_accuracy": 0.7736804057359695, + "eval_math_num_tokens": 21299050.0, + "eval_math_runtime": 58.9702, + "eval_math_samples_per_second": 8.479, + "eval_math_steps_per_second": 8.479, + "step": 3200 + }, + { + "epoch": 1.4241918433205363, + "eval_physics_entropy": 0.6271489924490452, + "eval_physics_loss": 0.7273263931274414, + "eval_physics_mean_token_accuracy": 0.7991984650492668, + "eval_physics_num_tokens": 21299050.0, + "eval_physics_runtime": 65.9545, + "eval_physics_samples_per_second": 7.581, + "eval_physics_steps_per_second": 7.581, + "step": 3200 + }, + { + "entropy": 0.4390040896832943, + "epoch": 1.42864296444667, + "grad_norm": 60.0, + "learning_rate": 1.5091111111111113e-05, + "loss": 7.0589, + "mean_token_accuracy": 0.8643136177212, + "num_tokens": 21366237.0, + "step": 3210 + }, + { + "entropy": 0.4670333099551499, + "epoch": 1.4330940855728036, + "grad_norm": 41.25, + "learning_rate": 1.506888888888889e-05, + "loss": 7.5381, + "mean_token_accuracy": 0.8599710401147604, + "num_tokens": 21436423.0, + "step": 3220 + }, + { + "entropy": 0.4512263203039765, + "epoch": 1.4375452066989374, + "grad_norm": 44.5, + "learning_rate": 1.5046666666666668e-05, + "loss": 7.2272, + "mean_token_accuracy": 0.8637539122253657, + "num_tokens": 21504658.0, + "step": 3230 + }, + { + "entropy": 0.43940321505069735, + "epoch": 1.441996327825071, + "grad_norm": 47.75, + "learning_rate": 1.5024444444444445e-05, + "loss": 7.2682, + "mean_token_accuracy": 0.8627861823886633, + "num_tokens": 21569748.0, + "step": 3240 + }, + { + "entropy": 0.4890416743699461, + "epoch": 1.4464474489512047, + "grad_norm": 48.0, + "learning_rate": 1.5002222222222223e-05, + "loss": 7.6296, + "mean_token_accuracy": 0.8581738580018282, + "num_tokens": 21630649.0, + "step": 3250 + }, + { + "entropy": 0.460271833371371, + "epoch": 1.4508985700773382, + "grad_norm": 58.5, + "learning_rate": 1.498e-05, + "loss": 7.5093, + "mean_token_accuracy": 0.8598960891366005, + "num_tokens": 21694065.0, + "step": 3260 + }, + { + "entropy": 0.43111922573298217, + "epoch": 1.4553496912034718, + "grad_norm": 48.5, + "learning_rate": 1.495777777777778e-05, + "loss": 7.1384, + "mean_token_accuracy": 0.8690983727574348, + "num_tokens": 21758494.0, + "step": 3270 + }, + { + "entropy": 0.46071252766996623, + "epoch": 1.4598008123296056, + "grad_norm": 61.25, + "learning_rate": 1.4935555555555556e-05, + "loss": 7.1884, + "mean_token_accuracy": 0.8645727276802063, + "num_tokens": 21822217.0, + "step": 3280 + }, + { + "entropy": 0.42421281514689324, + "epoch": 1.4642519334557391, + "grad_norm": 60.5, + "learning_rate": 1.4913333333333335e-05, + "loss": 7.0755, + "mean_token_accuracy": 0.8672281835228205, + "num_tokens": 21887417.0, + "step": 3290 + }, + { + "entropy": 0.44167528823018076, + "epoch": 1.468703054581873, + "grad_norm": 44.0, + "learning_rate": 1.4891111111111111e-05, + "loss": 7.0082, + "mean_token_accuracy": 0.8677619837224484, + "num_tokens": 21954784.0, + "step": 3300 + }, + { + "epoch": 1.468703054581873, + "eval_biology_entropy": 1.087524308115244, + "eval_biology_loss": 1.3437690734863281, + "eval_biology_mean_token_accuracy": 0.6751979528069496, + "eval_biology_num_tokens": 21954784.0, + "eval_biology_runtime": 49.2492, + "eval_biology_samples_per_second": 10.152, + "eval_biology_steps_per_second": 10.152, + "step": 3300 + }, + { + "epoch": 1.468703054581873, + "eval_chemistry_entropy": 0.5626610437631607, + "eval_chemistry_loss": 0.6688293814659119, + "eval_chemistry_mean_token_accuracy": 0.8103534108400345, + "eval_chemistry_num_tokens": 21954784.0, + "eval_chemistry_runtime": 57.0658, + "eval_chemistry_samples_per_second": 8.762, + "eval_chemistry_steps_per_second": 8.762, + "step": 3300 + }, + { + "epoch": 1.468703054581873, + "eval_math_entropy": 0.634619223922491, + "eval_math_loss": 0.8863106369972229, + "eval_math_mean_token_accuracy": 0.7748042297363281, + "eval_math_num_tokens": 21954784.0, + "eval_math_runtime": 58.9702, + "eval_math_samples_per_second": 8.479, + "eval_math_steps_per_second": 8.479, + "step": 3300 + }, + { + "epoch": 1.468703054581873, + "eval_physics_entropy": 0.6217128495275974, + "eval_physics_loss": 0.7265288233757019, + "eval_physics_mean_token_accuracy": 0.7997407680749893, + "eval_physics_num_tokens": 21954784.0, + "eval_physics_runtime": 65.942, + "eval_physics_samples_per_second": 7.582, + "eval_physics_steps_per_second": 7.582, + "step": 3300 + }, + { + "entropy": 0.46098333611153064, + "epoch": 1.4731541757080064, + "grad_norm": 48.0, + "learning_rate": 1.486888888888889e-05, + "loss": 7.5837, + "mean_token_accuracy": 0.8587041411548852, + "num_tokens": 22022454.0, + "step": 3310 + }, + { + "entropy": 0.46257833037525414, + "epoch": 1.47760529683414, + "grad_norm": 43.0, + "learning_rate": 1.4846666666666666e-05, + "loss": 7.2984, + "mean_token_accuracy": 0.863133342191577, + "num_tokens": 22090334.0, + "step": 3320 + }, + { + "entropy": 0.47077880781143905, + "epoch": 1.4820564179602738, + "grad_norm": 55.0, + "learning_rate": 1.4824444444444446e-05, + "loss": 7.7317, + "mean_token_accuracy": 0.8551351387053728, + "num_tokens": 22157686.0, + "step": 3330 + }, + { + "entropy": 0.4685546739026904, + "epoch": 1.4865075390864073, + "grad_norm": 48.25, + "learning_rate": 1.4802222222222225e-05, + "loss": 7.4103, + "mean_token_accuracy": 0.8611365500837564, + "num_tokens": 22223473.0, + "step": 3340 + }, + { + "entropy": 0.43017310397699476, + "epoch": 1.490958660212541, + "grad_norm": 45.75, + "learning_rate": 1.478e-05, + "loss": 7.0414, + "mean_token_accuracy": 0.8673346545547247, + "num_tokens": 22289965.0, + "step": 3350 + }, + { + "entropy": 0.4838591874577105, + "epoch": 1.4954097813386746, + "grad_norm": 51.5, + "learning_rate": 1.475777777777778e-05, + "loss": 7.6861, + "mean_token_accuracy": 0.8565518528223037, + "num_tokens": 22357538.0, + "step": 3360 + }, + { + "entropy": 0.46223199758678674, + "epoch": 1.4998609024648082, + "grad_norm": 52.75, + "learning_rate": 1.4735555555555556e-05, + "loss": 7.5521, + "mean_token_accuracy": 0.8585102248936891, + "num_tokens": 22425667.0, + "step": 3370 + }, + { + "entropy": 0.46012824261561036, + "epoch": 1.504312023590942, + "grad_norm": 39.75, + "learning_rate": 1.4713333333333335e-05, + "loss": 7.3463, + "mean_token_accuracy": 0.8621176246553659, + "num_tokens": 22493127.0, + "step": 3380 + }, + { + "entropy": 0.4519760748371482, + "epoch": 1.5087631447170757, + "grad_norm": 54.75, + "learning_rate": 1.4691111111111111e-05, + "loss": 7.4064, + "mean_token_accuracy": 0.8610552717000246, + "num_tokens": 22556146.0, + "step": 3390 + }, + { + "entropy": 0.4374730283860117, + "epoch": 1.5132142658432093, + "grad_norm": 64.0, + "learning_rate": 1.466888888888889e-05, + "loss": 7.0335, + "mean_token_accuracy": 0.8672068931162358, + "num_tokens": 22620726.0, + "step": 3400 + }, + { + "epoch": 1.5132142658432093, + "eval_biology_entropy": 1.0820362836122512, + "eval_biology_loss": 1.3420637845993042, + "eval_biology_mean_token_accuracy": 0.6756702532172203, + "eval_biology_num_tokens": 22620726.0, + "eval_biology_runtime": 49.1528, + "eval_biology_samples_per_second": 10.172, + "eval_biology_steps_per_second": 10.172, + "step": 3400 + }, + { + "epoch": 1.5132142658432093, + "eval_chemistry_entropy": 0.5567150608897209, + "eval_chemistry_loss": 0.664932370185852, + "eval_chemistry_mean_token_accuracy": 0.8107878102064132, + "eval_chemistry_num_tokens": 22620726.0, + "eval_chemistry_runtime": 56.9927, + "eval_chemistry_samples_per_second": 8.773, + "eval_chemistry_steps_per_second": 8.773, + "step": 3400 + }, + { + "epoch": 1.5132142658432093, + "eval_math_entropy": 0.6528812423944473, + "eval_math_loss": 0.8748553395271301, + "eval_math_mean_token_accuracy": 0.7760986275672913, + "eval_math_num_tokens": 22620726.0, + "eval_math_runtime": 58.8248, + "eval_math_samples_per_second": 8.5, + "eval_math_steps_per_second": 8.5, + "step": 3400 + }, + { + "epoch": 1.5132142658432093, + "eval_physics_entropy": 0.6191621573865413, + "eval_physics_loss": 0.7281864285469055, + "eval_physics_mean_token_accuracy": 0.7994781628251075, + "eval_physics_num_tokens": 22620726.0, + "eval_physics_runtime": 65.7555, + "eval_physics_samples_per_second": 7.604, + "eval_physics_steps_per_second": 7.604, + "step": 3400 + }, + { + "entropy": 0.45250841323286295, + "epoch": 1.5176653869693428, + "grad_norm": 40.25, + "learning_rate": 1.4646666666666666e-05, + "loss": 7.4106, + "mean_token_accuracy": 0.8627288337796927, + "num_tokens": 22692897.0, + "step": 3410 + }, + { + "entropy": 0.44010208174586296, + "epoch": 1.5221165080954764, + "grad_norm": 52.75, + "learning_rate": 1.4624444444444446e-05, + "loss": 7.0083, + "mean_token_accuracy": 0.867039955034852, + "num_tokens": 22757148.0, + "step": 3420 + }, + { + "entropy": 0.4613129423931241, + "epoch": 1.5265676292216102, + "grad_norm": 50.0, + "learning_rate": 1.4602222222222225e-05, + "loss": 7.5173, + "mean_token_accuracy": 0.8584544464945794, + "num_tokens": 22821150.0, + "step": 3430 + }, + { + "entropy": 0.4295078462921083, + "epoch": 1.531018750347744, + "grad_norm": 46.0, + "learning_rate": 1.4580000000000001e-05, + "loss": 6.839, + "mean_token_accuracy": 0.8710437446832657, + "num_tokens": 22895186.0, + "step": 3440 + }, + { + "entropy": 0.4099935442209244, + "epoch": 1.5354698714738775, + "grad_norm": 55.75, + "learning_rate": 1.455777777777778e-05, + "loss": 6.5808, + "mean_token_accuracy": 0.8745924565941096, + "num_tokens": 22963108.0, + "step": 3450 + }, + { + "entropy": 0.4301177678629756, + "epoch": 1.539920992600011, + "grad_norm": 43.5, + "learning_rate": 1.4535555555555556e-05, + "loss": 6.9409, + "mean_token_accuracy": 0.8693449333310127, + "num_tokens": 23020006.0, + "step": 3460 + }, + { + "entropy": 0.4379517641849816, + "epoch": 1.5443721137261448, + "grad_norm": 51.0, + "learning_rate": 1.4513333333333336e-05, + "loss": 7.0743, + "mean_token_accuracy": 0.8658050034195185, + "num_tokens": 23084950.0, + "step": 3470 + }, + { + "entropy": 0.4456146590411663, + "epoch": 1.5488232348522786, + "grad_norm": 48.0, + "learning_rate": 1.4491111111111111e-05, + "loss": 7.098, + "mean_token_accuracy": 0.865884667634964, + "num_tokens": 23145883.0, + "step": 3480 + }, + { + "entropy": 0.4186052186880261, + "epoch": 1.5532743559784121, + "grad_norm": 49.25, + "learning_rate": 1.446888888888889e-05, + "loss": 6.9524, + "mean_token_accuracy": 0.870098390430212, + "num_tokens": 23208622.0, + "step": 3490 + }, + { + "entropy": 0.44615869587287305, + "epoch": 1.5577254771045457, + "grad_norm": 57.5, + "learning_rate": 1.4446666666666668e-05, + "loss": 7.0196, + "mean_token_accuracy": 0.8686704605817794, + "num_tokens": 23274053.0, + "step": 3500 + }, + { + "epoch": 1.5577254771045457, + "eval_biology_entropy": 1.0597223438620567, + "eval_biology_loss": 1.349181056022644, + "eval_biology_mean_token_accuracy": 0.6754039505720139, + "eval_biology_num_tokens": 23274053.0, + "eval_biology_runtime": 49.4416, + "eval_biology_samples_per_second": 10.113, + "eval_biology_steps_per_second": 10.113, + "step": 3500 + }, + { + "epoch": 1.5577254771045457, + "eval_chemistry_entropy": 0.5542172521352768, + "eval_chemistry_loss": 0.6662623286247253, + "eval_chemistry_mean_token_accuracy": 0.8111978172063827, + "eval_chemistry_num_tokens": 23274053.0, + "eval_chemistry_runtime": 57.4648, + "eval_chemistry_samples_per_second": 8.701, + "eval_chemistry_steps_per_second": 8.701, + "step": 3500 + }, + { + "epoch": 1.5577254771045457, + "eval_math_entropy": 0.6495824805498123, + "eval_math_loss": 0.8766149878501892, + "eval_math_mean_token_accuracy": 0.7756204127073288, + "eval_math_num_tokens": 23274053.0, + "eval_math_runtime": 59.0092, + "eval_math_samples_per_second": 8.473, + "eval_math_steps_per_second": 8.473, + "step": 3500 + }, + { + "epoch": 1.5577254771045457, + "eval_physics_entropy": 0.6131590060293675, + "eval_physics_loss": 0.7274580597877502, + "eval_physics_mean_token_accuracy": 0.7998755808472633, + "eval_physics_num_tokens": 23274053.0, + "eval_physics_runtime": 65.8851, + "eval_physics_samples_per_second": 7.589, + "eval_physics_steps_per_second": 7.589, + "step": 3500 + }, + { + "entropy": 0.4539166674949229, + "epoch": 1.5621765982306792, + "grad_norm": 52.0, + "learning_rate": 1.4424444444444446e-05, + "loss": 7.5275, + "mean_token_accuracy": 0.8589690785855055, + "num_tokens": 23341632.0, + "step": 3510 + }, + { + "entropy": 0.4353353668469936, + "epoch": 1.566627719356813, + "grad_norm": 48.0, + "learning_rate": 1.4402222222222224e-05, + "loss": 6.7517, + "mean_token_accuracy": 0.8713302955031395, + "num_tokens": 23410761.0, + "step": 3520 + }, + { + "entropy": 0.43715719487518073, + "epoch": 1.5710788404829468, + "grad_norm": 68.5, + "learning_rate": 1.4380000000000001e-05, + "loss": 7.1149, + "mean_token_accuracy": 0.8645731471478939, + "num_tokens": 23474045.0, + "step": 3530 + }, + { + "entropy": 0.45210328227840363, + "epoch": 1.5755299616090803, + "grad_norm": 46.25, + "learning_rate": 1.4357777777777779e-05, + "loss": 7.3584, + "mean_token_accuracy": 0.862363849952817, + "num_tokens": 23541078.0, + "step": 3540 + }, + { + "entropy": 0.44945355141535404, + "epoch": 1.5799810827352139, + "grad_norm": 58.75, + "learning_rate": 1.4335555555555556e-05, + "loss": 7.266, + "mean_token_accuracy": 0.8623629752546549, + "num_tokens": 23609956.0, + "step": 3550 + }, + { + "entropy": 0.45151492971926926, + "epoch": 1.5844322038613474, + "grad_norm": 40.5, + "learning_rate": 1.4313333333333334e-05, + "loss": 7.2954, + "mean_token_accuracy": 0.8639614138752222, + "num_tokens": 23679603.0, + "step": 3560 + }, + { + "entropy": 0.47792316814884545, + "epoch": 1.5888833249874812, + "grad_norm": 44.5, + "learning_rate": 1.4291111111111113e-05, + "loss": 7.6957, + "mean_token_accuracy": 0.8571067977696657, + "num_tokens": 23744554.0, + "step": 3570 + }, + { + "entropy": 0.41747193234041335, + "epoch": 1.593334446113615, + "grad_norm": 44.25, + "learning_rate": 1.426888888888889e-05, + "loss": 6.7004, + "mean_token_accuracy": 0.8705373715609313, + "num_tokens": 23813464.0, + "step": 3580 + }, + { + "entropy": 0.4723031269852072, + "epoch": 1.5977855672397485, + "grad_norm": 54.0, + "learning_rate": 1.4246666666666669e-05, + "loss": 7.6821, + "mean_token_accuracy": 0.858668788895011, + "num_tokens": 23875868.0, + "step": 3590 + }, + { + "entropy": 0.46631492786109446, + "epoch": 1.602236688365882, + "grad_norm": 51.25, + "learning_rate": 1.4224444444444445e-05, + "loss": 7.3527, + "mean_token_accuracy": 0.8617794819176197, + "num_tokens": 23940396.0, + "step": 3600 + }, + { + "epoch": 1.602236688365882, + "eval_biology_entropy": 1.0909454934895038, + "eval_biology_loss": 1.3392319679260254, + "eval_biology_mean_token_accuracy": 0.6753941929340362, + "eval_biology_num_tokens": 23940396.0, + "eval_biology_runtime": 49.3998, + "eval_biology_samples_per_second": 10.122, + "eval_biology_steps_per_second": 10.122, + "step": 3600 + }, + { + "epoch": 1.602236688365882, + "eval_chemistry_entropy": 0.561623074799776, + "eval_chemistry_loss": 0.6604354381561279, + "eval_chemistry_mean_token_accuracy": 0.8123695262670517, + "eval_chemistry_num_tokens": 23940396.0, + "eval_chemistry_runtime": 57.1407, + "eval_chemistry_samples_per_second": 8.75, + "eval_chemistry_steps_per_second": 8.75, + "step": 3600 + }, + { + "epoch": 1.602236688365882, + "eval_math_entropy": 0.633789344638586, + "eval_math_loss": 0.881838858127594, + "eval_math_mean_token_accuracy": 0.7748601027727127, + "eval_math_num_tokens": 23940396.0, + "eval_math_runtime": 58.9729, + "eval_math_samples_per_second": 8.478, + "eval_math_steps_per_second": 8.478, + "step": 3600 + }, + { + "epoch": 1.602236688365882, + "eval_physics_entropy": 0.6112656152546406, + "eval_physics_loss": 0.7239125370979309, + "eval_physics_mean_token_accuracy": 0.8010450528264046, + "eval_physics_num_tokens": 23940396.0, + "eval_physics_runtime": 65.8568, + "eval_physics_samples_per_second": 7.592, + "eval_physics_steps_per_second": 7.592, + "step": 3600 + }, + { + "entropy": 0.43275422137230635, + "epoch": 1.6066878094920158, + "grad_norm": 46.0, + "learning_rate": 1.4202222222222224e-05, + "loss": 7.1698, + "mean_token_accuracy": 0.8656113926321268, + "num_tokens": 24005993.0, + "step": 3610 + }, + { + "entropy": 0.411208738386631, + "epoch": 1.6111389306181496, + "grad_norm": 44.0, + "learning_rate": 1.418e-05, + "loss": 6.5681, + "mean_token_accuracy": 0.8749731313437223, + "num_tokens": 24076192.0, + "step": 3620 + }, + { + "entropy": 0.43890127297490833, + "epoch": 1.6155900517442832, + "grad_norm": 47.0, + "learning_rate": 1.4157777777777779e-05, + "loss": 7.0493, + "mean_token_accuracy": 0.8670356266200543, + "num_tokens": 24147204.0, + "step": 3630 + }, + { + "entropy": 0.43870157171040775, + "epoch": 1.6200411728704167, + "grad_norm": 54.5, + "learning_rate": 1.4135555555555555e-05, + "loss": 7.1209, + "mean_token_accuracy": 0.8640866741538048, + "num_tokens": 24215449.0, + "step": 3640 + }, + { + "entropy": 0.4449730785563588, + "epoch": 1.6244922939965503, + "grad_norm": 63.0, + "learning_rate": 1.4113333333333334e-05, + "loss": 7.1004, + "mean_token_accuracy": 0.8666038513183594, + "num_tokens": 24278738.0, + "step": 3650 + }, + { + "entropy": 0.4421264092437923, + "epoch": 1.628943415122684, + "grad_norm": 48.0, + "learning_rate": 1.4091111111111114e-05, + "loss": 7.3031, + "mean_token_accuracy": 0.8612215649336576, + "num_tokens": 24343555.0, + "step": 3660 + }, + { + "entropy": 0.4467802966013551, + "epoch": 1.6333945362488178, + "grad_norm": 53.25, + "learning_rate": 1.406888888888889e-05, + "loss": 7.0218, + "mean_token_accuracy": 0.86554811373353, + "num_tokens": 24407294.0, + "step": 3670 + }, + { + "entropy": 0.44207426472567024, + "epoch": 1.6378456573749514, + "grad_norm": 46.0, + "learning_rate": 1.4046666666666669e-05, + "loss": 7.1055, + "mean_token_accuracy": 0.867503160983324, + "num_tokens": 24468795.0, + "step": 3680 + }, + { + "entropy": 0.41677775532007216, + "epoch": 1.642296778501085, + "grad_norm": 50.5, + "learning_rate": 1.4024444444444445e-05, + "loss": 6.8026, + "mean_token_accuracy": 0.8700127672404051, + "num_tokens": 24537515.0, + "step": 3690 + }, + { + "entropy": 0.450100605096668, + "epoch": 1.6467478996272185, + "grad_norm": 47.5, + "learning_rate": 1.4002222222222224e-05, + "loss": 7.1443, + "mean_token_accuracy": 0.8666038464754819, + "num_tokens": 24604118.0, + "step": 3700 + }, + { + "epoch": 1.6467478996272185, + "eval_biology_entropy": 1.1004000494480133, + "eval_biology_loss": 1.3429887294769287, + "eval_biology_mean_token_accuracy": 0.6752147419452668, + "eval_biology_num_tokens": 24604118.0, + "eval_biology_runtime": 49.4546, + "eval_biology_samples_per_second": 10.11, + "eval_biology_steps_per_second": 10.11, + "step": 3700 + }, + { + "epoch": 1.6467478996272185, + "eval_chemistry_entropy": 0.5701398371756077, + "eval_chemistry_loss": 0.6584111452102661, + "eval_chemistry_mean_token_accuracy": 0.8127154748439789, + "eval_chemistry_num_tokens": 24604118.0, + "eval_chemistry_runtime": 57.1903, + "eval_chemistry_samples_per_second": 8.743, + "eval_chemistry_steps_per_second": 8.743, + "step": 3700 + }, + { + "epoch": 1.6467478996272185, + "eval_math_entropy": 0.6637147405147552, + "eval_math_loss": 0.8762027621269226, + "eval_math_mean_token_accuracy": 0.7752429494857788, + "eval_math_num_tokens": 24604118.0, + "eval_math_runtime": 58.9266, + "eval_math_samples_per_second": 8.485, + "eval_math_steps_per_second": 8.485, + "step": 3700 + }, + { + "epoch": 1.6467478996272185, + "eval_physics_entropy": 0.6301841248869896, + "eval_physics_loss": 0.7230107188224792, + "eval_physics_mean_token_accuracy": 0.8007421780228615, + "eval_physics_num_tokens": 24604118.0, + "eval_physics_runtime": 65.826, + "eval_physics_samples_per_second": 7.596, + "eval_physics_steps_per_second": 7.596, + "step": 3700 + }, + { + "entropy": 0.40455705067142844, + "epoch": 1.6511990207533522, + "grad_norm": 53.0, + "learning_rate": 1.398e-05, + "loss": 6.5556, + "mean_token_accuracy": 0.8739966202527285, + "num_tokens": 24672806.0, + "step": 3710 + }, + { + "entropy": 0.415603784378618, + "epoch": 1.655650141879486, + "grad_norm": 41.75, + "learning_rate": 1.395777777777778e-05, + "loss": 6.7722, + "mean_token_accuracy": 0.8737395346164704, + "num_tokens": 24741914.0, + "step": 3720 + }, + { + "entropy": 0.4317036953754723, + "epoch": 1.6601012630056196, + "grad_norm": 46.0, + "learning_rate": 1.3935555555555557e-05, + "loss": 6.8955, + "mean_token_accuracy": 0.8708718940615654, + "num_tokens": 24809566.0, + "step": 3730 + }, + { + "entropy": 0.4604647663421929, + "epoch": 1.664552384131753, + "grad_norm": 46.75, + "learning_rate": 1.3913333333333335e-05, + "loss": 7.3296, + "mean_token_accuracy": 0.8622036151587963, + "num_tokens": 24876091.0, + "step": 3740 + }, + { + "entropy": 0.43308454575017097, + "epoch": 1.6690035052578869, + "grad_norm": 46.75, + "learning_rate": 1.3891111111111114e-05, + "loss": 7.0219, + "mean_token_accuracy": 0.8661958049982786, + "num_tokens": 24940642.0, + "step": 3750 + }, + { + "entropy": 0.42334359381347897, + "epoch": 1.6734546263840204, + "grad_norm": 49.5, + "learning_rate": 1.386888888888889e-05, + "loss": 6.8346, + "mean_token_accuracy": 0.8696243058890104, + "num_tokens": 25009086.0, + "step": 3760 + }, + { + "entropy": 0.48545486414805056, + "epoch": 1.6779057475101542, + "grad_norm": 41.25, + "learning_rate": 1.3846666666666669e-05, + "loss": 7.8143, + "mean_token_accuracy": 0.8556718792766332, + "num_tokens": 25075466.0, + "step": 3770 + }, + { + "entropy": 0.4933716295287013, + "epoch": 1.6823568686362878, + "grad_norm": 51.5, + "learning_rate": 1.3824444444444445e-05, + "loss": 8.0215, + "mean_token_accuracy": 0.8518200509250164, + "num_tokens": 25140113.0, + "step": 3780 + }, + { + "entropy": 0.4256497736554593, + "epoch": 1.6868079897624213, + "grad_norm": 55.5, + "learning_rate": 1.3802222222222224e-05, + "loss": 6.6601, + "mean_token_accuracy": 0.8717565663158894, + "num_tokens": 25204256.0, + "step": 3790 + }, + { + "entropy": 0.4225488518364727, + "epoch": 1.691259110888555, + "grad_norm": 48.75, + "learning_rate": 1.378e-05, + "loss": 6.9692, + "mean_token_accuracy": 0.8682578578591347, + "num_tokens": 25273806.0, + "step": 3800 + }, + { + "epoch": 1.691259110888555, + "eval_biology_entropy": 1.105823097229004, + "eval_biology_loss": 1.3366401195526123, + "eval_biology_mean_token_accuracy": 0.6761100863814354, + "eval_biology_num_tokens": 25273806.0, + "eval_biology_runtime": 49.3211, + "eval_biology_samples_per_second": 10.138, + "eval_biology_steps_per_second": 10.138, + "step": 3800 + }, + { + "epoch": 1.691259110888555, + "eval_chemistry_entropy": 0.5636824496090412, + "eval_chemistry_loss": 0.657598614692688, + "eval_chemistry_mean_token_accuracy": 0.8127364320755005, + "eval_chemistry_num_tokens": 25273806.0, + "eval_chemistry_runtime": 57.1246, + "eval_chemistry_samples_per_second": 8.753, + "eval_chemistry_steps_per_second": 8.753, + "step": 3800 + }, + { + "epoch": 1.691259110888555, + "eval_math_entropy": 0.6347656193673611, + "eval_math_loss": 0.8819147944450378, + "eval_math_mean_token_accuracy": 0.7749658533334732, + "eval_math_num_tokens": 25273806.0, + "eval_math_runtime": 58.937, + "eval_math_samples_per_second": 8.484, + "eval_math_steps_per_second": 8.484, + "step": 3800 + }, + { + "epoch": 1.691259110888555, + "eval_physics_entropy": 0.6153148064613342, + "eval_physics_loss": 0.7215734720230103, + "eval_physics_mean_token_accuracy": 0.8008127774000168, + "eval_physics_num_tokens": 25273806.0, + "eval_physics_runtime": 65.8943, + "eval_physics_samples_per_second": 7.588, + "eval_physics_steps_per_second": 7.588, + "step": 3800 + }, + { + "entropy": 0.4421769532375038, + "epoch": 1.6957102320146888, + "grad_norm": 47.0, + "learning_rate": 1.375777777777778e-05, + "loss": 6.9012, + "mean_token_accuracy": 0.8678170140832663, + "num_tokens": 25339482.0, + "step": 3810 + }, + { + "entropy": 0.44144861390814183, + "epoch": 1.7001613531408224, + "grad_norm": 46.25, + "learning_rate": 1.3735555555555557e-05, + "loss": 7.2175, + "mean_token_accuracy": 0.8648808564990759, + "num_tokens": 25408287.0, + "step": 3820 + }, + { + "entropy": 0.449921297095716, + "epoch": 1.704612474266956, + "grad_norm": 49.5, + "learning_rate": 1.3713333333333335e-05, + "loss": 7.2069, + "mean_token_accuracy": 0.8668343242257833, + "num_tokens": 25468817.0, + "step": 3830 + }, + { + "entropy": 0.43548805005848407, + "epoch": 1.7090635953930895, + "grad_norm": 51.25, + "learning_rate": 1.3691111111111112e-05, + "loss": 7.1884, + "mean_token_accuracy": 0.8652145210653543, + "num_tokens": 25532863.0, + "step": 3840 + }, + { + "entropy": 0.46991006275638936, + "epoch": 1.7135147165192233, + "grad_norm": 56.75, + "learning_rate": 1.366888888888889e-05, + "loss": 7.5559, + "mean_token_accuracy": 0.8593213982880116, + "num_tokens": 25599547.0, + "step": 3850 + }, + { + "entropy": 0.45514998873695733, + "epoch": 1.717965837645357, + "grad_norm": 55.25, + "learning_rate": 1.3646666666666668e-05, + "loss": 7.4007, + "mean_token_accuracy": 0.8621774800121784, + "num_tokens": 25664044.0, + "step": 3860 + }, + { + "entropy": 0.4406496840529144, + "epoch": 1.7224169587714906, + "grad_norm": 48.5, + "learning_rate": 1.3624444444444445e-05, + "loss": 7.0221, + "mean_token_accuracy": 0.8669323220849037, + "num_tokens": 25729236.0, + "step": 3870 + }, + { + "entropy": 0.44532603975385426, + "epoch": 1.7268680798976241, + "grad_norm": 47.0, + "learning_rate": 1.3602222222222223e-05, + "loss": 7.2687, + "mean_token_accuracy": 0.8641342371702194, + "num_tokens": 25795212.0, + "step": 3880 + }, + { + "entropy": 0.46422761185094713, + "epoch": 1.731319201023758, + "grad_norm": 52.25, + "learning_rate": 1.3580000000000002e-05, + "loss": 7.4198, + "mean_token_accuracy": 0.8605056870728731, + "num_tokens": 25862976.0, + "step": 3890 + }, + { + "entropy": 0.42144476203247905, + "epoch": 1.7357703221498915, + "grad_norm": 48.75, + "learning_rate": 1.3557777777777778e-05, + "loss": 6.8112, + "mean_token_accuracy": 0.86972092166543, + "num_tokens": 25932472.0, + "step": 3900 + }, + { + "epoch": 1.7357703221498915, + "eval_biology_entropy": 1.0987317026257515, + "eval_biology_loss": 1.3376166820526123, + "eval_biology_mean_token_accuracy": 0.6762716981172562, + "eval_biology_num_tokens": 25932472.0, + "eval_biology_runtime": 49.2272, + "eval_biology_samples_per_second": 10.157, + "eval_biology_steps_per_second": 10.157, + "step": 3900 + }, + { + "epoch": 1.7357703221498915, + "eval_chemistry_entropy": 0.5616450413167476, + "eval_chemistry_loss": 0.6555033326148987, + "eval_chemistry_mean_token_accuracy": 0.8131518055200577, + "eval_chemistry_num_tokens": 25932472.0, + "eval_chemistry_runtime": 57.0658, + "eval_chemistry_samples_per_second": 8.762, + "eval_chemistry_steps_per_second": 8.762, + "step": 3900 + }, + { + "epoch": 1.7357703221498915, + "eval_math_entropy": 0.6449382023215294, + "eval_math_loss": 0.8736368417739868, + "eval_math_mean_token_accuracy": 0.776440544128418, + "eval_math_num_tokens": 25932472.0, + "eval_math_runtime": 58.9466, + "eval_math_samples_per_second": 8.482, + "eval_math_steps_per_second": 8.482, + "step": 3900 + }, + { + "epoch": 1.7357703221498915, + "eval_physics_entropy": 0.6214759688675403, + "eval_physics_loss": 0.7196757793426514, + "eval_physics_mean_token_accuracy": 0.80060239726305, + "eval_physics_num_tokens": 25932472.0, + "eval_physics_runtime": 65.848, + "eval_physics_samples_per_second": 7.593, + "eval_physics_steps_per_second": 7.593, + "step": 3900 + }, + { + "entropy": 0.44556578346528114, + "epoch": 1.7402214432760252, + "grad_norm": 68.5, + "learning_rate": 1.3535555555555557e-05, + "loss": 7.1872, + "mean_token_accuracy": 0.8666312236338853, + "num_tokens": 26000782.0, + "step": 3910 + }, + { + "entropy": 0.40507533866912127, + "epoch": 1.7446725644021588, + "grad_norm": 41.75, + "learning_rate": 1.3513333333333333e-05, + "loss": 6.4373, + "mean_token_accuracy": 0.8755326095968485, + "num_tokens": 26066017.0, + "step": 3920 + }, + { + "entropy": 0.4400949960574508, + "epoch": 1.7491236855282923, + "grad_norm": 60.0, + "learning_rate": 1.3491111111111113e-05, + "loss": 7.1952, + "mean_token_accuracy": 0.8644619658589363, + "num_tokens": 26135900.0, + "step": 3930 + }, + { + "entropy": 0.43258765107020736, + "epoch": 1.753574806654426, + "grad_norm": 47.75, + "learning_rate": 1.3468888888888888e-05, + "loss": 6.8319, + "mean_token_accuracy": 0.8694869946688414, + "num_tokens": 26202278.0, + "step": 3940 + }, + { + "entropy": 0.44916954915970564, + "epoch": 1.7580259277805599, + "grad_norm": 42.75, + "learning_rate": 1.3446666666666668e-05, + "loss": 7.2249, + "mean_token_accuracy": 0.8641918078064919, + "num_tokens": 26267299.0, + "step": 3950 + }, + { + "entropy": 0.3934618907049298, + "epoch": 1.7624770489066934, + "grad_norm": 43.0, + "learning_rate": 1.3424444444444447e-05, + "loss": 6.3593, + "mean_token_accuracy": 0.8768466982990504, + "num_tokens": 26335089.0, + "step": 3960 + }, + { + "entropy": 0.43172463523224, + "epoch": 1.766928170032827, + "grad_norm": 51.0, + "learning_rate": 1.3402222222222223e-05, + "loss": 7.0227, + "mean_token_accuracy": 0.8651743248105049, + "num_tokens": 26401675.0, + "step": 3970 + }, + { + "entropy": 0.4284566377289593, + "epoch": 1.7713792911589605, + "grad_norm": 44.5, + "learning_rate": 1.3380000000000002e-05, + "loss": 6.9603, + "mean_token_accuracy": 0.8693511422723532, + "num_tokens": 26468664.0, + "step": 3980 + }, + { + "entropy": 0.41403404623270035, + "epoch": 1.7758304122850943, + "grad_norm": 59.25, + "learning_rate": 1.3357777777777778e-05, + "loss": 6.6291, + "mean_token_accuracy": 0.8749957829713821, + "num_tokens": 26533513.0, + "step": 3990 + }, + { + "entropy": 0.4295045448467135, + "epoch": 1.780281533411228, + "grad_norm": 40.75, + "learning_rate": 1.3335555555555558e-05, + "loss": 6.9156, + "mean_token_accuracy": 0.8678974959999323, + "num_tokens": 26599191.0, + "step": 4000 + }, + { + "epoch": 1.780281533411228, + "eval_biology_entropy": 1.09288672375679, + "eval_biology_loss": 1.3388818502426147, + "eval_biology_mean_token_accuracy": 0.6759003084897995, + "eval_biology_num_tokens": 26599191.0, + "eval_biology_runtime": 49.5834, + "eval_biology_samples_per_second": 10.084, + "eval_biology_steps_per_second": 10.084, + "step": 4000 + }, + { + "epoch": 1.780281533411228, + "eval_chemistry_entropy": 0.5640489151477813, + "eval_chemistry_loss": 0.6533928513526917, + "eval_chemistry_mean_token_accuracy": 0.8133883930444717, + "eval_chemistry_num_tokens": 26599191.0, + "eval_chemistry_runtime": 57.2336, + "eval_chemistry_samples_per_second": 8.736, + "eval_chemistry_steps_per_second": 8.736, + "step": 4000 + }, + { + "epoch": 1.780281533411228, + "eval_math_entropy": 0.6384000761508941, + "eval_math_loss": 0.8826572299003601, + "eval_math_mean_token_accuracy": 0.7747851693630219, + "eval_math_num_tokens": 26599191.0, + "eval_math_runtime": 59.06, + "eval_math_samples_per_second": 8.466, + "eval_math_steps_per_second": 8.466, + "step": 4000 + }, + { + "epoch": 1.780281533411228, + "eval_physics_entropy": 0.6182807017564773, + "eval_physics_loss": 0.7193024754524231, + "eval_physics_mean_token_accuracy": 0.8011252279281617, + "eval_physics_num_tokens": 26599191.0, + "eval_physics_runtime": 66.0007, + "eval_physics_samples_per_second": 7.576, + "eval_physics_steps_per_second": 7.576, + "step": 4000 + }, + { + "entropy": 0.4455654317513108, + "epoch": 1.7847326545373616, + "grad_norm": 50.0, + "learning_rate": 1.3313333333333333e-05, + "loss": 7.1189, + "mean_token_accuracy": 0.8670381877571345, + "num_tokens": 26662061.0, + "step": 4010 + }, + { + "entropy": 0.42714932542294265, + "epoch": 1.7891837756634952, + "grad_norm": 49.5, + "learning_rate": 1.3291111111111113e-05, + "loss": 7.1059, + "mean_token_accuracy": 0.865508059039712, + "num_tokens": 26728401.0, + "step": 4020 + }, + { + "entropy": 0.4686863644979894, + "epoch": 1.7936348967896287, + "grad_norm": 53.25, + "learning_rate": 1.3268888888888889e-05, + "loss": 7.453, + "mean_token_accuracy": 0.8607470963150263, + "num_tokens": 26795849.0, + "step": 4030 + }, + { + "entropy": 0.44081603535450997, + "epoch": 1.7980860179157625, + "grad_norm": 56.75, + "learning_rate": 1.3246666666666668e-05, + "loss": 7.094, + "mean_token_accuracy": 0.8661636434495449, + "num_tokens": 26863285.0, + "step": 4040 + }, + { + "entropy": 0.44031122056767347, + "epoch": 1.8025371390418963, + "grad_norm": 50.75, + "learning_rate": 1.3224444444444446e-05, + "loss": 7.0613, + "mean_token_accuracy": 0.8678509399294854, + "num_tokens": 26928387.0, + "step": 4050 + }, + { + "entropy": 0.4223677352070808, + "epoch": 1.8069882601680298, + "grad_norm": 44.75, + "learning_rate": 1.3202222222222223e-05, + "loss": 6.8073, + "mean_token_accuracy": 0.8709012702107429, + "num_tokens": 26993073.0, + "step": 4060 + }, + { + "entropy": 0.4358017733320594, + "epoch": 1.8114393812941634, + "grad_norm": 46.75, + "learning_rate": 1.3180000000000001e-05, + "loss": 7.0844, + "mean_token_accuracy": 0.8656642254441976, + "num_tokens": 27060885.0, + "step": 4070 + }, + { + "entropy": 0.48112288750708104, + "epoch": 1.8158905024202971, + "grad_norm": 51.75, + "learning_rate": 1.3157777777777778e-05, + "loss": 7.5967, + "mean_token_accuracy": 0.8570263028144837, + "num_tokens": 27128899.0, + "step": 4080 + }, + { + "entropy": 0.44178040148690345, + "epoch": 1.820341623546431, + "grad_norm": 45.25, + "learning_rate": 1.3135555555555558e-05, + "loss": 7.2925, + "mean_token_accuracy": 0.865721695870161, + "num_tokens": 27194121.0, + "step": 4090 + }, + { + "entropy": 0.43020233828574417, + "epoch": 1.8247927446725645, + "grad_norm": 52.0, + "learning_rate": 1.3113333333333334e-05, + "loss": 6.908, + "mean_token_accuracy": 0.8722872164100408, + "num_tokens": 27262707.0, + "step": 4100 + }, + { + "epoch": 1.8247927446725645, + "eval_biology_entropy": 1.0930933583974838, + "eval_biology_loss": 1.3323205709457397, + "eval_biology_mean_token_accuracy": 0.6766302047371864, + "eval_biology_num_tokens": 27262707.0, + "eval_biology_runtime": 49.6487, + "eval_biology_samples_per_second": 10.071, + "eval_biology_steps_per_second": 10.071, + "step": 4100 + }, + { + "epoch": 1.8247927446725645, + "eval_chemistry_entropy": 0.5624111870229245, + "eval_chemistry_loss": 0.6492710113525391, + "eval_chemistry_mean_token_accuracy": 0.813504365324974, + "eval_chemistry_num_tokens": 27262707.0, + "eval_chemistry_runtime": 57.4944, + "eval_chemistry_samples_per_second": 8.697, + "eval_chemistry_steps_per_second": 8.697, + "step": 4100 + }, + { + "epoch": 1.8247927446725645, + "eval_math_entropy": 0.6331602872908115, + "eval_math_loss": 0.8782398104667664, + "eval_math_mean_token_accuracy": 0.7759555462598801, + "eval_math_num_tokens": 27262707.0, + "eval_math_runtime": 59.0012, + "eval_math_samples_per_second": 8.474, + "eval_math_steps_per_second": 8.474, + "step": 4100 + }, + { + "epoch": 1.8247927446725645, + "eval_physics_entropy": 0.6101286012232303, + "eval_physics_loss": 0.7175766825675964, + "eval_physics_mean_token_accuracy": 0.802008852660656, + "eval_physics_num_tokens": 27262707.0, + "eval_physics_runtime": 65.9383, + "eval_physics_samples_per_second": 7.583, + "eval_physics_steps_per_second": 7.583, + "step": 4100 + }, + { + "entropy": 0.4529592189006507, + "epoch": 1.829243865798698, + "grad_norm": 55.0, + "learning_rate": 1.3091111111111113e-05, + "loss": 7.2454, + "mean_token_accuracy": 0.8652162194252014, + "num_tokens": 27330038.0, + "step": 4110 + }, + { + "entropy": 0.3973214037716389, + "epoch": 1.8336949869248316, + "grad_norm": 49.75, + "learning_rate": 1.306888888888889e-05, + "loss": 6.376, + "mean_token_accuracy": 0.8772102475166321, + "num_tokens": 27397555.0, + "step": 4120 + }, + { + "entropy": 0.44568534260615705, + "epoch": 1.8381461080509653, + "grad_norm": 48.5, + "learning_rate": 1.3046666666666668e-05, + "loss": 7.2989, + "mean_token_accuracy": 0.864243046939373, + "num_tokens": 27459777.0, + "step": 4130 + }, + { + "entropy": 0.4502247759606689, + "epoch": 1.8425972291770991, + "grad_norm": 46.5, + "learning_rate": 1.3024444444444446e-05, + "loss": 7.1174, + "mean_token_accuracy": 0.8657589253038168, + "num_tokens": 27526783.0, + "step": 4140 + }, + { + "entropy": 0.44870172487571836, + "epoch": 1.8470483503032327, + "grad_norm": 47.75, + "learning_rate": 1.3002222222222223e-05, + "loss": 7.3172, + "mean_token_accuracy": 0.8628721322864294, + "num_tokens": 27589476.0, + "step": 4150 + }, + { + "entropy": 0.4119010436348617, + "epoch": 1.8514994714293662, + "grad_norm": 45.0, + "learning_rate": 1.2980000000000001e-05, + "loss": 6.5902, + "mean_token_accuracy": 0.8741217296570539, + "num_tokens": 27663920.0, + "step": 4160 + }, + { + "entropy": 0.43196423682384194, + "epoch": 1.8559505925554998, + "grad_norm": 44.75, + "learning_rate": 1.2957777777777779e-05, + "loss": 6.9974, + "mean_token_accuracy": 0.869346110522747, + "num_tokens": 27732593.0, + "step": 4170 + }, + { + "entropy": 0.4634049771353602, + "epoch": 1.8604017136816335, + "grad_norm": 48.75, + "learning_rate": 1.2935555555555556e-05, + "loss": 7.554, + "mean_token_accuracy": 0.860065794736147, + "num_tokens": 27796282.0, + "step": 4180 + }, + { + "entropy": 0.41463155914098027, + "epoch": 1.8648528348077673, + "grad_norm": 42.0, + "learning_rate": 1.2913333333333336e-05, + "loss": 6.717, + "mean_token_accuracy": 0.8722549807280302, + "num_tokens": 27865723.0, + "step": 4190 + }, + { + "entropy": 0.44301305185072126, + "epoch": 1.8693039559339009, + "grad_norm": 53.25, + "learning_rate": 1.2891111111111112e-05, + "loss": 7.0622, + "mean_token_accuracy": 0.8678176179528236, + "num_tokens": 27934203.0, + "step": 4200 + }, + { + "epoch": 1.8693039559339009, + "eval_biology_entropy": 1.0975995972454549, + "eval_biology_loss": 1.3328328132629395, + "eval_biology_mean_token_accuracy": 0.6766401550769806, + "eval_biology_num_tokens": 27934203.0, + "eval_biology_runtime": 49.3821, + "eval_biology_samples_per_second": 10.125, + "eval_biology_steps_per_second": 10.125, + "step": 4200 + }, + { + "epoch": 1.8693039559339009, + "eval_chemistry_entropy": 0.5558113542199135, + "eval_chemistry_loss": 0.6481916308403015, + "eval_chemistry_mean_token_accuracy": 0.8148145881891251, + "eval_chemistry_num_tokens": 27934203.0, + "eval_chemistry_runtime": 57.1399, + "eval_chemistry_samples_per_second": 8.75, + "eval_chemistry_steps_per_second": 8.75, + "step": 4200 + }, + { + "epoch": 1.8693039559339009, + "eval_math_entropy": 0.6300465607643128, + "eval_math_loss": 0.8733142614364624, + "eval_math_mean_token_accuracy": 0.7774354614019394, + "eval_math_num_tokens": 27934203.0, + "eval_math_runtime": 58.9318, + "eval_math_samples_per_second": 8.484, + "eval_math_steps_per_second": 8.484, + "step": 4200 + }, + { + "epoch": 1.8693039559339009, + "eval_physics_entropy": 0.616854034692049, + "eval_physics_loss": 0.7168420553207397, + "eval_physics_mean_token_accuracy": 0.8017676858305931, + "eval_physics_num_tokens": 27934203.0, + "eval_physics_runtime": 65.9196, + "eval_physics_samples_per_second": 7.585, + "eval_physics_steps_per_second": 7.585, + "step": 4200 + }, + { + "entropy": 0.4502222963608801, + "epoch": 1.8737550770600344, + "grad_norm": 44.75, + "learning_rate": 1.2868888888888891e-05, + "loss": 7.2244, + "mean_token_accuracy": 0.8673382241278886, + "num_tokens": 27997740.0, + "step": 4210 + }, + { + "entropy": 0.4199290908873081, + "epoch": 1.8782061981861682, + "grad_norm": 41.5, + "learning_rate": 1.2846666666666667e-05, + "loss": 6.8114, + "mean_token_accuracy": 0.8723087403923273, + "num_tokens": 28066779.0, + "step": 4220 + }, + { + "entropy": 0.45287727387622, + "epoch": 1.8826573193123017, + "grad_norm": 51.0, + "learning_rate": 1.2824444444444446e-05, + "loss": 7.1901, + "mean_token_accuracy": 0.8633170835673809, + "num_tokens": 28130442.0, + "step": 4230 + }, + { + "entropy": 0.4136773616075516, + "epoch": 1.8871084404384355, + "grad_norm": 54.0, + "learning_rate": 1.2802222222222222e-05, + "loss": 6.7887, + "mean_token_accuracy": 0.870445353537798, + "num_tokens": 28198525.0, + "step": 4240 + }, + { + "entropy": 0.461670626886189, + "epoch": 1.891559561564569, + "grad_norm": 49.75, + "learning_rate": 1.2780000000000001e-05, + "loss": 7.545, + "mean_token_accuracy": 0.8588891945779323, + "num_tokens": 28264658.0, + "step": 4250 + }, + { + "entropy": 0.42892079018056395, + "epoch": 1.8960106826907026, + "grad_norm": 47.25, + "learning_rate": 1.2757777777777777e-05, + "loss": 6.5925, + "mean_token_accuracy": 0.874987380951643, + "num_tokens": 28330085.0, + "step": 4260 + }, + { + "entropy": 0.43946673478931186, + "epoch": 1.9004618038168364, + "grad_norm": 47.5, + "learning_rate": 1.2735555555555557e-05, + "loss": 7.458, + "mean_token_accuracy": 0.86157296448946, + "num_tokens": 28394063.0, + "step": 4270 + }, + { + "entropy": 0.43590318793430927, + "epoch": 1.9049129249429702, + "grad_norm": 50.25, + "learning_rate": 1.2713333333333336e-05, + "loss": 6.8379, + "mean_token_accuracy": 0.8706722978502512, + "num_tokens": 28463459.0, + "step": 4280 + }, + { + "entropy": 0.42629417181015017, + "epoch": 1.9093640460691037, + "grad_norm": 39.5, + "learning_rate": 1.2691111111111112e-05, + "loss": 6.8626, + "mean_token_accuracy": 0.87035539932549, + "num_tokens": 28535715.0, + "step": 4290 + }, + { + "entropy": 0.43369567352347077, + "epoch": 1.9138151671952373, + "grad_norm": 65.0, + "learning_rate": 1.2668888888888891e-05, + "loss": 6.9519, + "mean_token_accuracy": 0.869885740429163, + "num_tokens": 28597936.0, + "step": 4300 + }, + { + "epoch": 1.9138151671952373, + "eval_biology_entropy": 1.0911401687264441, + "eval_biology_loss": 1.336218237876892, + "eval_biology_mean_token_accuracy": 0.6764044223427772, + "eval_biology_num_tokens": 28597936.0, + "eval_biology_runtime": 49.2631, + "eval_biology_samples_per_second": 10.15, + "eval_biology_steps_per_second": 10.15, + "step": 4300 + }, + { + "epoch": 1.9138151671952373, + "eval_chemistry_entropy": 0.5497490223050118, + "eval_chemistry_loss": 0.6479013562202454, + "eval_chemistry_mean_token_accuracy": 0.8146616325378419, + "eval_chemistry_num_tokens": 28597936.0, + "eval_chemistry_runtime": 57.129, + "eval_chemistry_samples_per_second": 8.752, + "eval_chemistry_steps_per_second": 8.752, + "step": 4300 + }, + { + "epoch": 1.9138151671952373, + "eval_math_entropy": 0.6199660334885121, + "eval_math_loss": 0.8770778775215149, + "eval_math_mean_token_accuracy": 0.7772095297574997, + "eval_math_num_tokens": 28597936.0, + "eval_math_runtime": 58.9734, + "eval_math_samples_per_second": 8.478, + "eval_math_steps_per_second": 8.478, + "step": 4300 + }, + { + "epoch": 1.9138151671952373, + "eval_physics_entropy": 0.607051764011383, + "eval_physics_loss": 0.7166539430618286, + "eval_physics_mean_token_accuracy": 0.8021855513453484, + "eval_physics_num_tokens": 28597936.0, + "eval_physics_runtime": 65.8705, + "eval_physics_samples_per_second": 7.591, + "eval_physics_steps_per_second": 7.591, + "step": 4300 + }, + { + "entropy": 0.4368853101972491, + "epoch": 1.9182662883213708, + "grad_norm": 43.5, + "learning_rate": 1.2646666666666667e-05, + "loss": 7.1696, + "mean_token_accuracy": 0.8649825479835271, + "num_tokens": 28663350.0, + "step": 4310 + }, + { + "entropy": 0.46570923617109655, + "epoch": 1.9227174094475046, + "grad_norm": 58.25, + "learning_rate": 1.2624444444444446e-05, + "loss": 7.3494, + "mean_token_accuracy": 0.8613995850086212, + "num_tokens": 28729242.0, + "step": 4320 + }, + { + "entropy": 0.43485080236569046, + "epoch": 1.9271685305736383, + "grad_norm": 50.5, + "learning_rate": 1.2602222222222222e-05, + "loss": 7.1815, + "mean_token_accuracy": 0.8654624871909619, + "num_tokens": 28795007.0, + "step": 4330 + }, + { + "entropy": 0.4194305106531829, + "epoch": 1.931619651699772, + "grad_norm": 50.5, + "learning_rate": 1.2580000000000002e-05, + "loss": 6.7272, + "mean_token_accuracy": 0.8717147808521986, + "num_tokens": 28858616.0, + "step": 4340 + }, + { + "entropy": 0.4218201217241585, + "epoch": 1.9360707728259055, + "grad_norm": 56.5, + "learning_rate": 1.2557777777777779e-05, + "loss": 6.8258, + "mean_token_accuracy": 0.8718546938151122, + "num_tokens": 28922780.0, + "step": 4350 + }, + { + "entropy": 0.44504558080807327, + "epoch": 1.9405218939520392, + "grad_norm": 58.0, + "learning_rate": 1.2535555555555557e-05, + "loss": 7.1731, + "mean_token_accuracy": 0.8650299612432718, + "num_tokens": 28984371.0, + "step": 4360 + }, + { + "entropy": 0.4435701411217451, + "epoch": 1.9449730150781728, + "grad_norm": 50.5, + "learning_rate": 1.2513333333333334e-05, + "loss": 7.1749, + "mean_token_accuracy": 0.8663116294890643, + "num_tokens": 29047768.0, + "step": 4370 + }, + { + "entropy": 0.46321847471408545, + "epoch": 1.9494241362043065, + "grad_norm": 47.0, + "learning_rate": 1.2491111111111112e-05, + "loss": 7.3453, + "mean_token_accuracy": 0.8616611532866955, + "num_tokens": 29109707.0, + "step": 4380 + }, + { + "entropy": 0.42653880920261145, + "epoch": 1.95387525733044, + "grad_norm": 46.75, + "learning_rate": 1.246888888888889e-05, + "loss": 6.927, + "mean_token_accuracy": 0.8675042137503624, + "num_tokens": 29176703.0, + "step": 4390 + }, + { + "entropy": 0.41801422322168946, + "epoch": 1.9583263784565736, + "grad_norm": 47.75, + "learning_rate": 1.2446666666666667e-05, + "loss": 6.7931, + "mean_token_accuracy": 0.8719802606850863, + "num_tokens": 29242605.0, + "step": 4400 + }, + { + "epoch": 1.9583263784565736, + "eval_biology_entropy": 1.1034128618240358, + "eval_biology_loss": 1.3319411277770996, + "eval_biology_mean_token_accuracy": 0.6773669331669807, + "eval_biology_num_tokens": 29242605.0, + "eval_biology_runtime": 49.1144, + "eval_biology_samples_per_second": 10.18, + "eval_biology_steps_per_second": 10.18, + "step": 4400 + }, + { + "epoch": 1.9583263784565736, + "eval_chemistry_entropy": 0.5503038721978665, + "eval_chemistry_loss": 0.6460519433021545, + "eval_chemistry_mean_token_accuracy": 0.8153215191364288, + "eval_chemistry_num_tokens": 29242605.0, + "eval_chemistry_runtime": 57.0082, + "eval_chemistry_samples_per_second": 8.771, + "eval_chemistry_steps_per_second": 8.771, + "step": 4400 + }, + { + "epoch": 1.9583263784565736, + "eval_math_entropy": 0.6240969137847423, + "eval_math_loss": 0.8772608637809753, + "eval_math_mean_token_accuracy": 0.7771303664445877, + "eval_math_num_tokens": 29242605.0, + "eval_math_runtime": 58.9077, + "eval_math_samples_per_second": 8.488, + "eval_math_steps_per_second": 8.488, + "step": 4400 + }, + { + "epoch": 1.9583263784565736, + "eval_physics_entropy": 0.6126399195194244, + "eval_physics_loss": 0.7160037755966187, + "eval_physics_mean_token_accuracy": 0.8026845437884331, + "eval_physics_num_tokens": 29242605.0, + "eval_physics_runtime": 66.2455, + "eval_physics_samples_per_second": 7.548, + "eval_physics_steps_per_second": 7.548, + "step": 4400 + }, + { + "entropy": 0.4765114335343242, + "epoch": 1.9627774995827074, + "grad_norm": 46.25, + "learning_rate": 1.2424444444444445e-05, + "loss": 7.8286, + "mean_token_accuracy": 0.8540210161358118, + "num_tokens": 29308501.0, + "step": 4410 + }, + { + "entropy": 0.457128447573632, + "epoch": 1.9672286207088412, + "grad_norm": 49.25, + "learning_rate": 1.2402222222222222e-05, + "loss": 7.2695, + "mean_token_accuracy": 0.8650869440287352, + "num_tokens": 29373957.0, + "step": 4420 + }, + { + "entropy": 0.40455796252936127, + "epoch": 1.9716797418349747, + "grad_norm": 47.25, + "learning_rate": 1.2380000000000002e-05, + "loss": 6.3152, + "mean_token_accuracy": 0.8781236205250025, + "num_tokens": 29441252.0, + "step": 4430 + }, + { + "entropy": 0.4310204340610653, + "epoch": 1.9761308629611083, + "grad_norm": 53.25, + "learning_rate": 1.235777777777778e-05, + "loss": 7.0859, + "mean_token_accuracy": 0.8696885108947754, + "num_tokens": 29507294.0, + "step": 4440 + }, + { + "entropy": 0.41719874208793045, + "epoch": 1.9805819840872418, + "grad_norm": 44.25, + "learning_rate": 1.2335555555555557e-05, + "loss": 6.7107, + "mean_token_accuracy": 0.8724507693201303, + "num_tokens": 29577424.0, + "step": 4450 + }, + { + "entropy": 0.4336270928382874, + "epoch": 1.9850331052133756, + "grad_norm": 49.25, + "learning_rate": 1.2313333333333335e-05, + "loss": 6.9792, + "mean_token_accuracy": 0.8691455457359553, + "num_tokens": 29640508.0, + "step": 4460 + }, + { + "entropy": 0.44689270658418534, + "epoch": 1.9894842263395094, + "grad_norm": 50.25, + "learning_rate": 1.2291111111111112e-05, + "loss": 7.0793, + "mean_token_accuracy": 0.8650856353342533, + "num_tokens": 29706922.0, + "step": 4470 + }, + { + "entropy": 0.42449051737785337, + "epoch": 1.993935347465643, + "grad_norm": 44.0, + "learning_rate": 1.226888888888889e-05, + "loss": 6.9685, + "mean_token_accuracy": 0.8679084531962872, + "num_tokens": 29774375.0, + "step": 4480 + }, + { + "entropy": 0.432995151495561, + "epoch": 1.9983864685917765, + "grad_norm": 51.75, + "learning_rate": 1.2246666666666667e-05, + "loss": 6.9232, + "mean_token_accuracy": 0.866554967686534, + "num_tokens": 29842122.0, + "step": 4490 + }, + { + "entropy": 0.34638444324592493, + "epoch": 2.00267067267568, + "grad_norm": 39.5, + "learning_rate": 1.2224444444444445e-05, + "loss": 4.6958, + "mean_token_accuracy": 0.9096444585880676, + "num_tokens": 29902979.0, + "step": 4500 + }, + { + "epoch": 2.00267067267568, + "eval_biology_entropy": 0.933539985626936, + "eval_biology_loss": 1.3889635801315308, + "eval_biology_mean_token_accuracy": 0.6750703183412552, + "eval_biology_num_tokens": 29902979.0, + "eval_biology_runtime": 49.296, + "eval_biology_samples_per_second": 10.143, + "eval_biology_steps_per_second": 10.143, + "step": 4500 + }, + { + "epoch": 2.00267067267568, + "eval_chemistry_entropy": 0.44678295238316057, + "eval_chemistry_loss": 0.6827961206436157, + "eval_chemistry_mean_token_accuracy": 0.8145005642175674, + "eval_chemistry_num_tokens": 29902979.0, + "eval_chemistry_runtime": 57.4756, + "eval_chemistry_samples_per_second": 8.699, + "eval_chemistry_steps_per_second": 8.699, + "step": 4500 + }, + { + "epoch": 2.00267067267568, + "eval_math_entropy": 0.577753365367651, + "eval_math_loss": 0.8987858295440674, + "eval_math_mean_token_accuracy": 0.7763468166589736, + "eval_math_num_tokens": 29902979.0, + "eval_math_runtime": 59.1791, + "eval_math_samples_per_second": 8.449, + "eval_math_steps_per_second": 8.449, + "step": 4500 + }, + { + "epoch": 2.00267067267568, + "eval_physics_entropy": 0.5244043172299861, + "eval_physics_loss": 0.7431147694587708, + "eval_physics_mean_token_accuracy": 0.8012209991812707, + "eval_physics_num_tokens": 29902979.0, + "eval_physics_runtime": 66.0886, + "eval_physics_samples_per_second": 7.566, + "eval_physics_steps_per_second": 7.566, + "step": 4500 + }, + { + "entropy": 0.19432784689124674, + "epoch": 2.0071217938018138, + "grad_norm": 55.5, + "learning_rate": 1.2202222222222224e-05, + "loss": 3.4961, + "mean_token_accuracy": 0.9332890458405018, + "num_tokens": 29968860.0, + "step": 4510 + }, + { + "entropy": 0.210560632776469, + "epoch": 2.0115729149279473, + "grad_norm": 44.75, + "learning_rate": 1.218e-05, + "loss": 3.2894, + "mean_token_accuracy": 0.9351368475705385, + "num_tokens": 30039953.0, + "step": 4520 + }, + { + "entropy": 0.21663165464997292, + "epoch": 2.0160240360540813, + "grad_norm": 53.25, + "learning_rate": 1.215777777777778e-05, + "loss": 3.2707, + "mean_token_accuracy": 0.9385374564677477, + "num_tokens": 30104430.0, + "step": 4530 + }, + { + "entropy": 0.19315633978694677, + "epoch": 2.020475157180215, + "grad_norm": 48.25, + "learning_rate": 1.2135555555555556e-05, + "loss": 3.1409, + "mean_token_accuracy": 0.9388028878718615, + "num_tokens": 30170256.0, + "step": 4540 + }, + { + "entropy": 0.19228276158683003, + "epoch": 2.0249262783063484, + "grad_norm": 43.5, + "learning_rate": 1.2113333333333335e-05, + "loss": 3.1154, + "mean_token_accuracy": 0.9400146439671516, + "num_tokens": 30232990.0, + "step": 4550 + }, + { + "entropy": 0.190460999449715, + "epoch": 2.029377399432482, + "grad_norm": 61.25, + "learning_rate": 1.209111111111111e-05, + "loss": 3.0449, + "mean_token_accuracy": 0.9401529118418693, + "num_tokens": 30299075.0, + "step": 4560 + }, + { + "entropy": 0.21243180367164313, + "epoch": 2.0338285205586155, + "grad_norm": 48.5, + "learning_rate": 1.206888888888889e-05, + "loss": 3.3112, + "mean_token_accuracy": 0.9372529473155737, + "num_tokens": 30364016.0, + "step": 4570 + }, + { + "entropy": 0.19936151900328697, + "epoch": 2.0382796416847495, + "grad_norm": 49.75, + "learning_rate": 1.204666666666667e-05, + "loss": 3.1755, + "mean_token_accuracy": 0.9386512443423272, + "num_tokens": 30434991.0, + "step": 4580 + }, + { + "entropy": 0.2004031420685351, + "epoch": 2.042730762810883, + "grad_norm": 49.75, + "learning_rate": 1.2024444444444445e-05, + "loss": 3.2127, + "mean_token_accuracy": 0.9368518743664026, + "num_tokens": 30500960.0, + "step": 4590 + }, + { + "entropy": 0.20335377231240273, + "epoch": 2.0471818839370166, + "grad_norm": 43.75, + "learning_rate": 1.2002222222222225e-05, + "loss": 3.0969, + "mean_token_accuracy": 0.9389855105429887, + "num_tokens": 30565610.0, + "step": 4600 + }, + { + "epoch": 2.0471818839370166, + "eval_biology_entropy": 0.8037147823572158, + "eval_biology_loss": 1.4953559637069702, + "eval_biology_mean_token_accuracy": 0.670681405365467, + "eval_biology_num_tokens": 30565610.0, + "eval_biology_runtime": 49.3276, + "eval_biology_samples_per_second": 10.136, + "eval_biology_steps_per_second": 10.136, + "step": 4600 + }, + { + "epoch": 2.0471818839370166, + "eval_chemistry_entropy": 0.4112813184261322, + "eval_chemistry_loss": 0.7231889367103577, + "eval_chemistry_mean_token_accuracy": 0.8123716901540756, + "eval_chemistry_num_tokens": 30565610.0, + "eval_chemistry_runtime": 57.1099, + "eval_chemistry_samples_per_second": 8.755, + "eval_chemistry_steps_per_second": 8.755, + "step": 4600 + }, + { + "epoch": 2.0471818839370166, + "eval_math_entropy": 0.5299806647598744, + "eval_math_loss": 0.9405803680419922, + "eval_math_mean_token_accuracy": 0.7738288252353668, + "eval_math_num_tokens": 30565610.0, + "eval_math_runtime": 58.949, + "eval_math_samples_per_second": 8.482, + "eval_math_steps_per_second": 8.482, + "step": 4600 + }, + { + "epoch": 2.0471818839370166, + "eval_physics_entropy": 0.4806697656214237, + "eval_physics_loss": 0.7875670194625854, + "eval_physics_mean_token_accuracy": 0.798054132759571, + "eval_physics_num_tokens": 30565610.0, + "eval_physics_runtime": 65.9011, + "eval_physics_samples_per_second": 7.587, + "eval_physics_steps_per_second": 7.587, + "step": 4600 + }, + { + "entropy": 0.1939547288697213, + "epoch": 2.05163300506315, + "grad_norm": 57.5, + "learning_rate": 1.198e-05, + "loss": 3.1436, + "mean_token_accuracy": 0.9384047366678715, + "num_tokens": 30635592.0, + "step": 4610 + }, + { + "entropy": 0.19414176987484097, + "epoch": 2.0560841261892837, + "grad_norm": 50.5, + "learning_rate": 1.195777777777778e-05, + "loss": 3.1203, + "mean_token_accuracy": 0.9376082208007575, + "num_tokens": 30705180.0, + "step": 4620 + }, + { + "entropy": 0.2032523064641282, + "epoch": 2.0605352473154177, + "grad_norm": 47.5, + "learning_rate": 1.1935555555555556e-05, + "loss": 3.333, + "mean_token_accuracy": 0.9355457253754139, + "num_tokens": 30771970.0, + "step": 4630 + }, + { + "entropy": 0.2035067826975137, + "epoch": 2.0649863684415513, + "grad_norm": 66.0, + "learning_rate": 1.1913333333333335e-05, + "loss": 3.1289, + "mean_token_accuracy": 0.9392033904790879, + "num_tokens": 30833966.0, + "step": 4640 + }, + { + "entropy": 0.20246283044107258, + "epoch": 2.069437489567685, + "grad_norm": 52.0, + "learning_rate": 1.1891111111111111e-05, + "loss": 3.1576, + "mean_token_accuracy": 0.9374252818524837, + "num_tokens": 30897567.0, + "step": 4650 + }, + { + "entropy": 0.19708745954558254, + "epoch": 2.0738886106938184, + "grad_norm": 60.25, + "learning_rate": 1.186888888888889e-05, + "loss": 3.2889, + "mean_token_accuracy": 0.9368193861097097, + "num_tokens": 30961259.0, + "step": 4660 + }, + { + "entropy": 0.205085727898404, + "epoch": 2.0783397318199524, + "grad_norm": 57.25, + "learning_rate": 1.1846666666666668e-05, + "loss": 3.1576, + "mean_token_accuracy": 0.9377356734126806, + "num_tokens": 31024828.0, + "step": 4670 + }, + { + "entropy": 0.21182234776206316, + "epoch": 2.082790852946086, + "grad_norm": 53.75, + "learning_rate": 1.1824444444444445e-05, + "loss": 3.385, + "mean_token_accuracy": 0.933294901996851, + "num_tokens": 31089521.0, + "step": 4680 + }, + { + "entropy": 0.192715611634776, + "epoch": 2.0872419740722195, + "grad_norm": 46.75, + "learning_rate": 1.1802222222222223e-05, + "loss": 3.1177, + "mean_token_accuracy": 0.9370861582458019, + "num_tokens": 31155009.0, + "step": 4690 + }, + { + "entropy": 0.21394032575190067, + "epoch": 2.091693095198353, + "grad_norm": 48.5, + "learning_rate": 1.178e-05, + "loss": 3.3538, + "mean_token_accuracy": 0.9343659084290266, + "num_tokens": 31223043.0, + "step": 4700 + }, + { + "epoch": 2.091693095198353, + "eval_biology_entropy": 0.77990083065629, + "eval_biology_loss": 1.5102664232254028, + "eval_biology_mean_token_accuracy": 0.6704159413576126, + "eval_biology_num_tokens": 31223043.0, + "eval_biology_runtime": 51.4183, + "eval_biology_samples_per_second": 9.724, + "eval_biology_steps_per_second": 9.724, + "step": 4700 + }, + { + "epoch": 2.091693095198353, + "eval_chemistry_entropy": 0.4077444053888321, + "eval_chemistry_loss": 0.7248498201370239, + "eval_chemistry_mean_token_accuracy": 0.8115575251579284, + "eval_chemistry_num_tokens": 31223043.0, + "eval_chemistry_runtime": 59.1981, + "eval_chemistry_samples_per_second": 8.446, + "eval_chemistry_steps_per_second": 8.446, + "step": 4700 + }, + { + "epoch": 2.091693095198353, + "eval_math_entropy": 0.5065389047116041, + "eval_math_loss": 0.9510311484336853, + "eval_math_mean_token_accuracy": 0.7742284239530564, + "eval_math_num_tokens": 31223043.0, + "eval_math_runtime": 60.9897, + "eval_math_samples_per_second": 8.198, + "eval_math_steps_per_second": 8.198, + "step": 4700 + }, + { + "epoch": 2.091693095198353, + "eval_physics_entropy": 0.47027663865685465, + "eval_physics_loss": 0.7912967801094055, + "eval_physics_mean_token_accuracy": 0.7982778403162957, + "eval_physics_num_tokens": 31223043.0, + "eval_physics_runtime": 66.3139, + "eval_physics_samples_per_second": 7.54, + "eval_physics_steps_per_second": 7.54, + "step": 4700 + }, + { + "entropy": 0.2011160397902131, + "epoch": 2.0961442163244866, + "grad_norm": 53.0, + "learning_rate": 1.1757777777777778e-05, + "loss": 3.2785, + "mean_token_accuracy": 0.9362703409045935, + "num_tokens": 31286436.0, + "step": 4710 + }, + { + "entropy": 0.20832633413374424, + "epoch": 2.1005953374506205, + "grad_norm": 47.75, + "learning_rate": 1.1735555555555556e-05, + "loss": 3.3158, + "mean_token_accuracy": 0.9346338223665953, + "num_tokens": 31353811.0, + "step": 4720 + }, + { + "entropy": 0.20000891415402294, + "epoch": 2.105046458576754, + "grad_norm": 44.25, + "learning_rate": 1.1713333333333334e-05, + "loss": 3.1554, + "mean_token_accuracy": 0.9379078581929207, + "num_tokens": 31418157.0, + "step": 4730 + }, + { + "entropy": 0.20441038249991833, + "epoch": 2.1094975797028876, + "grad_norm": 54.5, + "learning_rate": 1.1691111111111113e-05, + "loss": 3.2712, + "mean_token_accuracy": 0.9364386297762394, + "num_tokens": 31487616.0, + "step": 4740 + }, + { + "entropy": 0.20344883445650339, + "epoch": 2.113948700829021, + "grad_norm": 51.75, + "learning_rate": 1.1668888888888889e-05, + "loss": 3.1951, + "mean_token_accuracy": 0.9377407234162092, + "num_tokens": 31553528.0, + "step": 4750 + }, + { + "entropy": 0.20886037438176572, + "epoch": 2.1183998219551547, + "grad_norm": 40.5, + "learning_rate": 1.1646666666666668e-05, + "loss": 3.4693, + "mean_token_accuracy": 0.9323958426713943, + "num_tokens": 31621760.0, + "step": 4760 + }, + { + "entropy": 0.21293632830493153, + "epoch": 2.1228509430812887, + "grad_norm": 49.75, + "learning_rate": 1.1624444444444446e-05, + "loss": 3.3064, + "mean_token_accuracy": 0.9356177769601345, + "num_tokens": 31688476.0, + "step": 4770 + }, + { + "entropy": 0.18729747808538377, + "epoch": 2.1273020642074223, + "grad_norm": 44.75, + "learning_rate": 1.1602222222222223e-05, + "loss": 2.9512, + "mean_token_accuracy": 0.9413186211138964, + "num_tokens": 31761188.0, + "step": 4780 + }, + { + "entropy": 0.19371092831715941, + "epoch": 2.131753185333556, + "grad_norm": 58.75, + "learning_rate": 1.1580000000000001e-05, + "loss": 3.1121, + "mean_token_accuracy": 0.938572907820344, + "num_tokens": 31821272.0, + "step": 4790 + }, + { + "entropy": 0.21055902824737133, + "epoch": 2.1362043064596894, + "grad_norm": 52.75, + "learning_rate": 1.1557777777777779e-05, + "loss": 3.4191, + "mean_token_accuracy": 0.93368777371943, + "num_tokens": 31888569.0, + "step": 4800 + }, + { + "epoch": 2.1362043064596894, + "eval_biology_entropy": 0.7709494482576847, + "eval_biology_loss": 1.519779086112976, + "eval_biology_mean_token_accuracy": 0.6703167478442192, + "eval_biology_num_tokens": 31888569.0, + "eval_biology_runtime": 49.4584, + "eval_biology_samples_per_second": 10.11, + "eval_biology_steps_per_second": 10.11, + "step": 4800 + }, + { + "epoch": 2.1362043064596894, + "eval_chemistry_entropy": 0.4099642983525991, + "eval_chemistry_loss": 0.7269572615623474, + "eval_chemistry_mean_token_accuracy": 0.8109228508472442, + "eval_chemistry_num_tokens": 31888569.0, + "eval_chemistry_runtime": 57.289, + "eval_chemistry_samples_per_second": 8.728, + "eval_chemistry_steps_per_second": 8.728, + "step": 4800 + }, + { + "epoch": 2.1362043064596894, + "eval_math_entropy": 0.509802019789815, + "eval_math_loss": 0.9596483111381531, + "eval_math_mean_token_accuracy": 0.7724687232971191, + "eval_math_num_tokens": 31888569.0, + "eval_math_runtime": 60.3339, + "eval_math_samples_per_second": 8.287, + "eval_math_steps_per_second": 8.287, + "step": 4800 + }, + { + "epoch": 2.1362043064596894, + "eval_physics_entropy": 0.47433996251225474, + "eval_physics_loss": 0.7937348484992981, + "eval_physics_mean_token_accuracy": 0.7973666390180588, + "eval_physics_num_tokens": 31888569.0, + "eval_physics_runtime": 66.3696, + "eval_physics_samples_per_second": 7.534, + "eval_physics_steps_per_second": 7.534, + "step": 4800 + }, + { + "entropy": 0.2084138799458742, + "epoch": 2.1406554275858234, + "grad_norm": 57.0, + "learning_rate": 1.1535555555555556e-05, + "loss": 3.2409, + "mean_token_accuracy": 0.937066400423646, + "num_tokens": 31956128.0, + "step": 4810 + }, + { + "entropy": 0.2134676464833319, + "epoch": 2.145106548711957, + "grad_norm": 51.5, + "learning_rate": 1.1513333333333334e-05, + "loss": 3.3475, + "mean_token_accuracy": 0.9349153436720371, + "num_tokens": 32022951.0, + "step": 4820 + }, + { + "entropy": 0.1977827858645469, + "epoch": 2.1495576698380905, + "grad_norm": 46.5, + "learning_rate": 1.1491111111111113e-05, + "loss": 3.2339, + "mean_token_accuracy": 0.937416598200798, + "num_tokens": 32088718.0, + "step": 4830 + }, + { + "entropy": 0.1914580906741321, + "epoch": 2.154008790964224, + "grad_norm": 48.0, + "learning_rate": 1.1468888888888889e-05, + "loss": 3.0191, + "mean_token_accuracy": 0.9405469063669443, + "num_tokens": 32159154.0, + "step": 4840 + }, + { + "entropy": 0.19496469043660908, + "epoch": 2.1584599120903576, + "grad_norm": 49.0, + "learning_rate": 1.1446666666666668e-05, + "loss": 3.0273, + "mean_token_accuracy": 0.9392817266285419, + "num_tokens": 32224580.0, + "step": 4850 + }, + { + "entropy": 0.20958372931927444, + "epoch": 2.1629110332164916, + "grad_norm": 52.5, + "learning_rate": 1.1424444444444444e-05, + "loss": 3.3584, + "mean_token_accuracy": 0.9336409136652947, + "num_tokens": 32290909.0, + "step": 4860 + }, + { + "entropy": 0.19971465929411353, + "epoch": 2.167362154342625, + "grad_norm": 52.75, + "learning_rate": 1.1402222222222224e-05, + "loss": 3.177, + "mean_token_accuracy": 0.9379275768995285, + "num_tokens": 32358720.0, + "step": 4870 + }, + { + "entropy": 0.1877427525119856, + "epoch": 2.1718132754687587, + "grad_norm": 52.0, + "learning_rate": 1.138e-05, + "loss": 2.9581, + "mean_token_accuracy": 0.9419840890914202, + "num_tokens": 32423128.0, + "step": 4880 + }, + { + "entropy": 0.20662877401337026, + "epoch": 2.1762643965948922, + "grad_norm": 55.25, + "learning_rate": 1.1357777777777779e-05, + "loss": 3.4253, + "mean_token_accuracy": 0.9336972165852785, + "num_tokens": 32490563.0, + "step": 4890 + }, + { + "entropy": 0.2113244437612593, + "epoch": 2.180715517721026, + "grad_norm": 58.25, + "learning_rate": 1.1335555555555558e-05, + "loss": 3.2783, + "mean_token_accuracy": 0.9364552021026611, + "num_tokens": 32556621.0, + "step": 4900 + }, + { + "epoch": 2.180715517721026, + "eval_biology_entropy": 0.7821404223144054, + "eval_biology_loss": 1.5180275440216064, + "eval_biology_mean_token_accuracy": 0.6699725322723389, + "eval_biology_num_tokens": 32556621.0, + "eval_biology_runtime": 49.4433, + "eval_biology_samples_per_second": 10.113, + "eval_biology_steps_per_second": 10.113, + "step": 4900 + }, + { + "epoch": 2.180715517721026, + "eval_chemistry_entropy": 0.41868991650640963, + "eval_chemistry_loss": 0.7234057784080505, + "eval_chemistry_mean_token_accuracy": 0.8110000754594803, + "eval_chemistry_num_tokens": 32556621.0, + "eval_chemistry_runtime": 57.4906, + "eval_chemistry_samples_per_second": 8.697, + "eval_chemistry_steps_per_second": 8.697, + "step": 4900 + }, + { + "epoch": 2.180715517721026, + "eval_math_entropy": 0.5210996385663748, + "eval_math_loss": 0.9522091150283813, + "eval_math_mean_token_accuracy": 0.7724902110099793, + "eval_math_num_tokens": 32556621.0, + "eval_math_runtime": 59.0456, + "eval_math_samples_per_second": 8.468, + "eval_math_steps_per_second": 8.468, + "step": 4900 + }, + { + "epoch": 2.180715517721026, + "eval_physics_entropy": 0.4867556933164597, + "eval_physics_loss": 0.7898615598678589, + "eval_physics_mean_token_accuracy": 0.7972035143375397, + "eval_physics_num_tokens": 32556621.0, + "eval_physics_runtime": 66.0459, + "eval_physics_samples_per_second": 7.57, + "eval_physics_steps_per_second": 7.57, + "step": 4900 + }, + { + "entropy": 0.1940465967170894, + "epoch": 2.18516663884716, + "grad_norm": 60.5, + "learning_rate": 1.1313333333333334e-05, + "loss": 3.1301, + "mean_token_accuracy": 0.938922145217657, + "num_tokens": 32622663.0, + "step": 4910 + }, + { + "entropy": 0.19738675863482058, + "epoch": 2.1896177599732933, + "grad_norm": 56.0, + "learning_rate": 1.1291111111111113e-05, + "loss": 3.1346, + "mean_token_accuracy": 0.9398286856710911, + "num_tokens": 32687470.0, + "step": 4920 + }, + { + "entropy": 0.19675758515950292, + "epoch": 2.194068881099427, + "grad_norm": 70.0, + "learning_rate": 1.126888888888889e-05, + "loss": 3.1289, + "mean_token_accuracy": 0.9401233237236738, + "num_tokens": 32747359.0, + "step": 4930 + }, + { + "entropy": 0.20007577040232719, + "epoch": 2.1985200022255604, + "grad_norm": 53.5, + "learning_rate": 1.1246666666666669e-05, + "loss": 3.2218, + "mean_token_accuracy": 0.9384327601641417, + "num_tokens": 32807906.0, + "step": 4940 + }, + { + "entropy": 0.20421422524377703, + "epoch": 2.202971123351694, + "grad_norm": 46.0, + "learning_rate": 1.1224444444444444e-05, + "loss": 3.2217, + "mean_token_accuracy": 0.937329813092947, + "num_tokens": 32876024.0, + "step": 4950 + }, + { + "entropy": 0.2059897052589804, + "epoch": 2.207422244477828, + "grad_norm": 50.0, + "learning_rate": 1.1202222222222224e-05, + "loss": 3.2772, + "mean_token_accuracy": 0.9354364857077598, + "num_tokens": 32944507.0, + "step": 4960 + }, + { + "entropy": 0.20454190894961358, + "epoch": 2.2118733656039615, + "grad_norm": 54.75, + "learning_rate": 1.1180000000000001e-05, + "loss": 3.2596, + "mean_token_accuracy": 0.9356468468904495, + "num_tokens": 33012235.0, + "step": 4970 + }, + { + "entropy": 0.19853306193836034, + "epoch": 2.216324486730095, + "grad_norm": 50.75, + "learning_rate": 1.1157777777777779e-05, + "loss": 3.1264, + "mean_token_accuracy": 0.9382979366928339, + "num_tokens": 33073581.0, + "step": 4980 + }, + { + "entropy": 0.2050587079487741, + "epoch": 2.2207756078562286, + "grad_norm": 66.0, + "learning_rate": 1.1135555555555557e-05, + "loss": 3.2496, + "mean_token_accuracy": 0.9360437963157893, + "num_tokens": 33140617.0, + "step": 4990 + }, + { + "entropy": 0.20523946965113282, + "epoch": 2.2252267289823626, + "grad_norm": 49.75, + "learning_rate": 1.1113333333333334e-05, + "loss": 3.2632, + "mean_token_accuracy": 0.9362727515399456, + "num_tokens": 33208296.0, + "step": 5000 + }, + { + "epoch": 2.2252267289823626, + "eval_biology_entropy": 0.7457333102524281, + "eval_biology_loss": 1.5549827814102173, + "eval_biology_mean_token_accuracy": 0.6689565183520317, + "eval_biology_num_tokens": 33208296.0, + "eval_biology_runtime": 49.1995, + "eval_biology_samples_per_second": 10.163, + "eval_biology_steps_per_second": 10.163, + "step": 5000 + }, + { + "epoch": 2.2252267289823626, + "eval_chemistry_entropy": 0.4065342933088541, + "eval_chemistry_loss": 0.7320985198020935, + "eval_chemistry_mean_token_accuracy": 0.8111300177574158, + "eval_chemistry_num_tokens": 33208296.0, + "eval_chemistry_runtime": 57.2588, + "eval_chemistry_samples_per_second": 8.732, + "eval_chemistry_steps_per_second": 8.732, + "step": 5000 + }, + { + "epoch": 2.2252267289823626, + "eval_math_entropy": 0.5086690319776535, + "eval_math_loss": 0.9592500925064087, + "eval_math_mean_token_accuracy": 0.7730341997146607, + "eval_math_num_tokens": 33208296.0, + "eval_math_runtime": 58.9591, + "eval_math_samples_per_second": 8.48, + "eval_math_steps_per_second": 8.48, + "step": 5000 + }, + { + "epoch": 2.2252267289823626, + "eval_physics_entropy": 0.47483887845277784, + "eval_physics_loss": 0.7995008826255798, + "eval_physics_mean_token_accuracy": 0.7965781780481338, + "eval_physics_num_tokens": 33208296.0, + "eval_physics_runtime": 65.9973, + "eval_physics_samples_per_second": 7.576, + "eval_physics_steps_per_second": 7.576, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.226761246440989e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}