{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.5590297095805052, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7850796665996314, "epoch": 0.017803493935684877, "grad_norm": 4.125, "learning_rate": 1.8e-07, "loss": 0.9868, "mean_token_accuracy": 0.7752813071012497, "num_tokens": 259048.0, "step": 10 }, { "entropy": 0.7656301472336053, "epoch": 0.035606987871369754, "grad_norm": 3.984375, "learning_rate": 3.8e-07, "loss": 0.9514, "mean_token_accuracy": 0.778770961612463, "num_tokens": 532019.0, "step": 20 }, { "entropy": 0.7800006279721856, "epoch": 0.053410481807054634, "grad_norm": 3.546875, "learning_rate": 5.800000000000001e-07, "loss": 0.9645, "mean_token_accuracy": 0.777244720607996, "num_tokens": 807220.0, "step": 30 }, { "entropy": 0.8148440960794687, "epoch": 0.07121397574273951, "grad_norm": 3.578125, "learning_rate": 7.8e-07, "loss": 0.9845, "mean_token_accuracy": 0.7725338555872441, "num_tokens": 1068429.0, "step": 40 }, { "entropy": 0.7980734214186669, "epoch": 0.08901746967842439, "grad_norm": 3.328125, "learning_rate": 9.800000000000001e-07, "loss": 0.9502, "mean_token_accuracy": 0.7774997223168612, "num_tokens": 1336698.0, "step": 50 }, { "entropy": 0.827414046600461, "epoch": 0.10682096361410927, "grad_norm": 2.65625, "learning_rate": 1.1800000000000001e-06, "loss": 0.969, "mean_token_accuracy": 0.7773337867110968, "num_tokens": 1608574.0, "step": 60 }, { "entropy": 0.8639046084135771, "epoch": 0.12462445754979415, "grad_norm": 2.21875, "learning_rate": 1.3800000000000001e-06, "loss": 0.9913, "mean_token_accuracy": 0.7705458387732506, "num_tokens": 1872747.0, "step": 70 }, { "entropy": 0.8748010536655784, "epoch": 0.14242795148547902, "grad_norm": 2.234375, "learning_rate": 1.5800000000000001e-06, "loss": 0.9868, "mean_token_accuracy": 0.7721451628953219, "num_tokens": 2133173.0, "step": 80 }, { "entropy": 0.8339586811140179, "epoch": 0.1602314454211639, "grad_norm": 2.546875, "learning_rate": 1.7800000000000001e-06, "loss": 0.9473, "mean_token_accuracy": 0.782527657970786, "num_tokens": 2406145.0, "step": 90 }, { "entropy": 0.8617484034970403, "epoch": 0.17803493935684878, "grad_norm": 2.671875, "learning_rate": 1.98e-06, "loss": 0.9705, "mean_token_accuracy": 0.7755408465862275, "num_tokens": 2663709.0, "step": 100 }, { "epoch": 0.17803493935684878, "eval_chemistry_entropy": 0.8530604543685913, "eval_chemistry_loss": 0.9879733920097351, "eval_chemistry_mean_token_accuracy": 0.7785031309127808, "eval_chemistry_num_tokens": 2663709.0, "eval_chemistry_runtime": 34.6174, "eval_chemistry_samples_per_second": 14.444, "eval_chemistry_steps_per_second": 3.611, "step": 100 }, { "epoch": 0.17803493935684878, "eval_physics_entropy": 0.6429356033802033, "eval_physics_loss": 0.7288391590118408, "eval_physics_mean_token_accuracy": 0.8237355589866638, "eval_physics_num_tokens": 2663709.0, "eval_physics_runtime": 41.2328, "eval_physics_samples_per_second": 12.126, "eval_physics_steps_per_second": 3.032, "step": 100 }, { "entropy": 0.8313142383471132, "epoch": 0.19583843329253367, "grad_norm": 2.3125, "learning_rate": 2.1800000000000003e-06, "loss": 0.931, "mean_token_accuracy": 0.7826357394456863, "num_tokens": 2934980.0, "step": 110 }, { "entropy": 0.8383694937452674, "epoch": 0.21364192722821854, "grad_norm": 2.140625, "learning_rate": 2.38e-06, "loss": 0.9256, "mean_token_accuracy": 0.7815470688045025, "num_tokens": 3208702.0, "step": 120 }, { "entropy": 0.8249719947576523, "epoch": 0.2314454211639034, "grad_norm": 2.3125, "learning_rate": 2.5800000000000003e-06, "loss": 0.9319, "mean_token_accuracy": 0.7833116251975298, "num_tokens": 3480119.0, "step": 130 }, { "entropy": 0.8376072259619832, "epoch": 0.2492489150995883, "grad_norm": 2.171875, "learning_rate": 2.7800000000000005e-06, "loss": 0.9339, "mean_token_accuracy": 0.7807847507297992, "num_tokens": 3746392.0, "step": 140 }, { "entropy": 0.8562109092250466, "epoch": 0.26705240903527316, "grad_norm": 2.125, "learning_rate": 2.9800000000000003e-06, "loss": 0.945, "mean_token_accuracy": 0.7766421258449554, "num_tokens": 4019487.0, "step": 150 }, { "entropy": 0.8114860905334353, "epoch": 0.28485590297095803, "grad_norm": 2.3125, "learning_rate": 3.1800000000000005e-06, "loss": 0.9108, "mean_token_accuracy": 0.7848536405712366, "num_tokens": 4298100.0, "step": 160 }, { "entropy": 0.8167605808004736, "epoch": 0.30265939690664295, "grad_norm": 1.984375, "learning_rate": 3.3800000000000007e-06, "loss": 0.896, "mean_token_accuracy": 0.7858319569379091, "num_tokens": 4568116.0, "step": 170 }, { "entropy": 0.8315373951569199, "epoch": 0.3204628908423278, "grad_norm": 2.359375, "learning_rate": 3.58e-06, "loss": 0.9106, "mean_token_accuracy": 0.7813973911106586, "num_tokens": 4831334.0, "step": 180 }, { "entropy": 0.8178198751062155, "epoch": 0.3382663847780127, "grad_norm": 2.015625, "learning_rate": 3.7800000000000002e-06, "loss": 0.881, "mean_token_accuracy": 0.7869020272046328, "num_tokens": 5103301.0, "step": 190 }, { "entropy": 0.8403754932805896, "epoch": 0.35606987871369755, "grad_norm": 1.9921875, "learning_rate": 3.980000000000001e-06, "loss": 0.9115, "mean_token_accuracy": 0.7783690184354782, "num_tokens": 5362915.0, "step": 200 }, { "epoch": 0.35606987871369755, "eval_chemistry_entropy": 0.8292876372337341, "eval_chemistry_loss": 0.9164837598800659, "eval_chemistry_mean_token_accuracy": 0.7834693474769592, "eval_chemistry_num_tokens": 5362915.0, "eval_chemistry_runtime": 34.57, "eval_chemistry_samples_per_second": 14.463, "eval_chemistry_steps_per_second": 3.616, "step": 200 }, { "epoch": 0.35606987871369755, "eval_physics_entropy": 0.6445197880268096, "eval_physics_loss": 0.6974144577980042, "eval_physics_mean_token_accuracy": 0.8239417452812194, "eval_physics_num_tokens": 5362915.0, "eval_physics_runtime": 41.2388, "eval_physics_samples_per_second": 12.124, "eval_physics_steps_per_second": 3.031, "step": 200 }, { "entropy": 0.8109689498320222, "epoch": 0.3738733726493824, "grad_norm": 2.078125, "learning_rate": 4.18e-06, "loss": 0.856, "mean_token_accuracy": 0.788726344704628, "num_tokens": 5625455.0, "step": 210 }, { "entropy": 0.8134721567854285, "epoch": 0.39167686658506734, "grad_norm": 2.0, "learning_rate": 4.38e-06, "loss": 0.8568, "mean_token_accuracy": 0.7867508839815855, "num_tokens": 5890806.0, "step": 220 }, { "entropy": 0.8066105756908655, "epoch": 0.4094803605207522, "grad_norm": 2.203125, "learning_rate": 4.58e-06, "loss": 0.8275, "mean_token_accuracy": 0.7900486279278993, "num_tokens": 6158751.0, "step": 230 }, { "entropy": 0.8255353083834052, "epoch": 0.4272838544564371, "grad_norm": 2.125, "learning_rate": 4.78e-06, "loss": 0.8244, "mean_token_accuracy": 0.7881908807903528, "num_tokens": 6423747.0, "step": 240 }, { "entropy": 0.8402099519968033, "epoch": 0.44508734839212194, "grad_norm": 1.9609375, "learning_rate": 4.980000000000001e-06, "loss": 0.8386, "mean_token_accuracy": 0.7834979113191366, "num_tokens": 6684517.0, "step": 250 }, { "entropy": 0.8010248441249133, "epoch": 0.4628908423278068, "grad_norm": 2.0, "learning_rate": 5.18e-06, "loss": 0.7988, "mean_token_accuracy": 0.7915425561368465, "num_tokens": 6945659.0, "step": 260 }, { "entropy": 0.814286389760673, "epoch": 0.48069433626349173, "grad_norm": 2.28125, "learning_rate": 5.380000000000001e-06, "loss": 0.8029, "mean_token_accuracy": 0.7904389422386885, "num_tokens": 7217797.0, "step": 270 }, { "entropy": 0.785382822342217, "epoch": 0.4984978301991766, "grad_norm": 2.0, "learning_rate": 5.580000000000001e-06, "loss": 0.76, "mean_token_accuracy": 0.7953414976596832, "num_tokens": 7498182.0, "step": 280 }, { "entropy": 0.8195194514468312, "epoch": 0.5163013241348615, "grad_norm": 1.90625, "learning_rate": 5.78e-06, "loss": 0.7916, "mean_token_accuracy": 0.78877494931221, "num_tokens": 7761686.0, "step": 290 }, { "entropy": 0.8369470547884703, "epoch": 0.5341048180705463, "grad_norm": 2.375, "learning_rate": 5.98e-06, "loss": 0.8066, "mean_token_accuracy": 0.786370538175106, "num_tokens": 8026923.0, "step": 300 }, { "epoch": 0.5341048180705463, "eval_chemistry_entropy": 0.818053927898407, "eval_chemistry_loss": 0.8182278871536255, "eval_chemistry_mean_token_accuracy": 0.7884931750297547, "eval_chemistry_num_tokens": 8026923.0, "eval_chemistry_runtime": 34.5992, "eval_chemistry_samples_per_second": 14.451, "eval_chemistry_steps_per_second": 3.613, "step": 300 }, { "epoch": 0.5341048180705463, "eval_physics_entropy": 0.6585752744674682, "eval_physics_loss": 0.6424084305763245, "eval_physics_mean_token_accuracy": 0.8238574590682983, "eval_physics_num_tokens": 8026923.0, "eval_physics_runtime": 41.2472, "eval_physics_samples_per_second": 12.122, "eval_physics_steps_per_second": 3.031, "step": 300 }, { "entropy": 0.8153910035267472, "epoch": 0.5519083120062312, "grad_norm": 2.6875, "learning_rate": 6.18e-06, "loss": 0.792, "mean_token_accuracy": 0.7901755437254906, "num_tokens": 8301506.0, "step": 310 }, { "entropy": 0.8126581776887178, "epoch": 0.5697118059419161, "grad_norm": 1.96875, "learning_rate": 6.380000000000001e-06, "loss": 0.7938, "mean_token_accuracy": 0.7923727743327618, "num_tokens": 8559126.0, "step": 320 }, { "entropy": 0.8185926586389541, "epoch": 0.587515299877601, "grad_norm": 2.375, "learning_rate": 6.5800000000000005e-06, "loss": 0.7873, "mean_token_accuracy": 0.7911712639033794, "num_tokens": 8821088.0, "step": 330 }, { "entropy": 0.7809942720457912, "epoch": 0.6053187938132859, "grad_norm": 1.8046875, "learning_rate": 6.780000000000001e-06, "loss": 0.75, "mean_token_accuracy": 0.7993631161749363, "num_tokens": 9089866.0, "step": 340 }, { "entropy": 0.790563733316958, "epoch": 0.6231222877489707, "grad_norm": 2.03125, "learning_rate": 6.98e-06, "loss": 0.7666, "mean_token_accuracy": 0.79404122158885, "num_tokens": 9358866.0, "step": 350 }, { "entropy": 0.7880068564787507, "epoch": 0.6409257816846556, "grad_norm": 1.8203125, "learning_rate": 7.180000000000001e-06, "loss": 0.7467, "mean_token_accuracy": 0.7980206806212664, "num_tokens": 9629977.0, "step": 360 }, { "entropy": 0.7765095194801688, "epoch": 0.6587292756203404, "grad_norm": 1.921875, "learning_rate": 7.3800000000000005e-06, "loss": 0.7403, "mean_token_accuracy": 0.799276452511549, "num_tokens": 9897757.0, "step": 370 }, { "entropy": 0.7657832968980074, "epoch": 0.6765327695560254, "grad_norm": 1.96875, "learning_rate": 7.58e-06, "loss": 0.7391, "mean_token_accuracy": 0.8021788071841002, "num_tokens": 10168367.0, "step": 380 }, { "entropy": 0.7575604457408189, "epoch": 0.6943362634917103, "grad_norm": 2.03125, "learning_rate": 7.78e-06, "loss": 0.7274, "mean_token_accuracy": 0.803612269833684, "num_tokens": 10439039.0, "step": 390 }, { "entropy": 0.771031835861504, "epoch": 0.7121397574273951, "grad_norm": 1.890625, "learning_rate": 7.980000000000002e-06, "loss": 0.7388, "mean_token_accuracy": 0.8006934400647878, "num_tokens": 10709549.0, "step": 400 }, { "epoch": 0.7121397574273951, "eval_chemistry_entropy": 0.8015456414222717, "eval_chemistry_loss": 0.782933235168457, "eval_chemistry_mean_token_accuracy": 0.7955212960243225, "eval_chemistry_num_tokens": 10709549.0, "eval_chemistry_runtime": 34.5635, "eval_chemistry_samples_per_second": 14.466, "eval_chemistry_steps_per_second": 3.617, "step": 400 }, { "epoch": 0.7121397574273951, "eval_physics_entropy": 0.6612502746582031, "eval_physics_loss": 0.6346089839935303, "eval_physics_mean_token_accuracy": 0.8259402804374695, "eval_physics_num_tokens": 10709549.0, "eval_physics_runtime": 41.2249, "eval_physics_samples_per_second": 12.129, "eval_physics_steps_per_second": 3.032, "step": 400 }, { "entropy": 0.7536357572302222, "epoch": 0.72994325136308, "grad_norm": 2.03125, "learning_rate": 8.18e-06, "loss": 0.7226, "mean_token_accuracy": 0.8046278320252895, "num_tokens": 10976011.0, "step": 410 }, { "entropy": 0.7753360515460372, "epoch": 0.7477467452987648, "grad_norm": 1.8203125, "learning_rate": 8.380000000000001e-06, "loss": 0.7386, "mean_token_accuracy": 0.8000223852694035, "num_tokens": 11245251.0, "step": 420 }, { "entropy": 0.7881368083879352, "epoch": 0.7655502392344498, "grad_norm": 2.140625, "learning_rate": 8.580000000000001e-06, "loss": 0.749, "mean_token_accuracy": 0.7968206465244293, "num_tokens": 11518514.0, "step": 430 }, { "entropy": 0.7809985620900989, "epoch": 0.7833537331701347, "grad_norm": 2.140625, "learning_rate": 8.78e-06, "loss": 0.7485, "mean_token_accuracy": 0.7970930390059948, "num_tokens": 11787117.0, "step": 440 }, { "entropy": 0.7746961690485478, "epoch": 0.8011572271058195, "grad_norm": 2.109375, "learning_rate": 8.98e-06, "loss": 0.7393, "mean_token_accuracy": 0.8000770397484303, "num_tokens": 12050993.0, "step": 450 }, { "entropy": 0.7727582400664688, "epoch": 0.8189607210415044, "grad_norm": 2.046875, "learning_rate": 9.180000000000002e-06, "loss": 0.7349, "mean_token_accuracy": 0.7991833999752999, "num_tokens": 12311688.0, "step": 460 }, { "entropy": 0.7381363824009896, "epoch": 0.8367642149771892, "grad_norm": 1.96875, "learning_rate": 9.38e-06, "loss": 0.7018, "mean_token_accuracy": 0.8056569200009107, "num_tokens": 12598408.0, "step": 470 }, { "entropy": 0.7458288084715605, "epoch": 0.8545677089128741, "grad_norm": 1.765625, "learning_rate": 9.58e-06, "loss": 0.7163, "mean_token_accuracy": 0.8052155710756779, "num_tokens": 12869642.0, "step": 480 }, { "entropy": 0.7388540361076593, "epoch": 0.8723712028485591, "grad_norm": 1.7578125, "learning_rate": 9.780000000000001e-06, "loss": 0.702, "mean_token_accuracy": 0.8063301011919976, "num_tokens": 13139555.0, "step": 490 }, { "entropy": 0.7744078584015369, "epoch": 0.8901746967842439, "grad_norm": 1.921875, "learning_rate": 9.980000000000001e-06, "loss": 0.7401, "mean_token_accuracy": 0.7977699730545282, "num_tokens": 13396545.0, "step": 500 }, { "epoch": 0.8901746967842439, "eval_chemistry_entropy": 0.7673972373008728, "eval_chemistry_loss": 0.7539037466049194, "eval_chemistry_mean_token_accuracy": 0.7999647192955017, "eval_chemistry_num_tokens": 13396545.0, "eval_chemistry_runtime": 34.5998, "eval_chemistry_samples_per_second": 14.451, "eval_chemistry_steps_per_second": 3.613, "step": 500 }, { "epoch": 0.8901746967842439, "eval_physics_entropy": 0.6430810136795044, "eval_physics_loss": 0.6303399205207825, "eval_physics_mean_token_accuracy": 0.8259163227081299, "eval_physics_num_tokens": 13396545.0, "eval_physics_runtime": 41.284, "eval_physics_samples_per_second": 12.111, "eval_physics_steps_per_second": 3.028, "step": 500 }, { "entropy": 0.7333443723618984, "epoch": 0.9079781907199288, "grad_norm": 1.625, "learning_rate": 1.018e-05, "loss": 0.6952, "mean_token_accuracy": 0.8094376768916846, "num_tokens": 13667857.0, "step": 510 }, { "entropy": 0.7600304771214723, "epoch": 0.9257816846556136, "grad_norm": 1.921875, "learning_rate": 1.038e-05, "loss": 0.7179, "mean_token_accuracy": 0.8037680003792047, "num_tokens": 13932771.0, "step": 520 }, { "entropy": 0.7434984143823385, "epoch": 0.9435851785912985, "grad_norm": 1.7578125, "learning_rate": 1.0580000000000002e-05, "loss": 0.7058, "mean_token_accuracy": 0.8062188472598791, "num_tokens": 14194632.0, "step": 530 }, { "entropy": 0.7611551772803068, "epoch": 0.9613886725269835, "grad_norm": 1.8203125, "learning_rate": 1.0780000000000002e-05, "loss": 0.7268, "mean_token_accuracy": 0.8000180393457412, "num_tokens": 14457650.0, "step": 540 }, { "entropy": 0.7252845833078027, "epoch": 0.9791921664626683, "grad_norm": 1.796875, "learning_rate": 1.0980000000000002e-05, "loss": 0.6969, "mean_token_accuracy": 0.8071364350616932, "num_tokens": 14724967.0, "step": 550 }, { "entropy": 0.7578858522698283, "epoch": 0.9969956603983532, "grad_norm": 1.8671875, "learning_rate": 1.1180000000000001e-05, "loss": 0.7274, "mean_token_accuracy": 0.8026934862136841, "num_tokens": 14993272.0, "step": 560 }, { "entropy": 0.6899186813062237, "epoch": 1.014242795148548, "grad_norm": 1.6484375, "learning_rate": 1.138e-05, "loss": 0.6595, "mean_token_accuracy": 0.8154077602970985, "num_tokens": 15254936.0, "step": 570 }, { "entropy": 0.7271509811282157, "epoch": 1.0320462890842328, "grad_norm": 2.03125, "learning_rate": 1.1580000000000001e-05, "loss": 0.6948, "mean_token_accuracy": 0.804929868131876, "num_tokens": 15515007.0, "step": 580 }, { "entropy": 0.6891195146366954, "epoch": 1.0498497830199176, "grad_norm": 1.875, "learning_rate": 1.178e-05, "loss": 0.6619, "mean_token_accuracy": 0.8155979864299298, "num_tokens": 15794203.0, "step": 590 }, { "entropy": 0.7109181759878993, "epoch": 1.0676532769556026, "grad_norm": 1.828125, "learning_rate": 1.198e-05, "loss": 0.6773, "mean_token_accuracy": 0.8111642371863127, "num_tokens": 16063664.0, "step": 600 }, { "epoch": 1.0676532769556026, "eval_chemistry_entropy": 0.7275329558849335, "eval_chemistry_loss": 0.7331510186195374, "eval_chemistry_mean_token_accuracy": 0.8037505540847778, "eval_chemistry_num_tokens": 16063664.0, "eval_chemistry_runtime": 34.6124, "eval_chemistry_samples_per_second": 14.446, "eval_chemistry_steps_per_second": 3.611, "step": 600 }, { "epoch": 1.0676532769556026, "eval_physics_entropy": 0.6278075368404389, "eval_physics_loss": 0.6282901763916016, "eval_physics_mean_token_accuracy": 0.825420051574707, "eval_physics_num_tokens": 16063664.0, "eval_physics_runtime": 41.2841, "eval_physics_samples_per_second": 12.111, "eval_physics_steps_per_second": 3.028, "step": 600 }, { "entropy": 0.6886052008718252, "epoch": 1.0854567708912874, "grad_norm": 1.9296875, "learning_rate": 1.218e-05, "loss": 0.664, "mean_token_accuracy": 0.8155276246368885, "num_tokens": 16338303.0, "step": 610 }, { "entropy": 0.7301610294729471, "epoch": 1.1032602648269723, "grad_norm": 2.03125, "learning_rate": 1.2380000000000002e-05, "loss": 0.69, "mean_token_accuracy": 0.8072479218244553, "num_tokens": 16606302.0, "step": 620 }, { "entropy": 0.7009955205023288, "epoch": 1.121063758762657, "grad_norm": 1.921875, "learning_rate": 1.2580000000000002e-05, "loss": 0.6662, "mean_token_accuracy": 0.8098525639623404, "num_tokens": 16877071.0, "step": 630 }, { "entropy": 0.6957393456250429, "epoch": 1.138867252698342, "grad_norm": 1.7421875, "learning_rate": 1.2780000000000001e-05, "loss": 0.6634, "mean_token_accuracy": 0.8121555853635073, "num_tokens": 17145775.0, "step": 640 }, { "entropy": 0.7081588903442025, "epoch": 1.156670746634027, "grad_norm": 1.828125, "learning_rate": 1.2980000000000001e-05, "loss": 0.6716, "mean_token_accuracy": 0.8104519549757242, "num_tokens": 17410775.0, "step": 650 }, { "entropy": 0.7330411653965712, "epoch": 1.1744742405697117, "grad_norm": 2.015625, "learning_rate": 1.3180000000000001e-05, "loss": 0.6949, "mean_token_accuracy": 0.8045349054038524, "num_tokens": 17677791.0, "step": 660 }, { "entropy": 0.7073222378268837, "epoch": 1.1922777345053968, "grad_norm": 1.8046875, "learning_rate": 1.3380000000000002e-05, "loss": 0.666, "mean_token_accuracy": 0.8106517169624567, "num_tokens": 17949441.0, "step": 670 }, { "entropy": 0.697087805531919, "epoch": 1.2100812284410816, "grad_norm": 1.8203125, "learning_rate": 1.3580000000000002e-05, "loss": 0.6644, "mean_token_accuracy": 0.8117371387779713, "num_tokens": 18217742.0, "step": 680 }, { "entropy": 0.7058945974335075, "epoch": 1.2278847223767664, "grad_norm": 1.9765625, "learning_rate": 1.378e-05, "loss": 0.6693, "mean_token_accuracy": 0.8107333112508058, "num_tokens": 18487769.0, "step": 690 }, { "entropy": 0.6924632461741567, "epoch": 1.2456882163124514, "grad_norm": 1.859375, "learning_rate": 1.398e-05, "loss": 0.6628, "mean_token_accuracy": 0.811711684986949, "num_tokens": 18760296.0, "step": 700 }, { "epoch": 1.2456882163124514, "eval_chemistry_entropy": 0.7344678246974945, "eval_chemistry_loss": 0.7194931507110596, "eval_chemistry_mean_token_accuracy": 0.8056631999015809, "eval_chemistry_num_tokens": 18760296.0, "eval_chemistry_runtime": 34.6183, "eval_chemistry_samples_per_second": 14.443, "eval_chemistry_steps_per_second": 3.611, "step": 700 }, { "epoch": 1.2456882163124514, "eval_physics_entropy": 0.6411541821956634, "eval_physics_loss": 0.6283924579620361, "eval_physics_mean_token_accuracy": 0.8252067227363586, "eval_physics_num_tokens": 18760296.0, "eval_physics_runtime": 41.2883, "eval_physics_samples_per_second": 12.11, "eval_physics_steps_per_second": 3.027, "step": 700 }, { "entropy": 0.6927230576053262, "epoch": 1.2634917102481362, "grad_norm": 1.6796875, "learning_rate": 1.418e-05, "loss": 0.6577, "mean_token_accuracy": 0.8144037745893001, "num_tokens": 19035733.0, "step": 710 }, { "entropy": 0.7126594386994839, "epoch": 1.281295204183821, "grad_norm": 1.78125, "learning_rate": 1.4380000000000001e-05, "loss": 0.681, "mean_token_accuracy": 0.80786834359169, "num_tokens": 19311432.0, "step": 720 }, { "entropy": 0.6811089560389518, "epoch": 1.299098698119506, "grad_norm": 1.6640625, "learning_rate": 1.4580000000000001e-05, "loss": 0.65, "mean_token_accuracy": 0.8166949838399887, "num_tokens": 19593016.0, "step": 730 }, { "entropy": 0.6871400082483887, "epoch": 1.3169021920551909, "grad_norm": 1.8359375, "learning_rate": 1.478e-05, "loss": 0.6507, "mean_token_accuracy": 0.8143899407237768, "num_tokens": 19860476.0, "step": 740 }, { "entropy": 0.6881214780732989, "epoch": 1.3347056859908757, "grad_norm": 1.9375, "learning_rate": 1.498e-05, "loss": 0.6517, "mean_token_accuracy": 0.8132891681045293, "num_tokens": 20124436.0, "step": 750 }, { "entropy": 0.6930879963561892, "epoch": 1.3525091799265607, "grad_norm": 1.7734375, "learning_rate": 1.5180000000000002e-05, "loss": 0.6664, "mean_token_accuracy": 0.8128124658018351, "num_tokens": 20388027.0, "step": 760 }, { "entropy": 0.6938662787899375, "epoch": 1.3703126738622455, "grad_norm": 1.8828125, "learning_rate": 1.5380000000000002e-05, "loss": 0.6592, "mean_token_accuracy": 0.812091863155365, "num_tokens": 20656076.0, "step": 770 }, { "entropy": 0.7249738791957497, "epoch": 1.3881161677979303, "grad_norm": 1.875, "learning_rate": 1.5580000000000003e-05, "loss": 0.6947, "mean_token_accuracy": 0.8059011038392783, "num_tokens": 20911633.0, "step": 780 }, { "entropy": 0.7112141275778413, "epoch": 1.4059196617336152, "grad_norm": 1.6640625, "learning_rate": 1.578e-05, "loss": 0.6839, "mean_token_accuracy": 0.8064889151602983, "num_tokens": 21176227.0, "step": 790 }, { "entropy": 0.6923007164150476, "epoch": 1.4237231556693, "grad_norm": 1.8828125, "learning_rate": 1.5980000000000003e-05, "loss": 0.6619, "mean_token_accuracy": 0.8126598935574293, "num_tokens": 21445242.0, "step": 800 }, { "epoch": 1.4237231556693, "eval_chemistry_entropy": 0.7049469864368438, "eval_chemistry_loss": 0.7038090229034424, "eval_chemistry_mean_token_accuracy": 0.8079238352775574, "eval_chemistry_num_tokens": 21445242.0, "eval_chemistry_runtime": 34.5889, "eval_chemistry_samples_per_second": 14.456, "eval_chemistry_steps_per_second": 3.614, "step": 800 }, { "epoch": 1.4237231556693, "eval_physics_entropy": 0.6302921390533447, "eval_physics_loss": 0.6273708343505859, "eval_physics_mean_token_accuracy": 0.824629822731018, "eval_physics_num_tokens": 21445242.0, "eval_physics_runtime": 41.2241, "eval_physics_samples_per_second": 12.129, "eval_physics_steps_per_second": 3.032, "step": 800 }, { "entropy": 0.6938302194699645, "epoch": 1.441526649604985, "grad_norm": 1.9765625, "learning_rate": 1.618e-05, "loss": 0.6643, "mean_token_accuracy": 0.8114070001989603, "num_tokens": 21718455.0, "step": 810 }, { "entropy": 0.6958711674436927, "epoch": 1.4593301435406698, "grad_norm": 1.8203125, "learning_rate": 1.638e-05, "loss": 0.6619, "mean_token_accuracy": 0.8139785166829825, "num_tokens": 21974076.0, "step": 820 }, { "entropy": 0.6980619359761476, "epoch": 1.4771336374763546, "grad_norm": 1.828125, "learning_rate": 1.658e-05, "loss": 0.6687, "mean_token_accuracy": 0.8110807090997696, "num_tokens": 22241599.0, "step": 830 }, { "entropy": 0.6977157359942794, "epoch": 1.4949371314120397, "grad_norm": 1.9921875, "learning_rate": 1.6780000000000002e-05, "loss": 0.6706, "mean_token_accuracy": 0.8098580442368984, "num_tokens": 22510725.0, "step": 840 }, { "entropy": 0.6765838697552681, "epoch": 1.5127406253477245, "grad_norm": 1.640625, "learning_rate": 1.698e-05, "loss": 0.6549, "mean_token_accuracy": 0.8153976205736398, "num_tokens": 22778514.0, "step": 850 }, { "entropy": 0.6760151350870729, "epoch": 1.5305441192834093, "grad_norm": 1.859375, "learning_rate": 1.718e-05, "loss": 0.6375, "mean_token_accuracy": 0.8156527172774076, "num_tokens": 23051230.0, "step": 860 }, { "entropy": 0.6908003646880388, "epoch": 1.5483476132190943, "grad_norm": 1.78125, "learning_rate": 1.7380000000000003e-05, "loss": 0.658, "mean_token_accuracy": 0.8122621558606624, "num_tokens": 23306327.0, "step": 870 }, { "entropy": 0.6964091071859002, "epoch": 1.5661511071547791, "grad_norm": 1.7578125, "learning_rate": 1.758e-05, "loss": 0.6597, "mean_token_accuracy": 0.811262060701847, "num_tokens": 23573127.0, "step": 880 }, { "entropy": 0.6847964959219098, "epoch": 1.583954601090464, "grad_norm": 1.5703125, "learning_rate": 1.7780000000000003e-05, "loss": 0.6532, "mean_token_accuracy": 0.8122437499463558, "num_tokens": 23841914.0, "step": 890 }, { "entropy": 0.6890485780313611, "epoch": 1.601758095026149, "grad_norm": 1.828125, "learning_rate": 1.798e-05, "loss": 0.6587, "mean_token_accuracy": 0.8129192098975182, "num_tokens": 24105114.0, "step": 900 }, { "epoch": 1.601758095026149, "eval_chemistry_entropy": 0.6863252129554749, "eval_chemistry_loss": 0.6930021643638611, "eval_chemistry_mean_token_accuracy": 0.8097150478363037, "eval_chemistry_num_tokens": 24105114.0, "eval_chemistry_runtime": 34.6005, "eval_chemistry_samples_per_second": 14.451, "eval_chemistry_steps_per_second": 3.613, "step": 900 }, { "epoch": 1.601758095026149, "eval_physics_entropy": 0.6244586195945739, "eval_physics_loss": 0.6287086606025696, "eval_physics_mean_token_accuracy": 0.8245933861732483, "eval_physics_num_tokens": 24105114.0, "eval_physics_runtime": 41.2389, "eval_physics_samples_per_second": 12.124, "eval_physics_steps_per_second": 3.031, "step": 900 }, { "entropy": 0.6585973870009184, "epoch": 1.6195615889618338, "grad_norm": 1.5703125, "learning_rate": 1.8180000000000002e-05, "loss": 0.6354, "mean_token_accuracy": 0.8186014011502266, "num_tokens": 24384008.0, "step": 910 }, { "entropy": 0.693634419888258, "epoch": 1.6373650828975186, "grad_norm": 1.78125, "learning_rate": 1.8380000000000004e-05, "loss": 0.6526, "mean_token_accuracy": 0.812073552981019, "num_tokens": 24638205.0, "step": 920 }, { "entropy": 0.6443359967321157, "epoch": 1.6551685768332036, "grad_norm": 1.640625, "learning_rate": 1.858e-05, "loss": 0.6138, "mean_token_accuracy": 0.8221632152795791, "num_tokens": 24910104.0, "step": 930 }, { "entropy": 0.6681255014613271, "epoch": 1.6729720707688884, "grad_norm": 1.8828125, "learning_rate": 1.878e-05, "loss": 0.6349, "mean_token_accuracy": 0.8162749476730824, "num_tokens": 25181306.0, "step": 940 }, { "entropy": 0.676227905228734, "epoch": 1.6907755647045732, "grad_norm": 1.75, "learning_rate": 1.898e-05, "loss": 0.6452, "mean_token_accuracy": 0.8150841273367405, "num_tokens": 25449138.0, "step": 950 }, { "entropy": 0.6844975538551807, "epoch": 1.7085790586402583, "grad_norm": 1.703125, "learning_rate": 1.918e-05, "loss": 0.6493, "mean_token_accuracy": 0.8142464257776737, "num_tokens": 25709030.0, "step": 960 }, { "entropy": 0.684933065995574, "epoch": 1.7263825525759429, "grad_norm": 1.828125, "learning_rate": 1.938e-05, "loss": 0.6539, "mean_token_accuracy": 0.8128440115600825, "num_tokens": 25972807.0, "step": 970 }, { "entropy": 0.6664420425891876, "epoch": 1.744186046511628, "grad_norm": 1.5234375, "learning_rate": 1.9580000000000002e-05, "loss": 0.632, "mean_token_accuracy": 0.8175933144986629, "num_tokens": 26244955.0, "step": 980 }, { "entropy": 0.639144916087389, "epoch": 1.761989540447313, "grad_norm": 1.515625, "learning_rate": 1.978e-05, "loss": 0.6137, "mean_token_accuracy": 0.8227979384362698, "num_tokens": 26515548.0, "step": 990 }, { "entropy": 0.6666794860735535, "epoch": 1.7797930343829975, "grad_norm": 1.578125, "learning_rate": 1.9980000000000002e-05, "loss": 0.6268, "mean_token_accuracy": 0.8173129990696907, "num_tokens": 26782376.0, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_chemistry_entropy": 0.6736977963447571, "eval_chemistry_loss": 0.6848514676094055, "eval_chemistry_mean_token_accuracy": 0.8112705006599427, "eval_chemistry_num_tokens": 26782376.0, "eval_chemistry_runtime": 34.6023, "eval_chemistry_samples_per_second": 14.45, "eval_chemistry_steps_per_second": 3.612, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_physics_entropy": 0.6228616139888763, "eval_physics_loss": 0.6303527355194092, "eval_physics_mean_token_accuracy": 0.8230352792739868, "eval_physics_num_tokens": 26782376.0, "eval_physics_runtime": 40.9503, "eval_physics_samples_per_second": 12.21, "eval_physics_steps_per_second": 3.052, "step": 1000 }, { "entropy": 0.6698539135977626, "epoch": 1.7975965283186826, "grad_norm": 1.6796875, "learning_rate": 1.9980000000000002e-05, "loss": 0.64, "mean_token_accuracy": 0.815592122823, "num_tokens": 27049648.0, "step": 1010 }, { "entropy": 0.6951070323586463, "epoch": 1.8154000222543676, "grad_norm": 1.6796875, "learning_rate": 1.995777777777778e-05, "loss": 0.6636, "mean_token_accuracy": 0.810206351801753, "num_tokens": 27315061.0, "step": 1020 }, { "entropy": 0.6561093555763364, "epoch": 1.8332035161900522, "grad_norm": 1.65625, "learning_rate": 1.9935555555555557e-05, "loss": 0.6193, "mean_token_accuracy": 0.8209650807082653, "num_tokens": 27587650.0, "step": 1030 }, { "entropy": 0.673011284135282, "epoch": 1.8510070101257372, "grad_norm": 1.4609375, "learning_rate": 1.9913333333333335e-05, "loss": 0.6476, "mean_token_accuracy": 0.8167991150170565, "num_tokens": 27855458.0, "step": 1040 }, { "entropy": 0.6599440133199096, "epoch": 1.868810504061422, "grad_norm": 1.578125, "learning_rate": 1.9891111111111112e-05, "loss": 0.6309, "mean_token_accuracy": 0.82068553827703, "num_tokens": 28127703.0, "step": 1050 }, { "entropy": 0.6387615172192455, "epoch": 1.8866139979971068, "grad_norm": 1.375, "learning_rate": 1.986888888888889e-05, "loss": 0.6027, "mean_token_accuracy": 0.8241243101656437, "num_tokens": 28395128.0, "step": 1060 }, { "entropy": 0.661963646300137, "epoch": 1.9044174919327919, "grad_norm": 1.5703125, "learning_rate": 1.9846666666666668e-05, "loss": 0.6311, "mean_token_accuracy": 0.817503708600998, "num_tokens": 28662344.0, "step": 1070 }, { "entropy": 0.6643290909007191, "epoch": 1.9222209858684767, "grad_norm": 1.6953125, "learning_rate": 1.9824444444444445e-05, "loss": 0.6355, "mean_token_accuracy": 0.8175244953483343, "num_tokens": 28928769.0, "step": 1080 }, { "entropy": 0.663678290694952, "epoch": 1.9400244798041615, "grad_norm": 1.8359375, "learning_rate": 1.9802222222222226e-05, "loss": 0.6318, "mean_token_accuracy": 0.8165966872125864, "num_tokens": 29186214.0, "step": 1090 }, { "entropy": 0.6729184232652188, "epoch": 1.9578279737398465, "grad_norm": 1.5078125, "learning_rate": 1.978e-05, "loss": 0.6437, "mean_token_accuracy": 0.8148789752274752, "num_tokens": 29446445.0, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_chemistry_entropy": 0.667749724149704, "eval_chemistry_loss": 0.670352041721344, "eval_chemistry_mean_token_accuracy": 0.8144329328536988, "eval_chemistry_num_tokens": 29446445.0, "eval_chemistry_runtime": 34.5967, "eval_chemistry_samples_per_second": 14.452, "eval_chemistry_steps_per_second": 3.613, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_physics_entropy": 0.6239387395381928, "eval_physics_loss": 0.6269068121910095, "eval_physics_mean_token_accuracy": 0.8240941405296326, "eval_physics_num_tokens": 29446445.0, "eval_physics_runtime": 41.2535, "eval_physics_samples_per_second": 12.12, "eval_physics_steps_per_second": 3.03, "step": 1100 }, { "entropy": 0.6582659302279353, "epoch": 1.9756314676755313, "grad_norm": 1.5234375, "learning_rate": 1.975777777777778e-05, "loss": 0.6207, "mean_token_accuracy": 0.8203308090567589, "num_tokens": 29711771.0, "step": 1110 }, { "entropy": 0.6604506926611066, "epoch": 1.9934349616112161, "grad_norm": 1.65625, "learning_rate": 1.9735555555555556e-05, "loss": 0.6317, "mean_token_accuracy": 0.8193308029323816, "num_tokens": 29979994.0, "step": 1120 }, { "entropy": 0.604119231431715, "epoch": 2.010682096361411, "grad_norm": 1.6953125, "learning_rate": 1.9713333333333337e-05, "loss": 0.5701, "mean_token_accuracy": 0.83187814950943, "num_tokens": 30240793.0, "step": 1130 }, { "entropy": 0.5444270031526685, "epoch": 2.028485590297096, "grad_norm": 1.671875, "learning_rate": 1.969111111111111e-05, "loss": 0.5188, "mean_token_accuracy": 0.8410136397928, "num_tokens": 30503988.0, "step": 1140 }, { "entropy": 0.5468037134036422, "epoch": 2.0462890842327806, "grad_norm": 1.8125, "learning_rate": 1.9668888888888892e-05, "loss": 0.5163, "mean_token_accuracy": 0.8412827380001545, "num_tokens": 30771841.0, "step": 1150 }, { "entropy": 0.5360686406493187, "epoch": 2.0640925781684656, "grad_norm": 1.8203125, "learning_rate": 1.9646666666666666e-05, "loss": 0.5083, "mean_token_accuracy": 0.8440209306776524, "num_tokens": 31045037.0, "step": 1160 }, { "entropy": 0.5755055965855718, "epoch": 2.0818960721041506, "grad_norm": 1.7890625, "learning_rate": 1.9624444444444447e-05, "loss": 0.5422, "mean_token_accuracy": 0.8334469784051179, "num_tokens": 31299575.0, "step": 1170 }, { "entropy": 0.5716140467673541, "epoch": 2.099699566039835, "grad_norm": 1.8671875, "learning_rate": 1.9602222222222225e-05, "loss": 0.5423, "mean_token_accuracy": 0.8336410760879517, "num_tokens": 31565963.0, "step": 1180 }, { "entropy": 0.5382599098607898, "epoch": 2.1175030599755202, "grad_norm": 1.6484375, "learning_rate": 1.9580000000000002e-05, "loss": 0.5168, "mean_token_accuracy": 0.8428799293935298, "num_tokens": 31834212.0, "step": 1190 }, { "entropy": 0.5634422322735191, "epoch": 2.1353065539112053, "grad_norm": 1.6953125, "learning_rate": 1.955777777777778e-05, "loss": 0.5327, "mean_token_accuracy": 0.8359647855162621, "num_tokens": 32103356.0, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_chemistry_entropy": 0.5813326196670532, "eval_chemistry_loss": 0.6811516284942627, "eval_chemistry_mean_token_accuracy": 0.8141688752174377, "eval_chemistry_num_tokens": 32103356.0, "eval_chemistry_runtime": 34.6182, "eval_chemistry_samples_per_second": 14.443, "eval_chemistry_steps_per_second": 3.611, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_physics_entropy": 0.5421837236881256, "eval_physics_loss": 0.6482248306274414, "eval_physics_mean_token_accuracy": 0.8212747263908386, "eval_physics_num_tokens": 32103356.0, "eval_physics_runtime": 41.2474, "eval_physics_samples_per_second": 12.122, "eval_physics_steps_per_second": 3.03, "step": 1200 }, { "entropy": 0.5539132442325354, "epoch": 2.15311004784689, "grad_norm": 1.6171875, "learning_rate": 1.9535555555555557e-05, "loss": 0.519, "mean_token_accuracy": 0.8395270679146052, "num_tokens": 32374830.0, "step": 1210 }, { "entropy": 0.5340574728325009, "epoch": 2.170913541782575, "grad_norm": 1.671875, "learning_rate": 1.9513333333333335e-05, "loss": 0.5083, "mean_token_accuracy": 0.8442022494971753, "num_tokens": 32643479.0, "step": 1220 }, { "entropy": 0.540911185927689, "epoch": 2.18871703571826, "grad_norm": 1.6953125, "learning_rate": 1.9491111111111113e-05, "loss": 0.5126, "mean_token_accuracy": 0.8427833508700132, "num_tokens": 32908761.0, "step": 1230 }, { "entropy": 0.5624004138633609, "epoch": 2.2065205296539445, "grad_norm": 1.5625, "learning_rate": 1.946888888888889e-05, "loss": 0.5288, "mean_token_accuracy": 0.8391110297292471, "num_tokens": 33166413.0, "step": 1240 }, { "entropy": 0.5559011250734329, "epoch": 2.2243240235896296, "grad_norm": 1.53125, "learning_rate": 1.9446666666666668e-05, "loss": 0.5314, "mean_token_accuracy": 0.8375788435339928, "num_tokens": 33432686.0, "step": 1250 }, { "entropy": 0.5524005191400647, "epoch": 2.242127517525314, "grad_norm": 1.8671875, "learning_rate": 1.9424444444444446e-05, "loss": 0.5252, "mean_token_accuracy": 0.8396644528955222, "num_tokens": 33693242.0, "step": 1260 }, { "entropy": 0.5266111794859171, "epoch": 2.259931011460999, "grad_norm": 1.6953125, "learning_rate": 1.9402222222222223e-05, "loss": 0.4966, "mean_token_accuracy": 0.847085589542985, "num_tokens": 33970820.0, "step": 1270 }, { "entropy": 0.5435060489922762, "epoch": 2.277734505396684, "grad_norm": 1.75, "learning_rate": 1.938e-05, "loss": 0.5191, "mean_token_accuracy": 0.8409085746854543, "num_tokens": 34243042.0, "step": 1280 }, { "entropy": 0.5497656201943755, "epoch": 2.295537999332369, "grad_norm": 1.6875, "learning_rate": 1.935777777777778e-05, "loss": 0.5245, "mean_token_accuracy": 0.8400060147047043, "num_tokens": 34515236.0, "step": 1290 }, { "entropy": 0.5251918986439705, "epoch": 2.313341493268054, "grad_norm": 1.6484375, "learning_rate": 1.9335555555555556e-05, "loss": 0.4979, "mean_token_accuracy": 0.848014684766531, "num_tokens": 34785068.0, "step": 1300 }, { "epoch": 2.313341493268054, "eval_chemistry_entropy": 0.5687024412155152, "eval_chemistry_loss": 0.6814095377922058, "eval_chemistry_mean_token_accuracy": 0.8144371585845948, "eval_chemistry_num_tokens": 34785068.0, "eval_chemistry_runtime": 34.6075, "eval_chemistry_samples_per_second": 14.448, "eval_chemistry_steps_per_second": 3.612, "step": 1300 }, { "epoch": 2.313341493268054, "eval_physics_entropy": 0.5373544907569885, "eval_physics_loss": 0.6523818373680115, "eval_physics_mean_token_accuracy": 0.8211935892105102, "eval_physics_num_tokens": 34785068.0, "eval_physics_runtime": 41.2549, "eval_physics_samples_per_second": 12.12, "eval_physics_steps_per_second": 3.03, "step": 1300 }, { "entropy": 0.5557622062042356, "epoch": 2.331144987203739, "grad_norm": 1.4921875, "learning_rate": 1.9313333333333334e-05, "loss": 0.5275, "mean_token_accuracy": 0.8374897003173828, "num_tokens": 35055845.0, "step": 1310 }, { "entropy": 0.5468069275841116, "epoch": 2.3489484811394234, "grad_norm": 1.6015625, "learning_rate": 1.9291111111111115e-05, "loss": 0.5189, "mean_token_accuracy": 0.8414008297026158, "num_tokens": 35327481.0, "step": 1320 }, { "entropy": 0.5578970231115818, "epoch": 2.3667519750751085, "grad_norm": 1.65625, "learning_rate": 1.926888888888889e-05, "loss": 0.5327, "mean_token_accuracy": 0.837207018584013, "num_tokens": 35596362.0, "step": 1330 }, { "entropy": 0.5461308034136891, "epoch": 2.3845554690107935, "grad_norm": 1.6328125, "learning_rate": 1.924666666666667e-05, "loss": 0.5144, "mean_token_accuracy": 0.8420327302068472, "num_tokens": 35869585.0, "step": 1340 }, { "entropy": 0.5518748119473458, "epoch": 2.402358962946478, "grad_norm": 1.765625, "learning_rate": 1.9224444444444444e-05, "loss": 0.5271, "mean_token_accuracy": 0.8386039260774851, "num_tokens": 36134072.0, "step": 1350 }, { "entropy": 0.5443382592871785, "epoch": 2.420162456882163, "grad_norm": 1.6015625, "learning_rate": 1.9202222222222225e-05, "loss": 0.5177, "mean_token_accuracy": 0.8425142876803875, "num_tokens": 36406263.0, "step": 1360 }, { "entropy": 0.5503683429211378, "epoch": 2.437965950817848, "grad_norm": 1.71875, "learning_rate": 1.918e-05, "loss": 0.5193, "mean_token_accuracy": 0.8414091795682908, "num_tokens": 36676822.0, "step": 1370 }, { "entropy": 0.5444278137758374, "epoch": 2.4557694447535328, "grad_norm": 1.8515625, "learning_rate": 1.915777777777778e-05, "loss": 0.5187, "mean_token_accuracy": 0.841403117403388, "num_tokens": 36942837.0, "step": 1380 }, { "entropy": 0.5489616293460131, "epoch": 2.473572938689218, "grad_norm": 1.8046875, "learning_rate": 1.9135555555555555e-05, "loss": 0.523, "mean_token_accuracy": 0.8393812999129295, "num_tokens": 37210791.0, "step": 1390 }, { "entropy": 0.5314275240525603, "epoch": 2.491376432624903, "grad_norm": 1.7265625, "learning_rate": 1.9113333333333336e-05, "loss": 0.5119, "mean_token_accuracy": 0.8442874882370234, "num_tokens": 37485822.0, "step": 1400 }, { "epoch": 2.491376432624903, "eval_chemistry_entropy": 0.5652910959720612, "eval_chemistry_loss": 0.6705389618873596, "eval_chemistry_mean_token_accuracy": 0.8175577726364136, "eval_chemistry_num_tokens": 37485822.0, "eval_chemistry_runtime": 34.6247, "eval_chemistry_samples_per_second": 14.441, "eval_chemistry_steps_per_second": 3.61, "step": 1400 }, { "epoch": 2.491376432624903, "eval_physics_entropy": 0.5328873710632325, "eval_physics_loss": 0.6484245657920837, "eval_physics_mean_token_accuracy": 0.8219989495277404, "eval_physics_num_tokens": 37485822.0, "eval_physics_runtime": 41.2499, "eval_physics_samples_per_second": 12.121, "eval_physics_steps_per_second": 3.03, "step": 1400 }, { "entropy": 0.5436741337180138, "epoch": 2.5091799265605874, "grad_norm": 1.65625, "learning_rate": 1.9091111111111113e-05, "loss": 0.5135, "mean_token_accuracy": 0.8424118865281344, "num_tokens": 37746150.0, "step": 1410 }, { "entropy": 0.5462065914645791, "epoch": 2.5269834204962724, "grad_norm": 1.609375, "learning_rate": 1.906888888888889e-05, "loss": 0.5158, "mean_token_accuracy": 0.8402331713587046, "num_tokens": 38016863.0, "step": 1420 }, { "entropy": 0.56711411960423, "epoch": 2.544786914431957, "grad_norm": 1.9765625, "learning_rate": 1.904666666666667e-05, "loss": 0.5349, "mean_token_accuracy": 0.8344442717730999, "num_tokens": 38277754.0, "step": 1430 }, { "entropy": 0.5571039380505681, "epoch": 2.562590408367642, "grad_norm": 1.734375, "learning_rate": 1.9024444444444446e-05, "loss": 0.529, "mean_token_accuracy": 0.8378622565418482, "num_tokens": 38542761.0, "step": 1440 }, { "entropy": 0.5426763394847512, "epoch": 2.580393902303327, "grad_norm": 1.796875, "learning_rate": 1.9002222222222224e-05, "loss": 0.5159, "mean_token_accuracy": 0.8411751441657543, "num_tokens": 38820999.0, "step": 1450 }, { "entropy": 0.5453944187611341, "epoch": 2.598197396239012, "grad_norm": 1.6953125, "learning_rate": 1.898e-05, "loss": 0.5238, "mean_token_accuracy": 0.8403888408094644, "num_tokens": 39080016.0, "step": 1460 }, { "entropy": 0.5451987128704786, "epoch": 2.6160008901746967, "grad_norm": 1.7734375, "learning_rate": 1.895777777777778e-05, "loss": 0.5123, "mean_token_accuracy": 0.8426350578665733, "num_tokens": 39345434.0, "step": 1470 }, { "entropy": 0.5478348048403859, "epoch": 2.6338043841103818, "grad_norm": 1.78125, "learning_rate": 1.8935555555555556e-05, "loss": 0.5212, "mean_token_accuracy": 0.8402178969234229, "num_tokens": 39610041.0, "step": 1480 }, { "entropy": 0.5249082477763295, "epoch": 2.6516078780460663, "grad_norm": 1.7890625, "learning_rate": 1.8913333333333334e-05, "loss": 0.4993, "mean_token_accuracy": 0.8461574167013168, "num_tokens": 39884185.0, "step": 1490 }, { "entropy": 0.5606990542262793, "epoch": 2.6694113719817514, "grad_norm": 1.953125, "learning_rate": 1.8891111111111115e-05, "loss": 0.533, "mean_token_accuracy": 0.8361463028937578, "num_tokens": 40143543.0, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_chemistry_entropy": 0.5683894844055176, "eval_chemistry_loss": 0.6641016602516174, "eval_chemistry_mean_token_accuracy": 0.817899118423462, "eval_chemistry_num_tokens": 40143543.0, "eval_chemistry_runtime": 34.613, "eval_chemistry_samples_per_second": 14.445, "eval_chemistry_steps_per_second": 3.611, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_physics_entropy": 0.5443970229625702, "eval_physics_loss": 0.6480051279067993, "eval_physics_mean_token_accuracy": 0.8217208013534546, "eval_physics_num_tokens": 40143543.0, "eval_physics_runtime": 40.9767, "eval_physics_samples_per_second": 12.202, "eval_physics_steps_per_second": 3.051, "step": 1500 }, { "entropy": 0.5430334324017168, "epoch": 2.6872148659174364, "grad_norm": 1.7421875, "learning_rate": 1.886888888888889e-05, "loss": 0.5152, "mean_token_accuracy": 0.8425797671079636, "num_tokens": 40406812.0, "step": 1510 }, { "entropy": 0.5591320911422372, "epoch": 2.7050183598531214, "grad_norm": 1.8359375, "learning_rate": 1.884666666666667e-05, "loss": 0.5348, "mean_token_accuracy": 0.8362568471580744, "num_tokens": 40668543.0, "step": 1520 }, { "entropy": 0.5530628427863121, "epoch": 2.722821853788806, "grad_norm": 1.6953125, "learning_rate": 1.8824444444444445e-05, "loss": 0.518, "mean_token_accuracy": 0.8411050379276276, "num_tokens": 40942603.0, "step": 1530 }, { "entropy": 0.556857710890472, "epoch": 2.740625347724491, "grad_norm": 1.6796875, "learning_rate": 1.8802222222222226e-05, "loss": 0.5243, "mean_token_accuracy": 0.838795681297779, "num_tokens": 41208093.0, "step": 1540 }, { "entropy": 0.5422420864924788, "epoch": 2.7584288416601757, "grad_norm": 1.609375, "learning_rate": 1.878e-05, "loss": 0.5204, "mean_token_accuracy": 0.8388970494270325, "num_tokens": 41477562.0, "step": 1550 }, { "entropy": 0.5319694015197456, "epoch": 2.7762323355958607, "grad_norm": 1.8359375, "learning_rate": 1.875777777777778e-05, "loss": 0.5078, "mean_token_accuracy": 0.8438673295080662, "num_tokens": 41746571.0, "step": 1560 }, { "entropy": 0.5427202990278601, "epoch": 2.7940358295315457, "grad_norm": 1.703125, "learning_rate": 1.873555555555556e-05, "loss": 0.5141, "mean_token_accuracy": 0.842416912689805, "num_tokens": 42011633.0, "step": 1570 }, { "entropy": 0.5518008042126894, "epoch": 2.8118393234672303, "grad_norm": 1.546875, "learning_rate": 1.8713333333333336e-05, "loss": 0.5238, "mean_token_accuracy": 0.8391041114926339, "num_tokens": 42283087.0, "step": 1580 }, { "entropy": 0.5269314305856824, "epoch": 2.8296428174029153, "grad_norm": 1.4375, "learning_rate": 1.8691111111111114e-05, "loss": 0.5, "mean_token_accuracy": 0.8462934419512749, "num_tokens": 42561534.0, "step": 1590 }, { "entropy": 0.54100974611938, "epoch": 2.8474463113386, "grad_norm": 1.6328125, "learning_rate": 1.866888888888889e-05, "loss": 0.5206, "mean_token_accuracy": 0.8429192952811718, "num_tokens": 42834266.0, "step": 1600 }, { "epoch": 2.8474463113386, "eval_chemistry_entropy": 0.5678292090892791, "eval_chemistry_loss": 0.6582387089729309, "eval_chemistry_mean_token_accuracy": 0.8187746982574463, "eval_chemistry_num_tokens": 42834266.0, "eval_chemistry_runtime": 34.6036, "eval_chemistry_samples_per_second": 14.449, "eval_chemistry_steps_per_second": 3.612, "step": 1600 }, { "epoch": 2.8474463113386, "eval_physics_entropy": 0.5397046103477477, "eval_physics_loss": 0.6487213969230652, "eval_physics_mean_token_accuracy": 0.8217604222297669, "eval_physics_num_tokens": 42834266.0, "eval_physics_runtime": 41.2474, "eval_physics_samples_per_second": 12.122, "eval_physics_steps_per_second": 3.03, "step": 1600 }, { "entropy": 0.5642041997984052, "epoch": 2.865249805274285, "grad_norm": 1.796875, "learning_rate": 1.864666666666667e-05, "loss": 0.5371, "mean_token_accuracy": 0.8362252287566662, "num_tokens": 43100513.0, "step": 1610 }, { "entropy": 0.553196731954813, "epoch": 2.88305329920997, "grad_norm": 1.65625, "learning_rate": 1.8624444444444446e-05, "loss": 0.5214, "mean_token_accuracy": 0.8403059497475625, "num_tokens": 43375632.0, "step": 1620 }, { "entropy": 0.5369223712012172, "epoch": 2.900856793145655, "grad_norm": 1.640625, "learning_rate": 1.8602222222222224e-05, "loss": 0.5167, "mean_token_accuracy": 0.8430781438946724, "num_tokens": 43644341.0, "step": 1630 }, { "entropy": 0.5357730442658066, "epoch": 2.9186602870813396, "grad_norm": 1.78125, "learning_rate": 1.858e-05, "loss": 0.5061, "mean_token_accuracy": 0.8430858284235001, "num_tokens": 43917580.0, "step": 1640 }, { "entropy": 0.5374868368729949, "epoch": 2.9364637810170247, "grad_norm": 1.7578125, "learning_rate": 1.855777777777778e-05, "loss": 0.5116, "mean_token_accuracy": 0.8447067823261023, "num_tokens": 44177325.0, "step": 1650 }, { "entropy": 0.5514863247051836, "epoch": 2.9542672749527092, "grad_norm": 1.6953125, "learning_rate": 1.8535555555555557e-05, "loss": 0.5211, "mean_token_accuracy": 0.8395764529705048, "num_tokens": 44442947.0, "step": 1660 }, { "entropy": 0.5531215896829963, "epoch": 2.9720707688883943, "grad_norm": 1.8515625, "learning_rate": 1.8513333333333335e-05, "loss": 0.5287, "mean_token_accuracy": 0.8382051270455122, "num_tokens": 44700601.0, "step": 1670 }, { "entropy": 0.5407275201752781, "epoch": 2.9898742628240793, "grad_norm": 1.59375, "learning_rate": 1.8491111111111112e-05, "loss": 0.51, "mean_token_accuracy": 0.844194658845663, "num_tokens": 44971479.0, "step": 1680 }, { "entropy": 0.49610343229386117, "epoch": 3.007121397574274, "grad_norm": 1.65625, "learning_rate": 1.846888888888889e-05, "loss": 0.4379, "mean_token_accuracy": 0.8622554202233591, "num_tokens": 45231796.0, "step": 1690 }, { "entropy": 0.3639660466462374, "epoch": 3.0249248915099587, "grad_norm": 2.171875, "learning_rate": 1.8446666666666667e-05, "loss": 0.3489, "mean_token_accuracy": 0.8842393532395363, "num_tokens": 45500317.0, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_chemistry_entropy": 0.40514632713794707, "eval_chemistry_loss": 0.756369411945343, "eval_chemistry_mean_token_accuracy": 0.815176130771637, "eval_chemistry_num_tokens": 45500317.0, "eval_chemistry_runtime": 34.6159, "eval_chemistry_samples_per_second": 14.444, "eval_chemistry_steps_per_second": 3.611, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_physics_entropy": 0.4021349341869354, "eval_physics_loss": 0.7458917498588562, "eval_physics_mean_token_accuracy": 0.8158441076278686, "eval_physics_num_tokens": 45500317.0, "eval_physics_runtime": 41.2473, "eval_physics_samples_per_second": 12.122, "eval_physics_steps_per_second": 3.031, "step": 1700 }, { "entropy": 0.35829943269491193, "epoch": 3.0427283854456437, "grad_norm": 2.125, "learning_rate": 1.842444444444445e-05, "loss": 0.3155, "mean_token_accuracy": 0.8941227950155735, "num_tokens": 45762680.0, "step": 1710 }, { "entropy": 0.3386535131372511, "epoch": 3.0605318793813288, "grad_norm": 2.296875, "learning_rate": 1.8402222222222223e-05, "loss": 0.3128, "mean_token_accuracy": 0.8955158289521933, "num_tokens": 46029923.0, "step": 1720 }, { "entropy": 0.3458150915801525, "epoch": 3.0783353733170133, "grad_norm": 2.078125, "learning_rate": 1.8380000000000004e-05, "loss": 0.3192, "mean_token_accuracy": 0.8931533485651016, "num_tokens": 46305156.0, "step": 1730 }, { "entropy": 0.3377646486274898, "epoch": 3.0961388672526984, "grad_norm": 2.1875, "learning_rate": 1.8357777777777778e-05, "loss": 0.3127, "mean_token_accuracy": 0.8952449705451727, "num_tokens": 46582874.0, "step": 1740 }, { "entropy": 0.35565020963549615, "epoch": 3.1139423611883834, "grad_norm": 2.59375, "learning_rate": 1.833555555555556e-05, "loss": 0.327, "mean_token_accuracy": 0.8901556842029095, "num_tokens": 46850175.0, "step": 1750 }, { "entropy": 0.3427689325064421, "epoch": 3.131745855124068, "grad_norm": 2.0625, "learning_rate": 1.8313333333333333e-05, "loss": 0.3146, "mean_token_accuracy": 0.8948938645422458, "num_tokens": 47125899.0, "step": 1760 }, { "entropy": 0.345877396594733, "epoch": 3.149549349059753, "grad_norm": 2.421875, "learning_rate": 1.8291111111111114e-05, "loss": 0.3207, "mean_token_accuracy": 0.8928927041590213, "num_tokens": 47390194.0, "step": 1770 }, { "entropy": 0.3544696198776364, "epoch": 3.167352842995438, "grad_norm": 2.203125, "learning_rate": 1.8268888888888888e-05, "loss": 0.324, "mean_token_accuracy": 0.8916583750396967, "num_tokens": 47654784.0, "step": 1780 }, { "entropy": 0.345364089962095, "epoch": 3.1851563369311227, "grad_norm": 2.25, "learning_rate": 1.824666666666667e-05, "loss": 0.32, "mean_token_accuracy": 0.8935606256127357, "num_tokens": 47919400.0, "step": 1790 }, { "entropy": 0.3512003905139863, "epoch": 3.2029598308668077, "grad_norm": 2.28125, "learning_rate": 1.8224444444444447e-05, "loss": 0.3262, "mean_token_accuracy": 0.8913596641272307, "num_tokens": 48187796.0, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_chemistry_entropy": 0.39868619096279145, "eval_chemistry_loss": 0.7814938426017761, "eval_chemistry_mean_token_accuracy": 0.8135026731491088, "eval_chemistry_num_tokens": 48187796.0, "eval_chemistry_runtime": 34.5848, "eval_chemistry_samples_per_second": 14.457, "eval_chemistry_steps_per_second": 3.614, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_physics_entropy": 0.38585546863079073, "eval_physics_loss": 0.785114049911499, "eval_physics_mean_token_accuracy": 0.8136380562782287, "eval_physics_num_tokens": 48187796.0, "eval_physics_runtime": 41.2383, "eval_physics_samples_per_second": 12.125, "eval_physics_steps_per_second": 3.031, "step": 1800 }, { "entropy": 0.34991011349484324, "epoch": 3.2207633248024923, "grad_norm": 2.21875, "learning_rate": 1.8202222222222225e-05, "loss": 0.3223, "mean_token_accuracy": 0.8920448690652847, "num_tokens": 48457428.0, "step": 1810 }, { "entropy": 0.3759195095859468, "epoch": 3.2385668187381773, "grad_norm": 2.359375, "learning_rate": 1.8180000000000002e-05, "loss": 0.341, "mean_token_accuracy": 0.8847057934850454, "num_tokens": 48716417.0, "step": 1820 }, { "entropy": 0.3503100727684796, "epoch": 3.2563703126738623, "grad_norm": 2.234375, "learning_rate": 1.815777777777778e-05, "loss": 0.3211, "mean_token_accuracy": 0.8925649128854275, "num_tokens": 48977306.0, "step": 1830 }, { "entropy": 0.34978028303012254, "epoch": 3.274173806609547, "grad_norm": 2.09375, "learning_rate": 1.8135555555555557e-05, "loss": 0.3206, "mean_token_accuracy": 0.8920259241014719, "num_tokens": 49249667.0, "step": 1840 }, { "entropy": 0.35568869626149535, "epoch": 3.291977300545232, "grad_norm": 2.109375, "learning_rate": 1.8113333333333335e-05, "loss": 0.3283, "mean_token_accuracy": 0.8901085384190083, "num_tokens": 49520900.0, "step": 1850 }, { "entropy": 0.35994688216596843, "epoch": 3.309780794480917, "grad_norm": 2.34375, "learning_rate": 1.8091111111111113e-05, "loss": 0.3309, "mean_token_accuracy": 0.8887050859630108, "num_tokens": 49786411.0, "step": 1860 }, { "entropy": 0.34956529131159186, "epoch": 3.3275842884166016, "grad_norm": 2.140625, "learning_rate": 1.806888888888889e-05, "loss": 0.3194, "mean_token_accuracy": 0.8924664918333292, "num_tokens": 50056207.0, "step": 1870 }, { "entropy": 0.35508423103019593, "epoch": 3.3453877823522866, "grad_norm": 2.078125, "learning_rate": 1.8046666666666668e-05, "loss": 0.3273, "mean_token_accuracy": 0.8904457587748766, "num_tokens": 50330876.0, "step": 1880 }, { "entropy": 0.35782372783869504, "epoch": 3.3631912762879717, "grad_norm": 2.4375, "learning_rate": 1.8024444444444445e-05, "loss": 0.3313, "mean_token_accuracy": 0.8893947552889585, "num_tokens": 50602004.0, "step": 1890 }, { "entropy": 0.36222805520519613, "epoch": 3.3809947702236562, "grad_norm": 2.203125, "learning_rate": 1.8002222222222223e-05, "loss": 0.3328, "mean_token_accuracy": 0.8882043663412332, "num_tokens": 50873444.0, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_chemistry_entropy": 0.3968628585338593, "eval_chemistry_loss": 0.7823912501335144, "eval_chemistry_mean_token_accuracy": 0.8138084063529968, "eval_chemistry_num_tokens": 50873444.0, "eval_chemistry_runtime": 34.5801, "eval_chemistry_samples_per_second": 14.459, "eval_chemistry_steps_per_second": 3.615, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_physics_entropy": 0.3825342948436737, "eval_physics_loss": 0.789954423904419, "eval_physics_mean_token_accuracy": 0.8131929535865784, "eval_physics_num_tokens": 50873444.0, "eval_physics_runtime": 41.2311, "eval_physics_samples_per_second": 12.127, "eval_physics_steps_per_second": 3.032, "step": 1900 }, { "entropy": 0.3620292558334768, "epoch": 3.3987982641593413, "grad_norm": 2.28125, "learning_rate": 1.798e-05, "loss": 0.3342, "mean_token_accuracy": 0.8878430653363466, "num_tokens": 51141077.0, "step": 1910 }, { "entropy": 0.3582372093573213, "epoch": 3.4166017580950263, "grad_norm": 2.375, "learning_rate": 1.7957777777777778e-05, "loss": 0.3297, "mean_token_accuracy": 0.8890867009758949, "num_tokens": 51406483.0, "step": 1920 }, { "entropy": 0.3618839686736465, "epoch": 3.434405252030711, "grad_norm": 2.4375, "learning_rate": 1.7935555555555556e-05, "loss": 0.3301, "mean_token_accuracy": 0.8890988543629647, "num_tokens": 51668458.0, "step": 1930 }, { "entropy": 0.36373381447046993, "epoch": 3.452208745966396, "grad_norm": 2.484375, "learning_rate": 1.7913333333333337e-05, "loss": 0.3361, "mean_token_accuracy": 0.8878721829503775, "num_tokens": 51928199.0, "step": 1940 }, { "entropy": 0.35968201477080586, "epoch": 3.470012239902081, "grad_norm": 2.21875, "learning_rate": 1.789111111111111e-05, "loss": 0.3309, "mean_token_accuracy": 0.8883917711675167, "num_tokens": 52199244.0, "step": 1950 }, { "entropy": 0.35602816781029106, "epoch": 3.4878157338377656, "grad_norm": 2.375, "learning_rate": 1.7868888888888892e-05, "loss": 0.3274, "mean_token_accuracy": 0.8896724134683609, "num_tokens": 52473476.0, "step": 1960 }, { "entropy": 0.3581710479222238, "epoch": 3.5056192277734506, "grad_norm": 1.9921875, "learning_rate": 1.7846666666666666e-05, "loss": 0.3307, "mean_token_accuracy": 0.8886904548853636, "num_tokens": 52743275.0, "step": 1970 }, { "entropy": 0.3580263523384929, "epoch": 3.523422721709135, "grad_norm": 2.171875, "learning_rate": 1.7824444444444447e-05, "loss": 0.328, "mean_token_accuracy": 0.8894063163548708, "num_tokens": 53014827.0, "step": 1980 }, { "entropy": 0.3583196923136711, "epoch": 3.54122621564482, "grad_norm": 2.46875, "learning_rate": 1.780222222222222e-05, "loss": 0.3328, "mean_token_accuracy": 0.8892181769013405, "num_tokens": 53285577.0, "step": 1990 }, { "entropy": 0.35019520940259097, "epoch": 3.5590297095805052, "grad_norm": 2.359375, "learning_rate": 1.7780000000000003e-05, "loss": 0.3253, "mean_token_accuracy": 0.8909101441502572, "num_tokens": 53553391.0, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_chemistry_entropy": 0.40147681057453155, "eval_chemistry_loss": 0.778447151184082, "eval_chemistry_mean_token_accuracy": 0.8141673741340637, "eval_chemistry_num_tokens": 53553391.0, "eval_chemistry_runtime": 34.6216, "eval_chemistry_samples_per_second": 14.442, "eval_chemistry_steps_per_second": 3.61, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_physics_entropy": 0.3869015793800354, "eval_physics_loss": 0.7880970239639282, "eval_physics_mean_token_accuracy": 0.813429744720459, "eval_physics_num_tokens": 53553391.0, "eval_physics_runtime": 40.9274, "eval_physics_samples_per_second": 12.217, "eval_physics_steps_per_second": 3.054, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.16464083921255e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }