{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6430254079401493, "epoch": 0.008, "grad_norm": 4.53125, "learning_rate": 6.000000000000001e-07, "loss": 1.8435, "mean_token_accuracy": 0.6683766476809978, "num_tokens": 133939.0, "step": 10 }, { "entropy": 0.6358832061290741, "epoch": 0.016, "grad_norm": 4.3125, "learning_rate": 1.2666666666666669e-06, "loss": 1.8392, "mean_token_accuracy": 0.6651193369179964, "num_tokens": 273600.0, "step": 20 }, { "entropy": 0.6754261270165444, "epoch": 0.024, "grad_norm": 4.9375, "learning_rate": 1.9333333333333336e-06, "loss": 1.9101, "mean_token_accuracy": 0.6521823685616255, "num_tokens": 408034.0, "step": 30 }, { "entropy": 0.6558344909921289, "epoch": 0.032, "grad_norm": 4.28125, "learning_rate": 2.6e-06, "loss": 1.8472, "mean_token_accuracy": 0.6629668604582548, "num_tokens": 542836.0, "step": 40 }, { "entropy": 0.6494104014709592, "epoch": 0.04, "grad_norm": 3.8125, "learning_rate": 3.266666666666667e-06, "loss": 1.8027, "mean_token_accuracy": 0.6676773574203253, "num_tokens": 678419.0, "step": 50 }, { "entropy": 0.662838158942759, "epoch": 0.048, "grad_norm": 4.03125, "learning_rate": 3.9333333333333335e-06, "loss": 1.8213, "mean_token_accuracy": 0.6647731238976121, "num_tokens": 813510.0, "step": 60 }, { "entropy": 0.6506876813247799, "epoch": 0.056, "grad_norm": 3.71875, "learning_rate": 4.600000000000001e-06, "loss": 1.7955, "mean_token_accuracy": 0.6665123915299773, "num_tokens": 954682.0, "step": 70 }, { "entropy": 0.6252049600705505, "epoch": 0.064, "grad_norm": 3.65625, "learning_rate": 5.2666666666666665e-06, "loss": 1.7861, "mean_token_accuracy": 0.6728611093014478, "num_tokens": 1096071.0, "step": 80 }, { "entropy": 0.6236795110628008, "epoch": 0.072, "grad_norm": 3.171875, "learning_rate": 5.933333333333335e-06, "loss": 1.6458, "mean_token_accuracy": 0.6831523377448321, "num_tokens": 1239910.0, "step": 90 }, { "entropy": 0.6565364092588425, "epoch": 0.08, "grad_norm": 3.125, "learning_rate": 6.600000000000001e-06, "loss": 1.6948, "mean_token_accuracy": 0.6728412076830864, "num_tokens": 1376045.0, "step": 100 }, { "epoch": 0.08, "eval_biology_entropy": 1.821127786874771, "eval_biology_loss": 4.8697123527526855, "eval_biology_mean_token_accuracy": 0.3407035154700279, "eval_biology_num_tokens": 1376045.0, "eval_biology_runtime": 26.9612, "eval_biology_samples_per_second": 18.545, "eval_biology_steps_per_second": 9.273, "step": 100 }, { "epoch": 0.08, "eval_chemistry_entropy": 1.1214371861219405, "eval_chemistry_loss": 2.9583740234375, "eval_chemistry_mean_token_accuracy": 0.5230503298044205, "eval_chemistry_num_tokens": 1376045.0, "eval_chemistry_runtime": 34.4743, "eval_chemistry_samples_per_second": 14.504, "eval_chemistry_steps_per_second": 7.252, "step": 100 }, { "epoch": 0.08, "eval_math_entropy": 0.6455623766183853, "eval_math_loss": 1.6610009670257568, "eval_math_mean_token_accuracy": 0.6816041300296783, "eval_math_num_tokens": 1376045.0, "eval_math_runtime": 34.9863, "eval_math_samples_per_second": 14.291, "eval_math_steps_per_second": 7.146, "step": 100 }, { "epoch": 0.08, "eval_physics_entropy": 0.8894976880550385, "eval_physics_loss": 2.340852975845337, "eval_physics_mean_token_accuracy": 0.5970065190792083, "eval_physics_num_tokens": 1376045.0, "eval_physics_runtime": 40.5871, "eval_physics_samples_per_second": 12.319, "eval_physics_steps_per_second": 6.16, "step": 100 }, { "entropy": 0.6108825646340847, "epoch": 0.088, "grad_norm": 3.0625, "learning_rate": 7.266666666666668e-06, "loss": 1.534, "mean_token_accuracy": 0.6979989364743233, "num_tokens": 1518257.0, "step": 110 }, { "entropy": 0.6626681972295045, "epoch": 0.096, "grad_norm": 3.0, "learning_rate": 7.933333333333334e-06, "loss": 1.4858, "mean_token_accuracy": 0.6921830836683511, "num_tokens": 1656215.0, "step": 120 }, { "entropy": 0.6807309044525027, "epoch": 0.104, "grad_norm": 2.4375, "learning_rate": 8.6e-06, "loss": 1.3897, "mean_token_accuracy": 0.7023331839591265, "num_tokens": 1791507.0, "step": 130 }, { "entropy": 0.7359636599197984, "epoch": 0.112, "grad_norm": 1.9765625, "learning_rate": 9.266666666666667e-06, "loss": 1.3356, "mean_token_accuracy": 0.7036307867616415, "num_tokens": 1928153.0, "step": 140 }, { "entropy": 0.8070579996332526, "epoch": 0.12, "grad_norm": 1.7265625, "learning_rate": 9.933333333333334e-06, "loss": 1.2682, "mean_token_accuracy": 0.712115453183651, "num_tokens": 2071008.0, "step": 150 }, { "entropy": 0.874446533434093, "epoch": 0.128, "grad_norm": 1.2734375, "learning_rate": 1.0600000000000002e-05, "loss": 1.1443, "mean_token_accuracy": 0.7246435590088367, "num_tokens": 2216042.0, "step": 160 }, { "entropy": 0.9639016635715961, "epoch": 0.136, "grad_norm": 1.125, "learning_rate": 1.1266666666666668e-05, "loss": 1.1005, "mean_token_accuracy": 0.730505557358265, "num_tokens": 2350589.0, "step": 170 }, { "entropy": 1.0533128878101707, "epoch": 0.144, "grad_norm": 0.90234375, "learning_rate": 1.1933333333333335e-05, "loss": 1.0494, "mean_token_accuracy": 0.7364416709169745, "num_tokens": 2483116.0, "step": 180 }, { "entropy": 1.0085792429745197, "epoch": 0.152, "grad_norm": 0.96484375, "learning_rate": 1.2600000000000001e-05, "loss": 0.9875, "mean_token_accuracy": 0.749194710329175, "num_tokens": 2617958.0, "step": 190 }, { "entropy": 1.000020370259881, "epoch": 0.16, "grad_norm": 0.90234375, "learning_rate": 1.3266666666666668e-05, "loss": 0.9952, "mean_token_accuracy": 0.7486346989870072, "num_tokens": 2752258.0, "step": 200 }, { "epoch": 0.16, "eval_biology_entropy": 3.2312401638031005, "eval_biology_loss": 3.4984476566314697, "eval_biology_mean_token_accuracy": 0.4106140378713608, "eval_biology_num_tokens": 2752258.0, "eval_biology_runtime": 26.9494, "eval_biology_samples_per_second": 18.553, "eval_biology_steps_per_second": 9.277, "step": 200 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.9263038992881776, "eval_chemistry_loss": 1.9608160257339478, "eval_chemistry_mean_token_accuracy": 0.5982974277734756, "eval_chemistry_num_tokens": 2752258.0, "eval_chemistry_runtime": 34.4173, "eval_chemistry_samples_per_second": 14.528, "eval_chemistry_steps_per_second": 7.264, "step": 200 }, { "epoch": 0.16, "eval_math_entropy": 1.0257348996400832, "eval_math_loss": 1.029754638671875, "eval_math_mean_token_accuracy": 0.7436505243778229, "eval_math_num_tokens": 2752258.0, "eval_math_runtime": 34.9579, "eval_math_samples_per_second": 14.303, "eval_math_steps_per_second": 7.151, "step": 200 }, { "epoch": 0.16, "eval_physics_entropy": 1.5005105144977569, "eval_physics_loss": 1.5203113555908203, "eval_physics_mean_token_accuracy": 0.6669992462396622, "eval_physics_num_tokens": 2752258.0, "eval_physics_runtime": 40.5453, "eval_physics_samples_per_second": 12.332, "eval_physics_steps_per_second": 6.166, "step": 200 }, { "entropy": 1.0170325668528677, "epoch": 0.168, "grad_norm": 0.80859375, "learning_rate": 1.3933333333333334e-05, "loss": 1.0032, "mean_token_accuracy": 0.7469347029924392, "num_tokens": 2889249.0, "step": 210 }, { "entropy": 0.9709451261907815, "epoch": 0.176, "grad_norm": 0.7734375, "learning_rate": 1.46e-05, "loss": 0.9299, "mean_token_accuracy": 0.7556545663625002, "num_tokens": 3030305.0, "step": 220 }, { "entropy": 1.0022083457559348, "epoch": 0.184, "grad_norm": 0.90234375, "learning_rate": 1.5266666666666667e-05, "loss": 0.9815, "mean_token_accuracy": 0.7520246226340532, "num_tokens": 3167096.0, "step": 230 }, { "entropy": 0.9930278725922108, "epoch": 0.192, "grad_norm": 0.85546875, "learning_rate": 1.5933333333333336e-05, "loss": 0.9637, "mean_token_accuracy": 0.7547307416796685, "num_tokens": 3306293.0, "step": 240 }, { "entropy": 0.9768950846046209, "epoch": 0.2, "grad_norm": 0.7734375, "learning_rate": 1.66e-05, "loss": 0.9373, "mean_token_accuracy": 0.7594478782266378, "num_tokens": 3441895.0, "step": 250 }, { "entropy": 0.9833385257050395, "epoch": 0.208, "grad_norm": 0.8359375, "learning_rate": 1.726666666666667e-05, "loss": 0.9384, "mean_token_accuracy": 0.7573206882923842, "num_tokens": 3578349.0, "step": 260 }, { "entropy": 0.9361576547846198, "epoch": 0.216, "grad_norm": 0.74609375, "learning_rate": 1.7933333333333333e-05, "loss": 0.9004, "mean_token_accuracy": 0.7651511203497648, "num_tokens": 3718484.0, "step": 270 }, { "entropy": 0.9637048680335283, "epoch": 0.224, "grad_norm": 0.9140625, "learning_rate": 1.86e-05, "loss": 0.9349, "mean_token_accuracy": 0.7605184197425843, "num_tokens": 3851369.0, "step": 280 }, { "entropy": 0.9367688067257405, "epoch": 0.232, "grad_norm": 0.74609375, "learning_rate": 1.926666666666667e-05, "loss": 0.8962, "mean_token_accuracy": 0.7659803122282028, "num_tokens": 3992977.0, "step": 290 }, { "entropy": 0.9229113388806581, "epoch": 0.24, "grad_norm": 0.7890625, "learning_rate": 1.9933333333333334e-05, "loss": 0.9024, "mean_token_accuracy": 0.77176120467484, "num_tokens": 4133723.0, "step": 300 }, { "epoch": 0.24, "eval_biology_entropy": 3.018038348674774, "eval_biology_loss": 3.2969858646392822, "eval_biology_mean_token_accuracy": 0.43762873542308806, "eval_biology_num_tokens": 4133723.0, "eval_biology_runtime": 27.1593, "eval_biology_samples_per_second": 18.41, "eval_biology_steps_per_second": 9.205, "step": 300 }, { "epoch": 0.24, "eval_chemistry_entropy": 1.778470279932022, "eval_chemistry_loss": 1.8064701557159424, "eval_chemistry_mean_token_accuracy": 0.6220316555500031, "eval_chemistry_num_tokens": 4133723.0, "eval_chemistry_runtime": 34.6422, "eval_chemistry_samples_per_second": 14.433, "eval_chemistry_steps_per_second": 7.217, "step": 300 }, { "epoch": 0.24, "eval_math_entropy": 0.9541566734313964, "eval_math_loss": 0.9326547980308533, "eval_math_mean_token_accuracy": 0.7629979259967804, "eval_math_num_tokens": 4133723.0, "eval_math_runtime": 35.1904, "eval_math_samples_per_second": 14.208, "eval_math_steps_per_second": 7.104, "step": 300 }, { "epoch": 0.24, "eval_physics_entropy": 1.387200428724289, "eval_physics_loss": 1.3932181596755981, "eval_physics_mean_token_accuracy": 0.6886798795461655, "eval_physics_num_tokens": 4133723.0, "eval_physics_runtime": 40.7999, "eval_physics_samples_per_second": 12.255, "eval_physics_steps_per_second": 6.127, "step": 300 }, { "entropy": 0.9078424336388707, "epoch": 0.248, "grad_norm": 0.75, "learning_rate": 1.9933333333333334e-05, "loss": 0.8542, "mean_token_accuracy": 0.7745326109230518, "num_tokens": 4273949.0, "step": 310 }, { "entropy": 0.9262684918940067, "epoch": 0.256, "grad_norm": 0.84765625, "learning_rate": 1.985925925925926e-05, "loss": 0.8761, "mean_token_accuracy": 0.7701829686760903, "num_tokens": 4414102.0, "step": 320 }, { "entropy": 0.9303955681622028, "epoch": 0.264, "grad_norm": 0.78515625, "learning_rate": 1.9785185185185187e-05, "loss": 0.8765, "mean_token_accuracy": 0.7714239094406367, "num_tokens": 4548402.0, "step": 330 }, { "entropy": 0.9207985673099757, "epoch": 0.272, "grad_norm": 0.7109375, "learning_rate": 1.971111111111111e-05, "loss": 0.8728, "mean_token_accuracy": 0.7721738632768392, "num_tokens": 4685739.0, "step": 340 }, { "entropy": 0.9172879874706268, "epoch": 0.28, "grad_norm": 0.80078125, "learning_rate": 1.963703703703704e-05, "loss": 0.869, "mean_token_accuracy": 0.7719811659306288, "num_tokens": 4827264.0, "step": 350 }, { "entropy": 0.8906141923740506, "epoch": 0.288, "grad_norm": 0.76171875, "learning_rate": 1.9562962962962964e-05, "loss": 0.8676, "mean_token_accuracy": 0.779513431340456, "num_tokens": 4963900.0, "step": 360 }, { "entropy": 0.9126916229724884, "epoch": 0.296, "grad_norm": 0.671875, "learning_rate": 1.948888888888889e-05, "loss": 0.8727, "mean_token_accuracy": 0.7722058985382318, "num_tokens": 5106308.0, "step": 370 }, { "entropy": 0.9109785668551922, "epoch": 0.304, "grad_norm": 0.76171875, "learning_rate": 1.9414814814814817e-05, "loss": 0.8562, "mean_token_accuracy": 0.7752636637538671, "num_tokens": 5246290.0, "step": 380 }, { "entropy": 0.9183575309813022, "epoch": 0.312, "grad_norm": 0.71875, "learning_rate": 1.9340740740740743e-05, "loss": 0.8621, "mean_token_accuracy": 0.7713671930134296, "num_tokens": 5389824.0, "step": 390 }, { "entropy": 0.8591190708801151, "epoch": 0.32, "grad_norm": 0.82421875, "learning_rate": 1.926666666666667e-05, "loss": 0.8065, "mean_token_accuracy": 0.7856244947761297, "num_tokens": 5531902.0, "step": 400 }, { "epoch": 0.32, "eval_biology_entropy": 2.974372909069061, "eval_biology_loss": 3.2137467861175537, "eval_biology_mean_token_accuracy": 0.44984077060222627, "eval_biology_num_tokens": 5531902.0, "eval_biology_runtime": 26.9787, "eval_biology_samples_per_second": 18.533, "eval_biology_steps_per_second": 9.267, "step": 400 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.7318408770561218, "eval_chemistry_loss": 1.746923804283142, "eval_chemistry_mean_token_accuracy": 0.632696654677391, "eval_chemistry_num_tokens": 5531902.0, "eval_chemistry_runtime": 34.5046, "eval_chemistry_samples_per_second": 14.491, "eval_chemistry_steps_per_second": 7.245, "step": 400 }, { "epoch": 0.32, "eval_math_entropy": 0.9170646283626557, "eval_math_loss": 0.8942311406135559, "eval_math_mean_token_accuracy": 0.7709064717292786, "eval_math_num_tokens": 5531902.0, "eval_math_runtime": 35.0734, "eval_math_samples_per_second": 14.256, "eval_math_steps_per_second": 7.128, "step": 400 }, { "epoch": 0.32, "eval_physics_entropy": 1.349606383562088, "eval_physics_loss": 1.343300223350525, "eval_physics_mean_token_accuracy": 0.6978710038661957, "eval_physics_num_tokens": 5531902.0, "eval_physics_runtime": 40.7692, "eval_physics_samples_per_second": 12.264, "eval_physics_steps_per_second": 6.132, "step": 400 }, { "entropy": 0.9086290217936039, "epoch": 0.328, "grad_norm": 0.796875, "learning_rate": 1.9192592592592593e-05, "loss": 0.847, "mean_token_accuracy": 0.7745547294616699, "num_tokens": 5669047.0, "step": 410 }, { "entropy": 0.9156009530648589, "epoch": 0.336, "grad_norm": 0.7578125, "learning_rate": 1.911851851851852e-05, "loss": 0.8769, "mean_token_accuracy": 0.7692474499344826, "num_tokens": 5806889.0, "step": 420 }, { "entropy": 0.8934111742302775, "epoch": 0.344, "grad_norm": 0.67578125, "learning_rate": 1.9044444444444446e-05, "loss": 0.8462, "mean_token_accuracy": 0.7796071924269199, "num_tokens": 5947019.0, "step": 430 }, { "entropy": 0.8856402298435568, "epoch": 0.352, "grad_norm": 0.7578125, "learning_rate": 1.8970370370370372e-05, "loss": 0.8362, "mean_token_accuracy": 0.7777645040303469, "num_tokens": 6083443.0, "step": 440 }, { "entropy": 0.8964365001767873, "epoch": 0.36, "grad_norm": 0.7109375, "learning_rate": 1.8896296296296295e-05, "loss": 0.8406, "mean_token_accuracy": 0.776063571870327, "num_tokens": 6215195.0, "step": 450 }, { "entropy": 0.8810398764908314, "epoch": 0.368, "grad_norm": 0.69921875, "learning_rate": 1.8822222222222225e-05, "loss": 0.8247, "mean_token_accuracy": 0.7789717759937048, "num_tokens": 6357305.0, "step": 460 }, { "entropy": 0.8900227572768926, "epoch": 0.376, "grad_norm": 0.6875, "learning_rate": 1.874814814814815e-05, "loss": 0.8424, "mean_token_accuracy": 0.7792754102498293, "num_tokens": 6498815.0, "step": 470 }, { "entropy": 0.8727197473868727, "epoch": 0.384, "grad_norm": 0.68359375, "learning_rate": 1.8674074074074075e-05, "loss": 0.8209, "mean_token_accuracy": 0.7820516306906938, "num_tokens": 6635429.0, "step": 480 }, { "entropy": 0.8815433232113719, "epoch": 0.392, "grad_norm": 0.73828125, "learning_rate": 1.86e-05, "loss": 0.8274, "mean_token_accuracy": 0.7792554065585137, "num_tokens": 6774408.0, "step": 490 }, { "entropy": 0.9402453400194645, "epoch": 0.4, "grad_norm": 0.78515625, "learning_rate": 1.8525925925925928e-05, "loss": 0.9268, "mean_token_accuracy": 0.7666260946542025, "num_tokens": 6914489.0, "step": 500 }, { "epoch": 0.4, "eval_biology_entropy": 2.984537806510925, "eval_biology_loss": 3.180291175842285, "eval_biology_mean_token_accuracy": 0.4551255764961243, "eval_biology_num_tokens": 6914489.0, "eval_biology_runtime": 26.9382, "eval_biology_samples_per_second": 18.561, "eval_biology_steps_per_second": 9.281, "step": 500 }, { "epoch": 0.4, "eval_chemistry_entropy": 1.735370470046997, "eval_chemistry_loss": 1.7186591625213623, "eval_chemistry_mean_token_accuracy": 0.6382190730571747, "eval_chemistry_num_tokens": 6914489.0, "eval_chemistry_runtime": 34.4276, "eval_chemistry_samples_per_second": 14.523, "eval_chemistry_steps_per_second": 7.262, "step": 500 }, { "epoch": 0.4, "eval_math_entropy": 0.9168894953727722, "eval_math_loss": 0.8749539852142334, "eval_math_mean_token_accuracy": 0.7746825203895569, "eval_math_num_tokens": 6914489.0, "eval_math_runtime": 34.9026, "eval_math_samples_per_second": 14.326, "eval_math_steps_per_second": 7.163, "step": 500 }, { "epoch": 0.4, "eval_physics_entropy": 1.34889164853096, "eval_physics_loss": 1.3155016899108887, "eval_physics_mean_token_accuracy": 0.7029694278240204, "eval_physics_num_tokens": 6914489.0, "eval_physics_runtime": 40.5298, "eval_physics_samples_per_second": 12.337, "eval_physics_steps_per_second": 6.168, "step": 500 }, { "entropy": 0.8826847270131111, "epoch": 0.408, "grad_norm": 0.6328125, "learning_rate": 1.8451851851851855e-05, "loss": 0.8191, "mean_token_accuracy": 0.7815356869250536, "num_tokens": 7053065.0, "step": 510 }, { "entropy": 0.888327713869512, "epoch": 0.416, "grad_norm": 0.67578125, "learning_rate": 1.8377777777777778e-05, "loss": 0.8458, "mean_token_accuracy": 0.7794966764748097, "num_tokens": 7187783.0, "step": 520 }, { "entropy": 0.8952456833794713, "epoch": 0.424, "grad_norm": 0.66015625, "learning_rate": 1.8303703703703704e-05, "loss": 0.8535, "mean_token_accuracy": 0.7754339158535004, "num_tokens": 7320375.0, "step": 530 }, { "entropy": 0.8508788656443358, "epoch": 0.432, "grad_norm": 0.65625, "learning_rate": 1.822962962962963e-05, "loss": 0.7973, "mean_token_accuracy": 0.7881547212600708, "num_tokens": 7454401.0, "step": 540 }, { "entropy": 0.8842971453443169, "epoch": 0.44, "grad_norm": 0.71484375, "learning_rate": 1.8155555555555557e-05, "loss": 0.8373, "mean_token_accuracy": 0.7774394288659096, "num_tokens": 7593764.0, "step": 550 }, { "entropy": 0.8545080715790391, "epoch": 0.448, "grad_norm": 0.75, "learning_rate": 1.8081481481481484e-05, "loss": 0.8067, "mean_token_accuracy": 0.7855051398277283, "num_tokens": 7732710.0, "step": 560 }, { "entropy": 0.8548783052712678, "epoch": 0.456, "grad_norm": 0.70703125, "learning_rate": 1.800740740740741e-05, "loss": 0.8214, "mean_token_accuracy": 0.7835484184324741, "num_tokens": 7873721.0, "step": 570 }, { "entropy": 0.8870714059099555, "epoch": 0.464, "grad_norm": 0.5859375, "learning_rate": 1.7933333333333333e-05, "loss": 0.8303, "mean_token_accuracy": 0.7804129023104907, "num_tokens": 8012478.0, "step": 580 }, { "entropy": 0.8966930942609906, "epoch": 0.472, "grad_norm": 0.64453125, "learning_rate": 1.785925925925926e-05, "loss": 0.8442, "mean_token_accuracy": 0.7768670186400414, "num_tokens": 8153429.0, "step": 590 }, { "entropy": 0.8990984376519918, "epoch": 0.48, "grad_norm": 0.7265625, "learning_rate": 1.7785185185185186e-05, "loss": 0.8363, "mean_token_accuracy": 0.7791902955621481, "num_tokens": 8288086.0, "step": 600 }, { "epoch": 0.48, "eval_biology_entropy": 3.026648554325104, "eval_biology_loss": 3.1774065494537354, "eval_biology_mean_token_accuracy": 0.45753026235103605, "eval_biology_num_tokens": 8288086.0, "eval_biology_runtime": 26.9914, "eval_biology_samples_per_second": 18.524, "eval_biology_steps_per_second": 9.262, "step": 600 }, { "epoch": 0.48, "eval_chemistry_entropy": 1.7211642112731933, "eval_chemistry_loss": 1.7037526369094849, "eval_chemistry_mean_token_accuracy": 0.6409431176185608, "eval_chemistry_num_tokens": 8288086.0, "eval_chemistry_runtime": 34.4519, "eval_chemistry_samples_per_second": 14.513, "eval_chemistry_steps_per_second": 7.256, "step": 600 }, { "epoch": 0.48, "eval_math_entropy": 0.8927879617214203, "eval_math_loss": 0.863281786441803, "eval_math_mean_token_accuracy": 0.7770743372440339, "eval_math_num_tokens": 8288086.0, "eval_math_runtime": 34.9704, "eval_math_samples_per_second": 14.298, "eval_math_steps_per_second": 7.149, "step": 600 }, { "epoch": 0.48, "eval_physics_entropy": 1.3300827517509461, "eval_physics_loss": 1.3015602827072144, "eval_physics_mean_token_accuracy": 0.7059740360975265, "eval_physics_num_tokens": 8288086.0, "eval_physics_runtime": 40.5426, "eval_physics_samples_per_second": 12.333, "eval_physics_steps_per_second": 6.166, "step": 600 }, { "entropy": 0.8605974804610014, "epoch": 0.488, "grad_norm": 0.65234375, "learning_rate": 1.7711111111111113e-05, "loss": 0.8123, "mean_token_accuracy": 0.7824984133243561, "num_tokens": 8424958.0, "step": 610 }, { "entropy": 0.8556231519207358, "epoch": 0.496, "grad_norm": 0.73828125, "learning_rate": 1.763703703703704e-05, "loss": 0.8073, "mean_token_accuracy": 0.7834980070590973, "num_tokens": 8562656.0, "step": 620 }, { "entropy": 0.8707238981500268, "epoch": 0.504, "grad_norm": 0.63671875, "learning_rate": 1.7562962962962962e-05, "loss": 0.8323, "mean_token_accuracy": 0.7818343795835971, "num_tokens": 8706097.0, "step": 630 }, { "entropy": 0.8535389142110944, "epoch": 0.512, "grad_norm": 0.6953125, "learning_rate": 1.7488888888888892e-05, "loss": 0.7829, "mean_token_accuracy": 0.7894029818475247, "num_tokens": 8844762.0, "step": 640 }, { "entropy": 0.8502879558131099, "epoch": 0.52, "grad_norm": 0.73828125, "learning_rate": 1.7414814814814815e-05, "loss": 0.8016, "mean_token_accuracy": 0.789192034304142, "num_tokens": 8983268.0, "step": 650 }, { "entropy": 0.841147867590189, "epoch": 0.528, "grad_norm": 0.76171875, "learning_rate": 1.7340740740740742e-05, "loss": 0.7936, "mean_token_accuracy": 0.7880374405533075, "num_tokens": 9132725.0, "step": 660 }, { "entropy": 0.8636940447613597, "epoch": 0.536, "grad_norm": 0.796875, "learning_rate": 1.726666666666667e-05, "loss": 0.841, "mean_token_accuracy": 0.7832605082541704, "num_tokens": 9271490.0, "step": 670 }, { "entropy": 0.8671527728438377, "epoch": 0.544, "grad_norm": 0.78125, "learning_rate": 1.7192592592592595e-05, "loss": 0.8081, "mean_token_accuracy": 0.7864414434880018, "num_tokens": 9412327.0, "step": 680 }, { "entropy": 0.9130071861669421, "epoch": 0.552, "grad_norm": 0.85546875, "learning_rate": 1.711851851851852e-05, "loss": 0.8653, "mean_token_accuracy": 0.7723087813705206, "num_tokens": 9541196.0, "step": 690 }, { "entropy": 0.8585997836664319, "epoch": 0.56, "grad_norm": 0.7265625, "learning_rate": 1.7044444444444445e-05, "loss": 0.794, "mean_token_accuracy": 0.7848813243210315, "num_tokens": 9676595.0, "step": 700 }, { "epoch": 0.56, "eval_biology_entropy": 2.9588126630783083, "eval_biology_loss": 3.134213447570801, "eval_biology_mean_token_accuracy": 0.46100987315177916, "eval_biology_num_tokens": 9676595.0, "eval_biology_runtime": 27.0044, "eval_biology_samples_per_second": 18.516, "eval_biology_steps_per_second": 9.258, "step": 700 }, { "epoch": 0.56, "eval_chemistry_entropy": 1.6786610894203187, "eval_chemistry_loss": 1.6839625835418701, "eval_chemistry_mean_token_accuracy": 0.6436767561435699, "eval_chemistry_num_tokens": 9676595.0, "eval_chemistry_runtime": 34.4136, "eval_chemistry_samples_per_second": 14.529, "eval_chemistry_steps_per_second": 7.265, "step": 700 }, { "epoch": 0.56, "eval_math_entropy": 0.8769606277942658, "eval_math_loss": 0.8541995882987976, "eval_math_mean_token_accuracy": 0.7790154383182526, "eval_math_num_tokens": 9676595.0, "eval_math_runtime": 34.9482, "eval_math_samples_per_second": 14.307, "eval_math_steps_per_second": 7.153, "step": 700 }, { "epoch": 0.56, "eval_physics_entropy": 1.302037154197693, "eval_physics_loss": 1.289175271987915, "eval_physics_mean_token_accuracy": 0.7078491554260254, "eval_physics_num_tokens": 9676595.0, "eval_physics_runtime": 40.5408, "eval_physics_samples_per_second": 12.333, "eval_physics_steps_per_second": 6.167, "step": 700 }, { "entropy": 0.8488044740632177, "epoch": 0.568, "grad_norm": 0.65625, "learning_rate": 1.697037037037037e-05, "loss": 0.8027, "mean_token_accuracy": 0.7859589237719774, "num_tokens": 9812856.0, "step": 710 }, { "entropy": 0.8703772563487291, "epoch": 0.576, "grad_norm": 0.76171875, "learning_rate": 1.6896296296296298e-05, "loss": 0.8196, "mean_token_accuracy": 0.7833330363035202, "num_tokens": 9953227.0, "step": 720 }, { "entropy": 0.8656591676175595, "epoch": 0.584, "grad_norm": 0.7109375, "learning_rate": 1.6822222222222224e-05, "loss": 0.8193, "mean_token_accuracy": 0.7825856961309909, "num_tokens": 10089276.0, "step": 730 }, { "entropy": 0.8539155850186944, "epoch": 0.592, "grad_norm": 0.62109375, "learning_rate": 1.6748148148148147e-05, "loss": 0.8051, "mean_token_accuracy": 0.7846140246838331, "num_tokens": 10234220.0, "step": 740 }, { "entropy": 0.8651883415877819, "epoch": 0.6, "grad_norm": 0.62109375, "learning_rate": 1.6674074074074077e-05, "loss": 0.8268, "mean_token_accuracy": 0.7814744919538498, "num_tokens": 10370667.0, "step": 750 }, { "entropy": 0.863803020492196, "epoch": 0.608, "grad_norm": 0.76171875, "learning_rate": 1.66e-05, "loss": 0.8229, "mean_token_accuracy": 0.783318106085062, "num_tokens": 10515760.0, "step": 760 }, { "entropy": 0.8657625541090965, "epoch": 0.616, "grad_norm": 0.609375, "learning_rate": 1.6525925925925927e-05, "loss": 0.8201, "mean_token_accuracy": 0.781252521276474, "num_tokens": 10660578.0, "step": 770 }, { "entropy": 0.8430647728964686, "epoch": 0.624, "grad_norm": 0.73828125, "learning_rate": 1.6451851851851853e-05, "loss": 0.786, "mean_token_accuracy": 0.7895114913582801, "num_tokens": 10797566.0, "step": 780 }, { "entropy": 0.8240785686299205, "epoch": 0.632, "grad_norm": 0.63671875, "learning_rate": 1.637777777777778e-05, "loss": 0.7805, "mean_token_accuracy": 0.7922219831496478, "num_tokens": 10938456.0, "step": 790 }, { "entropy": 0.8485349236056209, "epoch": 0.64, "grad_norm": 0.65625, "learning_rate": 1.6303703703703706e-05, "loss": 0.7989, "mean_token_accuracy": 0.7866497144103051, "num_tokens": 11079348.0, "step": 800 }, { "epoch": 0.64, "eval_biology_entropy": 2.932964876174927, "eval_biology_loss": 3.1179301738739014, "eval_biology_mean_token_accuracy": 0.46290264546871185, "eval_biology_num_tokens": 11079348.0, "eval_biology_runtime": 27.0006, "eval_biology_samples_per_second": 18.518, "eval_biology_steps_per_second": 9.259, "step": 800 }, { "epoch": 0.64, "eval_chemistry_entropy": 1.6625523691177369, "eval_chemistry_loss": 1.6735429763793945, "eval_chemistry_mean_token_accuracy": 0.645549393773079, "eval_chemistry_num_tokens": 11079348.0, "eval_chemistry_runtime": 34.504, "eval_chemistry_samples_per_second": 14.491, "eval_chemistry_steps_per_second": 7.246, "step": 800 }, { "epoch": 0.64, "eval_math_entropy": 0.8715307763814926, "eval_math_loss": 0.8475318551063538, "eval_math_mean_token_accuracy": 0.780082686662674, "eval_math_num_tokens": 11079348.0, "eval_math_runtime": 34.9793, "eval_math_samples_per_second": 14.294, "eval_math_steps_per_second": 7.147, "step": 800 }, { "epoch": 0.64, "eval_physics_entropy": 1.292870044708252, "eval_physics_loss": 1.2783890962600708, "eval_physics_mean_token_accuracy": 0.7100210604667664, "eval_physics_num_tokens": 11079348.0, "eval_physics_runtime": 40.5422, "eval_physics_samples_per_second": 12.333, "eval_physics_steps_per_second": 6.166, "step": 800 }, { "entropy": 0.8196414019912481, "epoch": 0.648, "grad_norm": 0.6484375, "learning_rate": 1.622962962962963e-05, "loss": 0.7785, "mean_token_accuracy": 0.7903594326227903, "num_tokens": 11221757.0, "step": 810 }, { "entropy": 0.8491896335035563, "epoch": 0.656, "grad_norm": 0.66796875, "learning_rate": 1.6155555555555556e-05, "loss": 0.8188, "mean_token_accuracy": 0.7861174423247576, "num_tokens": 11360239.0, "step": 820 }, { "entropy": 0.8492498191073536, "epoch": 0.664, "grad_norm": 0.83984375, "learning_rate": 1.6081481481481482e-05, "loss": 0.7993, "mean_token_accuracy": 0.7853016339242458, "num_tokens": 11498677.0, "step": 830 }, { "entropy": 0.8525434667244554, "epoch": 0.672, "grad_norm": 0.69140625, "learning_rate": 1.600740740740741e-05, "loss": 0.8167, "mean_token_accuracy": 0.7853171724826098, "num_tokens": 11644674.0, "step": 840 }, { "entropy": 0.8380661699920893, "epoch": 0.68, "grad_norm": 0.73046875, "learning_rate": 1.5933333333333336e-05, "loss": 0.7928, "mean_token_accuracy": 0.788429806381464, "num_tokens": 11783080.0, "step": 850 }, { "entropy": 0.8381451951339841, "epoch": 0.688, "grad_norm": 0.62109375, "learning_rate": 1.5859259259259262e-05, "loss": 0.7921, "mean_token_accuracy": 0.7877590708434582, "num_tokens": 11917713.0, "step": 860 }, { "entropy": 0.8567008957266807, "epoch": 0.696, "grad_norm": 0.6640625, "learning_rate": 1.5785185185185185e-05, "loss": 0.7966, "mean_token_accuracy": 0.7848114792257548, "num_tokens": 12058193.0, "step": 870 }, { "entropy": 0.8353428209200502, "epoch": 0.704, "grad_norm": 0.65234375, "learning_rate": 1.571111111111111e-05, "loss": 0.7868, "mean_token_accuracy": 0.7881882593035698, "num_tokens": 12199067.0, "step": 880 }, { "entropy": 0.8710033852607012, "epoch": 0.712, "grad_norm": 0.671875, "learning_rate": 1.5637037037037038e-05, "loss": 0.8088, "mean_token_accuracy": 0.7816738404333592, "num_tokens": 12335740.0, "step": 890 }, { "entropy": 0.8482802568003536, "epoch": 0.72, "grad_norm": 0.8671875, "learning_rate": 1.5562962962962965e-05, "loss": 0.8002, "mean_token_accuracy": 0.7863489333540201, "num_tokens": 12469017.0, "step": 900 }, { "epoch": 0.72, "eval_biology_entropy": 2.9198746318817137, "eval_biology_loss": 3.095550775527954, "eval_biology_mean_token_accuracy": 0.4650931693315506, "eval_biology_num_tokens": 12469017.0, "eval_biology_runtime": 26.9459, "eval_biology_samples_per_second": 18.556, "eval_biology_steps_per_second": 9.278, "step": 900 }, { "epoch": 0.72, "eval_chemistry_entropy": 1.654184314250946, "eval_chemistry_loss": 1.6625065803527832, "eval_chemistry_mean_token_accuracy": 0.6471669510602951, "eval_chemistry_num_tokens": 12469017.0, "eval_chemistry_runtime": 34.4235, "eval_chemistry_samples_per_second": 14.525, "eval_chemistry_steps_per_second": 7.262, "step": 900 }, { "epoch": 0.72, "eval_math_entropy": 0.8690718128681183, "eval_math_loss": 0.8421436548233032, "eval_math_mean_token_accuracy": 0.7811065847873688, "eval_math_num_tokens": 12469017.0, "eval_math_runtime": 34.9224, "eval_math_samples_per_second": 14.317, "eval_math_steps_per_second": 7.159, "step": 900 }, { "epoch": 0.72, "eval_physics_entropy": 1.2865873792171478, "eval_physics_loss": 1.2697285413742065, "eval_physics_mean_token_accuracy": 0.7113078062534333, "eval_physics_num_tokens": 12469017.0, "eval_physics_runtime": 40.5051, "eval_physics_samples_per_second": 12.344, "eval_physics_steps_per_second": 6.172, "step": 900 }, { "entropy": 0.8559863271191717, "epoch": 0.728, "grad_norm": 0.859375, "learning_rate": 1.548888888888889e-05, "loss": 0.8087, "mean_token_accuracy": 0.7872065585106611, "num_tokens": 12604524.0, "step": 910 }, { "entropy": 0.8351603880524635, "epoch": 0.736, "grad_norm": 0.69921875, "learning_rate": 1.5414814814814814e-05, "loss": 0.7815, "mean_token_accuracy": 0.7907860331237316, "num_tokens": 12746145.0, "step": 920 }, { "entropy": 0.8385633688420058, "epoch": 0.744, "grad_norm": 0.68359375, "learning_rate": 1.5340740740740744e-05, "loss": 0.7849, "mean_token_accuracy": 0.7884344782680273, "num_tokens": 12882140.0, "step": 930 }, { "entropy": 0.8265942361205816, "epoch": 0.752, "grad_norm": 0.671875, "learning_rate": 1.5266666666666667e-05, "loss": 0.7683, "mean_token_accuracy": 0.7920810837298632, "num_tokens": 13019785.0, "step": 940 }, { "entropy": 0.801921004243195, "epoch": 0.76, "grad_norm": 0.64453125, "learning_rate": 1.5192592592592594e-05, "loss": 0.7567, "mean_token_accuracy": 0.7981402266770601, "num_tokens": 13167975.0, "step": 950 }, { "entropy": 0.8200628351420164, "epoch": 0.768, "grad_norm": 0.6015625, "learning_rate": 1.5118518518518519e-05, "loss": 0.7712, "mean_token_accuracy": 0.7937319375574589, "num_tokens": 13307071.0, "step": 960 }, { "entropy": 0.844639228284359, "epoch": 0.776, "grad_norm": 0.68359375, "learning_rate": 1.5044444444444445e-05, "loss": 0.8066, "mean_token_accuracy": 0.7887587692588568, "num_tokens": 13444206.0, "step": 970 }, { "entropy": 0.8560901025310159, "epoch": 0.784, "grad_norm": 0.7109375, "learning_rate": 1.497037037037037e-05, "loss": 0.8159, "mean_token_accuracy": 0.7839560698717832, "num_tokens": 13586074.0, "step": 980 }, { "entropy": 0.8473082648590207, "epoch": 0.792, "grad_norm": 0.60546875, "learning_rate": 1.4896296296296298e-05, "loss": 0.8055, "mean_token_accuracy": 0.785613476485014, "num_tokens": 13727642.0, "step": 990 }, { "entropy": 0.8641716981306672, "epoch": 0.8, "grad_norm": 0.73828125, "learning_rate": 1.4822222222222225e-05, "loss": 0.8161, "mean_token_accuracy": 0.7843089539557695, "num_tokens": 13869252.0, "step": 1000 }, { "epoch": 0.8, "eval_biology_entropy": 2.927406606197357, "eval_biology_loss": 3.0816211700439453, "eval_biology_mean_token_accuracy": 0.46697849237918854, "eval_biology_num_tokens": 13869252.0, "eval_biology_runtime": 26.9151, "eval_biology_samples_per_second": 18.577, "eval_biology_steps_per_second": 9.288, "step": 1000 }, { "epoch": 0.8, "eval_chemistry_entropy": 1.6538746666908264, "eval_chemistry_loss": 1.6552778482437134, "eval_chemistry_mean_token_accuracy": 0.6485219527482986, "eval_chemistry_num_tokens": 13869252.0, "eval_chemistry_runtime": 34.4246, "eval_chemistry_samples_per_second": 14.525, "eval_chemistry_steps_per_second": 7.262, "step": 1000 }, { "epoch": 0.8, "eval_math_entropy": 0.8677517844438553, "eval_math_loss": 0.8380001187324524, "eval_math_mean_token_accuracy": 0.7821021761894226, "eval_math_num_tokens": 13869252.0, "eval_math_runtime": 34.92, "eval_math_samples_per_second": 14.318, "eval_math_steps_per_second": 7.159, "step": 1000 }, { "epoch": 0.8, "eval_physics_entropy": 1.2842487235069275, "eval_physics_loss": 1.2633119821548462, "eval_physics_mean_token_accuracy": 0.7128965456485749, "eval_physics_num_tokens": 13869252.0, "eval_physics_runtime": 40.4838, "eval_physics_samples_per_second": 12.351, "eval_physics_steps_per_second": 6.175, "step": 1000 }, { "entropy": 0.826583449728787, "epoch": 0.808, "grad_norm": 0.66015625, "learning_rate": 1.474814814814815e-05, "loss": 0.7747, "mean_token_accuracy": 0.7898658849298954, "num_tokens": 14008931.0, "step": 1010 }, { "entropy": 0.8473596401512623, "epoch": 0.816, "grad_norm": 0.69140625, "learning_rate": 1.4674074074074076e-05, "loss": 0.7943, "mean_token_accuracy": 0.7863069783896208, "num_tokens": 14151374.0, "step": 1020 }, { "entropy": 0.8361653342843056, "epoch": 0.824, "grad_norm": 0.71484375, "learning_rate": 1.46e-05, "loss": 0.7867, "mean_token_accuracy": 0.7892423421144485, "num_tokens": 14291862.0, "step": 1030 }, { "entropy": 0.832403457723558, "epoch": 0.832, "grad_norm": 0.7578125, "learning_rate": 1.4525925925925927e-05, "loss": 0.7757, "mean_token_accuracy": 0.792064693197608, "num_tokens": 14429535.0, "step": 1040 }, { "entropy": 0.8536718141287565, "epoch": 0.84, "grad_norm": 0.8359375, "learning_rate": 1.4451851851851852e-05, "loss": 0.8129, "mean_token_accuracy": 0.7852799411863088, "num_tokens": 14565836.0, "step": 1050 }, { "entropy": 0.8711680036038161, "epoch": 0.848, "grad_norm": 0.64453125, "learning_rate": 1.4377777777777779e-05, "loss": 0.822, "mean_token_accuracy": 0.7821739461272955, "num_tokens": 14703912.0, "step": 1060 }, { "entropy": 0.8762293307110667, "epoch": 0.856, "grad_norm": 0.70703125, "learning_rate": 1.4303703703703703e-05, "loss": 0.8157, "mean_token_accuracy": 0.7823412474244833, "num_tokens": 14843747.0, "step": 1070 }, { "entropy": 0.8367005018517375, "epoch": 0.864, "grad_norm": 0.67578125, "learning_rate": 1.4229629629629632e-05, "loss": 0.7889, "mean_token_accuracy": 0.7886287022382021, "num_tokens": 14981626.0, "step": 1080 }, { "entropy": 0.8624969441443682, "epoch": 0.872, "grad_norm": 0.66015625, "learning_rate": 1.4155555555555556e-05, "loss": 0.8014, "mean_token_accuracy": 0.7859421294182539, "num_tokens": 15119687.0, "step": 1090 }, { "entropy": 0.8763546861708165, "epoch": 0.88, "grad_norm": 0.88671875, "learning_rate": 1.4081481481481483e-05, "loss": 0.8178, "mean_token_accuracy": 0.778026782348752, "num_tokens": 15256318.0, "step": 1100 }, { "epoch": 0.88, "eval_biology_entropy": 2.896754152297974, "eval_biology_loss": 3.063400983810425, "eval_biology_mean_token_accuracy": 0.46827907478809355, "eval_biology_num_tokens": 15256318.0, "eval_biology_runtime": 27.0374, "eval_biology_samples_per_second": 18.493, "eval_biology_steps_per_second": 9.246, "step": 1100 }, { "epoch": 0.88, "eval_chemistry_entropy": 1.638319587945938, "eval_chemistry_loss": 1.6441051959991455, "eval_chemistry_mean_token_accuracy": 0.6500709130764007, "eval_chemistry_num_tokens": 15256318.0, "eval_chemistry_runtime": 34.5158, "eval_chemistry_samples_per_second": 14.486, "eval_chemistry_steps_per_second": 7.243, "step": 1100 }, { "epoch": 0.88, "eval_math_entropy": 0.862964648604393, "eval_math_loss": 0.8344796299934387, "eval_math_mean_token_accuracy": 0.7827803816795349, "eval_math_num_tokens": 15256318.0, "eval_math_runtime": 34.9946, "eval_math_samples_per_second": 14.288, "eval_math_steps_per_second": 7.144, "step": 1100 }, { "epoch": 0.88, "eval_physics_entropy": 1.2776771280765533, "eval_physics_loss": 1.2576273679733276, "eval_physics_mean_token_accuracy": 0.7135676975250245, "eval_physics_num_tokens": 15256318.0, "eval_physics_runtime": 40.5631, "eval_physics_samples_per_second": 12.326, "eval_physics_steps_per_second": 6.163, "step": 1100 }, { "entropy": 0.8450109241530299, "epoch": 0.888, "grad_norm": 0.703125, "learning_rate": 1.400740740740741e-05, "loss": 0.7945, "mean_token_accuracy": 0.7892760831862689, "num_tokens": 15390095.0, "step": 1110 }, { "entropy": 0.8675164245069027, "epoch": 0.896, "grad_norm": 0.71484375, "learning_rate": 1.3933333333333334e-05, "loss": 0.8145, "mean_token_accuracy": 0.7832493595778942, "num_tokens": 15527679.0, "step": 1120 }, { "entropy": 0.8140978695824742, "epoch": 0.904, "grad_norm": 0.66796875, "learning_rate": 1.385925925925926e-05, "loss": 0.7663, "mean_token_accuracy": 0.7915640484541655, "num_tokens": 15667231.0, "step": 1130 }, { "entropy": 0.8478411814197898, "epoch": 0.912, "grad_norm": 0.6796875, "learning_rate": 1.3785185185185186e-05, "loss": 0.7968, "mean_token_accuracy": 0.7854505475610495, "num_tokens": 15808515.0, "step": 1140 }, { "entropy": 0.8326777325943112, "epoch": 0.92, "grad_norm": 0.671875, "learning_rate": 1.3711111111111112e-05, "loss": 0.7785, "mean_token_accuracy": 0.7895378533750772, "num_tokens": 15944670.0, "step": 1150 }, { "entropy": 0.8248712478205562, "epoch": 0.928, "grad_norm": 0.6015625, "learning_rate": 1.3637037037037037e-05, "loss": 0.7752, "mean_token_accuracy": 0.7912526894360781, "num_tokens": 16081416.0, "step": 1160 }, { "entropy": 0.8408484322950244, "epoch": 0.936, "grad_norm": 0.6796875, "learning_rate": 1.3562962962962965e-05, "loss": 0.7991, "mean_token_accuracy": 0.7868186816573143, "num_tokens": 16218228.0, "step": 1170 }, { "entropy": 0.8680851750075818, "epoch": 0.944, "grad_norm": 0.82421875, "learning_rate": 1.3488888888888888e-05, "loss": 0.827, "mean_token_accuracy": 0.7814383570104837, "num_tokens": 16356073.0, "step": 1180 }, { "entropy": 0.8465319074690342, "epoch": 0.952, "grad_norm": 0.72265625, "learning_rate": 1.3414814814814817e-05, "loss": 0.7988, "mean_token_accuracy": 0.7877550296485424, "num_tokens": 16492413.0, "step": 1190 }, { "entropy": 0.8646586099639535, "epoch": 0.96, "grad_norm": 0.60546875, "learning_rate": 1.3340740740740741e-05, "loss": 0.8061, "mean_token_accuracy": 0.7851971492171288, "num_tokens": 16632796.0, "step": 1200 }, { "epoch": 0.96, "eval_biology_entropy": 2.908616391181946, "eval_biology_loss": 3.0608139038085938, "eval_biology_mean_token_accuracy": 0.4685148627758026, "eval_biology_num_tokens": 16632796.0, "eval_biology_runtime": 26.9626, "eval_biology_samples_per_second": 18.544, "eval_biology_steps_per_second": 9.272, "step": 1200 }, { "epoch": 0.96, "eval_chemistry_entropy": 1.6397918918132781, "eval_chemistry_loss": 1.640189290046692, "eval_chemistry_mean_token_accuracy": 0.6506748970746994, "eval_chemistry_num_tokens": 16632796.0, "eval_chemistry_runtime": 34.4834, "eval_chemistry_samples_per_second": 14.5, "eval_chemistry_steps_per_second": 7.25, "step": 1200 }, { "epoch": 0.96, "eval_math_entropy": 0.8619930381774903, "eval_math_loss": 0.8314722776412964, "eval_math_mean_token_accuracy": 0.783630312204361, "eval_math_num_tokens": 16632796.0, "eval_math_runtime": 35.3561, "eval_math_samples_per_second": 14.142, "eval_math_steps_per_second": 7.071, "step": 1200 }, { "epoch": 0.96, "eval_physics_entropy": 1.2789247527122498, "eval_physics_loss": 1.2552392482757568, "eval_physics_mean_token_accuracy": 0.7142456164360046, "eval_physics_num_tokens": 16632796.0, "eval_physics_runtime": 40.5336, "eval_physics_samples_per_second": 12.335, "eval_physics_steps_per_second": 6.168, "step": 1200 }, { "entropy": 0.8285112351179122, "epoch": 0.968, "grad_norm": 0.75390625, "learning_rate": 1.3266666666666668e-05, "loss": 0.776, "mean_token_accuracy": 0.7918854866176843, "num_tokens": 16773370.0, "step": 1210 }, { "entropy": 0.8358210870996118, "epoch": 0.976, "grad_norm": 0.6875, "learning_rate": 1.3192592592592594e-05, "loss": 0.7953, "mean_token_accuracy": 0.7874758914113045, "num_tokens": 16908991.0, "step": 1220 }, { "entropy": 0.8504814989864826, "epoch": 0.984, "grad_norm": 0.71875, "learning_rate": 1.311851851851852e-05, "loss": 0.8009, "mean_token_accuracy": 0.7851403091102839, "num_tokens": 17048278.0, "step": 1230 }, { "entropy": 0.8459016673266888, "epoch": 0.992, "grad_norm": 0.68359375, "learning_rate": 1.3044444444444446e-05, "loss": 0.7909, "mean_token_accuracy": 0.7847922585904599, "num_tokens": 17189644.0, "step": 1240 }, { "entropy": 0.848821627907455, "epoch": 1.0, "grad_norm": 0.60546875, "learning_rate": 1.297037037037037e-05, "loss": 0.8135, "mean_token_accuracy": 0.7836502768099308, "num_tokens": 17329320.0, "step": 1250 }, { "entropy": 0.8311884285882115, "epoch": 1.008, "grad_norm": 0.63671875, "learning_rate": 1.2896296296296299e-05, "loss": 0.7746, "mean_token_accuracy": 0.7915325038135052, "num_tokens": 17471675.0, "step": 1260 }, { "entropy": 0.8471982046961785, "epoch": 1.016, "grad_norm": 0.77734375, "learning_rate": 1.2822222222222222e-05, "loss": 0.7847, "mean_token_accuracy": 0.7903359431773425, "num_tokens": 17611177.0, "step": 1270 }, { "entropy": 0.8569712620228529, "epoch": 1.024, "grad_norm": 0.6484375, "learning_rate": 1.274814814814815e-05, "loss": 0.793, "mean_token_accuracy": 0.7870935752987862, "num_tokens": 17744248.0, "step": 1280 }, { "entropy": 0.8451526958495379, "epoch": 1.032, "grad_norm": 0.6875, "learning_rate": 1.2674074074074075e-05, "loss": 0.7979, "mean_token_accuracy": 0.7867367029190063, "num_tokens": 17881760.0, "step": 1290 }, { "entropy": 0.8108762178570033, "epoch": 1.04, "grad_norm": 0.63671875, "learning_rate": 1.2600000000000001e-05, "loss": 0.7625, "mean_token_accuracy": 0.7972131386399269, "num_tokens": 18014485.0, "step": 1300 }, { "epoch": 1.04, "eval_biology_entropy": 2.861083526134491, "eval_biology_loss": 3.054857015609741, "eval_biology_mean_token_accuracy": 0.469379000544548, "eval_biology_num_tokens": 18014485.0, "eval_biology_runtime": 26.9793, "eval_biology_samples_per_second": 18.533, "eval_biology_steps_per_second": 9.266, "step": 1300 }, { "epoch": 1.04, "eval_chemistry_entropy": 1.614394629240036, "eval_chemistry_loss": 1.6372945308685303, "eval_chemistry_mean_token_accuracy": 0.6513354105949402, "eval_chemistry_num_tokens": 18014485.0, "eval_chemistry_runtime": 34.4802, "eval_chemistry_samples_per_second": 14.501, "eval_chemistry_steps_per_second": 7.251, "step": 1300 }, { "epoch": 1.04, "eval_math_entropy": 0.8521397960186005, "eval_math_loss": 0.8296064734458923, "eval_math_mean_token_accuracy": 0.7838501288890839, "eval_math_num_tokens": 18014485.0, "eval_math_runtime": 34.9703, "eval_math_samples_per_second": 14.298, "eval_math_steps_per_second": 7.149, "step": 1300 }, { "epoch": 1.04, "eval_physics_entropy": 1.2607037408351898, "eval_physics_loss": 1.2520027160644531, "eval_physics_mean_token_accuracy": 0.7147621221542358, "eval_physics_num_tokens": 18014485.0, "eval_physics_runtime": 40.5706, "eval_physics_samples_per_second": 12.324, "eval_physics_steps_per_second": 6.162, "step": 1300 }, { "entropy": 0.8444511197507382, "epoch": 1.048, "grad_norm": 0.71875, "learning_rate": 1.2525925925925928e-05, "loss": 0.797, "mean_token_accuracy": 0.7875816393643618, "num_tokens": 18156077.0, "step": 1310 }, { "entropy": 0.8128441723063589, "epoch": 1.056, "grad_norm": 0.65625, "learning_rate": 1.2451851851851853e-05, "loss": 0.7764, "mean_token_accuracy": 0.7942416854202747, "num_tokens": 18294650.0, "step": 1320 }, { "entropy": 0.8420778449624777, "epoch": 1.064, "grad_norm": 0.68359375, "learning_rate": 1.237777777777778e-05, "loss": 0.7963, "mean_token_accuracy": 0.7890487045049668, "num_tokens": 18435862.0, "step": 1330 }, { "entropy": 0.8193851249292493, "epoch": 1.072, "grad_norm": 0.66015625, "learning_rate": 1.2303703703703704e-05, "loss": 0.7754, "mean_token_accuracy": 0.7921654149889946, "num_tokens": 18568936.0, "step": 1340 }, { "entropy": 0.841069664247334, "epoch": 1.08, "grad_norm": 0.68359375, "learning_rate": 1.222962962962963e-05, "loss": 0.7797, "mean_token_accuracy": 0.7857704412192106, "num_tokens": 18712013.0, "step": 1350 }, { "entropy": 0.8188380904495716, "epoch": 1.088, "grad_norm": 0.59765625, "learning_rate": 1.2155555555555555e-05, "loss": 0.7684, "mean_token_accuracy": 0.7954085689038038, "num_tokens": 18852497.0, "step": 1360 }, { "entropy": 0.8550961302593351, "epoch": 1.096, "grad_norm": 0.671875, "learning_rate": 1.2081481481481484e-05, "loss": 0.7868, "mean_token_accuracy": 0.7869294688105584, "num_tokens": 18997208.0, "step": 1370 }, { "entropy": 0.8430958347395062, "epoch": 1.104, "grad_norm": 0.65625, "learning_rate": 1.2007407407407408e-05, "loss": 0.7933, "mean_token_accuracy": 0.787052009999752, "num_tokens": 19136485.0, "step": 1380 }, { "entropy": 0.818677469342947, "epoch": 1.112, "grad_norm": 0.60546875, "learning_rate": 1.1933333333333335e-05, "loss": 0.7707, "mean_token_accuracy": 0.7958467341959476, "num_tokens": 19272237.0, "step": 1390 }, { "entropy": 0.8190919890999794, "epoch": 1.12, "grad_norm": 0.62890625, "learning_rate": 1.185925925925926e-05, "loss": 0.7635, "mean_token_accuracy": 0.793834337219596, "num_tokens": 19415146.0, "step": 1400 }, { "epoch": 1.12, "eval_biology_entropy": 2.8546162362098695, "eval_biology_loss": 3.046806573867798, "eval_biology_mean_token_accuracy": 0.47026353430747986, "eval_biology_num_tokens": 19415146.0, "eval_biology_runtime": 26.9248, "eval_biology_samples_per_second": 18.57, "eval_biology_steps_per_second": 9.285, "step": 1400 }, { "epoch": 1.12, "eval_chemistry_entropy": 1.6122739651203155, "eval_chemistry_loss": 1.6334017515182495, "eval_chemistry_mean_token_accuracy": 0.6514999225139618, "eval_chemistry_num_tokens": 19415146.0, "eval_chemistry_runtime": 34.4486, "eval_chemistry_samples_per_second": 14.514, "eval_chemistry_steps_per_second": 7.257, "step": 1400 }, { "epoch": 1.12, "eval_math_entropy": 0.8521307862997055, "eval_math_loss": 0.827960193157196, "eval_math_mean_token_accuracy": 0.7839585332870483, "eval_math_num_tokens": 19415146.0, "eval_math_runtime": 34.9467, "eval_math_samples_per_second": 14.308, "eval_math_steps_per_second": 7.154, "step": 1400 }, { "epoch": 1.12, "eval_physics_entropy": 1.2608284051418304, "eval_physics_loss": 1.2490959167480469, "eval_physics_mean_token_accuracy": 0.7151757082939147, "eval_physics_num_tokens": 19415146.0, "eval_physics_runtime": 40.537, "eval_physics_samples_per_second": 12.334, "eval_physics_steps_per_second": 6.167, "step": 1400 }, { "entropy": 0.8294604141265154, "epoch": 1.1280000000000001, "grad_norm": 0.7734375, "learning_rate": 1.1785185185185186e-05, "loss": 0.7807, "mean_token_accuracy": 0.7920530281960965, "num_tokens": 19550300.0, "step": 1410 }, { "entropy": 0.8140227854251861, "epoch": 1.1360000000000001, "grad_norm": 0.671875, "learning_rate": 1.1711111111111113e-05, "loss": 0.7705, "mean_token_accuracy": 0.7926124900579452, "num_tokens": 19687362.0, "step": 1420 }, { "entropy": 0.8327476343140006, "epoch": 1.144, "grad_norm": 0.6328125, "learning_rate": 1.1637037037037037e-05, "loss": 0.7873, "mean_token_accuracy": 0.7886258989572525, "num_tokens": 19825647.0, "step": 1430 }, { "entropy": 0.8256687164306641, "epoch": 1.152, "grad_norm": 0.70703125, "learning_rate": 1.1562962962962964e-05, "loss": 0.7785, "mean_token_accuracy": 0.7936491280794143, "num_tokens": 19968311.0, "step": 1440 }, { "entropy": 0.8199941594153642, "epoch": 1.16, "grad_norm": 0.64453125, "learning_rate": 1.1488888888888889e-05, "loss": 0.7707, "mean_token_accuracy": 0.7947168540209532, "num_tokens": 20115101.0, "step": 1450 }, { "entropy": 0.8253829684108496, "epoch": 1.168, "grad_norm": 0.65625, "learning_rate": 1.1414814814814817e-05, "loss": 0.7826, "mean_token_accuracy": 0.7904348153620958, "num_tokens": 20254700.0, "step": 1460 }, { "entropy": 0.8298774238675832, "epoch": 1.176, "grad_norm": 0.66015625, "learning_rate": 1.1340740740740742e-05, "loss": 0.7714, "mean_token_accuracy": 0.7898557730019092, "num_tokens": 20394054.0, "step": 1470 }, { "entropy": 0.8195257894694805, "epoch": 1.184, "grad_norm": 0.63671875, "learning_rate": 1.1266666666666668e-05, "loss": 0.7762, "mean_token_accuracy": 0.7923125520348548, "num_tokens": 20535739.0, "step": 1480 }, { "entropy": 0.8128214536234737, "epoch": 1.192, "grad_norm": 0.70703125, "learning_rate": 1.1192592592592593e-05, "loss": 0.7665, "mean_token_accuracy": 0.7968613497912884, "num_tokens": 20679391.0, "step": 1490 }, { "entropy": 0.8249955477192998, "epoch": 1.2, "grad_norm": 0.6640625, "learning_rate": 1.111851851851852e-05, "loss": 0.7804, "mean_token_accuracy": 0.789951428771019, "num_tokens": 20822188.0, "step": 1500 }, { "epoch": 1.2, "eval_biology_entropy": 2.8529665331840515, "eval_biology_loss": 3.043259620666504, "eval_biology_mean_token_accuracy": 0.4708596212863922, "eval_biology_num_tokens": 20822188.0, "eval_biology_runtime": 26.932, "eval_biology_samples_per_second": 18.565, "eval_biology_steps_per_second": 9.283, "step": 1500 }, { "epoch": 1.2, "eval_chemistry_entropy": 1.6120401091575622, "eval_chemistry_loss": 1.6317973136901855, "eval_chemistry_mean_token_accuracy": 0.6518439574241638, "eval_chemistry_num_tokens": 20822188.0, "eval_chemistry_runtime": 34.4788, "eval_chemistry_samples_per_second": 14.502, "eval_chemistry_steps_per_second": 7.251, "step": 1500 }, { "epoch": 1.2, "eval_math_entropy": 0.8526870446205139, "eval_math_loss": 0.8266505599021912, "eval_math_mean_token_accuracy": 0.7845923402309418, "eval_math_num_tokens": 20822188.0, "eval_math_runtime": 34.9055, "eval_math_samples_per_second": 14.324, "eval_math_steps_per_second": 7.162, "step": 1500 }, { "epoch": 1.2, "eval_physics_entropy": 1.2596520464420318, "eval_physics_loss": 1.2472213506698608, "eval_physics_mean_token_accuracy": 0.715502037525177, "eval_physics_num_tokens": 20822188.0, "eval_physics_runtime": 40.5487, "eval_physics_samples_per_second": 12.331, "eval_physics_steps_per_second": 6.165, "step": 1500 }, { "entropy": 0.8487134363502264, "epoch": 1.208, "grad_norm": 0.6484375, "learning_rate": 1.1044444444444444e-05, "loss": 0.7823, "mean_token_accuracy": 0.788084252178669, "num_tokens": 20960481.0, "step": 1510 }, { "entropy": 0.8314280182123184, "epoch": 1.216, "grad_norm": 0.66796875, "learning_rate": 1.0970370370370371e-05, "loss": 0.7697, "mean_token_accuracy": 0.7901332624256611, "num_tokens": 21095959.0, "step": 1520 }, { "entropy": 0.8034210726618767, "epoch": 1.224, "grad_norm": 0.65234375, "learning_rate": 1.0896296296296298e-05, "loss": 0.7583, "mean_token_accuracy": 0.7961955599486827, "num_tokens": 21234422.0, "step": 1530 }, { "entropy": 0.8255144201219082, "epoch": 1.232, "grad_norm": 0.70703125, "learning_rate": 1.0822222222222222e-05, "loss": 0.7632, "mean_token_accuracy": 0.7938656162470579, "num_tokens": 21370052.0, "step": 1540 }, { "entropy": 0.8085383821278811, "epoch": 1.24, "grad_norm": 0.6640625, "learning_rate": 1.074814814814815e-05, "loss": 0.7552, "mean_token_accuracy": 0.79493248462677, "num_tokens": 21511515.0, "step": 1550 }, { "entropy": 0.824030945263803, "epoch": 1.248, "grad_norm": 0.6171875, "learning_rate": 1.0674074074074074e-05, "loss": 0.77, "mean_token_accuracy": 0.792547807842493, "num_tokens": 21648728.0, "step": 1560 }, { "entropy": 0.8271475210785866, "epoch": 1.256, "grad_norm": 0.6796875, "learning_rate": 1.0600000000000002e-05, "loss": 0.7797, "mean_token_accuracy": 0.7907101519405841, "num_tokens": 21789454.0, "step": 1570 }, { "entropy": 0.8608066206797957, "epoch": 1.264, "grad_norm": 0.6953125, "learning_rate": 1.0525925925925927e-05, "loss": 0.7857, "mean_token_accuracy": 0.7850348219275475, "num_tokens": 21927034.0, "step": 1580 }, { "entropy": 0.8113653404638171, "epoch": 1.272, "grad_norm": 0.66015625, "learning_rate": 1.0451851851851853e-05, "loss": 0.7717, "mean_token_accuracy": 0.794109757617116, "num_tokens": 22057647.0, "step": 1590 }, { "entropy": 0.8274726118892431, "epoch": 1.28, "grad_norm": 0.62890625, "learning_rate": 1.0377777777777778e-05, "loss": 0.7794, "mean_token_accuracy": 0.790321846306324, "num_tokens": 22199229.0, "step": 1600 }, { "epoch": 1.28, "eval_biology_entropy": 2.8388440871238707, "eval_biology_loss": 3.0385401248931885, "eval_biology_mean_token_accuracy": 0.47107076036930084, "eval_biology_num_tokens": 22199229.0, "eval_biology_runtime": 26.9767, "eval_biology_samples_per_second": 18.535, "eval_biology_steps_per_second": 9.267, "step": 1600 }, { "epoch": 1.28, "eval_chemistry_entropy": 1.6038966157436372, "eval_chemistry_loss": 1.629550576210022, "eval_chemistry_mean_token_accuracy": 0.6523506959676743, "eval_chemistry_num_tokens": 22199229.0, "eval_chemistry_runtime": 34.5012, "eval_chemistry_samples_per_second": 14.492, "eval_chemistry_steps_per_second": 7.246, "step": 1600 }, { "epoch": 1.28, "eval_math_entropy": 0.8486473338603974, "eval_math_loss": 0.8255683183670044, "eval_math_mean_token_accuracy": 0.7848021316528321, "eval_math_num_tokens": 22199229.0, "eval_math_runtime": 34.9828, "eval_math_samples_per_second": 14.293, "eval_math_steps_per_second": 7.146, "step": 1600 }, { "epoch": 1.28, "eval_physics_entropy": 1.2539866659641266, "eval_physics_loss": 1.2459049224853516, "eval_physics_mean_token_accuracy": 0.7157720692157745, "eval_physics_num_tokens": 22199229.0, "eval_physics_runtime": 40.5655, "eval_physics_samples_per_second": 12.326, "eval_physics_steps_per_second": 6.163, "step": 1600 }, { "entropy": 0.8169244319200516, "epoch": 1.288, "grad_norm": 0.68359375, "learning_rate": 1.0303703703703705e-05, "loss": 0.7704, "mean_token_accuracy": 0.7924644846469164, "num_tokens": 22335695.0, "step": 1610 }, { "entropy": 0.8122199434787035, "epoch": 1.296, "grad_norm": 0.65625, "learning_rate": 1.0229629629629631e-05, "loss": 0.7639, "mean_token_accuracy": 0.7955255527049303, "num_tokens": 22470502.0, "step": 1620 }, { "entropy": 0.8375904524698854, "epoch": 1.304, "grad_norm": 0.62109375, "learning_rate": 1.0155555555555556e-05, "loss": 0.7728, "mean_token_accuracy": 0.7911740705370903, "num_tokens": 22610780.0, "step": 1630 }, { "entropy": 0.8467503670603037, "epoch": 1.312, "grad_norm": 0.68359375, "learning_rate": 1.0081481481481484e-05, "loss": 0.8097, "mean_token_accuracy": 0.7875058270990849, "num_tokens": 22749763.0, "step": 1640 }, { "entropy": 0.8325416158884764, "epoch": 1.32, "grad_norm": 0.66015625, "learning_rate": 1.0007407407407407e-05, "loss": 0.7816, "mean_token_accuracy": 0.7904935259371996, "num_tokens": 22886557.0, "step": 1650 }, { "entropy": 0.8405437588691711, "epoch": 1.328, "grad_norm": 0.73828125, "learning_rate": 9.933333333333334e-06, "loss": 0.7953, "mean_token_accuracy": 0.7865632444620132, "num_tokens": 23020866.0, "step": 1660 }, { "entropy": 0.8233300801366568, "epoch": 1.336, "grad_norm": 0.75390625, "learning_rate": 9.85925925925926e-06, "loss": 0.7806, "mean_token_accuracy": 0.7902509056031704, "num_tokens": 23155782.0, "step": 1670 }, { "entropy": 0.7958553103730083, "epoch": 1.3439999999999999, "grad_norm": 0.60546875, "learning_rate": 9.785185185185187e-06, "loss": 0.7631, "mean_token_accuracy": 0.797454484179616, "num_tokens": 23291068.0, "step": 1680 }, { "entropy": 0.8144848184660077, "epoch": 1.3519999999999999, "grad_norm": 0.66015625, "learning_rate": 9.711111111111111e-06, "loss": 0.7765, "mean_token_accuracy": 0.7929423958063125, "num_tokens": 23430048.0, "step": 1690 }, { "entropy": 0.8195401934906841, "epoch": 1.3599999999999999, "grad_norm": 0.69921875, "learning_rate": 9.637037037037038e-06, "loss": 0.7735, "mean_token_accuracy": 0.7930282630026341, "num_tokens": 23571779.0, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_biology_entropy": 2.8502994742393493, "eval_biology_loss": 3.0442328453063965, "eval_biology_mean_token_accuracy": 0.47077508652210237, "eval_biology_num_tokens": 23571779.0, "eval_biology_runtime": 27.0342, "eval_biology_samples_per_second": 18.495, "eval_biology_steps_per_second": 9.248, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_chemistry_entropy": 1.604833221435547, "eval_chemistry_loss": 1.6297847032546997, "eval_chemistry_mean_token_accuracy": 0.65207393181324, "eval_chemistry_num_tokens": 23571779.0, "eval_chemistry_runtime": 34.6203, "eval_chemistry_samples_per_second": 14.442, "eval_chemistry_steps_per_second": 7.221, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_math_entropy": 0.8477920205593109, "eval_math_loss": 0.8247544169425964, "eval_math_mean_token_accuracy": 0.7847949225902557, "eval_math_num_tokens": 23571779.0, "eval_math_runtime": 35.0722, "eval_math_samples_per_second": 14.256, "eval_math_steps_per_second": 7.128, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_physics_entropy": 1.2535700430870056, "eval_physics_loss": 1.2455135583877563, "eval_physics_mean_token_accuracy": 0.7158266956806183, "eval_physics_num_tokens": 23571779.0, "eval_physics_runtime": 40.6362, "eval_physics_samples_per_second": 12.304, "eval_physics_steps_per_second": 6.152, "step": 1700 }, { "entropy": 0.8383084131404758, "epoch": 1.3679999999999999, "grad_norm": 0.74609375, "learning_rate": 9.562962962962965e-06, "loss": 0.7956, "mean_token_accuracy": 0.7880340579897165, "num_tokens": 23707772.0, "step": 1710 }, { "entropy": 0.8829018753021955, "epoch": 1.376, "grad_norm": 0.62890625, "learning_rate": 9.48888888888889e-06, "loss": 0.8363, "mean_token_accuracy": 0.7783376723527908, "num_tokens": 23844985.0, "step": 1720 }, { "entropy": 0.8262542556971312, "epoch": 1.384, "grad_norm": 0.66796875, "learning_rate": 9.414814814814816e-06, "loss": 0.7657, "mean_token_accuracy": 0.7913823142647743, "num_tokens": 23981628.0, "step": 1730 }, { "entropy": 0.8316655196249485, "epoch": 1.392, "grad_norm": 0.63671875, "learning_rate": 9.34074074074074e-06, "loss": 0.7787, "mean_token_accuracy": 0.7895013779401779, "num_tokens": 24119732.0, "step": 1740 }, { "entropy": 0.7973068812862039, "epoch": 1.4, "grad_norm": 0.71484375, "learning_rate": 9.266666666666667e-06, "loss": 0.7521, "mean_token_accuracy": 0.7974414359778166, "num_tokens": 24262401.0, "step": 1750 }, { "entropy": 0.8182015813887119, "epoch": 1.408, "grad_norm": 0.76171875, "learning_rate": 9.192592592592594e-06, "loss": 0.7642, "mean_token_accuracy": 0.7919386733323336, "num_tokens": 24401537.0, "step": 1760 }, { "entropy": 0.8352840909734368, "epoch": 1.416, "grad_norm": 0.71875, "learning_rate": 9.118518518518518e-06, "loss": 0.7858, "mean_token_accuracy": 0.7885801542550326, "num_tokens": 24540281.0, "step": 1770 }, { "entropy": 0.7927129395306111, "epoch": 1.424, "grad_norm": 0.60546875, "learning_rate": 9.044444444444445e-06, "loss": 0.742, "mean_token_accuracy": 0.7989379655569792, "num_tokens": 24682390.0, "step": 1780 }, { "entropy": 0.8145547799766064, "epoch": 1.432, "grad_norm": 0.640625, "learning_rate": 8.970370370370372e-06, "loss": 0.7624, "mean_token_accuracy": 0.7933211088180542, "num_tokens": 24820884.0, "step": 1790 }, { "entropy": 0.8423224657773971, "epoch": 1.44, "grad_norm": 0.671875, "learning_rate": 8.896296296296298e-06, "loss": 0.7958, "mean_token_accuracy": 0.7877123914659023, "num_tokens": 24957390.0, "step": 1800 }, { "epoch": 1.44, "eval_biology_entropy": 2.8344242753982543, "eval_biology_loss": 3.0405893325805664, "eval_biology_mean_token_accuracy": 0.47124050533771517, "eval_biology_num_tokens": 24957390.0, "eval_biology_runtime": 27.0536, "eval_biology_samples_per_second": 18.482, "eval_biology_steps_per_second": 9.241, "step": 1800 }, { "epoch": 1.44, "eval_chemistry_entropy": 1.5973036425113678, "eval_chemistry_loss": 1.6277400255203247, "eval_chemistry_mean_token_accuracy": 0.6527108114957809, "eval_chemistry_num_tokens": 24957390.0, "eval_chemistry_runtime": 34.5577, "eval_chemistry_samples_per_second": 14.469, "eval_chemistry_steps_per_second": 7.234, "step": 1800 }, { "epoch": 1.44, "eval_math_entropy": 0.8444431309700012, "eval_math_loss": 0.823908805847168, "eval_math_mean_token_accuracy": 0.7849345812797547, "eval_math_num_tokens": 24957390.0, "eval_math_runtime": 35.0263, "eval_math_samples_per_second": 14.275, "eval_math_steps_per_second": 7.137, "step": 1800 }, { "epoch": 1.44, "eval_physics_entropy": 1.2485797266960144, "eval_physics_loss": 1.2440050840377808, "eval_physics_mean_token_accuracy": 0.7161458423137664, "eval_physics_num_tokens": 24957390.0, "eval_physics_runtime": 40.7709, "eval_physics_samples_per_second": 12.264, "eval_physics_steps_per_second": 6.132, "step": 1800 }, { "entropy": 0.8316453367471695, "epoch": 1.448, "grad_norm": 0.703125, "learning_rate": 8.822222222222223e-06, "loss": 0.7816, "mean_token_accuracy": 0.7880163863301277, "num_tokens": 25091961.0, "step": 1810 }, { "entropy": 0.8137075098231434, "epoch": 1.456, "grad_norm": 0.67578125, "learning_rate": 8.74814814814815e-06, "loss": 0.7661, "mean_token_accuracy": 0.7935271102935075, "num_tokens": 25229390.0, "step": 1820 }, { "entropy": 0.8185559125617147, "epoch": 1.464, "grad_norm": 0.6015625, "learning_rate": 8.674074074074074e-06, "loss": 0.7898, "mean_token_accuracy": 0.790428926050663, "num_tokens": 25365072.0, "step": 1830 }, { "entropy": 0.8206291127949953, "epoch": 1.472, "grad_norm": 0.5703125, "learning_rate": 8.6e-06, "loss": 0.7714, "mean_token_accuracy": 0.7919356789439916, "num_tokens": 25499263.0, "step": 1840 }, { "entropy": 0.833819380402565, "epoch": 1.48, "grad_norm": 0.69921875, "learning_rate": 8.525925925925927e-06, "loss": 0.7875, "mean_token_accuracy": 0.7891976829618216, "num_tokens": 25630695.0, "step": 1850 }, { "entropy": 0.860048278607428, "epoch": 1.488, "grad_norm": 0.765625, "learning_rate": 8.451851851851852e-06, "loss": 0.8185, "mean_token_accuracy": 0.7816774118691683, "num_tokens": 25766470.0, "step": 1860 }, { "entropy": 0.8099161019548774, "epoch": 1.496, "grad_norm": 0.66015625, "learning_rate": 8.377777777777779e-06, "loss": 0.7587, "mean_token_accuracy": 0.7959996823221445, "num_tokens": 25909007.0, "step": 1870 }, { "entropy": 0.8094545688480139, "epoch": 1.504, "grad_norm": 0.734375, "learning_rate": 8.303703703703705e-06, "loss": 0.7604, "mean_token_accuracy": 0.7959994535893202, "num_tokens": 26051351.0, "step": 1880 }, { "entropy": 0.8493411887437106, "epoch": 1.512, "grad_norm": 0.68359375, "learning_rate": 8.229629629629632e-06, "loss": 0.8068, "mean_token_accuracy": 0.7882944524288178, "num_tokens": 26188237.0, "step": 1890 }, { "entropy": 0.8235599439591169, "epoch": 1.52, "grad_norm": 0.62890625, "learning_rate": 8.155555555555556e-06, "loss": 0.7783, "mean_token_accuracy": 0.79057194031775, "num_tokens": 26331956.0, "step": 1900 }, { "epoch": 1.52, "eval_biology_entropy": 2.8468204393386842, "eval_biology_loss": 3.039438009262085, "eval_biology_mean_token_accuracy": 0.4713715660572052, "eval_biology_num_tokens": 26331956.0, "eval_biology_runtime": 26.9649, "eval_biology_samples_per_second": 18.543, "eval_biology_steps_per_second": 9.271, "step": 1900 }, { "epoch": 1.52, "eval_chemistry_entropy": 1.6046333646774291, "eval_chemistry_loss": 1.6270935535430908, "eval_chemistry_mean_token_accuracy": 0.6528140946626663, "eval_chemistry_num_tokens": 26331956.0, "eval_chemistry_runtime": 34.4714, "eval_chemistry_samples_per_second": 14.505, "eval_chemistry_steps_per_second": 7.252, "step": 1900 }, { "epoch": 1.52, "eval_math_entropy": 0.8492136422395706, "eval_math_loss": 0.8234105110168457, "eval_math_mean_token_accuracy": 0.7850139417648315, "eval_math_num_tokens": 26331956.0, "eval_math_runtime": 34.9296, "eval_math_samples_per_second": 14.314, "eval_math_steps_per_second": 7.157, "step": 1900 }, { "epoch": 1.52, "eval_physics_entropy": 1.254153406381607, "eval_physics_loss": 1.2435050010681152, "eval_physics_mean_token_accuracy": 0.7162581722736359, "eval_physics_num_tokens": 26331956.0, "eval_physics_runtime": 40.5372, "eval_physics_samples_per_second": 12.334, "eval_physics_steps_per_second": 6.167, "step": 1900 }, { "entropy": 0.8410981273278594, "epoch": 1.528, "grad_norm": 0.70703125, "learning_rate": 8.081481481481483e-06, "loss": 0.7966, "mean_token_accuracy": 0.7865708086639642, "num_tokens": 26469780.0, "step": 1910 }, { "entropy": 0.8120917391031981, "epoch": 1.536, "grad_norm": 0.6796875, "learning_rate": 8.007407407407408e-06, "loss": 0.7679, "mean_token_accuracy": 0.7950867023319006, "num_tokens": 26613031.0, "step": 1920 }, { "entropy": 0.813549479842186, "epoch": 1.544, "grad_norm": 0.63671875, "learning_rate": 7.933333333333334e-06, "loss": 0.7604, "mean_token_accuracy": 0.7951221365481616, "num_tokens": 26748931.0, "step": 1930 }, { "entropy": 0.8368369644507766, "epoch": 1.552, "grad_norm": 0.65625, "learning_rate": 7.859259259259259e-06, "loss": 0.7908, "mean_token_accuracy": 0.7891507402062417, "num_tokens": 26892482.0, "step": 1940 }, { "entropy": 0.8099740859121084, "epoch": 1.56, "grad_norm": 0.7890625, "learning_rate": 7.785185185185185e-06, "loss": 0.7465, "mean_token_accuracy": 0.7952026624232531, "num_tokens": 27033385.0, "step": 1950 }, { "entropy": 0.8125249484553934, "epoch": 1.568, "grad_norm": 0.62109375, "learning_rate": 7.711111111111112e-06, "loss": 0.7716, "mean_token_accuracy": 0.795727027580142, "num_tokens": 27177253.0, "step": 1960 }, { "entropy": 0.8376018533483147, "epoch": 1.576, "grad_norm": 0.7109375, "learning_rate": 7.637037037037037e-06, "loss": 0.7807, "mean_token_accuracy": 0.7907413702458144, "num_tokens": 27314645.0, "step": 1970 }, { "entropy": 0.8189013006165624, "epoch": 1.584, "grad_norm": 0.87109375, "learning_rate": 7.562962962962963e-06, "loss": 0.771, "mean_token_accuracy": 0.7916807536035776, "num_tokens": 27456524.0, "step": 1980 }, { "entropy": 0.8082204090431333, "epoch": 1.592, "grad_norm": 0.6328125, "learning_rate": 7.48888888888889e-06, "loss": 0.7578, "mean_token_accuracy": 0.7939547438174486, "num_tokens": 27592021.0, "step": 1990 }, { "entropy": 0.8296483233571053, "epoch": 1.6, "grad_norm": 0.62109375, "learning_rate": 7.4148148148148155e-06, "loss": 0.7837, "mean_token_accuracy": 0.7896194510161877, "num_tokens": 27734080.0, "step": 2000 }, { "epoch": 1.6, "eval_biology_entropy": 2.848469418525696, "eval_biology_loss": 3.041050672531128, "eval_biology_mean_token_accuracy": 0.4710944994688034, "eval_biology_num_tokens": 27734080.0, "eval_biology_runtime": 26.9667, "eval_biology_samples_per_second": 18.541, "eval_biology_steps_per_second": 9.271, "step": 2000 }, { "epoch": 1.6, "eval_chemistry_entropy": 1.6052903940677643, "eval_chemistry_loss": 1.6265658140182495, "eval_chemistry_mean_token_accuracy": 0.6528225523233414, "eval_chemistry_num_tokens": 27734080.0, "eval_chemistry_runtime": 34.5552, "eval_chemistry_samples_per_second": 14.47, "eval_chemistry_steps_per_second": 7.235, "step": 2000 }, { "epoch": 1.6, "eval_math_entropy": 0.8491179572343827, "eval_math_loss": 0.8228975534439087, "eval_math_mean_token_accuracy": 0.7851616134643554, "eval_math_num_tokens": 27734080.0, "eval_math_runtime": 35.0391, "eval_math_samples_per_second": 14.27, "eval_math_steps_per_second": 7.135, "step": 2000 }, { "epoch": 1.6, "eval_physics_entropy": 1.2549104006290437, "eval_physics_loss": 1.2431191205978394, "eval_physics_mean_token_accuracy": 0.7163957259654998, "eval_physics_num_tokens": 27734080.0, "eval_physics_runtime": 40.6487, "eval_physics_samples_per_second": 12.301, "eval_physics_steps_per_second": 6.15, "step": 2000 }, { "entropy": 0.8265687106177211, "epoch": 1.608, "grad_norm": 0.71875, "learning_rate": 7.340740740740742e-06, "loss": 0.7784, "mean_token_accuracy": 0.7915367607027293, "num_tokens": 27869074.0, "step": 2010 }, { "entropy": 0.8278342794626952, "epoch": 1.616, "grad_norm": 0.640625, "learning_rate": 7.266666666666668e-06, "loss": 0.7955, "mean_token_accuracy": 0.7893077913671732, "num_tokens": 28011317.0, "step": 2020 }, { "entropy": 0.8308734528720378, "epoch": 1.624, "grad_norm": 0.65234375, "learning_rate": 7.192592592592593e-06, "loss": 0.7831, "mean_token_accuracy": 0.7892321836203336, "num_tokens": 28145049.0, "step": 2030 }, { "entropy": 0.8520573848858476, "epoch": 1.6320000000000001, "grad_norm": 0.80078125, "learning_rate": 7.118518518518519e-06, "loss": 0.7978, "mean_token_accuracy": 0.7839532144367695, "num_tokens": 28278374.0, "step": 2040 }, { "entropy": 0.7954119427129627, "epoch": 1.6400000000000001, "grad_norm": 0.640625, "learning_rate": 7.044444444444445e-06, "loss": 0.7415, "mean_token_accuracy": 0.8002398379147053, "num_tokens": 28415627.0, "step": 2050 }, { "entropy": 0.8363964034244418, "epoch": 1.6480000000000001, "grad_norm": 0.671875, "learning_rate": 6.97037037037037e-06, "loss": 0.7808, "mean_token_accuracy": 0.7943990144878625, "num_tokens": 28558140.0, "step": 2060 }, { "entropy": 0.8353608377277851, "epoch": 1.6560000000000001, "grad_norm": 0.72265625, "learning_rate": 6.896296296296297e-06, "loss": 0.7782, "mean_token_accuracy": 0.7878375049680472, "num_tokens": 28703165.0, "step": 2070 }, { "entropy": 0.8140342826023698, "epoch": 1.6640000000000001, "grad_norm": 0.72265625, "learning_rate": 6.8222222222222225e-06, "loss": 0.7604, "mean_token_accuracy": 0.7917621362954378, "num_tokens": 28844845.0, "step": 2080 }, { "entropy": 0.8718782737851143, "epoch": 1.6720000000000002, "grad_norm": 0.65234375, "learning_rate": 6.748148148148149e-06, "loss": 0.82, "mean_token_accuracy": 0.7796166706830263, "num_tokens": 28986433.0, "step": 2090 }, { "entropy": 0.7938652452081442, "epoch": 1.6800000000000002, "grad_norm": 0.609375, "learning_rate": 6.674074074074075e-06, "loss": 0.7443, "mean_token_accuracy": 0.800289686396718, "num_tokens": 29124095.0, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_biology_entropy": 2.8482976722717286, "eval_biology_loss": 3.0416367053985596, "eval_biology_mean_token_accuracy": 0.47124637401103975, "eval_biology_num_tokens": 29124095.0, "eval_biology_runtime": 27.0617, "eval_biology_samples_per_second": 18.476, "eval_biology_steps_per_second": 9.238, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_chemistry_entropy": 1.603459088563919, "eval_chemistry_loss": 1.6268434524536133, "eval_chemistry_mean_token_accuracy": 0.6529314165115356, "eval_chemistry_num_tokens": 29124095.0, "eval_chemistry_runtime": 34.4583, "eval_chemistry_samples_per_second": 14.51, "eval_chemistry_steps_per_second": 7.255, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_math_entropy": 0.847537318944931, "eval_math_loss": 0.8226439356803894, "eval_math_mean_token_accuracy": 0.7852621068954467, "eval_math_num_tokens": 29124095.0, "eval_math_runtime": 35.0403, "eval_math_samples_per_second": 14.269, "eval_math_steps_per_second": 7.135, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_physics_entropy": 1.2527501695156098, "eval_physics_loss": 1.2428388595581055, "eval_physics_mean_token_accuracy": 0.7164320776462555, "eval_physics_num_tokens": 29124095.0, "eval_physics_runtime": 40.5543, "eval_physics_samples_per_second": 12.329, "eval_physics_steps_per_second": 6.165, "step": 2100 }, { "entropy": 0.8439570276066661, "epoch": 1.688, "grad_norm": 0.71875, "learning_rate": 6.600000000000001e-06, "loss": 0.7825, "mean_token_accuracy": 0.7874237090349198, "num_tokens": 29258437.0, "step": 2110 }, { "entropy": 0.8313365705311299, "epoch": 1.696, "grad_norm": 0.68359375, "learning_rate": 6.525925925925927e-06, "loss": 0.7807, "mean_token_accuracy": 0.7899383477866649, "num_tokens": 29398520.0, "step": 2120 }, { "entropy": 0.8049716239795088, "epoch": 1.704, "grad_norm": 0.671875, "learning_rate": 6.4518518518518525e-06, "loss": 0.7606, "mean_token_accuracy": 0.7955230951309205, "num_tokens": 29539888.0, "step": 2130 }, { "entropy": 0.821560175716877, "epoch": 1.712, "grad_norm": 0.59375, "learning_rate": 6.377777777777778e-06, "loss": 0.7639, "mean_token_accuracy": 0.7909712709486485, "num_tokens": 29686105.0, "step": 2140 }, { "entropy": 0.8149364372715354, "epoch": 1.72, "grad_norm": 0.703125, "learning_rate": 6.303703703703704e-06, "loss": 0.7529, "mean_token_accuracy": 0.7945274922996759, "num_tokens": 29820793.0, "step": 2150 }, { "entropy": 0.8199317397549748, "epoch": 1.728, "grad_norm": 0.734375, "learning_rate": 6.2296296296296295e-06, "loss": 0.7723, "mean_token_accuracy": 0.7926640961319208, "num_tokens": 29957158.0, "step": 2160 }, { "entropy": 0.8122139278799295, "epoch": 1.736, "grad_norm": 0.6875, "learning_rate": 6.155555555555556e-06, "loss": 0.7631, "mean_token_accuracy": 0.7937244303524494, "num_tokens": 30094112.0, "step": 2170 }, { "entropy": 0.8205946860834956, "epoch": 1.744, "grad_norm": 0.66796875, "learning_rate": 6.081481481481482e-06, "loss": 0.7844, "mean_token_accuracy": 0.793923956528306, "num_tokens": 30233084.0, "step": 2180 }, { "entropy": 0.8003243621438741, "epoch": 1.752, "grad_norm": 0.65625, "learning_rate": 6.007407407407407e-06, "loss": 0.755, "mean_token_accuracy": 0.7946645710617304, "num_tokens": 30372728.0, "step": 2190 }, { "entropy": 0.8203964494168758, "epoch": 1.76, "grad_norm": 0.70703125, "learning_rate": 5.933333333333335e-06, "loss": 0.78, "mean_token_accuracy": 0.791372949257493, "num_tokens": 30509617.0, "step": 2200 }, { "epoch": 1.76, "eval_biology_entropy": 2.846324257850647, "eval_biology_loss": 3.040083885192871, "eval_biology_mean_token_accuracy": 0.47158474671840667, "eval_biology_num_tokens": 30509617.0, "eval_biology_runtime": 27.1152, "eval_biology_samples_per_second": 18.44, "eval_biology_steps_per_second": 9.22, "step": 2200 }, { "epoch": 1.76, "eval_chemistry_entropy": 1.6028525500297546, "eval_chemistry_loss": 1.626190423965454, "eval_chemistry_mean_token_accuracy": 0.6529276027679444, "eval_chemistry_num_tokens": 30509617.0, "eval_chemistry_runtime": 34.5753, "eval_chemistry_samples_per_second": 14.461, "eval_chemistry_steps_per_second": 7.231, "step": 2200 }, { "epoch": 1.76, "eval_math_entropy": 0.8474567606449127, "eval_math_loss": 0.8222874999046326, "eval_math_mean_token_accuracy": 0.7852948112487793, "eval_math_num_tokens": 30509617.0, "eval_math_runtime": 35.1106, "eval_math_samples_per_second": 14.241, "eval_math_steps_per_second": 7.12, "step": 2200 }, { "epoch": 1.76, "eval_physics_entropy": 1.2525973563194275, "eval_physics_loss": 1.2423382997512817, "eval_physics_mean_token_accuracy": 0.7164157366752625, "eval_physics_num_tokens": 30509617.0, "eval_physics_runtime": 40.572, "eval_physics_samples_per_second": 12.324, "eval_physics_steps_per_second": 6.162, "step": 2200 }, { "entropy": 0.8106696594506502, "epoch": 1.768, "grad_norm": 0.6484375, "learning_rate": 5.85925925925926e-06, "loss": 0.7626, "mean_token_accuracy": 0.7939427189528943, "num_tokens": 30652913.0, "step": 2210 }, { "entropy": 0.8465521112084389, "epoch": 1.776, "grad_norm": 0.7109375, "learning_rate": 5.785185185185186e-06, "loss": 0.7892, "mean_token_accuracy": 0.790094893053174, "num_tokens": 30792010.0, "step": 2220 }, { "entropy": 0.8444044131785631, "epoch": 1.784, "grad_norm": 0.67578125, "learning_rate": 5.711111111111112e-06, "loss": 0.7814, "mean_token_accuracy": 0.7884286772459745, "num_tokens": 30932387.0, "step": 2230 }, { "entropy": 0.8184869319200516, "epoch": 1.792, "grad_norm": 0.65234375, "learning_rate": 5.637037037037037e-06, "loss": 0.7685, "mean_token_accuracy": 0.7903404340147973, "num_tokens": 31070827.0, "step": 2240 }, { "entropy": 0.8448216756805778, "epoch": 1.8, "grad_norm": 0.703125, "learning_rate": 5.562962962962963e-06, "loss": 0.795, "mean_token_accuracy": 0.7889822244644165, "num_tokens": 31206114.0, "step": 2250 }, { "entropy": 0.8201512550935149, "epoch": 1.808, "grad_norm": 0.6171875, "learning_rate": 5.4888888888888895e-06, "loss": 0.7685, "mean_token_accuracy": 0.7918681055307388, "num_tokens": 31346941.0, "step": 2260 }, { "entropy": 0.8058544466271996, "epoch": 1.8159999999999998, "grad_norm": 0.7421875, "learning_rate": 5.414814814814815e-06, "loss": 0.7494, "mean_token_accuracy": 0.79769425727427, "num_tokens": 31481643.0, "step": 2270 }, { "entropy": 0.793770793452859, "epoch": 1.8239999999999998, "grad_norm": 0.59765625, "learning_rate": 5.340740740740741e-06, "loss": 0.7326, "mean_token_accuracy": 0.7983124267309905, "num_tokens": 31618322.0, "step": 2280 }, { "entropy": 0.8382254412397743, "epoch": 1.8319999999999999, "grad_norm": 0.69921875, "learning_rate": 5.2666666666666665e-06, "loss": 0.8064, "mean_token_accuracy": 0.7901876337826252, "num_tokens": 31758767.0, "step": 2290 }, { "entropy": 0.7901408698409795, "epoch": 1.8399999999999999, "grad_norm": 0.63671875, "learning_rate": 5.192592592592594e-06, "loss": 0.7398, "mean_token_accuracy": 0.7980923440307379, "num_tokens": 31892057.0, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_biology_entropy": 2.848859769821167, "eval_biology_loss": 3.039982318878174, "eval_biology_mean_token_accuracy": 0.47137124490737914, "eval_biology_num_tokens": 31892057.0, "eval_biology_runtime": 27.0702, "eval_biology_samples_per_second": 18.47, "eval_biology_steps_per_second": 9.235, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_chemistry_entropy": 1.603529981136322, "eval_chemistry_loss": 1.6258869171142578, "eval_chemistry_mean_token_accuracy": 0.6530722501277924, "eval_chemistry_num_tokens": 31892057.0, "eval_chemistry_runtime": 34.5798, "eval_chemistry_samples_per_second": 14.459, "eval_chemistry_steps_per_second": 7.23, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_math_entropy": 0.8474580272436142, "eval_math_loss": 0.8221175670623779, "eval_math_mean_token_accuracy": 0.7854107286930084, "eval_math_num_tokens": 31892057.0, "eval_math_runtime": 35.1096, "eval_math_samples_per_second": 14.241, "eval_math_steps_per_second": 7.121, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_physics_entropy": 1.252752271413803, "eval_physics_loss": 1.2421237230300903, "eval_physics_mean_token_accuracy": 0.7164910452365876, "eval_physics_num_tokens": 31892057.0, "eval_physics_runtime": 40.6776, "eval_physics_samples_per_second": 12.292, "eval_physics_steps_per_second": 6.146, "step": 2300 }, { "entropy": 0.8583742175251245, "epoch": 1.8479999999999999, "grad_norm": 0.76953125, "learning_rate": 5.1185185185185195e-06, "loss": 0.8006, "mean_token_accuracy": 0.7850713964551688, "num_tokens": 32028586.0, "step": 2310 }, { "entropy": 0.8119469789788127, "epoch": 1.8559999999999999, "grad_norm": 0.6875, "learning_rate": 5.044444444444445e-06, "loss": 0.7663, "mean_token_accuracy": 0.7947797764092683, "num_tokens": 32172918.0, "step": 2320 }, { "entropy": 0.817203932069242, "epoch": 1.8639999999999999, "grad_norm": 0.70703125, "learning_rate": 4.970370370370371e-06, "loss": 0.7572, "mean_token_accuracy": 0.7951970502734185, "num_tokens": 32312404.0, "step": 2330 }, { "entropy": 0.8136437116190791, "epoch": 1.8719999999999999, "grad_norm": 0.6796875, "learning_rate": 4.8962962962962965e-06, "loss": 0.7639, "mean_token_accuracy": 0.7948971126228571, "num_tokens": 32453087.0, "step": 2340 }, { "entropy": 0.8080563105642795, "epoch": 1.88, "grad_norm": 0.6640625, "learning_rate": 4.822222222222222e-06, "loss": 0.7639, "mean_token_accuracy": 0.7945172045379877, "num_tokens": 32590700.0, "step": 2350 }, { "entropy": 0.8381484400480985, "epoch": 1.888, "grad_norm": 0.70703125, "learning_rate": 4.748148148148149e-06, "loss": 0.7845, "mean_token_accuracy": 0.7886030592024327, "num_tokens": 32723969.0, "step": 2360 }, { "entropy": 0.8534527840092778, "epoch": 1.896, "grad_norm": 0.66015625, "learning_rate": 4.674074074074074e-06, "loss": 0.797, "mean_token_accuracy": 0.7858456280082464, "num_tokens": 32857979.0, "step": 2370 }, { "entropy": 0.8192885598167777, "epoch": 1.904, "grad_norm": 0.6953125, "learning_rate": 4.600000000000001e-06, "loss": 0.7665, "mean_token_accuracy": 0.7926325775682926, "num_tokens": 32991800.0, "step": 2380 }, { "entropy": 0.8101921185851098, "epoch": 1.912, "grad_norm": 0.68359375, "learning_rate": 4.5259259259259265e-06, "loss": 0.7506, "mean_token_accuracy": 0.7983851782977581, "num_tokens": 33127246.0, "step": 2390 }, { "entropy": 0.8159189112484455, "epoch": 1.92, "grad_norm": 0.62109375, "learning_rate": 4.451851851851852e-06, "loss": 0.7728, "mean_token_accuracy": 0.791253462433815, "num_tokens": 33260189.0, "step": 2400 }, { "epoch": 1.92, "eval_biology_entropy": 2.8435637803077696, "eval_biology_loss": 3.0387935638427734, "eval_biology_mean_token_accuracy": 0.47165171790122984, "eval_biology_num_tokens": 33260189.0, "eval_biology_runtime": 26.981, "eval_biology_samples_per_second": 18.532, "eval_biology_steps_per_second": 9.266, "step": 2400 }, { "epoch": 1.92, "eval_chemistry_entropy": 1.6004922416210174, "eval_chemistry_loss": 1.625298023223877, "eval_chemistry_mean_token_accuracy": 0.653136913061142, "eval_chemistry_num_tokens": 33260189.0, "eval_chemistry_runtime": 34.3626, "eval_chemistry_samples_per_second": 14.551, "eval_chemistry_steps_per_second": 7.275, "step": 2400 }, { "epoch": 1.92, "eval_math_entropy": 0.846027658700943, "eval_math_loss": 0.8219407200813293, "eval_math_mean_token_accuracy": 0.7854623227119446, "eval_math_num_tokens": 33260189.0, "eval_math_runtime": 34.9028, "eval_math_samples_per_second": 14.325, "eval_math_steps_per_second": 7.163, "step": 2400 }, { "epoch": 1.92, "eval_physics_entropy": 1.2505494358539582, "eval_physics_loss": 1.241852045059204, "eval_physics_mean_token_accuracy": 0.7165833852291107, "eval_physics_num_tokens": 33260189.0, "eval_physics_runtime": 40.4223, "eval_physics_samples_per_second": 12.369, "eval_physics_steps_per_second": 6.185, "step": 2400 }, { "entropy": 0.82371318154037, "epoch": 1.928, "grad_norm": 0.640625, "learning_rate": 4.377777777777778e-06, "loss": 0.7639, "mean_token_accuracy": 0.7929469864815474, "num_tokens": 33397805.0, "step": 2410 }, { "entropy": 0.8550658386200667, "epoch": 1.936, "grad_norm": 0.68359375, "learning_rate": 4.3037037037037035e-06, "loss": 0.8053, "mean_token_accuracy": 0.7845841065049172, "num_tokens": 33532547.0, "step": 2420 }, { "entropy": 0.8324382711201906, "epoch": 1.944, "grad_norm": 0.640625, "learning_rate": 4.22962962962963e-06, "loss": 0.7781, "mean_token_accuracy": 0.7913172505795956, "num_tokens": 33666839.0, "step": 2430 }, { "entropy": 0.8015188897028566, "epoch": 1.952, "grad_norm": 0.64453125, "learning_rate": 4.155555555555556e-06, "loss": 0.742, "mean_token_accuracy": 0.797937859967351, "num_tokens": 33801790.0, "step": 2440 }, { "entropy": 0.8526428388431668, "epoch": 1.96, "grad_norm": 0.7421875, "learning_rate": 4.081481481481482e-06, "loss": 0.804, "mean_token_accuracy": 0.7881749698892235, "num_tokens": 33941905.0, "step": 2450 }, { "entropy": 0.8375929415225982, "epoch": 1.968, "grad_norm": 0.98828125, "learning_rate": 4.007407407407408e-06, "loss": 0.8143, "mean_token_accuracy": 0.7871783085167408, "num_tokens": 34082674.0, "step": 2460 }, { "entropy": 0.8121453728526831, "epoch": 1.976, "grad_norm": 0.66015625, "learning_rate": 3.9333333333333335e-06, "loss": 0.7648, "mean_token_accuracy": 0.7940985098481178, "num_tokens": 34225307.0, "step": 2470 }, { "entropy": 0.8036439396440983, "epoch": 1.984, "grad_norm": 0.71484375, "learning_rate": 3.85925925925926e-06, "loss": 0.7563, "mean_token_accuracy": 0.7977584436535835, "num_tokens": 34367218.0, "step": 2480 }, { "entropy": 0.7928735548630357, "epoch": 1.992, "grad_norm": 0.640625, "learning_rate": 3.7851851851851857e-06, "loss": 0.7508, "mean_token_accuracy": 0.7971320513635873, "num_tokens": 34518929.0, "step": 2490 }, { "entropy": 0.8184740727767348, "epoch": 2.0, "grad_norm": 0.62109375, "learning_rate": 3.7111111111111113e-06, "loss": 0.7825, "mean_token_accuracy": 0.7948230810463428, "num_tokens": 34658640.0, "step": 2500 }, { "epoch": 2.0, "eval_biology_entropy": 2.8442644810676576, "eval_biology_loss": 3.0393142700195312, "eval_biology_mean_token_accuracy": 0.4714907593727112, "eval_biology_num_tokens": 34658640.0, "eval_biology_runtime": 1198.4619, "eval_biology_samples_per_second": 0.417, "eval_biology_steps_per_second": 0.209, "step": 2500 }, { "epoch": 2.0, "eval_chemistry_entropy": 1.6002152450084686, "eval_chemistry_loss": 1.625183343887329, "eval_chemistry_mean_token_accuracy": 0.6531649433374405, "eval_chemistry_num_tokens": 34658640.0, "eval_chemistry_runtime": 34.4638, "eval_chemistry_samples_per_second": 14.508, "eval_chemistry_steps_per_second": 7.254, "step": 2500 }, { "epoch": 2.0, "eval_math_entropy": 0.8454342411756516, "eval_math_loss": 0.8218699097633362, "eval_math_mean_token_accuracy": 0.7852884840965271, "eval_math_num_tokens": 34658640.0, "eval_math_runtime": 34.893, "eval_math_samples_per_second": 14.33, "eval_math_steps_per_second": 7.165, "step": 2500 }, { "epoch": 2.0, "eval_physics_entropy": 1.2499676442146301, "eval_physics_loss": 1.241803765296936, "eval_physics_mean_token_accuracy": 0.7166225876808167, "eval_physics_num_tokens": 34658640.0, "eval_physics_runtime": 40.4558, "eval_physics_samples_per_second": 12.359, "eval_physics_steps_per_second": 6.18, "step": 2500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.820575488969769e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }