{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1780160423368216, "epoch": 0.016, "grad_norm": 13.0, "learning_rate": 6.000000000000001e-07, "loss": 1.8406, "mean_token_accuracy": 0.6489301804453135, "num_tokens": 340696.0, "step": 10 }, { "entropy": 1.1818634796887637, "epoch": 0.032, "grad_norm": 11.5625, "learning_rate": 1.2666666666666669e-06, "loss": 1.8191, "mean_token_accuracy": 0.6528046734631061, "num_tokens": 675930.0, "step": 20 }, { "entropy": 1.195955842360854, "epoch": 0.048, "grad_norm": 9.3125, "learning_rate": 1.9333333333333336e-06, "loss": 1.7817, "mean_token_accuracy": 0.6578715395182371, "num_tokens": 1007956.0, "step": 30 }, { "entropy": 1.2672166559845208, "epoch": 0.064, "grad_norm": 6.5625, "learning_rate": 2.6e-06, "loss": 1.7272, "mean_token_accuracy": 0.6625342659652234, "num_tokens": 1340560.0, "step": 40 }, { "entropy": 1.2964693307876587, "epoch": 0.08, "grad_norm": 5.0, "learning_rate": 3.266666666666667e-06, "loss": 1.5867, "mean_token_accuracy": 0.6811951555311679, "num_tokens": 1679432.0, "step": 50 }, { "entropy": 1.2539724007248878, "epoch": 0.096, "grad_norm": 3.375, "learning_rate": 3.9333333333333335e-06, "loss": 1.3962, "mean_token_accuracy": 0.7081016473472118, "num_tokens": 2021570.0, "step": 60 }, { "entropy": 1.2302619956433773, "epoch": 0.112, "grad_norm": 2.28125, "learning_rate": 4.600000000000001e-06, "loss": 1.2974, "mean_token_accuracy": 0.7239668637514114, "num_tokens": 2359297.0, "step": 70 }, { "entropy": 1.169797332212329, "epoch": 0.128, "grad_norm": 1.875, "learning_rate": 5.2666666666666665e-06, "loss": 1.2306, "mean_token_accuracy": 0.7328833002597094, "num_tokens": 2683168.0, "step": 80 }, { "entropy": 1.0165224198251963, "epoch": 0.144, "grad_norm": 1.59375, "learning_rate": 5.933333333333335e-06, "loss": 1.1129, "mean_token_accuracy": 0.7561763934791088, "num_tokens": 3019426.0, "step": 90 }, { "entropy": 0.9457759071141482, "epoch": 0.16, "grad_norm": 1.546875, "learning_rate": 6.600000000000001e-06, "loss": 1.0448, "mean_token_accuracy": 0.7662712432444095, "num_tokens": 3354390.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 2.4009983863830566, "eval_biology_loss": 3.090766429901123, "eval_biology_mean_token_accuracy": 0.5075433547496796, "eval_biology_num_tokens": 3354390.0, "eval_biology_runtime": 38.801, "eval_biology_samples_per_second": 12.886, "eval_biology_steps_per_second": 3.222, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.1818295245170594, "eval_chemistry_loss": 1.4003204107284546, "eval_chemistry_mean_token_accuracy": 0.7158680257797241, "eval_chemistry_num_tokens": 3354390.0, "eval_chemistry_runtime": 48.2819, "eval_chemistry_samples_per_second": 10.356, "eval_chemistry_steps_per_second": 2.589, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.8902422695159912, "eval_math_loss": 1.2323389053344727, "eval_math_mean_token_accuracy": 0.750218500137329, "eval_math_num_tokens": 3354390.0, "eval_math_runtime": 49.6484, "eval_math_samples_per_second": 10.071, "eval_math_steps_per_second": 2.518, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 0.9318590335845948, "eval_physics_loss": 1.0569193363189697, "eval_physics_mean_token_accuracy": 0.7675474114418029, "eval_physics_num_tokens": 3354390.0, "eval_physics_runtime": 57.1057, "eval_physics_samples_per_second": 8.756, "eval_physics_steps_per_second": 2.189, "step": 100 }, { "entropy": 0.9224597703665495, "epoch": 0.176, "grad_norm": 1.4921875, "learning_rate": 7.266666666666668e-06, "loss": 1.0192, "mean_token_accuracy": 0.7696165222674608, "num_tokens": 3684374.0, "step": 110 }, { "entropy": 0.8823430232703686, "epoch": 0.192, "grad_norm": 1.125, "learning_rate": 7.933333333333334e-06, "loss": 0.9854, "mean_token_accuracy": 0.7739298477768898, "num_tokens": 4023915.0, "step": 120 }, { "entropy": 0.8393984897062182, "epoch": 0.208, "grad_norm": 1.4375, "learning_rate": 8.6e-06, "loss": 0.9379, "mean_token_accuracy": 0.7841779347509146, "num_tokens": 4352045.0, "step": 130 }, { "entropy": 0.8179828137159347, "epoch": 0.224, "grad_norm": 1.234375, "learning_rate": 9.266666666666667e-06, "loss": 0.9191, "mean_token_accuracy": 0.7869349300861359, "num_tokens": 4681371.0, "step": 140 }, { "entropy": 0.805096386373043, "epoch": 0.24, "grad_norm": 1.09375, "learning_rate": 9.933333333333334e-06, "loss": 0.9012, "mean_token_accuracy": 0.7888233289122581, "num_tokens": 5021784.0, "step": 150 }, { "entropy": 0.798224457167089, "epoch": 0.256, "grad_norm": 1.2109375, "learning_rate": 1.0600000000000002e-05, "loss": 0.8922, "mean_token_accuracy": 0.7903898701071739, "num_tokens": 5367308.0, "step": 160 }, { "entropy": 0.805234762467444, "epoch": 0.272, "grad_norm": 1.3203125, "learning_rate": 1.1266666666666668e-05, "loss": 0.8993, "mean_token_accuracy": 0.7877147275954485, "num_tokens": 5699101.0, "step": 170 }, { "entropy": 0.7946224914863705, "epoch": 0.288, "grad_norm": 1.234375, "learning_rate": 1.1933333333333335e-05, "loss": 0.8976, "mean_token_accuracy": 0.7911442808806897, "num_tokens": 6022837.0, "step": 180 }, { "entropy": 0.7837886592373252, "epoch": 0.304, "grad_norm": 1.5078125, "learning_rate": 1.2600000000000001e-05, "loss": 0.8805, "mean_token_accuracy": 0.7934082143008709, "num_tokens": 6343826.0, "step": 190 }, { "entropy": 0.778077344968915, "epoch": 0.32, "grad_norm": 1.109375, "learning_rate": 1.3266666666666668e-05, "loss": 0.8781, "mean_token_accuracy": 0.7940127164125442, "num_tokens": 6670164.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 2.1669651889801025, "eval_biology_loss": 2.686201810836792, "eval_biology_mean_token_accuracy": 0.5460970797538758, "eval_biology_num_tokens": 6670164.0, "eval_biology_runtime": 38.7901, "eval_biology_samples_per_second": 12.89, "eval_biology_steps_per_second": 3.222, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 0.9981456413269043, "eval_chemistry_loss": 1.1842883825302124, "eval_chemistry_mean_token_accuracy": 0.7467401041984558, "eval_chemistry_num_tokens": 6670164.0, "eval_chemistry_runtime": 48.3154, "eval_chemistry_samples_per_second": 10.349, "eval_chemistry_steps_per_second": 2.587, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.7862319254875183, "eval_math_loss": 1.0977362394332886, "eval_math_mean_token_accuracy": 0.7683374147415161, "eval_math_num_tokens": 6670164.0, "eval_math_runtime": 49.687, "eval_math_samples_per_second": 10.063, "eval_math_steps_per_second": 2.516, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.7679825134277344, "eval_physics_loss": 0.886142373085022, "eval_physics_mean_token_accuracy": 0.7957743234634399, "eval_physics_num_tokens": 6670164.0, "eval_physics_runtime": 57.1202, "eval_physics_samples_per_second": 8.753, "eval_physics_steps_per_second": 2.188, "step": 200 }, { "entropy": 0.7617403082549572, "epoch": 0.336, "grad_norm": 1.03125, "learning_rate": 1.3933333333333334e-05, "loss": 0.8531, "mean_token_accuracy": 0.7977134332060813, "num_tokens": 7003404.0, "step": 210 }, { "entropy": 0.737130863033235, "epoch": 0.352, "grad_norm": 1.2265625, "learning_rate": 1.46e-05, "loss": 0.8339, "mean_token_accuracy": 0.8024804938584567, "num_tokens": 7342593.0, "step": 220 }, { "entropy": 0.7299130430445075, "epoch": 0.368, "grad_norm": 1.015625, "learning_rate": 1.5266666666666667e-05, "loss": 0.8252, "mean_token_accuracy": 0.805871631577611, "num_tokens": 7675569.0, "step": 230 }, { "entropy": 0.7597114410251379, "epoch": 0.384, "grad_norm": 1.171875, "learning_rate": 1.5933333333333336e-05, "loss": 0.8558, "mean_token_accuracy": 0.7943590730428696, "num_tokens": 8012218.0, "step": 240 }, { "entropy": 0.7247350050136447, "epoch": 0.4, "grad_norm": 1.125, "learning_rate": 1.66e-05, "loss": 0.8193, "mean_token_accuracy": 0.8040182612836361, "num_tokens": 8350120.0, "step": 250 }, { "entropy": 0.7144488081336021, "epoch": 0.416, "grad_norm": 1.0859375, "learning_rate": 1.726666666666667e-05, "loss": 0.7998, "mean_token_accuracy": 0.8067968346178531, "num_tokens": 8688773.0, "step": 260 }, { "entropy": 0.7226355630904436, "epoch": 0.432, "grad_norm": 1.1953125, "learning_rate": 1.7933333333333333e-05, "loss": 0.8125, "mean_token_accuracy": 0.8042349684983492, "num_tokens": 9022094.0, "step": 270 }, { "entropy": 0.6943683221936225, "epoch": 0.448, "grad_norm": 1.03125, "learning_rate": 1.86e-05, "loss": 0.7924, "mean_token_accuracy": 0.8109067149460316, "num_tokens": 9360608.0, "step": 280 }, { "entropy": 0.6898288525640964, "epoch": 0.464, "grad_norm": 1.3046875, "learning_rate": 1.926666666666667e-05, "loss": 0.7814, "mean_token_accuracy": 0.8110810052603483, "num_tokens": 9688896.0, "step": 290 }, { "entropy": 0.6964065950363875, "epoch": 0.48, "grad_norm": 1.125, "learning_rate": 1.9933333333333334e-05, "loss": 0.7896, "mean_token_accuracy": 0.8094950247555971, "num_tokens": 10025864.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 2.1021561794281007, "eval_biology_loss": 2.546415090560913, "eval_biology_mean_token_accuracy": 0.5616441056728363, "eval_biology_num_tokens": 10025864.0, "eval_biology_runtime": 38.8255, "eval_biology_samples_per_second": 12.878, "eval_biology_steps_per_second": 3.22, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.9578249802589417, "eval_chemistry_loss": 1.1104036569595337, "eval_chemistry_mean_token_accuracy": 0.7582236580848694, "eval_chemistry_num_tokens": 10025864.0, "eval_chemistry_runtime": 48.5149, "eval_chemistry_samples_per_second": 10.306, "eval_chemistry_steps_per_second": 2.577, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.7676350421905518, "eval_math_loss": 1.0549193620681763, "eval_math_mean_token_accuracy": 0.7745008988380432, "eval_math_num_tokens": 10025864.0, "eval_math_runtime": 49.8707, "eval_math_samples_per_second": 10.026, "eval_math_steps_per_second": 2.506, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.7307665984630585, "eval_physics_loss": 0.8242064118385315, "eval_physics_mean_token_accuracy": 0.8064617681503295, "eval_physics_num_tokens": 10025864.0, "eval_physics_runtime": 57.1693, "eval_physics_samples_per_second": 8.746, "eval_physics_steps_per_second": 2.186, "step": 300 }, { "entropy": 0.7168749757111073, "epoch": 0.496, "grad_norm": 0.9921875, "learning_rate": 1.9933333333333334e-05, "loss": 0.8032, "mean_token_accuracy": 0.8051175128668546, "num_tokens": 10362573.0, "step": 310 }, { "entropy": 0.6993480321019888, "epoch": 0.512, "grad_norm": 1.15625, "learning_rate": 1.985925925925926e-05, "loss": 0.7903, "mean_token_accuracy": 0.8099954195320607, "num_tokens": 10694440.0, "step": 320 }, { "entropy": 0.7138712629675865, "epoch": 0.528, "grad_norm": 1.046875, "learning_rate": 1.9785185185185187e-05, "loss": 0.8101, "mean_token_accuracy": 0.805506169050932, "num_tokens": 11022357.0, "step": 330 }, { "entropy": 0.6928766580298543, "epoch": 0.544, "grad_norm": 1.0546875, "learning_rate": 1.971111111111111e-05, "loss": 0.7916, "mean_token_accuracy": 0.8087165944278241, "num_tokens": 11345911.0, "step": 340 }, { "entropy": 0.682861409150064, "epoch": 0.56, "grad_norm": 0.8984375, "learning_rate": 1.963703703703704e-05, "loss": 0.7735, "mean_token_accuracy": 0.8126827124506235, "num_tokens": 11687225.0, "step": 350 }, { "entropy": 0.6824749782681465, "epoch": 0.576, "grad_norm": 1.015625, "learning_rate": 1.9562962962962964e-05, "loss": 0.7723, "mean_token_accuracy": 0.8146394658833742, "num_tokens": 12015720.0, "step": 360 }, { "entropy": 0.6805058639496565, "epoch": 0.592, "grad_norm": 1.2109375, "learning_rate": 1.948888888888889e-05, "loss": 0.776, "mean_token_accuracy": 0.811661035567522, "num_tokens": 12343393.0, "step": 370 }, { "entropy": 0.6700849516317249, "epoch": 0.608, "grad_norm": 0.9921875, "learning_rate": 1.9414814814814817e-05, "loss": 0.7658, "mean_token_accuracy": 0.8141583666205406, "num_tokens": 12671474.0, "step": 380 }, { "entropy": 0.6893410481512546, "epoch": 0.624, "grad_norm": 0.98046875, "learning_rate": 1.9340740740740743e-05, "loss": 0.7782, "mean_token_accuracy": 0.8111887093633413, "num_tokens": 12996077.0, "step": 390 }, { "entropy": 0.6852772971615195, "epoch": 0.64, "grad_norm": 0.9609375, "learning_rate": 1.926666666666667e-05, "loss": 0.7799, "mean_token_accuracy": 0.8112590182572603, "num_tokens": 13325291.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 2.0124985208511355, "eval_biology_loss": 2.4155702590942383, "eval_biology_mean_token_accuracy": 0.5735694291591644, "eval_biology_num_tokens": 13325291.0, "eval_biology_runtime": 38.6667, "eval_biology_samples_per_second": 12.931, "eval_biology_steps_per_second": 3.233, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.8911373369693756, "eval_chemistry_loss": 1.0692609548568726, "eval_chemistry_mean_token_accuracy": 0.7647108516693115, "eval_chemistry_num_tokens": 13325291.0, "eval_chemistry_runtime": 48.2089, "eval_chemistry_samples_per_second": 10.372, "eval_chemistry_steps_per_second": 2.593, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.7330298454761505, "eval_math_loss": 1.0336663722991943, "eval_math_mean_token_accuracy": 0.7786638278961182, "eval_math_num_tokens": 13325291.0, "eval_math_runtime": 49.7905, "eval_math_samples_per_second": 10.042, "eval_math_steps_per_second": 2.511, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.681670261144638, "eval_physics_loss": 0.7883204817771912, "eval_physics_mean_token_accuracy": 0.8129381031990052, "eval_physics_num_tokens": 13325291.0, "eval_physics_runtime": 57.2563, "eval_physics_samples_per_second": 8.733, "eval_physics_steps_per_second": 2.183, "step": 400 }, { "entropy": 0.6723099524155259, "epoch": 0.656, "grad_norm": 0.953125, "learning_rate": 1.9192592592592593e-05, "loss": 0.757, "mean_token_accuracy": 0.8151150114834309, "num_tokens": 13671434.0, "step": 410 }, { "entropy": 0.6621538577601314, "epoch": 0.672, "grad_norm": 1.0234375, "learning_rate": 1.911851851851852e-05, "loss": 0.7563, "mean_token_accuracy": 0.8172798678278923, "num_tokens": 13993668.0, "step": 420 }, { "entropy": 0.6673029117286206, "epoch": 0.688, "grad_norm": 0.921875, "learning_rate": 1.9044444444444446e-05, "loss": 0.7529, "mean_token_accuracy": 0.8164214458316564, "num_tokens": 14334907.0, "step": 430 }, { "entropy": 0.6810694945976138, "epoch": 0.704, "grad_norm": 1.046875, "learning_rate": 1.8970370370370372e-05, "loss": 0.7695, "mean_token_accuracy": 0.8127802673727273, "num_tokens": 14670639.0, "step": 440 }, { "entropy": 0.6582699194550514, "epoch": 0.72, "grad_norm": 1.0, "learning_rate": 1.8896296296296295e-05, "loss": 0.7544, "mean_token_accuracy": 0.8171826928853989, "num_tokens": 14987995.0, "step": 450 }, { "entropy": 0.6564427128061652, "epoch": 0.736, "grad_norm": 1.078125, "learning_rate": 1.8822222222222225e-05, "loss": 0.7446, "mean_token_accuracy": 0.8173684533685446, "num_tokens": 15324659.0, "step": 460 }, { "entropy": 0.6495672106742859, "epoch": 0.752, "grad_norm": 0.94921875, "learning_rate": 1.874814814814815e-05, "loss": 0.7416, "mean_token_accuracy": 0.8195757914334536, "num_tokens": 15657342.0, "step": 470 }, { "entropy": 0.6738376861438156, "epoch": 0.768, "grad_norm": 0.95703125, "learning_rate": 1.8674074074074075e-05, "loss": 0.7642, "mean_token_accuracy": 0.8124835971742869, "num_tokens": 15988993.0, "step": 480 }, { "entropy": 0.6495399951934815, "epoch": 0.784, "grad_norm": 0.9609375, "learning_rate": 1.86e-05, "loss": 0.7324, "mean_token_accuracy": 0.8208612345159054, "num_tokens": 16333857.0, "step": 490 }, { "entropy": 0.6567428983747959, "epoch": 0.8, "grad_norm": 0.99609375, "learning_rate": 1.8525925925925928e-05, "loss": 0.7488, "mean_token_accuracy": 0.817277068644762, "num_tokens": 16664690.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 2.039887357711792, "eval_biology_loss": 2.4157471656799316, "eval_biology_mean_token_accuracy": 0.5770127189159393, "eval_biology_num_tokens": 16664690.0, "eval_biology_runtime": 38.6815, "eval_biology_samples_per_second": 12.926, "eval_biology_steps_per_second": 3.232, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.8955893061161041, "eval_chemistry_loss": 1.048737645149231, "eval_chemistry_mean_token_accuracy": 0.768093888759613, "eval_chemistry_num_tokens": 16664690.0, "eval_chemistry_runtime": 48.42, "eval_chemistry_samples_per_second": 10.326, "eval_chemistry_steps_per_second": 2.582, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7318526220321655, "eval_math_loss": 1.0199034214019775, "eval_math_mean_token_accuracy": 0.7799897599220276, "eval_math_num_tokens": 16664690.0, "eval_math_runtime": 49.8882, "eval_math_samples_per_second": 10.022, "eval_math_steps_per_second": 2.506, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.6712099099159241, "eval_physics_loss": 0.7689476609230042, "eval_physics_mean_token_accuracy": 0.8163677668571472, "eval_physics_num_tokens": 16664690.0, "eval_physics_runtime": 57.4517, "eval_physics_samples_per_second": 8.703, "eval_physics_steps_per_second": 2.176, "step": 500 }, { "entropy": 0.6573688389733434, "epoch": 0.816, "grad_norm": 0.96875, "learning_rate": 1.8451851851851855e-05, "loss": 0.7477, "mean_token_accuracy": 0.8164514761418105, "num_tokens": 16989307.0, "step": 510 }, { "entropy": 0.6648645078763366, "epoch": 0.832, "grad_norm": 0.9609375, "learning_rate": 1.8377777777777778e-05, "loss": 0.749, "mean_token_accuracy": 0.8170726090669632, "num_tokens": 17317524.0, "step": 520 }, { "entropy": 0.6526606786996126, "epoch": 0.848, "grad_norm": 0.8515625, "learning_rate": 1.8303703703703704e-05, "loss": 0.7388, "mean_token_accuracy": 0.8195879191160202, "num_tokens": 17650424.0, "step": 530 }, { "entropy": 0.6565553491935134, "epoch": 0.864, "grad_norm": 0.8984375, "learning_rate": 1.822962962962963e-05, "loss": 0.7464, "mean_token_accuracy": 0.8177060768008232, "num_tokens": 17985547.0, "step": 540 }, { "entropy": 0.6623865978792309, "epoch": 0.88, "grad_norm": 0.9453125, "learning_rate": 1.8155555555555557e-05, "loss": 0.7495, "mean_token_accuracy": 0.8162953305989504, "num_tokens": 18315392.0, "step": 550 }, { "entropy": 0.6179373754188419, "epoch": 0.896, "grad_norm": 0.8359375, "learning_rate": 1.8081481481481484e-05, "loss": 0.7023, "mean_token_accuracy": 0.8264330130070447, "num_tokens": 18667124.0, "step": 560 }, { "entropy": 0.6555242039263248, "epoch": 0.912, "grad_norm": 0.8984375, "learning_rate": 1.800740740740741e-05, "loss": 0.7393, "mean_token_accuracy": 0.8174499638378621, "num_tokens": 19002186.0, "step": 570 }, { "entropy": 0.6485247412696481, "epoch": 0.928, "grad_norm": 0.98046875, "learning_rate": 1.7933333333333333e-05, "loss": 0.7363, "mean_token_accuracy": 0.8195535041391849, "num_tokens": 19335013.0, "step": 580 }, { "entropy": 0.63772834520787, "epoch": 0.944, "grad_norm": 0.984375, "learning_rate": 1.785925925925926e-05, "loss": 0.7269, "mean_token_accuracy": 0.821834321692586, "num_tokens": 19662733.0, "step": 590 }, { "entropy": 0.6372630735859275, "epoch": 0.96, "grad_norm": 0.99609375, "learning_rate": 1.7785185185185186e-05, "loss": 0.7318, "mean_token_accuracy": 0.8204116970300674, "num_tokens": 19996789.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 2.0030080833435058, "eval_biology_loss": 2.361283302307129, "eval_biology_mean_token_accuracy": 0.5793048655986786, "eval_biology_num_tokens": 19996789.0, "eval_biology_runtime": 38.6981, "eval_biology_samples_per_second": 12.921, "eval_biology_steps_per_second": 3.23, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 0.8840513801574708, "eval_chemistry_loss": 1.0323643684387207, "eval_chemistry_mean_token_accuracy": 0.7703397722244263, "eval_chemistry_num_tokens": 19996789.0, "eval_chemistry_runtime": 48.2039, "eval_chemistry_samples_per_second": 10.373, "eval_chemistry_steps_per_second": 2.593, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.7270034260749817, "eval_math_loss": 1.009481430053711, "eval_math_mean_token_accuracy": 0.7820751585960388, "eval_math_num_tokens": 19996789.0, "eval_math_runtime": 49.571, "eval_math_samples_per_second": 10.087, "eval_math_steps_per_second": 2.522, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.6698693735599518, "eval_physics_loss": 0.7564050555229187, "eval_physics_mean_token_accuracy": 0.8185040464401245, "eval_physics_num_tokens": 19996789.0, "eval_physics_runtime": 56.9517, "eval_physics_samples_per_second": 8.779, "eval_physics_steps_per_second": 2.195, "step": 600 }, { "entropy": 0.6384279150515795, "epoch": 0.976, "grad_norm": 1.0234375, "learning_rate": 1.7711111111111113e-05, "loss": 0.7283, "mean_token_accuracy": 0.82187210470438, "num_tokens": 20332636.0, "step": 610 }, { "entropy": 0.6450063675642014, "epoch": 0.992, "grad_norm": 0.8515625, "learning_rate": 1.763703703703704e-05, "loss": 0.7272, "mean_token_accuracy": 0.8209770727902651, "num_tokens": 20662209.0, "step": 620 }, { "entropy": 0.6167608626186848, "epoch": 1.008, "grad_norm": 0.8984375, "learning_rate": 1.7562962962962962e-05, "loss": 0.7008, "mean_token_accuracy": 0.825862829759717, "num_tokens": 21005731.0, "step": 630 }, { "entropy": 0.6117491278797388, "epoch": 1.024, "grad_norm": 0.93359375, "learning_rate": 1.7488888888888892e-05, "loss": 0.7058, "mean_token_accuracy": 0.8266724064946175, "num_tokens": 21335407.0, "step": 640 }, { "entropy": 0.6068670526146889, "epoch": 1.04, "grad_norm": 0.97265625, "learning_rate": 1.7414814814814815e-05, "loss": 0.6989, "mean_token_accuracy": 0.8283324401825667, "num_tokens": 21668490.0, "step": 650 }, { "entropy": 0.601451874896884, "epoch": 1.056, "grad_norm": 0.95703125, "learning_rate": 1.7340740740740742e-05, "loss": 0.6864, "mean_token_accuracy": 0.829675118252635, "num_tokens": 22006039.0, "step": 660 }, { "entropy": 0.6137524953112006, "epoch": 1.072, "grad_norm": 0.94921875, "learning_rate": 1.726666666666667e-05, "loss": 0.7008, "mean_token_accuracy": 0.8262542523443699, "num_tokens": 22348206.0, "step": 670 }, { "entropy": 0.6093455260619521, "epoch": 1.088, "grad_norm": 0.96875, "learning_rate": 1.7192592592592595e-05, "loss": 0.6956, "mean_token_accuracy": 0.8260304640978575, "num_tokens": 22676163.0, "step": 680 }, { "entropy": 0.6026462253183127, "epoch": 1.104, "grad_norm": 0.87109375, "learning_rate": 1.711851851851852e-05, "loss": 0.6967, "mean_token_accuracy": 0.8273924000561237, "num_tokens": 23009279.0, "step": 690 }, { "entropy": 0.6122452523559332, "epoch": 1.12, "grad_norm": 0.9453125, "learning_rate": 1.7044444444444445e-05, "loss": 0.7, "mean_token_accuracy": 0.8257350366562605, "num_tokens": 23346828.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 1.8616097135543823, "eval_biology_loss": 2.4014246463775635, "eval_biology_mean_token_accuracy": 0.578062358379364, "eval_biology_num_tokens": 23346828.0, "eval_biology_runtime": 38.9662, "eval_biology_samples_per_second": 12.832, "eval_biology_steps_per_second": 3.208, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 0.8116880433559418, "eval_chemistry_loss": 1.0317949056625366, "eval_chemistry_mean_token_accuracy": 0.7715164208412171, "eval_chemistry_num_tokens": 23346828.0, "eval_chemistry_runtime": 48.496, "eval_chemistry_samples_per_second": 10.31, "eval_chemistry_steps_per_second": 2.578, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.6895896532535553, "eval_math_loss": 1.0153776407241821, "eval_math_mean_token_accuracy": 0.7820956745147705, "eval_math_num_tokens": 23346828.0, "eval_math_runtime": 49.7815, "eval_math_samples_per_second": 10.044, "eval_math_steps_per_second": 2.511, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.6192426791191101, "eval_physics_loss": 0.7497905492782593, "eval_physics_mean_token_accuracy": 0.8200045080184937, "eval_physics_num_tokens": 23346828.0, "eval_physics_runtime": 57.1287, "eval_physics_samples_per_second": 8.752, "eval_physics_steps_per_second": 2.188, "step": 700 }, { "entropy": 0.588855667039752, "epoch": 1.1360000000000001, "grad_norm": 0.9375, "learning_rate": 1.697037037037037e-05, "loss": 0.6772, "mean_token_accuracy": 0.8326364874839782, "num_tokens": 23689161.0, "step": 710 }, { "entropy": 0.6099351227283478, "epoch": 1.152, "grad_norm": 1.0390625, "learning_rate": 1.6896296296296298e-05, "loss": 0.7039, "mean_token_accuracy": 0.8282962709665298, "num_tokens": 24016439.0, "step": 720 }, { "entropy": 0.6140455640852451, "epoch": 1.168, "grad_norm": 0.94921875, "learning_rate": 1.6822222222222224e-05, "loss": 0.7027, "mean_token_accuracy": 0.8253309100866317, "num_tokens": 24344431.0, "step": 730 }, { "entropy": 0.6041248327121138, "epoch": 1.184, "grad_norm": 1.09375, "learning_rate": 1.6748148148148147e-05, "loss": 0.6955, "mean_token_accuracy": 0.8286070462316275, "num_tokens": 24668087.0, "step": 740 }, { "entropy": 0.5895567566156388, "epoch": 1.2, "grad_norm": 0.9140625, "learning_rate": 1.6674074074074077e-05, "loss": 0.6822, "mean_token_accuracy": 0.8305550657212735, "num_tokens": 25000629.0, "step": 750 }, { "entropy": 0.6073450578376651, "epoch": 1.216, "grad_norm": 1.1875, "learning_rate": 1.66e-05, "loss": 0.7023, "mean_token_accuracy": 0.8261869914829731, "num_tokens": 25324324.0, "step": 760 }, { "entropy": 0.6059388216584921, "epoch": 1.232, "grad_norm": 0.9453125, "learning_rate": 1.6525925925925927e-05, "loss": 0.6863, "mean_token_accuracy": 0.8298216536641121, "num_tokens": 25658795.0, "step": 770 }, { "entropy": 0.5957553267478943, "epoch": 1.248, "grad_norm": 0.92578125, "learning_rate": 1.6451851851851853e-05, "loss": 0.6864, "mean_token_accuracy": 0.830686765909195, "num_tokens": 25993177.0, "step": 780 }, { "entropy": 0.5959657493978738, "epoch": 1.264, "grad_norm": 1.0234375, "learning_rate": 1.637777777777778e-05, "loss": 0.6826, "mean_token_accuracy": 0.8305899318307638, "num_tokens": 26329726.0, "step": 790 }, { "entropy": 0.6204709148034453, "epoch": 1.28, "grad_norm": 1.0078125, "learning_rate": 1.6303703703703706e-05, "loss": 0.7071, "mean_token_accuracy": 0.8238335218280554, "num_tokens": 26664093.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 1.7922346639633178, "eval_biology_loss": 2.4233291149139404, "eval_biology_mean_token_accuracy": 0.5778127768039704, "eval_biology_num_tokens": 26664093.0, "eval_biology_runtime": 39.1057, "eval_biology_samples_per_second": 12.786, "eval_biology_steps_per_second": 3.196, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 0.7962674243450165, "eval_chemistry_loss": 1.0286256074905396, "eval_chemistry_mean_token_accuracy": 0.7727626013755798, "eval_chemistry_num_tokens": 26664093.0, "eval_chemistry_runtime": 48.3131, "eval_chemistry_samples_per_second": 10.349, "eval_chemistry_steps_per_second": 2.587, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.6702238636016846, "eval_math_loss": 1.0157709121704102, "eval_math_mean_token_accuracy": 0.7829344019889831, "eval_math_num_tokens": 26664093.0, "eval_math_runtime": 49.7647, "eval_math_samples_per_second": 10.047, "eval_math_steps_per_second": 2.512, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.6072953283786774, "eval_physics_loss": 0.7434503436088562, "eval_physics_mean_token_accuracy": 0.8212288799285888, "eval_physics_num_tokens": 26664093.0, "eval_physics_runtime": 57.1238, "eval_physics_samples_per_second": 8.753, "eval_physics_steps_per_second": 2.188, "step": 800 }, { "entropy": 0.593550406768918, "epoch": 1.296, "grad_norm": 0.984375, "learning_rate": 1.622962962962963e-05, "loss": 0.6825, "mean_token_accuracy": 0.8320997886359691, "num_tokens": 26994877.0, "step": 810 }, { "entropy": 0.6156477816402912, "epoch": 1.312, "grad_norm": 1.046875, "learning_rate": 1.6155555555555556e-05, "loss": 0.703, "mean_token_accuracy": 0.8259445391595364, "num_tokens": 27328198.0, "step": 820 }, { "entropy": 0.5969830378890038, "epoch": 1.328, "grad_norm": 1.0078125, "learning_rate": 1.6081481481481482e-05, "loss": 0.6901, "mean_token_accuracy": 0.8290002550929785, "num_tokens": 27658818.0, "step": 830 }, { "entropy": 0.5997996777296066, "epoch": 1.3439999999999999, "grad_norm": 1.1015625, "learning_rate": 1.600740740740741e-05, "loss": 0.6904, "mean_token_accuracy": 0.8284455709159374, "num_tokens": 27991108.0, "step": 840 }, { "entropy": 0.6052390130236744, "epoch": 1.3599999999999999, "grad_norm": 0.94921875, "learning_rate": 1.5933333333333336e-05, "loss": 0.6852, "mean_token_accuracy": 0.8274706263095141, "num_tokens": 28327775.0, "step": 850 }, { "entropy": 0.6018602728843689, "epoch": 1.376, "grad_norm": 1.109375, "learning_rate": 1.5859259259259262e-05, "loss": 0.6827, "mean_token_accuracy": 0.8304568257182836, "num_tokens": 28666116.0, "step": 860 }, { "entropy": 0.5851238770410419, "epoch": 1.392, "grad_norm": 0.98828125, "learning_rate": 1.5785185185185185e-05, "loss": 0.6746, "mean_token_accuracy": 0.8333170894533396, "num_tokens": 29004293.0, "step": 870 }, { "entropy": 0.58795285820961, "epoch": 1.408, "grad_norm": 0.96875, "learning_rate": 1.571111111111111e-05, "loss": 0.6857, "mean_token_accuracy": 0.8305332105606794, "num_tokens": 29330131.0, "step": 880 }, { "entropy": 0.6189510561525822, "epoch": 1.424, "grad_norm": 0.90234375, "learning_rate": 1.5637037037037038e-05, "loss": 0.7072, "mean_token_accuracy": 0.8251793116331101, "num_tokens": 29661850.0, "step": 890 }, { "entropy": 0.5798567572608591, "epoch": 1.44, "grad_norm": 0.90625, "learning_rate": 1.5562962962962965e-05, "loss": 0.6695, "mean_token_accuracy": 0.8342564977705479, "num_tokens": 29999392.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 1.8147089042663573, "eval_biology_loss": 2.417410135269165, "eval_biology_mean_token_accuracy": 0.5787473826408386, "eval_biology_num_tokens": 29999392.0, "eval_biology_runtime": 38.9072, "eval_biology_samples_per_second": 12.851, "eval_biology_steps_per_second": 3.213, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 0.7944381227493286, "eval_chemistry_loss": 1.0228049755096436, "eval_chemistry_mean_token_accuracy": 0.7732948322296143, "eval_chemistry_num_tokens": 29999392.0, "eval_chemistry_runtime": 48.4429, "eval_chemistry_samples_per_second": 10.321, "eval_chemistry_steps_per_second": 2.58, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.6757729785442352, "eval_math_loss": 1.0138108730316162, "eval_math_mean_token_accuracy": 0.7829539861679077, "eval_math_num_tokens": 29999392.0, "eval_math_runtime": 49.813, "eval_math_samples_per_second": 10.038, "eval_math_steps_per_second": 2.509, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.607404308795929, "eval_physics_loss": 0.737089991569519, "eval_physics_mean_token_accuracy": 0.8226114134788514, "eval_physics_num_tokens": 29999392.0, "eval_physics_runtime": 57.3115, "eval_physics_samples_per_second": 8.724, "eval_physics_steps_per_second": 2.181, "step": 900 }, { "entropy": 0.5827234297990799, "epoch": 1.456, "grad_norm": 0.8828125, "learning_rate": 1.548888888888889e-05, "loss": 0.6728, "mean_token_accuracy": 0.8342432040721178, "num_tokens": 30341004.0, "step": 910 }, { "entropy": 0.613773494027555, "epoch": 1.472, "grad_norm": 1.015625, "learning_rate": 1.5414814814814814e-05, "loss": 0.7035, "mean_token_accuracy": 0.8268327709287405, "num_tokens": 30667689.0, "step": 920 }, { "entropy": 0.5988023646175862, "epoch": 1.488, "grad_norm": 0.93359375, "learning_rate": 1.5340740740740744e-05, "loss": 0.6897, "mean_token_accuracy": 0.8293631616979837, "num_tokens": 30999697.0, "step": 930 }, { "entropy": 0.5896453000605106, "epoch": 1.504, "grad_norm": 0.8828125, "learning_rate": 1.5266666666666667e-05, "loss": 0.6728, "mean_token_accuracy": 0.8324251122772693, "num_tokens": 31332775.0, "step": 940 }, { "entropy": 0.605005569756031, "epoch": 1.52, "grad_norm": 1.0390625, "learning_rate": 1.5192592592592594e-05, "loss": 0.6942, "mean_token_accuracy": 0.8275358382612467, "num_tokens": 31666521.0, "step": 950 }, { "entropy": 0.5999037871137262, "epoch": 1.536, "grad_norm": 1.0390625, "learning_rate": 1.5118518518518519e-05, "loss": 0.6904, "mean_token_accuracy": 0.8276842717081309, "num_tokens": 31998232.0, "step": 960 }, { "entropy": 0.5867987772449851, "epoch": 1.552, "grad_norm": 1.0703125, "learning_rate": 1.5044444444444445e-05, "loss": 0.6754, "mean_token_accuracy": 0.8337812848389149, "num_tokens": 32328085.0, "step": 970 }, { "entropy": 0.5942975046113134, "epoch": 1.568, "grad_norm": 1.0, "learning_rate": 1.497037037037037e-05, "loss": 0.6807, "mean_token_accuracy": 0.831771444156766, "num_tokens": 32656603.0, "step": 980 }, { "entropy": 0.6016290852800011, "epoch": 1.584, "grad_norm": 0.98828125, "learning_rate": 1.4896296296296298e-05, "loss": 0.6947, "mean_token_accuracy": 0.8264373868703843, "num_tokens": 32983769.0, "step": 990 }, { "entropy": 0.5999371835961937, "epoch": 1.6, "grad_norm": 0.92578125, "learning_rate": 1.4822222222222225e-05, "loss": 0.6882, "mean_token_accuracy": 0.8293469067662954, "num_tokens": 33309282.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 1.788435049057007, "eval_biology_loss": 2.369239330291748, "eval_biology_mean_token_accuracy": 0.582524644613266, "eval_biology_num_tokens": 33309282.0, "eval_biology_runtime": 38.8968, "eval_biology_samples_per_second": 12.855, "eval_biology_steps_per_second": 3.214, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 0.7873289968967437, "eval_chemistry_loss": 1.0177081823349, "eval_chemistry_mean_token_accuracy": 0.774118812084198, "eval_chemistry_num_tokens": 33309282.0, "eval_chemistry_runtime": 48.3649, "eval_chemistry_samples_per_second": 10.338, "eval_chemistry_steps_per_second": 2.585, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.6668027784824372, "eval_math_loss": 1.0126802921295166, "eval_math_mean_token_accuracy": 0.7838437123298645, "eval_math_num_tokens": 33309282.0, "eval_math_runtime": 49.8108, "eval_math_samples_per_second": 10.038, "eval_math_steps_per_second": 2.509, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.5990195829868317, "eval_physics_loss": 0.7323749661445618, "eval_physics_mean_token_accuracy": 0.8238483490943909, "eval_physics_num_tokens": 33309282.0, "eval_physics_runtime": 57.3705, "eval_physics_samples_per_second": 8.715, "eval_physics_steps_per_second": 2.179, "step": 1000 }, { "entropy": 0.5766303434967994, "epoch": 1.616, "grad_norm": 1.0, "learning_rate": 1.474814814814815e-05, "loss": 0.6687, "mean_token_accuracy": 0.8350894570350647, "num_tokens": 33647616.0, "step": 1010 }, { "entropy": 0.5906545480713248, "epoch": 1.6320000000000001, "grad_norm": 1.0390625, "learning_rate": 1.4674074074074076e-05, "loss": 0.6798, "mean_token_accuracy": 0.8322980519384146, "num_tokens": 33977586.0, "step": 1020 }, { "entropy": 0.5677616313099861, "epoch": 1.6480000000000001, "grad_norm": 0.98828125, "learning_rate": 1.46e-05, "loss": 0.658, "mean_token_accuracy": 0.8373314294964075, "num_tokens": 34312436.0, "step": 1030 }, { "entropy": 0.5948906594887375, "epoch": 1.6640000000000001, "grad_norm": 1.109375, "learning_rate": 1.4525925925925927e-05, "loss": 0.682, "mean_token_accuracy": 0.8317620534449816, "num_tokens": 34641549.0, "step": 1040 }, { "entropy": 0.5764713797718286, "epoch": 1.6800000000000002, "grad_norm": 0.94921875, "learning_rate": 1.4451851851851852e-05, "loss": 0.6715, "mean_token_accuracy": 0.8332184217870235, "num_tokens": 34977031.0, "step": 1050 }, { "entropy": 0.5952808676287532, "epoch": 1.696, "grad_norm": 0.99609375, "learning_rate": 1.4377777777777779e-05, "loss": 0.6834, "mean_token_accuracy": 0.8309958126395941, "num_tokens": 35300962.0, "step": 1060 }, { "entropy": 0.5965396504849195, "epoch": 1.712, "grad_norm": 1.046875, "learning_rate": 1.4303703703703703e-05, "loss": 0.6786, "mean_token_accuracy": 0.829968997463584, "num_tokens": 35642662.0, "step": 1070 }, { "entropy": 0.5757137715816498, "epoch": 1.728, "grad_norm": 0.9765625, "learning_rate": 1.4229629629629632e-05, "loss": 0.6695, "mean_token_accuracy": 0.833732133358717, "num_tokens": 35980993.0, "step": 1080 }, { "entropy": 0.5878799825906753, "epoch": 1.744, "grad_norm": 1.03125, "learning_rate": 1.4155555555555556e-05, "loss": 0.6717, "mean_token_accuracy": 0.8321238547563553, "num_tokens": 36326797.0, "step": 1090 }, { "entropy": 0.577763288281858, "epoch": 1.76, "grad_norm": 1.015625, "learning_rate": 1.4081481481481483e-05, "loss": 0.6698, "mean_token_accuracy": 0.8338598430156707, "num_tokens": 36654797.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 1.76575110912323, "eval_biology_loss": 2.379521369934082, "eval_biology_mean_token_accuracy": 0.5822040309906006, "eval_biology_num_tokens": 36654797.0, "eval_biology_runtime": 38.7201, "eval_biology_samples_per_second": 12.913, "eval_biology_steps_per_second": 3.228, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 0.7854319989681244, "eval_chemistry_loss": 1.01557457447052, "eval_chemistry_mean_token_accuracy": 0.7744541010856628, "eval_chemistry_num_tokens": 36654797.0, "eval_chemistry_runtime": 48.2464, "eval_chemistry_samples_per_second": 10.363, "eval_chemistry_steps_per_second": 2.591, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.6641156001091003, "eval_math_loss": 1.0110862255096436, "eval_math_mean_token_accuracy": 0.783713164806366, "eval_math_num_tokens": 36654797.0, "eval_math_runtime": 49.5907, "eval_math_samples_per_second": 10.083, "eval_math_steps_per_second": 2.521, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.5953411047458649, "eval_physics_loss": 0.7290456295013428, "eval_physics_mean_token_accuracy": 0.8248090887069702, "eval_physics_num_tokens": 36654797.0, "eval_physics_runtime": 57.3959, "eval_physics_samples_per_second": 8.711, "eval_physics_steps_per_second": 2.178, "step": 1100 }, { "entropy": 0.5980557221919298, "epoch": 1.776, "grad_norm": 0.95703125, "learning_rate": 1.400740740740741e-05, "loss": 0.6911, "mean_token_accuracy": 0.8282759781926871, "num_tokens": 36989310.0, "step": 1110 }, { "entropy": 0.5839062621816993, "epoch": 1.792, "grad_norm": 1.0859375, "learning_rate": 1.3933333333333334e-05, "loss": 0.6708, "mean_token_accuracy": 0.8331681247800589, "num_tokens": 37318535.0, "step": 1120 }, { "entropy": 0.5731059337034822, "epoch": 1.808, "grad_norm": 1.015625, "learning_rate": 1.385925925925926e-05, "loss": 0.6585, "mean_token_accuracy": 0.8363862674683332, "num_tokens": 37658889.0, "step": 1130 }, { "entropy": 0.5865196855738759, "epoch": 1.8239999999999998, "grad_norm": 0.984375, "learning_rate": 1.3785185185185186e-05, "loss": 0.6698, "mean_token_accuracy": 0.8333104524761439, "num_tokens": 37995198.0, "step": 1140 }, { "entropy": 0.5773211907595396, "epoch": 1.8399999999999999, "grad_norm": 1.0, "learning_rate": 1.3711111111111112e-05, "loss": 0.6726, "mean_token_accuracy": 0.8339518435299397, "num_tokens": 38325542.0, "step": 1150 }, { "entropy": 0.5834932073950767, "epoch": 1.8559999999999999, "grad_norm": 0.97265625, "learning_rate": 1.3637037037037037e-05, "loss": 0.6632, "mean_token_accuracy": 0.8345976937562227, "num_tokens": 38668148.0, "step": 1160 }, { "entropy": 0.5739704865962267, "epoch": 1.8719999999999999, "grad_norm": 0.95703125, "learning_rate": 1.3562962962962965e-05, "loss": 0.6687, "mean_token_accuracy": 0.8333972290158271, "num_tokens": 39001652.0, "step": 1170 }, { "entropy": 0.5951925914734602, "epoch": 1.888, "grad_norm": 1.09375, "learning_rate": 1.3488888888888888e-05, "loss": 0.682, "mean_token_accuracy": 0.8303560864180326, "num_tokens": 39333474.0, "step": 1180 }, { "entropy": 0.6042962603271007, "epoch": 1.904, "grad_norm": 1.0234375, "learning_rate": 1.3414814814814817e-05, "loss": 0.6915, "mean_token_accuracy": 0.8288168527185917, "num_tokens": 39663055.0, "step": 1190 }, { "entropy": 0.601089458540082, "epoch": 1.92, "grad_norm": 1.0234375, "learning_rate": 1.3340740740740741e-05, "loss": 0.6888, "mean_token_accuracy": 0.8287999380379916, "num_tokens": 39989824.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 1.759890214920044, "eval_biology_loss": 2.368649959564209, "eval_biology_mean_token_accuracy": 0.5835101284980774, "eval_biology_num_tokens": 39989824.0, "eval_biology_runtime": 38.6774, "eval_biology_samples_per_second": 12.927, "eval_biology_steps_per_second": 3.232, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 0.776658641576767, "eval_chemistry_loss": 1.0122178792953491, "eval_chemistry_mean_token_accuracy": 0.7752061448097229, "eval_chemistry_num_tokens": 39989824.0, "eval_chemistry_runtime": 48.1689, "eval_chemistry_samples_per_second": 10.38, "eval_chemistry_steps_per_second": 2.595, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.6634243364334107, "eval_math_loss": 1.0101300477981567, "eval_math_mean_token_accuracy": 0.7840519022941589, "eval_math_num_tokens": 39989824.0, "eval_math_runtime": 49.5286, "eval_math_samples_per_second": 10.095, "eval_math_steps_per_second": 2.524, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.5912484722137451, "eval_physics_loss": 0.7268282175064087, "eval_physics_mean_token_accuracy": 0.825360511302948, "eval_physics_num_tokens": 39989824.0, "eval_physics_runtime": 56.9412, "eval_physics_samples_per_second": 8.781, "eval_physics_steps_per_second": 2.195, "step": 1200 }, { "entropy": 0.5859691947698593, "epoch": 1.936, "grad_norm": 0.96875, "learning_rate": 1.3266666666666668e-05, "loss": 0.676, "mean_token_accuracy": 0.8319578696042299, "num_tokens": 40325070.0, "step": 1210 }, { "entropy": 0.5632861316204071, "epoch": 1.952, "grad_norm": 1.0390625, "learning_rate": 1.3192592592592594e-05, "loss": 0.6508, "mean_token_accuracy": 0.83807716332376, "num_tokens": 40659614.0, "step": 1220 }, { "entropy": 0.5868391951546073, "epoch": 1.968, "grad_norm": 0.91796875, "learning_rate": 1.311851851851852e-05, "loss": 0.6756, "mean_token_accuracy": 0.8323294088244438, "num_tokens": 40991508.0, "step": 1230 }, { "entropy": 0.5867437845095992, "epoch": 1.984, "grad_norm": 1.078125, "learning_rate": 1.3044444444444446e-05, "loss": 0.676, "mean_token_accuracy": 0.8312037277966737, "num_tokens": 41328779.0, "step": 1240 }, { "entropy": 0.593030778504908, "epoch": 2.0, "grad_norm": 0.9765625, "learning_rate": 1.297037037037037e-05, "loss": 0.6752, "mean_token_accuracy": 0.8317596733570098, "num_tokens": 41664296.0, "step": 1250 }, { "entropy": 0.5428176861256361, "epoch": 2.016, "grad_norm": 0.91796875, "learning_rate": 1.2896296296296299e-05, "loss": 0.6304, "mean_token_accuracy": 0.8421510916203261, "num_tokens": 42001193.0, "step": 1260 }, { "entropy": 0.5524626910686493, "epoch": 2.032, "grad_norm": 1.046875, "learning_rate": 1.2822222222222222e-05, "loss": 0.633, "mean_token_accuracy": 0.8409431543201208, "num_tokens": 42340757.0, "step": 1270 }, { "entropy": 0.5630420710891485, "epoch": 2.048, "grad_norm": 1.0625, "learning_rate": 1.274814814814815e-05, "loss": 0.6561, "mean_token_accuracy": 0.8348792966455221, "num_tokens": 42670324.0, "step": 1280 }, { "entropy": 0.5547426689416171, "epoch": 2.064, "grad_norm": 1.015625, "learning_rate": 1.2674074074074075e-05, "loss": 0.6368, "mean_token_accuracy": 0.8413027279078961, "num_tokens": 43010231.0, "step": 1290 }, { "entropy": 0.5610322959721088, "epoch": 2.08, "grad_norm": 1.0, "learning_rate": 1.2600000000000001e-05, "loss": 0.6516, "mean_token_accuracy": 0.8378075629472732, "num_tokens": 43340227.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 1.6757129163742066, "eval_biology_loss": 2.434731960296631, "eval_biology_mean_token_accuracy": 0.5802675273418426, "eval_biology_num_tokens": 43340227.0, "eval_biology_runtime": 38.7759, "eval_biology_samples_per_second": 12.895, "eval_biology_steps_per_second": 3.224, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 0.7416200067996979, "eval_chemistry_loss": 1.027290940284729, "eval_chemistry_mean_token_accuracy": 0.774878448009491, "eval_chemistry_num_tokens": 43340227.0, "eval_chemistry_runtime": 48.262, "eval_chemistry_samples_per_second": 10.36, "eval_chemistry_steps_per_second": 2.59, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.6465173182487488, "eval_math_loss": 1.023693561553955, "eval_math_mean_token_accuracy": 0.7827271037101745, "eval_math_num_tokens": 43340227.0, "eval_math_runtime": 49.631, "eval_math_samples_per_second": 10.074, "eval_math_steps_per_second": 2.519, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.5699445073604583, "eval_physics_loss": 0.7285439372062683, "eval_physics_mean_token_accuracy": 0.8254043416976928, "eval_physics_num_tokens": 43340227.0, "eval_physics_runtime": 57.0739, "eval_physics_samples_per_second": 8.761, "eval_physics_steps_per_second": 2.19, "step": 1300 }, { "entropy": 0.5561112010851502, "epoch": 2.096, "grad_norm": 1.03125, "learning_rate": 1.2525925925925928e-05, "loss": 0.6449, "mean_token_accuracy": 0.838719493150711, "num_tokens": 43673099.0, "step": 1310 }, { "entropy": 0.5452175224199891, "epoch": 2.112, "grad_norm": 1.0234375, "learning_rate": 1.2451851851851853e-05, "loss": 0.6321, "mean_token_accuracy": 0.8417959384620189, "num_tokens": 44005986.0, "step": 1320 }, { "entropy": 0.5562603289261461, "epoch": 2.128, "grad_norm": 1.0390625, "learning_rate": 1.237777777777778e-05, "loss": 0.653, "mean_token_accuracy": 0.8379485800862312, "num_tokens": 44321465.0, "step": 1330 }, { "entropy": 0.5499181509017944, "epoch": 2.144, "grad_norm": 0.9921875, "learning_rate": 1.2303703703703704e-05, "loss": 0.638, "mean_token_accuracy": 0.8397632710635662, "num_tokens": 44661027.0, "step": 1340 }, { "entropy": 0.5740855507552624, "epoch": 2.16, "grad_norm": 1.0703125, "learning_rate": 1.222962962962963e-05, "loss": 0.6641, "mean_token_accuracy": 0.8336062435060739, "num_tokens": 44994236.0, "step": 1350 }, { "entropy": 0.5468110611662269, "epoch": 2.176, "grad_norm": 0.99609375, "learning_rate": 1.2155555555555555e-05, "loss": 0.6341, "mean_token_accuracy": 0.8411031287163496, "num_tokens": 45334280.0, "step": 1360 }, { "entropy": 0.5445626365020871, "epoch": 2.192, "grad_norm": 1.0234375, "learning_rate": 1.2081481481481484e-05, "loss": 0.6379, "mean_token_accuracy": 0.8422058593481779, "num_tokens": 45666954.0, "step": 1370 }, { "entropy": 0.5418444711714983, "epoch": 2.208, "grad_norm": 1.0859375, "learning_rate": 1.2007407407407408e-05, "loss": 0.6314, "mean_token_accuracy": 0.8422281835228205, "num_tokens": 45999546.0, "step": 1380 }, { "entropy": 0.54025251083076, "epoch": 2.224, "grad_norm": 1.0, "learning_rate": 1.1933333333333335e-05, "loss": 0.6343, "mean_token_accuracy": 0.8418932400643826, "num_tokens": 46333809.0, "step": 1390 }, { "entropy": 0.5499276254326105, "epoch": 2.24, "grad_norm": 1.015625, "learning_rate": 1.185925925925926e-05, "loss": 0.6369, "mean_token_accuracy": 0.8407698534429073, "num_tokens": 46673328.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 1.6533626160621644, "eval_biology_loss": 2.441138744354248, "eval_biology_mean_token_accuracy": 0.5818718819618225, "eval_biology_num_tokens": 46673328.0, "eval_biology_runtime": 38.7889, "eval_biology_samples_per_second": 12.89, "eval_biology_steps_per_second": 3.223, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 0.7358436925411225, "eval_chemistry_loss": 1.0279189348220825, "eval_chemistry_mean_token_accuracy": 0.7754086136817933, "eval_chemistry_num_tokens": 46673328.0, "eval_chemistry_runtime": 48.2677, "eval_chemistry_samples_per_second": 10.359, "eval_chemistry_steps_per_second": 2.59, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.6436889851093293, "eval_math_loss": 1.0210638046264648, "eval_math_mean_token_accuracy": 0.7834169192314148, "eval_math_num_tokens": 46673328.0, "eval_math_runtime": 49.6493, "eval_math_samples_per_second": 10.071, "eval_math_steps_per_second": 2.518, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.5687800683975219, "eval_physics_loss": 0.7271425127983093, "eval_physics_mean_token_accuracy": 0.8259177951812744, "eval_physics_num_tokens": 46673328.0, "eval_physics_runtime": 57.064, "eval_physics_samples_per_second": 8.762, "eval_physics_steps_per_second": 2.191, "step": 1400 }, { "entropy": 0.5672985427081585, "epoch": 2.2560000000000002, "grad_norm": 1.109375, "learning_rate": 1.1785185185185186e-05, "loss": 0.6585, "mean_token_accuracy": 0.8361219819635153, "num_tokens": 47001509.0, "step": 1410 }, { "entropy": 0.5440221125259995, "epoch": 2.2720000000000002, "grad_norm": 1.0859375, "learning_rate": 1.1711111111111113e-05, "loss": 0.6308, "mean_token_accuracy": 0.842396317794919, "num_tokens": 47340142.0, "step": 1420 }, { "entropy": 0.5601490139961243, "epoch": 2.288, "grad_norm": 1.09375, "learning_rate": 1.1637037037037037e-05, "loss": 0.6515, "mean_token_accuracy": 0.8371975239366293, "num_tokens": 47677550.0, "step": 1430 }, { "entropy": 0.5608395885676145, "epoch": 2.304, "grad_norm": 1.15625, "learning_rate": 1.1562962962962964e-05, "loss": 0.65, "mean_token_accuracy": 0.8390717066824436, "num_tokens": 47993851.0, "step": 1440 }, { "entropy": 0.5507894741371274, "epoch": 2.32, "grad_norm": 0.96484375, "learning_rate": 1.1488888888888889e-05, "loss": 0.6392, "mean_token_accuracy": 0.8389977443963289, "num_tokens": 48332577.0, "step": 1450 }, { "entropy": 0.5349721314385534, "epoch": 2.336, "grad_norm": 1.0078125, "learning_rate": 1.1414814814814817e-05, "loss": 0.6212, "mean_token_accuracy": 0.8431226223707199, "num_tokens": 48676623.0, "step": 1460 }, { "entropy": 0.5349597102031112, "epoch": 2.352, "grad_norm": 1.015625, "learning_rate": 1.1340740740740742e-05, "loss": 0.6199, "mean_token_accuracy": 0.8435559894889593, "num_tokens": 49007770.0, "step": 1470 }, { "entropy": 0.5474015891551971, "epoch": 2.368, "grad_norm": 1.046875, "learning_rate": 1.1266666666666668e-05, "loss": 0.6384, "mean_token_accuracy": 0.8399505577981472, "num_tokens": 49350016.0, "step": 1480 }, { "entropy": 0.5547215724363923, "epoch": 2.384, "grad_norm": 1.171875, "learning_rate": 1.1192592592592593e-05, "loss": 0.6461, "mean_token_accuracy": 0.8389234948903322, "num_tokens": 49679074.0, "step": 1490 }, { "entropy": 0.5506776092574001, "epoch": 2.4, "grad_norm": 1.09375, "learning_rate": 1.111851851851852e-05, "loss": 0.6381, "mean_token_accuracy": 0.8396286979317665, "num_tokens": 50020648.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 1.6101444239616394, "eval_biology_loss": 2.4393153190612793, "eval_biology_mean_token_accuracy": 0.5827285711765289, "eval_biology_num_tokens": 50020648.0, "eval_biology_runtime": 38.6357, "eval_biology_samples_per_second": 12.941, "eval_biology_steps_per_second": 3.235, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 0.7312388877868652, "eval_chemistry_loss": 1.029062271118164, "eval_chemistry_mean_token_accuracy": 0.7757972526550293, "eval_chemistry_num_tokens": 50020648.0, "eval_chemistry_runtime": 48.0523, "eval_chemistry_samples_per_second": 10.405, "eval_chemistry_steps_per_second": 2.601, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.6399386277198792, "eval_math_loss": 1.0237832069396973, "eval_math_mean_token_accuracy": 0.7829466652870178, "eval_math_num_tokens": 50020648.0, "eval_math_runtime": 49.5118, "eval_math_samples_per_second": 10.099, "eval_math_steps_per_second": 2.525, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.5676034677028656, "eval_physics_loss": 0.7261826395988464, "eval_physics_mean_token_accuracy": 0.825977876663208, "eval_physics_num_tokens": 50020648.0, "eval_physics_runtime": 56.9466, "eval_physics_samples_per_second": 8.78, "eval_physics_steps_per_second": 2.195, "step": 1500 }, { "entropy": 0.546076669357717, "epoch": 2.416, "grad_norm": 1.0234375, "learning_rate": 1.1044444444444444e-05, "loss": 0.6307, "mean_token_accuracy": 0.8414915602654218, "num_tokens": 50370135.0, "step": 1510 }, { "entropy": 0.5511986341327428, "epoch": 2.432, "grad_norm": 1.0859375, "learning_rate": 1.0970370370370371e-05, "loss": 0.6434, "mean_token_accuracy": 0.8396999359130859, "num_tokens": 50695363.0, "step": 1520 }, { "entropy": 0.568221763893962, "epoch": 2.448, "grad_norm": 1.0234375, "learning_rate": 1.0896296296296298e-05, "loss": 0.6577, "mean_token_accuracy": 0.8357432372868061, "num_tokens": 51023498.0, "step": 1530 }, { "entropy": 0.5589647406712175, "epoch": 2.464, "grad_norm": 1.1328125, "learning_rate": 1.0822222222222222e-05, "loss": 0.6484, "mean_token_accuracy": 0.8391309097409249, "num_tokens": 51349816.0, "step": 1540 }, { "entropy": 0.5509988136589528, "epoch": 2.48, "grad_norm": 1.109375, "learning_rate": 1.074814814814815e-05, "loss": 0.6417, "mean_token_accuracy": 0.8385909989476203, "num_tokens": 51680507.0, "step": 1550 }, { "entropy": 0.5563702458515764, "epoch": 2.496, "grad_norm": 1.046875, "learning_rate": 1.0674074074074074e-05, "loss": 0.6462, "mean_token_accuracy": 0.8388838239014149, "num_tokens": 52017752.0, "step": 1560 }, { "entropy": 0.5431887688115239, "epoch": 2.512, "grad_norm": 1.140625, "learning_rate": 1.0600000000000002e-05, "loss": 0.6342, "mean_token_accuracy": 0.8411614701151848, "num_tokens": 52345439.0, "step": 1570 }, { "entropy": 0.543870740942657, "epoch": 2.528, "grad_norm": 0.99609375, "learning_rate": 1.0525925925925927e-05, "loss": 0.6315, "mean_token_accuracy": 0.8416834581643343, "num_tokens": 52683886.0, "step": 1580 }, { "entropy": 0.5500559687614441, "epoch": 2.544, "grad_norm": 1.125, "learning_rate": 1.0451851851851853e-05, "loss": 0.6438, "mean_token_accuracy": 0.8382218111306429, "num_tokens": 53004529.0, "step": 1590 }, { "entropy": 0.5530376594513655, "epoch": 2.56, "grad_norm": 1.1171875, "learning_rate": 1.0377777777777778e-05, "loss": 0.6366, "mean_token_accuracy": 0.8400206513702869, "num_tokens": 53336280.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 1.6161543126106261, "eval_biology_loss": 2.4465079307556152, "eval_biology_mean_token_accuracy": 0.5826826608180999, "eval_biology_num_tokens": 53336280.0, "eval_biology_runtime": 38.7006, "eval_biology_samples_per_second": 12.92, "eval_biology_steps_per_second": 3.23, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 0.7235212452411651, "eval_chemistry_loss": 1.0302128791809082, "eval_chemistry_mean_token_accuracy": 0.776062783241272, "eval_chemistry_num_tokens": 53336280.0, "eval_chemistry_runtime": 48.1934, "eval_chemistry_samples_per_second": 10.375, "eval_chemistry_steps_per_second": 2.594, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.6339258260726929, "eval_math_loss": 1.025081753730774, "eval_math_mean_token_accuracy": 0.7834318013191223, "eval_math_num_tokens": 53336280.0, "eval_math_runtime": 49.912, "eval_math_samples_per_second": 10.018, "eval_math_steps_per_second": 2.504, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 0.5578962779045105, "eval_physics_loss": 0.7250717878341675, "eval_physics_mean_token_accuracy": 0.8264809098243714, "eval_physics_num_tokens": 53336280.0, "eval_physics_runtime": 56.9593, "eval_physics_samples_per_second": 8.778, "eval_physics_steps_per_second": 2.195, "step": 1600 }, { "entropy": 0.537515789270401, "epoch": 2.576, "grad_norm": 1.0625, "learning_rate": 1.0303703703703705e-05, "loss": 0.6322, "mean_token_accuracy": 0.8432505313307047, "num_tokens": 53667647.0, "step": 1610 }, { "entropy": 0.5506160443648696, "epoch": 2.592, "grad_norm": 0.97265625, "learning_rate": 1.0229629629629631e-05, "loss": 0.6402, "mean_token_accuracy": 0.8399668127298355, "num_tokens": 53997614.0, "step": 1620 }, { "entropy": 0.5386643601581454, "epoch": 2.608, "grad_norm": 1.0078125, "learning_rate": 1.0155555555555556e-05, "loss": 0.6243, "mean_token_accuracy": 0.8430452451109887, "num_tokens": 54330195.0, "step": 1630 }, { "entropy": 0.5495816670358181, "epoch": 2.624, "grad_norm": 1.03125, "learning_rate": 1.0081481481481484e-05, "loss": 0.6364, "mean_token_accuracy": 0.8401540901511908, "num_tokens": 54664342.0, "step": 1640 }, { "entropy": 0.5469144558534026, "epoch": 2.64, "grad_norm": 1.09375, "learning_rate": 1.0007407407407407e-05, "loss": 0.636, "mean_token_accuracy": 0.8406256098300219, "num_tokens": 54993626.0, "step": 1650 }, { "entropy": 0.5694095639511942, "epoch": 2.656, "grad_norm": 1.1953125, "learning_rate": 9.933333333333334e-06, "loss": 0.656, "mean_token_accuracy": 0.8346509717404842, "num_tokens": 55339962.0, "step": 1660 }, { "entropy": 0.5597302883863449, "epoch": 2.672, "grad_norm": 1.140625, "learning_rate": 9.85925925925926e-06, "loss": 0.6462, "mean_token_accuracy": 0.838615670055151, "num_tokens": 55670567.0, "step": 1670 }, { "entropy": 0.5369519403204321, "epoch": 2.6879999999999997, "grad_norm": 1.1015625, "learning_rate": 9.785185185185187e-06, "loss": 0.6323, "mean_token_accuracy": 0.843082357198, "num_tokens": 56003156.0, "step": 1680 }, { "entropy": 0.5516389394178987, "epoch": 2.7039999999999997, "grad_norm": 1.046875, "learning_rate": 9.711111111111111e-06, "loss": 0.6369, "mean_token_accuracy": 0.8410698171705008, "num_tokens": 56342926.0, "step": 1690 }, { "entropy": 0.5484106032177806, "epoch": 2.7199999999999998, "grad_norm": 1.203125, "learning_rate": 9.637037037037038e-06, "loss": 0.6328, "mean_token_accuracy": 0.8402947820723057, "num_tokens": 56677521.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 1.5972739911079408, "eval_biology_loss": 2.458798408508301, "eval_biology_mean_token_accuracy": 0.5806125638484955, "eval_biology_num_tokens": 56677521.0, "eval_biology_runtime": 38.6662, "eval_biology_samples_per_second": 12.931, "eval_biology_steps_per_second": 3.233, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 0.7181811017990112, "eval_chemistry_loss": 1.0299792289733887, "eval_chemistry_mean_token_accuracy": 0.7751972675323486, "eval_chemistry_num_tokens": 56677521.0, "eval_chemistry_runtime": 48.1483, "eval_chemistry_samples_per_second": 10.385, "eval_chemistry_steps_per_second": 2.596, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.6283527569770813, "eval_math_loss": 1.0243655443191528, "eval_math_mean_token_accuracy": 0.7836487565040589, "eval_math_num_tokens": 56677521.0, "eval_math_runtime": 49.508, "eval_math_samples_per_second": 10.099, "eval_math_steps_per_second": 2.525, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 0.5540005767345428, "eval_physics_loss": 0.7235716581344604, "eval_physics_mean_token_accuracy": 0.8267813692092896, "eval_physics_num_tokens": 56677521.0, "eval_physics_runtime": 56.9575, "eval_physics_samples_per_second": 8.778, "eval_physics_steps_per_second": 2.195, "step": 1700 }, { "entropy": 0.5626651704311371, "epoch": 2.7359999999999998, "grad_norm": 1.0859375, "learning_rate": 9.562962962962965e-06, "loss": 0.6551, "mean_token_accuracy": 0.8361094355583191, "num_tokens": 57003161.0, "step": 1710 }, { "entropy": 0.5559584245085716, "epoch": 2.752, "grad_norm": 1.0859375, "learning_rate": 9.48888888888889e-06, "loss": 0.6447, "mean_token_accuracy": 0.8389039475470781, "num_tokens": 57335850.0, "step": 1720 }, { "entropy": 0.5382809387519956, "epoch": 2.768, "grad_norm": 1.1171875, "learning_rate": 9.414814814814816e-06, "loss": 0.6267, "mean_token_accuracy": 0.8432002298533916, "num_tokens": 57672649.0, "step": 1730 }, { "entropy": 0.5509585844352841, "epoch": 2.784, "grad_norm": 1.09375, "learning_rate": 9.34074074074074e-06, "loss": 0.6395, "mean_token_accuracy": 0.8397889394313097, "num_tokens": 58007431.0, "step": 1740 }, { "entropy": 0.5838387541472911, "epoch": 2.8, "grad_norm": 1.0859375, "learning_rate": 9.266666666666667e-06, "loss": 0.6711, "mean_token_accuracy": 0.8319451794028282, "num_tokens": 58332730.0, "step": 1750 }, { "entropy": 0.5306204471737146, "epoch": 2.816, "grad_norm": 1.078125, "learning_rate": 9.192592592592594e-06, "loss": 0.6171, "mean_token_accuracy": 0.8446537777781487, "num_tokens": 58672106.0, "step": 1760 }, { "entropy": 0.553738858550787, "epoch": 2.832, "grad_norm": 1.078125, "learning_rate": 9.118518518518518e-06, "loss": 0.6486, "mean_token_accuracy": 0.8376123756170273, "num_tokens": 58997592.0, "step": 1770 }, { "entropy": 0.5568923223763704, "epoch": 2.848, "grad_norm": 1.0625, "learning_rate": 9.044444444444445e-06, "loss": 0.6446, "mean_token_accuracy": 0.8393427152186632, "num_tokens": 59326336.0, "step": 1780 }, { "entropy": 0.5415080957114696, "epoch": 2.864, "grad_norm": 0.94921875, "learning_rate": 8.970370370370372e-06, "loss": 0.6308, "mean_token_accuracy": 0.8419267870485783, "num_tokens": 59668586.0, "step": 1790 }, { "entropy": 0.5516748385503888, "epoch": 2.88, "grad_norm": 1.2578125, "learning_rate": 8.896296296296298e-06, "loss": 0.6441, "mean_token_accuracy": 0.840485867485404, "num_tokens": 59993572.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 1.6067570729255676, "eval_biology_loss": 2.442582130432129, "eval_biology_mean_token_accuracy": 0.5815401375293732, "eval_biology_num_tokens": 59993572.0, "eval_biology_runtime": 38.7204, "eval_biology_samples_per_second": 12.913, "eval_biology_steps_per_second": 3.228, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 0.7207075932025909, "eval_chemistry_loss": 1.0278831720352173, "eval_chemistry_mean_token_accuracy": 0.7756774797439575, "eval_chemistry_num_tokens": 59993572.0, "eval_chemistry_runtime": 48.2116, "eval_chemistry_samples_per_second": 10.371, "eval_chemistry_steps_per_second": 2.593, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.629777349948883, "eval_math_loss": 1.0234665870666504, "eval_math_mean_token_accuracy": 0.7837573509216309, "eval_math_num_tokens": 59993572.0, "eval_math_runtime": 49.5638, "eval_math_samples_per_second": 10.088, "eval_math_steps_per_second": 2.522, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 0.5562911832332611, "eval_physics_loss": 0.7223864793777466, "eval_physics_mean_token_accuracy": 0.8269883937835694, "eval_physics_num_tokens": 59993572.0, "eval_physics_runtime": 56.9994, "eval_physics_samples_per_second": 8.772, "eval_physics_steps_per_second": 2.193, "step": 1800 }, { "entropy": 0.5429861357435584, "epoch": 2.896, "grad_norm": 1.1171875, "learning_rate": 8.822222222222223e-06, "loss": 0.6347, "mean_token_accuracy": 0.8413365628570318, "num_tokens": 60324128.0, "step": 1810 }, { "entropy": 0.5474321844056249, "epoch": 2.912, "grad_norm": 1.1015625, "learning_rate": 8.74814814814815e-06, "loss": 0.635, "mean_token_accuracy": 0.8426228888332844, "num_tokens": 60657399.0, "step": 1820 }, { "entropy": 0.5416806817054749, "epoch": 2.928, "grad_norm": 1.0703125, "learning_rate": 8.674074074074074e-06, "loss": 0.6306, "mean_token_accuracy": 0.8423144549131394, "num_tokens": 60984711.0, "step": 1830 }, { "entropy": 0.5390040006488561, "epoch": 2.944, "grad_norm": 1.1875, "learning_rate": 8.6e-06, "loss": 0.6304, "mean_token_accuracy": 0.8424391083419323, "num_tokens": 61321359.0, "step": 1840 }, { "entropy": 0.5532678855583072, "epoch": 2.96, "grad_norm": 0.984375, "learning_rate": 8.525925925925927e-06, "loss": 0.6378, "mean_token_accuracy": 0.8402687277644872, "num_tokens": 61659042.0, "step": 1850 }, { "entropy": 0.5464650699868798, "epoch": 2.976, "grad_norm": 1.0546875, "learning_rate": 8.451851851851852e-06, "loss": 0.6345, "mean_token_accuracy": 0.8401576526463032, "num_tokens": 61993595.0, "step": 1860 }, { "entropy": 0.5306900983676315, "epoch": 2.992, "grad_norm": 1.0390625, "learning_rate": 8.377777777777779e-06, "loss": 0.6196, "mean_token_accuracy": 0.8441225662827492, "num_tokens": 62334012.0, "step": 1870 }, { "entropy": 0.5364003209397197, "epoch": 3.008, "grad_norm": 1.0703125, "learning_rate": 8.303703703703705e-06, "loss": 0.6242, "mean_token_accuracy": 0.8451750382781029, "num_tokens": 62660928.0, "step": 1880 }, { "entropy": 0.5261571481823921, "epoch": 3.024, "grad_norm": 1.078125, "learning_rate": 8.229629629629632e-06, "loss": 0.614, "mean_token_accuracy": 0.8461304292082786, "num_tokens": 62992670.0, "step": 1890 }, { "entropy": 0.5176527475938201, "epoch": 3.04, "grad_norm": 1.1328125, "learning_rate": 8.155555555555556e-06, "loss": 0.6074, "mean_token_accuracy": 0.8469121795147657, "num_tokens": 63335157.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 1.5792802815437317, "eval_biology_loss": 2.4808876514434814, "eval_biology_mean_token_accuracy": 0.5793148455619812, "eval_biology_num_tokens": 63335157.0, "eval_biology_runtime": 38.7055, "eval_biology_samples_per_second": 12.918, "eval_biology_steps_per_second": 3.23, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 0.7061369748115539, "eval_chemistry_loss": 1.0390231609344482, "eval_chemistry_mean_token_accuracy": 0.7745346717834473, "eval_chemistry_num_tokens": 63335157.0, "eval_chemistry_runtime": 48.1872, "eval_chemistry_samples_per_second": 10.376, "eval_chemistry_steps_per_second": 2.594, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.6224793126583099, "eval_math_loss": 1.034122347831726, "eval_math_mean_token_accuracy": 0.7828424015045166, "eval_math_num_tokens": 63335157.0, "eval_math_runtime": 49.5497, "eval_math_samples_per_second": 10.091, "eval_math_steps_per_second": 2.523, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 0.5448903846740722, "eval_physics_loss": 0.7253366708755493, "eval_physics_mean_token_accuracy": 0.8264335384368896, "eval_physics_num_tokens": 63335157.0, "eval_physics_runtime": 56.9581, "eval_physics_samples_per_second": 8.778, "eval_physics_steps_per_second": 2.195, "step": 1900 }, { "entropy": 0.5329983660951256, "epoch": 3.056, "grad_norm": 1.140625, "learning_rate": 8.081481481481483e-06, "loss": 0.6183, "mean_token_accuracy": 0.8438076838850975, "num_tokens": 63662314.0, "step": 1910 }, { "entropy": 0.5316095747053623, "epoch": 3.072, "grad_norm": 1.171875, "learning_rate": 8.007407407407408e-06, "loss": 0.6223, "mean_token_accuracy": 0.8441695164889097, "num_tokens": 63997780.0, "step": 1920 }, { "entropy": 0.541679815761745, "epoch": 3.088, "grad_norm": 1.15625, "learning_rate": 7.933333333333334e-06, "loss": 0.6328, "mean_token_accuracy": 0.8417537044733763, "num_tokens": 64325274.0, "step": 1930 }, { "entropy": 0.5170316396281123, "epoch": 3.104, "grad_norm": 1.140625, "learning_rate": 7.859259259259259e-06, "loss": 0.6062, "mean_token_accuracy": 0.8478735946118832, "num_tokens": 64659683.0, "step": 1940 }, { "entropy": 0.5163595724850893, "epoch": 3.12, "grad_norm": 1.1328125, "learning_rate": 7.785185185185185e-06, "loss": 0.603, "mean_token_accuracy": 0.8484522052109241, "num_tokens": 64998212.0, "step": 1950 }, { "entropy": 0.5422403154894709, "epoch": 3.136, "grad_norm": 1.1015625, "learning_rate": 7.711111111111112e-06, "loss": 0.6357, "mean_token_accuracy": 0.8405985131859779, "num_tokens": 65328436.0, "step": 1960 }, { "entropy": 0.5195852382108569, "epoch": 3.152, "grad_norm": 1.125, "learning_rate": 7.637037037037037e-06, "loss": 0.602, "mean_token_accuracy": 0.8487723391503096, "num_tokens": 65659346.0, "step": 1970 }, { "entropy": 0.5302856534719467, "epoch": 3.168, "grad_norm": 1.03125, "learning_rate": 7.562962962962963e-06, "loss": 0.6216, "mean_token_accuracy": 0.8445643980056048, "num_tokens": 65986382.0, "step": 1980 }, { "entropy": 0.5176609115675092, "epoch": 3.184, "grad_norm": 1.1953125, "learning_rate": 7.48888888888889e-06, "loss": 0.6056, "mean_token_accuracy": 0.8478365700691939, "num_tokens": 66324308.0, "step": 1990 }, { "entropy": 0.5267078908160329, "epoch": 3.2, "grad_norm": 1.1015625, "learning_rate": 7.4148148148148155e-06, "loss": 0.6149, "mean_token_accuracy": 0.8465773615986109, "num_tokens": 66658712.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 1.5625373516082763, "eval_biology_loss": 2.4981629848480225, "eval_biology_mean_token_accuracy": 0.57876149559021, "eval_biology_num_tokens": 66658712.0, "eval_biology_runtime": 38.9285, "eval_biology_samples_per_second": 12.844, "eval_biology_steps_per_second": 3.211, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 0.70146466588974, "eval_chemistry_loss": 1.0409005880355835, "eval_chemistry_mean_token_accuracy": 0.7748316297531128, "eval_chemistry_num_tokens": 66658712.0, "eval_chemistry_runtime": 48.3893, "eval_chemistry_samples_per_second": 10.333, "eval_chemistry_steps_per_second": 2.583, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.6218927059173583, "eval_math_loss": 1.0327448844909668, "eval_math_mean_token_accuracy": 0.7830894327163697, "eval_math_num_tokens": 66658712.0, "eval_math_runtime": 49.899, "eval_math_samples_per_second": 10.02, "eval_math_steps_per_second": 2.505, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 0.5445813267230988, "eval_physics_loss": 0.725453794002533, "eval_physics_mean_token_accuracy": 0.8266869735717773, "eval_physics_num_tokens": 66658712.0, "eval_physics_runtime": 57.3527, "eval_physics_samples_per_second": 8.718, "eval_physics_steps_per_second": 2.179, "step": 2000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.956042200464073e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }