vpuqmqt0 / checkpoint-2000 /trainer_state.json
roonbug's picture
Upload folder using huggingface_hub
20f2690 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.2,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1780160423368216,
"epoch": 0.016,
"grad_norm": 13.0,
"learning_rate": 6.000000000000001e-07,
"loss": 1.8406,
"mean_token_accuracy": 0.6489301804453135,
"num_tokens": 340696.0,
"step": 10
},
{
"entropy": 1.1818634796887637,
"epoch": 0.032,
"grad_norm": 11.5625,
"learning_rate": 1.2666666666666669e-06,
"loss": 1.8191,
"mean_token_accuracy": 0.6528046734631061,
"num_tokens": 675930.0,
"step": 20
},
{
"entropy": 1.195955842360854,
"epoch": 0.048,
"grad_norm": 9.3125,
"learning_rate": 1.9333333333333336e-06,
"loss": 1.7817,
"mean_token_accuracy": 0.6578715395182371,
"num_tokens": 1007956.0,
"step": 30
},
{
"entropy": 1.2672166559845208,
"epoch": 0.064,
"grad_norm": 6.5625,
"learning_rate": 2.6e-06,
"loss": 1.7272,
"mean_token_accuracy": 0.6625342659652234,
"num_tokens": 1340560.0,
"step": 40
},
{
"entropy": 1.2964693307876587,
"epoch": 0.08,
"grad_norm": 5.0,
"learning_rate": 3.266666666666667e-06,
"loss": 1.5867,
"mean_token_accuracy": 0.6811951555311679,
"num_tokens": 1679432.0,
"step": 50
},
{
"entropy": 1.2539724007248878,
"epoch": 0.096,
"grad_norm": 3.375,
"learning_rate": 3.9333333333333335e-06,
"loss": 1.3962,
"mean_token_accuracy": 0.7081016473472118,
"num_tokens": 2021570.0,
"step": 60
},
{
"entropy": 1.2302619956433773,
"epoch": 0.112,
"grad_norm": 2.28125,
"learning_rate": 4.600000000000001e-06,
"loss": 1.2974,
"mean_token_accuracy": 0.7239668637514114,
"num_tokens": 2359297.0,
"step": 70
},
{
"entropy": 1.169797332212329,
"epoch": 0.128,
"grad_norm": 1.875,
"learning_rate": 5.2666666666666665e-06,
"loss": 1.2306,
"mean_token_accuracy": 0.7328833002597094,
"num_tokens": 2683168.0,
"step": 80
},
{
"entropy": 1.0165224198251963,
"epoch": 0.144,
"grad_norm": 1.59375,
"learning_rate": 5.933333333333335e-06,
"loss": 1.1129,
"mean_token_accuracy": 0.7561763934791088,
"num_tokens": 3019426.0,
"step": 90
},
{
"entropy": 0.9457759071141482,
"epoch": 0.16,
"grad_norm": 1.546875,
"learning_rate": 6.600000000000001e-06,
"loss": 1.0448,
"mean_token_accuracy": 0.7662712432444095,
"num_tokens": 3354390.0,
"step": 100
},
{
"epoch": 0.16,
"eval_biology_entropy": 2.4009983863830566,
"eval_biology_loss": 3.090766429901123,
"eval_biology_mean_token_accuracy": 0.5075433547496796,
"eval_biology_num_tokens": 3354390.0,
"eval_biology_runtime": 38.801,
"eval_biology_samples_per_second": 12.886,
"eval_biology_steps_per_second": 3.222,
"step": 100
},
{
"epoch": 0.16,
"eval_chemistry_entropy": 1.1818295245170594,
"eval_chemistry_loss": 1.4003204107284546,
"eval_chemistry_mean_token_accuracy": 0.7158680257797241,
"eval_chemistry_num_tokens": 3354390.0,
"eval_chemistry_runtime": 48.2819,
"eval_chemistry_samples_per_second": 10.356,
"eval_chemistry_steps_per_second": 2.589,
"step": 100
},
{
"epoch": 0.16,
"eval_math_entropy": 0.8902422695159912,
"eval_math_loss": 1.2323389053344727,
"eval_math_mean_token_accuracy": 0.750218500137329,
"eval_math_num_tokens": 3354390.0,
"eval_math_runtime": 49.6484,
"eval_math_samples_per_second": 10.071,
"eval_math_steps_per_second": 2.518,
"step": 100
},
{
"epoch": 0.16,
"eval_physics_entropy": 0.9318590335845948,
"eval_physics_loss": 1.0569193363189697,
"eval_physics_mean_token_accuracy": 0.7675474114418029,
"eval_physics_num_tokens": 3354390.0,
"eval_physics_runtime": 57.1057,
"eval_physics_samples_per_second": 8.756,
"eval_physics_steps_per_second": 2.189,
"step": 100
},
{
"entropy": 0.9224597703665495,
"epoch": 0.176,
"grad_norm": 1.4921875,
"learning_rate": 7.266666666666668e-06,
"loss": 1.0192,
"mean_token_accuracy": 0.7696165222674608,
"num_tokens": 3684374.0,
"step": 110
},
{
"entropy": 0.8823430232703686,
"epoch": 0.192,
"grad_norm": 1.125,
"learning_rate": 7.933333333333334e-06,
"loss": 0.9854,
"mean_token_accuracy": 0.7739298477768898,
"num_tokens": 4023915.0,
"step": 120
},
{
"entropy": 0.8393984897062182,
"epoch": 0.208,
"grad_norm": 1.4375,
"learning_rate": 8.6e-06,
"loss": 0.9379,
"mean_token_accuracy": 0.7841779347509146,
"num_tokens": 4352045.0,
"step": 130
},
{
"entropy": 0.8179828137159347,
"epoch": 0.224,
"grad_norm": 1.234375,
"learning_rate": 9.266666666666667e-06,
"loss": 0.9191,
"mean_token_accuracy": 0.7869349300861359,
"num_tokens": 4681371.0,
"step": 140
},
{
"entropy": 0.805096386373043,
"epoch": 0.24,
"grad_norm": 1.09375,
"learning_rate": 9.933333333333334e-06,
"loss": 0.9012,
"mean_token_accuracy": 0.7888233289122581,
"num_tokens": 5021784.0,
"step": 150
},
{
"entropy": 0.798224457167089,
"epoch": 0.256,
"grad_norm": 1.2109375,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.8922,
"mean_token_accuracy": 0.7903898701071739,
"num_tokens": 5367308.0,
"step": 160
},
{
"entropy": 0.805234762467444,
"epoch": 0.272,
"grad_norm": 1.3203125,
"learning_rate": 1.1266666666666668e-05,
"loss": 0.8993,
"mean_token_accuracy": 0.7877147275954485,
"num_tokens": 5699101.0,
"step": 170
},
{
"entropy": 0.7946224914863705,
"epoch": 0.288,
"grad_norm": 1.234375,
"learning_rate": 1.1933333333333335e-05,
"loss": 0.8976,
"mean_token_accuracy": 0.7911442808806897,
"num_tokens": 6022837.0,
"step": 180
},
{
"entropy": 0.7837886592373252,
"epoch": 0.304,
"grad_norm": 1.5078125,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.8805,
"mean_token_accuracy": 0.7934082143008709,
"num_tokens": 6343826.0,
"step": 190
},
{
"entropy": 0.778077344968915,
"epoch": 0.32,
"grad_norm": 1.109375,
"learning_rate": 1.3266666666666668e-05,
"loss": 0.8781,
"mean_token_accuracy": 0.7940127164125442,
"num_tokens": 6670164.0,
"step": 200
},
{
"epoch": 0.32,
"eval_biology_entropy": 2.1669651889801025,
"eval_biology_loss": 2.686201810836792,
"eval_biology_mean_token_accuracy": 0.5460970797538758,
"eval_biology_num_tokens": 6670164.0,
"eval_biology_runtime": 38.7901,
"eval_biology_samples_per_second": 12.89,
"eval_biology_steps_per_second": 3.222,
"step": 200
},
{
"epoch": 0.32,
"eval_chemistry_entropy": 0.9981456413269043,
"eval_chemistry_loss": 1.1842883825302124,
"eval_chemistry_mean_token_accuracy": 0.7467401041984558,
"eval_chemistry_num_tokens": 6670164.0,
"eval_chemistry_runtime": 48.3154,
"eval_chemistry_samples_per_second": 10.349,
"eval_chemistry_steps_per_second": 2.587,
"step": 200
},
{
"epoch": 0.32,
"eval_math_entropy": 0.7862319254875183,
"eval_math_loss": 1.0977362394332886,
"eval_math_mean_token_accuracy": 0.7683374147415161,
"eval_math_num_tokens": 6670164.0,
"eval_math_runtime": 49.687,
"eval_math_samples_per_second": 10.063,
"eval_math_steps_per_second": 2.516,
"step": 200
},
{
"epoch": 0.32,
"eval_physics_entropy": 0.7679825134277344,
"eval_physics_loss": 0.886142373085022,
"eval_physics_mean_token_accuracy": 0.7957743234634399,
"eval_physics_num_tokens": 6670164.0,
"eval_physics_runtime": 57.1202,
"eval_physics_samples_per_second": 8.753,
"eval_physics_steps_per_second": 2.188,
"step": 200
},
{
"entropy": 0.7617403082549572,
"epoch": 0.336,
"grad_norm": 1.03125,
"learning_rate": 1.3933333333333334e-05,
"loss": 0.8531,
"mean_token_accuracy": 0.7977134332060813,
"num_tokens": 7003404.0,
"step": 210
},
{
"entropy": 0.737130863033235,
"epoch": 0.352,
"grad_norm": 1.2265625,
"learning_rate": 1.46e-05,
"loss": 0.8339,
"mean_token_accuracy": 0.8024804938584567,
"num_tokens": 7342593.0,
"step": 220
},
{
"entropy": 0.7299130430445075,
"epoch": 0.368,
"grad_norm": 1.015625,
"learning_rate": 1.5266666666666667e-05,
"loss": 0.8252,
"mean_token_accuracy": 0.805871631577611,
"num_tokens": 7675569.0,
"step": 230
},
{
"entropy": 0.7597114410251379,
"epoch": 0.384,
"grad_norm": 1.171875,
"learning_rate": 1.5933333333333336e-05,
"loss": 0.8558,
"mean_token_accuracy": 0.7943590730428696,
"num_tokens": 8012218.0,
"step": 240
},
{
"entropy": 0.7247350050136447,
"epoch": 0.4,
"grad_norm": 1.125,
"learning_rate": 1.66e-05,
"loss": 0.8193,
"mean_token_accuracy": 0.8040182612836361,
"num_tokens": 8350120.0,
"step": 250
},
{
"entropy": 0.7144488081336021,
"epoch": 0.416,
"grad_norm": 1.0859375,
"learning_rate": 1.726666666666667e-05,
"loss": 0.7998,
"mean_token_accuracy": 0.8067968346178531,
"num_tokens": 8688773.0,
"step": 260
},
{
"entropy": 0.7226355630904436,
"epoch": 0.432,
"grad_norm": 1.1953125,
"learning_rate": 1.7933333333333333e-05,
"loss": 0.8125,
"mean_token_accuracy": 0.8042349684983492,
"num_tokens": 9022094.0,
"step": 270
},
{
"entropy": 0.6943683221936225,
"epoch": 0.448,
"grad_norm": 1.03125,
"learning_rate": 1.86e-05,
"loss": 0.7924,
"mean_token_accuracy": 0.8109067149460316,
"num_tokens": 9360608.0,
"step": 280
},
{
"entropy": 0.6898288525640964,
"epoch": 0.464,
"grad_norm": 1.3046875,
"learning_rate": 1.926666666666667e-05,
"loss": 0.7814,
"mean_token_accuracy": 0.8110810052603483,
"num_tokens": 9688896.0,
"step": 290
},
{
"entropy": 0.6964065950363875,
"epoch": 0.48,
"grad_norm": 1.125,
"learning_rate": 1.9933333333333334e-05,
"loss": 0.7896,
"mean_token_accuracy": 0.8094950247555971,
"num_tokens": 10025864.0,
"step": 300
},
{
"epoch": 0.48,
"eval_biology_entropy": 2.1021561794281007,
"eval_biology_loss": 2.546415090560913,
"eval_biology_mean_token_accuracy": 0.5616441056728363,
"eval_biology_num_tokens": 10025864.0,
"eval_biology_runtime": 38.8255,
"eval_biology_samples_per_second": 12.878,
"eval_biology_steps_per_second": 3.22,
"step": 300
},
{
"epoch": 0.48,
"eval_chemistry_entropy": 0.9578249802589417,
"eval_chemistry_loss": 1.1104036569595337,
"eval_chemistry_mean_token_accuracy": 0.7582236580848694,
"eval_chemistry_num_tokens": 10025864.0,
"eval_chemistry_runtime": 48.5149,
"eval_chemistry_samples_per_second": 10.306,
"eval_chemistry_steps_per_second": 2.577,
"step": 300
},
{
"epoch": 0.48,
"eval_math_entropy": 0.7676350421905518,
"eval_math_loss": 1.0549193620681763,
"eval_math_mean_token_accuracy": 0.7745008988380432,
"eval_math_num_tokens": 10025864.0,
"eval_math_runtime": 49.8707,
"eval_math_samples_per_second": 10.026,
"eval_math_steps_per_second": 2.506,
"step": 300
},
{
"epoch": 0.48,
"eval_physics_entropy": 0.7307665984630585,
"eval_physics_loss": 0.8242064118385315,
"eval_physics_mean_token_accuracy": 0.8064617681503295,
"eval_physics_num_tokens": 10025864.0,
"eval_physics_runtime": 57.1693,
"eval_physics_samples_per_second": 8.746,
"eval_physics_steps_per_second": 2.186,
"step": 300
},
{
"entropy": 0.7168749757111073,
"epoch": 0.496,
"grad_norm": 0.9921875,
"learning_rate": 1.9933333333333334e-05,
"loss": 0.8032,
"mean_token_accuracy": 0.8051175128668546,
"num_tokens": 10362573.0,
"step": 310
},
{
"entropy": 0.6993480321019888,
"epoch": 0.512,
"grad_norm": 1.15625,
"learning_rate": 1.985925925925926e-05,
"loss": 0.7903,
"mean_token_accuracy": 0.8099954195320607,
"num_tokens": 10694440.0,
"step": 320
},
{
"entropy": 0.7138712629675865,
"epoch": 0.528,
"grad_norm": 1.046875,
"learning_rate": 1.9785185185185187e-05,
"loss": 0.8101,
"mean_token_accuracy": 0.805506169050932,
"num_tokens": 11022357.0,
"step": 330
},
{
"entropy": 0.6928766580298543,
"epoch": 0.544,
"grad_norm": 1.0546875,
"learning_rate": 1.971111111111111e-05,
"loss": 0.7916,
"mean_token_accuracy": 0.8087165944278241,
"num_tokens": 11345911.0,
"step": 340
},
{
"entropy": 0.682861409150064,
"epoch": 0.56,
"grad_norm": 0.8984375,
"learning_rate": 1.963703703703704e-05,
"loss": 0.7735,
"mean_token_accuracy": 0.8126827124506235,
"num_tokens": 11687225.0,
"step": 350
},
{
"entropy": 0.6824749782681465,
"epoch": 0.576,
"grad_norm": 1.015625,
"learning_rate": 1.9562962962962964e-05,
"loss": 0.7723,
"mean_token_accuracy": 0.8146394658833742,
"num_tokens": 12015720.0,
"step": 360
},
{
"entropy": 0.6805058639496565,
"epoch": 0.592,
"grad_norm": 1.2109375,
"learning_rate": 1.948888888888889e-05,
"loss": 0.776,
"mean_token_accuracy": 0.811661035567522,
"num_tokens": 12343393.0,
"step": 370
},
{
"entropy": 0.6700849516317249,
"epoch": 0.608,
"grad_norm": 0.9921875,
"learning_rate": 1.9414814814814817e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.8141583666205406,
"num_tokens": 12671474.0,
"step": 380
},
{
"entropy": 0.6893410481512546,
"epoch": 0.624,
"grad_norm": 0.98046875,
"learning_rate": 1.9340740740740743e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.8111887093633413,
"num_tokens": 12996077.0,
"step": 390
},
{
"entropy": 0.6852772971615195,
"epoch": 0.64,
"grad_norm": 0.9609375,
"learning_rate": 1.926666666666667e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.8112590182572603,
"num_tokens": 13325291.0,
"step": 400
},
{
"epoch": 0.64,
"eval_biology_entropy": 2.0124985208511355,
"eval_biology_loss": 2.4155702590942383,
"eval_biology_mean_token_accuracy": 0.5735694291591644,
"eval_biology_num_tokens": 13325291.0,
"eval_biology_runtime": 38.6667,
"eval_biology_samples_per_second": 12.931,
"eval_biology_steps_per_second": 3.233,
"step": 400
},
{
"epoch": 0.64,
"eval_chemistry_entropy": 0.8911373369693756,
"eval_chemistry_loss": 1.0692609548568726,
"eval_chemistry_mean_token_accuracy": 0.7647108516693115,
"eval_chemistry_num_tokens": 13325291.0,
"eval_chemistry_runtime": 48.2089,
"eval_chemistry_samples_per_second": 10.372,
"eval_chemistry_steps_per_second": 2.593,
"step": 400
},
{
"epoch": 0.64,
"eval_math_entropy": 0.7330298454761505,
"eval_math_loss": 1.0336663722991943,
"eval_math_mean_token_accuracy": 0.7786638278961182,
"eval_math_num_tokens": 13325291.0,
"eval_math_runtime": 49.7905,
"eval_math_samples_per_second": 10.042,
"eval_math_steps_per_second": 2.511,
"step": 400
},
{
"epoch": 0.64,
"eval_physics_entropy": 0.681670261144638,
"eval_physics_loss": 0.7883204817771912,
"eval_physics_mean_token_accuracy": 0.8129381031990052,
"eval_physics_num_tokens": 13325291.0,
"eval_physics_runtime": 57.2563,
"eval_physics_samples_per_second": 8.733,
"eval_physics_steps_per_second": 2.183,
"step": 400
},
{
"entropy": 0.6723099524155259,
"epoch": 0.656,
"grad_norm": 0.953125,
"learning_rate": 1.9192592592592593e-05,
"loss": 0.757,
"mean_token_accuracy": 0.8151150114834309,
"num_tokens": 13671434.0,
"step": 410
},
{
"entropy": 0.6621538577601314,
"epoch": 0.672,
"grad_norm": 1.0234375,
"learning_rate": 1.911851851851852e-05,
"loss": 0.7563,
"mean_token_accuracy": 0.8172798678278923,
"num_tokens": 13993668.0,
"step": 420
},
{
"entropy": 0.6673029117286206,
"epoch": 0.688,
"grad_norm": 0.921875,
"learning_rate": 1.9044444444444446e-05,
"loss": 0.7529,
"mean_token_accuracy": 0.8164214458316564,
"num_tokens": 14334907.0,
"step": 430
},
{
"entropy": 0.6810694945976138,
"epoch": 0.704,
"grad_norm": 1.046875,
"learning_rate": 1.8970370370370372e-05,
"loss": 0.7695,
"mean_token_accuracy": 0.8127802673727273,
"num_tokens": 14670639.0,
"step": 440
},
{
"entropy": 0.6582699194550514,
"epoch": 0.72,
"grad_norm": 1.0,
"learning_rate": 1.8896296296296295e-05,
"loss": 0.7544,
"mean_token_accuracy": 0.8171826928853989,
"num_tokens": 14987995.0,
"step": 450
},
{
"entropy": 0.6564427128061652,
"epoch": 0.736,
"grad_norm": 1.078125,
"learning_rate": 1.8822222222222225e-05,
"loss": 0.7446,
"mean_token_accuracy": 0.8173684533685446,
"num_tokens": 15324659.0,
"step": 460
},
{
"entropy": 0.6495672106742859,
"epoch": 0.752,
"grad_norm": 0.94921875,
"learning_rate": 1.874814814814815e-05,
"loss": 0.7416,
"mean_token_accuracy": 0.8195757914334536,
"num_tokens": 15657342.0,
"step": 470
},
{
"entropy": 0.6738376861438156,
"epoch": 0.768,
"grad_norm": 0.95703125,
"learning_rate": 1.8674074074074075e-05,
"loss": 0.7642,
"mean_token_accuracy": 0.8124835971742869,
"num_tokens": 15988993.0,
"step": 480
},
{
"entropy": 0.6495399951934815,
"epoch": 0.784,
"grad_norm": 0.9609375,
"learning_rate": 1.86e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.8208612345159054,
"num_tokens": 16333857.0,
"step": 490
},
{
"entropy": 0.6567428983747959,
"epoch": 0.8,
"grad_norm": 0.99609375,
"learning_rate": 1.8525925925925928e-05,
"loss": 0.7488,
"mean_token_accuracy": 0.817277068644762,
"num_tokens": 16664690.0,
"step": 500
},
{
"epoch": 0.8,
"eval_biology_entropy": 2.039887357711792,
"eval_biology_loss": 2.4157471656799316,
"eval_biology_mean_token_accuracy": 0.5770127189159393,
"eval_biology_num_tokens": 16664690.0,
"eval_biology_runtime": 38.6815,
"eval_biology_samples_per_second": 12.926,
"eval_biology_steps_per_second": 3.232,
"step": 500
},
{
"epoch": 0.8,
"eval_chemistry_entropy": 0.8955893061161041,
"eval_chemistry_loss": 1.048737645149231,
"eval_chemistry_mean_token_accuracy": 0.768093888759613,
"eval_chemistry_num_tokens": 16664690.0,
"eval_chemistry_runtime": 48.42,
"eval_chemistry_samples_per_second": 10.326,
"eval_chemistry_steps_per_second": 2.582,
"step": 500
},
{
"epoch": 0.8,
"eval_math_entropy": 0.7318526220321655,
"eval_math_loss": 1.0199034214019775,
"eval_math_mean_token_accuracy": 0.7799897599220276,
"eval_math_num_tokens": 16664690.0,
"eval_math_runtime": 49.8882,
"eval_math_samples_per_second": 10.022,
"eval_math_steps_per_second": 2.506,
"step": 500
},
{
"epoch": 0.8,
"eval_physics_entropy": 0.6712099099159241,
"eval_physics_loss": 0.7689476609230042,
"eval_physics_mean_token_accuracy": 0.8163677668571472,
"eval_physics_num_tokens": 16664690.0,
"eval_physics_runtime": 57.4517,
"eval_physics_samples_per_second": 8.703,
"eval_physics_steps_per_second": 2.176,
"step": 500
},
{
"entropy": 0.6573688389733434,
"epoch": 0.816,
"grad_norm": 0.96875,
"learning_rate": 1.8451851851851855e-05,
"loss": 0.7477,
"mean_token_accuracy": 0.8164514761418105,
"num_tokens": 16989307.0,
"step": 510
},
{
"entropy": 0.6648645078763366,
"epoch": 0.832,
"grad_norm": 0.9609375,
"learning_rate": 1.8377777777777778e-05,
"loss": 0.749,
"mean_token_accuracy": 0.8170726090669632,
"num_tokens": 17317524.0,
"step": 520
},
{
"entropy": 0.6526606786996126,
"epoch": 0.848,
"grad_norm": 0.8515625,
"learning_rate": 1.8303703703703704e-05,
"loss": 0.7388,
"mean_token_accuracy": 0.8195879191160202,
"num_tokens": 17650424.0,
"step": 530
},
{
"entropy": 0.6565553491935134,
"epoch": 0.864,
"grad_norm": 0.8984375,
"learning_rate": 1.822962962962963e-05,
"loss": 0.7464,
"mean_token_accuracy": 0.8177060768008232,
"num_tokens": 17985547.0,
"step": 540
},
{
"entropy": 0.6623865978792309,
"epoch": 0.88,
"grad_norm": 0.9453125,
"learning_rate": 1.8155555555555557e-05,
"loss": 0.7495,
"mean_token_accuracy": 0.8162953305989504,
"num_tokens": 18315392.0,
"step": 550
},
{
"entropy": 0.6179373754188419,
"epoch": 0.896,
"grad_norm": 0.8359375,
"learning_rate": 1.8081481481481484e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.8264330130070447,
"num_tokens": 18667124.0,
"step": 560
},
{
"entropy": 0.6555242039263248,
"epoch": 0.912,
"grad_norm": 0.8984375,
"learning_rate": 1.800740740740741e-05,
"loss": 0.7393,
"mean_token_accuracy": 0.8174499638378621,
"num_tokens": 19002186.0,
"step": 570
},
{
"entropy": 0.6485247412696481,
"epoch": 0.928,
"grad_norm": 0.98046875,
"learning_rate": 1.7933333333333333e-05,
"loss": 0.7363,
"mean_token_accuracy": 0.8195535041391849,
"num_tokens": 19335013.0,
"step": 580
},
{
"entropy": 0.63772834520787,
"epoch": 0.944,
"grad_norm": 0.984375,
"learning_rate": 1.785925925925926e-05,
"loss": 0.7269,
"mean_token_accuracy": 0.821834321692586,
"num_tokens": 19662733.0,
"step": 590
},
{
"entropy": 0.6372630735859275,
"epoch": 0.96,
"grad_norm": 0.99609375,
"learning_rate": 1.7785185185185186e-05,
"loss": 0.7318,
"mean_token_accuracy": 0.8204116970300674,
"num_tokens": 19996789.0,
"step": 600
},
{
"epoch": 0.96,
"eval_biology_entropy": 2.0030080833435058,
"eval_biology_loss": 2.361283302307129,
"eval_biology_mean_token_accuracy": 0.5793048655986786,
"eval_biology_num_tokens": 19996789.0,
"eval_biology_runtime": 38.6981,
"eval_biology_samples_per_second": 12.921,
"eval_biology_steps_per_second": 3.23,
"step": 600
},
{
"epoch": 0.96,
"eval_chemistry_entropy": 0.8840513801574708,
"eval_chemistry_loss": 1.0323643684387207,
"eval_chemistry_mean_token_accuracy": 0.7703397722244263,
"eval_chemistry_num_tokens": 19996789.0,
"eval_chemistry_runtime": 48.2039,
"eval_chemistry_samples_per_second": 10.373,
"eval_chemistry_steps_per_second": 2.593,
"step": 600
},
{
"epoch": 0.96,
"eval_math_entropy": 0.7270034260749817,
"eval_math_loss": 1.009481430053711,
"eval_math_mean_token_accuracy": 0.7820751585960388,
"eval_math_num_tokens": 19996789.0,
"eval_math_runtime": 49.571,
"eval_math_samples_per_second": 10.087,
"eval_math_steps_per_second": 2.522,
"step": 600
},
{
"epoch": 0.96,
"eval_physics_entropy": 0.6698693735599518,
"eval_physics_loss": 0.7564050555229187,
"eval_physics_mean_token_accuracy": 0.8185040464401245,
"eval_physics_num_tokens": 19996789.0,
"eval_physics_runtime": 56.9517,
"eval_physics_samples_per_second": 8.779,
"eval_physics_steps_per_second": 2.195,
"step": 600
},
{
"entropy": 0.6384279150515795,
"epoch": 0.976,
"grad_norm": 1.0234375,
"learning_rate": 1.7711111111111113e-05,
"loss": 0.7283,
"mean_token_accuracy": 0.82187210470438,
"num_tokens": 20332636.0,
"step": 610
},
{
"entropy": 0.6450063675642014,
"epoch": 0.992,
"grad_norm": 0.8515625,
"learning_rate": 1.763703703703704e-05,
"loss": 0.7272,
"mean_token_accuracy": 0.8209770727902651,
"num_tokens": 20662209.0,
"step": 620
},
{
"entropy": 0.6167608626186848,
"epoch": 1.008,
"grad_norm": 0.8984375,
"learning_rate": 1.7562962962962962e-05,
"loss": 0.7008,
"mean_token_accuracy": 0.825862829759717,
"num_tokens": 21005731.0,
"step": 630
},
{
"entropy": 0.6117491278797388,
"epoch": 1.024,
"grad_norm": 0.93359375,
"learning_rate": 1.7488888888888892e-05,
"loss": 0.7058,
"mean_token_accuracy": 0.8266724064946175,
"num_tokens": 21335407.0,
"step": 640
},
{
"entropy": 0.6068670526146889,
"epoch": 1.04,
"grad_norm": 0.97265625,
"learning_rate": 1.7414814814814815e-05,
"loss": 0.6989,
"mean_token_accuracy": 0.8283324401825667,
"num_tokens": 21668490.0,
"step": 650
},
{
"entropy": 0.601451874896884,
"epoch": 1.056,
"grad_norm": 0.95703125,
"learning_rate": 1.7340740740740742e-05,
"loss": 0.6864,
"mean_token_accuracy": 0.829675118252635,
"num_tokens": 22006039.0,
"step": 660
},
{
"entropy": 0.6137524953112006,
"epoch": 1.072,
"grad_norm": 0.94921875,
"learning_rate": 1.726666666666667e-05,
"loss": 0.7008,
"mean_token_accuracy": 0.8262542523443699,
"num_tokens": 22348206.0,
"step": 670
},
{
"entropy": 0.6093455260619521,
"epoch": 1.088,
"grad_norm": 0.96875,
"learning_rate": 1.7192592592592595e-05,
"loss": 0.6956,
"mean_token_accuracy": 0.8260304640978575,
"num_tokens": 22676163.0,
"step": 680
},
{
"entropy": 0.6026462253183127,
"epoch": 1.104,
"grad_norm": 0.87109375,
"learning_rate": 1.711851851851852e-05,
"loss": 0.6967,
"mean_token_accuracy": 0.8273924000561237,
"num_tokens": 23009279.0,
"step": 690
},
{
"entropy": 0.6122452523559332,
"epoch": 1.12,
"grad_norm": 0.9453125,
"learning_rate": 1.7044444444444445e-05,
"loss": 0.7,
"mean_token_accuracy": 0.8257350366562605,
"num_tokens": 23346828.0,
"step": 700
},
{
"epoch": 1.12,
"eval_biology_entropy": 1.8616097135543823,
"eval_biology_loss": 2.4014246463775635,
"eval_biology_mean_token_accuracy": 0.578062358379364,
"eval_biology_num_tokens": 23346828.0,
"eval_biology_runtime": 38.9662,
"eval_biology_samples_per_second": 12.832,
"eval_biology_steps_per_second": 3.208,
"step": 700
},
{
"epoch": 1.12,
"eval_chemistry_entropy": 0.8116880433559418,
"eval_chemistry_loss": 1.0317949056625366,
"eval_chemistry_mean_token_accuracy": 0.7715164208412171,
"eval_chemistry_num_tokens": 23346828.0,
"eval_chemistry_runtime": 48.496,
"eval_chemistry_samples_per_second": 10.31,
"eval_chemistry_steps_per_second": 2.578,
"step": 700
},
{
"epoch": 1.12,
"eval_math_entropy": 0.6895896532535553,
"eval_math_loss": 1.0153776407241821,
"eval_math_mean_token_accuracy": 0.7820956745147705,
"eval_math_num_tokens": 23346828.0,
"eval_math_runtime": 49.7815,
"eval_math_samples_per_second": 10.044,
"eval_math_steps_per_second": 2.511,
"step": 700
},
{
"epoch": 1.12,
"eval_physics_entropy": 0.6192426791191101,
"eval_physics_loss": 0.7497905492782593,
"eval_physics_mean_token_accuracy": 0.8200045080184937,
"eval_physics_num_tokens": 23346828.0,
"eval_physics_runtime": 57.1287,
"eval_physics_samples_per_second": 8.752,
"eval_physics_steps_per_second": 2.188,
"step": 700
},
{
"entropy": 0.588855667039752,
"epoch": 1.1360000000000001,
"grad_norm": 0.9375,
"learning_rate": 1.697037037037037e-05,
"loss": 0.6772,
"mean_token_accuracy": 0.8326364874839782,
"num_tokens": 23689161.0,
"step": 710
},
{
"entropy": 0.6099351227283478,
"epoch": 1.152,
"grad_norm": 1.0390625,
"learning_rate": 1.6896296296296298e-05,
"loss": 0.7039,
"mean_token_accuracy": 0.8282962709665298,
"num_tokens": 24016439.0,
"step": 720
},
{
"entropy": 0.6140455640852451,
"epoch": 1.168,
"grad_norm": 0.94921875,
"learning_rate": 1.6822222222222224e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.8253309100866317,
"num_tokens": 24344431.0,
"step": 730
},
{
"entropy": 0.6041248327121138,
"epoch": 1.184,
"grad_norm": 1.09375,
"learning_rate": 1.6748148148148147e-05,
"loss": 0.6955,
"mean_token_accuracy": 0.8286070462316275,
"num_tokens": 24668087.0,
"step": 740
},
{
"entropy": 0.5895567566156388,
"epoch": 1.2,
"grad_norm": 0.9140625,
"learning_rate": 1.6674074074074077e-05,
"loss": 0.6822,
"mean_token_accuracy": 0.8305550657212735,
"num_tokens": 25000629.0,
"step": 750
},
{
"entropy": 0.6073450578376651,
"epoch": 1.216,
"grad_norm": 1.1875,
"learning_rate": 1.66e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.8261869914829731,
"num_tokens": 25324324.0,
"step": 760
},
{
"entropy": 0.6059388216584921,
"epoch": 1.232,
"grad_norm": 0.9453125,
"learning_rate": 1.6525925925925927e-05,
"loss": 0.6863,
"mean_token_accuracy": 0.8298216536641121,
"num_tokens": 25658795.0,
"step": 770
},
{
"entropy": 0.5957553267478943,
"epoch": 1.248,
"grad_norm": 0.92578125,
"learning_rate": 1.6451851851851853e-05,
"loss": 0.6864,
"mean_token_accuracy": 0.830686765909195,
"num_tokens": 25993177.0,
"step": 780
},
{
"entropy": 0.5959657493978738,
"epoch": 1.264,
"grad_norm": 1.0234375,
"learning_rate": 1.637777777777778e-05,
"loss": 0.6826,
"mean_token_accuracy": 0.8305899318307638,
"num_tokens": 26329726.0,
"step": 790
},
{
"entropy": 0.6204709148034453,
"epoch": 1.28,
"grad_norm": 1.0078125,
"learning_rate": 1.6303703703703706e-05,
"loss": 0.7071,
"mean_token_accuracy": 0.8238335218280554,
"num_tokens": 26664093.0,
"step": 800
},
{
"epoch": 1.28,
"eval_biology_entropy": 1.7922346639633178,
"eval_biology_loss": 2.4233291149139404,
"eval_biology_mean_token_accuracy": 0.5778127768039704,
"eval_biology_num_tokens": 26664093.0,
"eval_biology_runtime": 39.1057,
"eval_biology_samples_per_second": 12.786,
"eval_biology_steps_per_second": 3.196,
"step": 800
},
{
"epoch": 1.28,
"eval_chemistry_entropy": 0.7962674243450165,
"eval_chemistry_loss": 1.0286256074905396,
"eval_chemistry_mean_token_accuracy": 0.7727626013755798,
"eval_chemistry_num_tokens": 26664093.0,
"eval_chemistry_runtime": 48.3131,
"eval_chemistry_samples_per_second": 10.349,
"eval_chemistry_steps_per_second": 2.587,
"step": 800
},
{
"epoch": 1.28,
"eval_math_entropy": 0.6702238636016846,
"eval_math_loss": 1.0157709121704102,
"eval_math_mean_token_accuracy": 0.7829344019889831,
"eval_math_num_tokens": 26664093.0,
"eval_math_runtime": 49.7647,
"eval_math_samples_per_second": 10.047,
"eval_math_steps_per_second": 2.512,
"step": 800
},
{
"epoch": 1.28,
"eval_physics_entropy": 0.6072953283786774,
"eval_physics_loss": 0.7434503436088562,
"eval_physics_mean_token_accuracy": 0.8212288799285888,
"eval_physics_num_tokens": 26664093.0,
"eval_physics_runtime": 57.1238,
"eval_physics_samples_per_second": 8.753,
"eval_physics_steps_per_second": 2.188,
"step": 800
},
{
"entropy": 0.593550406768918,
"epoch": 1.296,
"grad_norm": 0.984375,
"learning_rate": 1.622962962962963e-05,
"loss": 0.6825,
"mean_token_accuracy": 0.8320997886359691,
"num_tokens": 26994877.0,
"step": 810
},
{
"entropy": 0.6156477816402912,
"epoch": 1.312,
"grad_norm": 1.046875,
"learning_rate": 1.6155555555555556e-05,
"loss": 0.703,
"mean_token_accuracy": 0.8259445391595364,
"num_tokens": 27328198.0,
"step": 820
},
{
"entropy": 0.5969830378890038,
"epoch": 1.328,
"grad_norm": 1.0078125,
"learning_rate": 1.6081481481481482e-05,
"loss": 0.6901,
"mean_token_accuracy": 0.8290002550929785,
"num_tokens": 27658818.0,
"step": 830
},
{
"entropy": 0.5997996777296066,
"epoch": 1.3439999999999999,
"grad_norm": 1.1015625,
"learning_rate": 1.600740740740741e-05,
"loss": 0.6904,
"mean_token_accuracy": 0.8284455709159374,
"num_tokens": 27991108.0,
"step": 840
},
{
"entropy": 0.6052390130236744,
"epoch": 1.3599999999999999,
"grad_norm": 0.94921875,
"learning_rate": 1.5933333333333336e-05,
"loss": 0.6852,
"mean_token_accuracy": 0.8274706263095141,
"num_tokens": 28327775.0,
"step": 850
},
{
"entropy": 0.6018602728843689,
"epoch": 1.376,
"grad_norm": 1.109375,
"learning_rate": 1.5859259259259262e-05,
"loss": 0.6827,
"mean_token_accuracy": 0.8304568257182836,
"num_tokens": 28666116.0,
"step": 860
},
{
"entropy": 0.5851238770410419,
"epoch": 1.392,
"grad_norm": 0.98828125,
"learning_rate": 1.5785185185185185e-05,
"loss": 0.6746,
"mean_token_accuracy": 0.8333170894533396,
"num_tokens": 29004293.0,
"step": 870
},
{
"entropy": 0.58795285820961,
"epoch": 1.408,
"grad_norm": 0.96875,
"learning_rate": 1.571111111111111e-05,
"loss": 0.6857,
"mean_token_accuracy": 0.8305332105606794,
"num_tokens": 29330131.0,
"step": 880
},
{
"entropy": 0.6189510561525822,
"epoch": 1.424,
"grad_norm": 0.90234375,
"learning_rate": 1.5637037037037038e-05,
"loss": 0.7072,
"mean_token_accuracy": 0.8251793116331101,
"num_tokens": 29661850.0,
"step": 890
},
{
"entropy": 0.5798567572608591,
"epoch": 1.44,
"grad_norm": 0.90625,
"learning_rate": 1.5562962962962965e-05,
"loss": 0.6695,
"mean_token_accuracy": 0.8342564977705479,
"num_tokens": 29999392.0,
"step": 900
},
{
"epoch": 1.44,
"eval_biology_entropy": 1.8147089042663573,
"eval_biology_loss": 2.417410135269165,
"eval_biology_mean_token_accuracy": 0.5787473826408386,
"eval_biology_num_tokens": 29999392.0,
"eval_biology_runtime": 38.9072,
"eval_biology_samples_per_second": 12.851,
"eval_biology_steps_per_second": 3.213,
"step": 900
},
{
"epoch": 1.44,
"eval_chemistry_entropy": 0.7944381227493286,
"eval_chemistry_loss": 1.0228049755096436,
"eval_chemistry_mean_token_accuracy": 0.7732948322296143,
"eval_chemistry_num_tokens": 29999392.0,
"eval_chemistry_runtime": 48.4429,
"eval_chemistry_samples_per_second": 10.321,
"eval_chemistry_steps_per_second": 2.58,
"step": 900
},
{
"epoch": 1.44,
"eval_math_entropy": 0.6757729785442352,
"eval_math_loss": 1.0138108730316162,
"eval_math_mean_token_accuracy": 0.7829539861679077,
"eval_math_num_tokens": 29999392.0,
"eval_math_runtime": 49.813,
"eval_math_samples_per_second": 10.038,
"eval_math_steps_per_second": 2.509,
"step": 900
},
{
"epoch": 1.44,
"eval_physics_entropy": 0.607404308795929,
"eval_physics_loss": 0.737089991569519,
"eval_physics_mean_token_accuracy": 0.8226114134788514,
"eval_physics_num_tokens": 29999392.0,
"eval_physics_runtime": 57.3115,
"eval_physics_samples_per_second": 8.724,
"eval_physics_steps_per_second": 2.181,
"step": 900
},
{
"entropy": 0.5827234297990799,
"epoch": 1.456,
"grad_norm": 0.8828125,
"learning_rate": 1.548888888888889e-05,
"loss": 0.6728,
"mean_token_accuracy": 0.8342432040721178,
"num_tokens": 30341004.0,
"step": 910
},
{
"entropy": 0.613773494027555,
"epoch": 1.472,
"grad_norm": 1.015625,
"learning_rate": 1.5414814814814814e-05,
"loss": 0.7035,
"mean_token_accuracy": 0.8268327709287405,
"num_tokens": 30667689.0,
"step": 920
},
{
"entropy": 0.5988023646175862,
"epoch": 1.488,
"grad_norm": 0.93359375,
"learning_rate": 1.5340740740740744e-05,
"loss": 0.6897,
"mean_token_accuracy": 0.8293631616979837,
"num_tokens": 30999697.0,
"step": 930
},
{
"entropy": 0.5896453000605106,
"epoch": 1.504,
"grad_norm": 0.8828125,
"learning_rate": 1.5266666666666667e-05,
"loss": 0.6728,
"mean_token_accuracy": 0.8324251122772693,
"num_tokens": 31332775.0,
"step": 940
},
{
"entropy": 0.605005569756031,
"epoch": 1.52,
"grad_norm": 1.0390625,
"learning_rate": 1.5192592592592594e-05,
"loss": 0.6942,
"mean_token_accuracy": 0.8275358382612467,
"num_tokens": 31666521.0,
"step": 950
},
{
"entropy": 0.5999037871137262,
"epoch": 1.536,
"grad_norm": 1.0390625,
"learning_rate": 1.5118518518518519e-05,
"loss": 0.6904,
"mean_token_accuracy": 0.8276842717081309,
"num_tokens": 31998232.0,
"step": 960
},
{
"entropy": 0.5867987772449851,
"epoch": 1.552,
"grad_norm": 1.0703125,
"learning_rate": 1.5044444444444445e-05,
"loss": 0.6754,
"mean_token_accuracy": 0.8337812848389149,
"num_tokens": 32328085.0,
"step": 970
},
{
"entropy": 0.5942975046113134,
"epoch": 1.568,
"grad_norm": 1.0,
"learning_rate": 1.497037037037037e-05,
"loss": 0.6807,
"mean_token_accuracy": 0.831771444156766,
"num_tokens": 32656603.0,
"step": 980
},
{
"entropy": 0.6016290852800011,
"epoch": 1.584,
"grad_norm": 0.98828125,
"learning_rate": 1.4896296296296298e-05,
"loss": 0.6947,
"mean_token_accuracy": 0.8264373868703843,
"num_tokens": 32983769.0,
"step": 990
},
{
"entropy": 0.5999371835961937,
"epoch": 1.6,
"grad_norm": 0.92578125,
"learning_rate": 1.4822222222222225e-05,
"loss": 0.6882,
"mean_token_accuracy": 0.8293469067662954,
"num_tokens": 33309282.0,
"step": 1000
},
{
"epoch": 1.6,
"eval_biology_entropy": 1.788435049057007,
"eval_biology_loss": 2.369239330291748,
"eval_biology_mean_token_accuracy": 0.582524644613266,
"eval_biology_num_tokens": 33309282.0,
"eval_biology_runtime": 38.8968,
"eval_biology_samples_per_second": 12.855,
"eval_biology_steps_per_second": 3.214,
"step": 1000
},
{
"epoch": 1.6,
"eval_chemistry_entropy": 0.7873289968967437,
"eval_chemistry_loss": 1.0177081823349,
"eval_chemistry_mean_token_accuracy": 0.774118812084198,
"eval_chemistry_num_tokens": 33309282.0,
"eval_chemistry_runtime": 48.3649,
"eval_chemistry_samples_per_second": 10.338,
"eval_chemistry_steps_per_second": 2.585,
"step": 1000
},
{
"epoch": 1.6,
"eval_math_entropy": 0.6668027784824372,
"eval_math_loss": 1.0126802921295166,
"eval_math_mean_token_accuracy": 0.7838437123298645,
"eval_math_num_tokens": 33309282.0,
"eval_math_runtime": 49.8108,
"eval_math_samples_per_second": 10.038,
"eval_math_steps_per_second": 2.509,
"step": 1000
},
{
"epoch": 1.6,
"eval_physics_entropy": 0.5990195829868317,
"eval_physics_loss": 0.7323749661445618,
"eval_physics_mean_token_accuracy": 0.8238483490943909,
"eval_physics_num_tokens": 33309282.0,
"eval_physics_runtime": 57.3705,
"eval_physics_samples_per_second": 8.715,
"eval_physics_steps_per_second": 2.179,
"step": 1000
},
{
"entropy": 0.5766303434967994,
"epoch": 1.616,
"grad_norm": 1.0,
"learning_rate": 1.474814814814815e-05,
"loss": 0.6687,
"mean_token_accuracy": 0.8350894570350647,
"num_tokens": 33647616.0,
"step": 1010
},
{
"entropy": 0.5906545480713248,
"epoch": 1.6320000000000001,
"grad_norm": 1.0390625,
"learning_rate": 1.4674074074074076e-05,
"loss": 0.6798,
"mean_token_accuracy": 0.8322980519384146,
"num_tokens": 33977586.0,
"step": 1020
},
{
"entropy": 0.5677616313099861,
"epoch": 1.6480000000000001,
"grad_norm": 0.98828125,
"learning_rate": 1.46e-05,
"loss": 0.658,
"mean_token_accuracy": 0.8373314294964075,
"num_tokens": 34312436.0,
"step": 1030
},
{
"entropy": 0.5948906594887375,
"epoch": 1.6640000000000001,
"grad_norm": 1.109375,
"learning_rate": 1.4525925925925927e-05,
"loss": 0.682,
"mean_token_accuracy": 0.8317620534449816,
"num_tokens": 34641549.0,
"step": 1040
},
{
"entropy": 0.5764713797718286,
"epoch": 1.6800000000000002,
"grad_norm": 0.94921875,
"learning_rate": 1.4451851851851852e-05,
"loss": 0.6715,
"mean_token_accuracy": 0.8332184217870235,
"num_tokens": 34977031.0,
"step": 1050
},
{
"entropy": 0.5952808676287532,
"epoch": 1.696,
"grad_norm": 0.99609375,
"learning_rate": 1.4377777777777779e-05,
"loss": 0.6834,
"mean_token_accuracy": 0.8309958126395941,
"num_tokens": 35300962.0,
"step": 1060
},
{
"entropy": 0.5965396504849195,
"epoch": 1.712,
"grad_norm": 1.046875,
"learning_rate": 1.4303703703703703e-05,
"loss": 0.6786,
"mean_token_accuracy": 0.829968997463584,
"num_tokens": 35642662.0,
"step": 1070
},
{
"entropy": 0.5757137715816498,
"epoch": 1.728,
"grad_norm": 0.9765625,
"learning_rate": 1.4229629629629632e-05,
"loss": 0.6695,
"mean_token_accuracy": 0.833732133358717,
"num_tokens": 35980993.0,
"step": 1080
},
{
"entropy": 0.5878799825906753,
"epoch": 1.744,
"grad_norm": 1.03125,
"learning_rate": 1.4155555555555556e-05,
"loss": 0.6717,
"mean_token_accuracy": 0.8321238547563553,
"num_tokens": 36326797.0,
"step": 1090
},
{
"entropy": 0.577763288281858,
"epoch": 1.76,
"grad_norm": 1.015625,
"learning_rate": 1.4081481481481483e-05,
"loss": 0.6698,
"mean_token_accuracy": 0.8338598430156707,
"num_tokens": 36654797.0,
"step": 1100
},
{
"epoch": 1.76,
"eval_biology_entropy": 1.76575110912323,
"eval_biology_loss": 2.379521369934082,
"eval_biology_mean_token_accuracy": 0.5822040309906006,
"eval_biology_num_tokens": 36654797.0,
"eval_biology_runtime": 38.7201,
"eval_biology_samples_per_second": 12.913,
"eval_biology_steps_per_second": 3.228,
"step": 1100
},
{
"epoch": 1.76,
"eval_chemistry_entropy": 0.7854319989681244,
"eval_chemistry_loss": 1.01557457447052,
"eval_chemistry_mean_token_accuracy": 0.7744541010856628,
"eval_chemistry_num_tokens": 36654797.0,
"eval_chemistry_runtime": 48.2464,
"eval_chemistry_samples_per_second": 10.363,
"eval_chemistry_steps_per_second": 2.591,
"step": 1100
},
{
"epoch": 1.76,
"eval_math_entropy": 0.6641156001091003,
"eval_math_loss": 1.0110862255096436,
"eval_math_mean_token_accuracy": 0.783713164806366,
"eval_math_num_tokens": 36654797.0,
"eval_math_runtime": 49.5907,
"eval_math_samples_per_second": 10.083,
"eval_math_steps_per_second": 2.521,
"step": 1100
},
{
"epoch": 1.76,
"eval_physics_entropy": 0.5953411047458649,
"eval_physics_loss": 0.7290456295013428,
"eval_physics_mean_token_accuracy": 0.8248090887069702,
"eval_physics_num_tokens": 36654797.0,
"eval_physics_runtime": 57.3959,
"eval_physics_samples_per_second": 8.711,
"eval_physics_steps_per_second": 2.178,
"step": 1100
},
{
"entropy": 0.5980557221919298,
"epoch": 1.776,
"grad_norm": 0.95703125,
"learning_rate": 1.400740740740741e-05,
"loss": 0.6911,
"mean_token_accuracy": 0.8282759781926871,
"num_tokens": 36989310.0,
"step": 1110
},
{
"entropy": 0.5839062621816993,
"epoch": 1.792,
"grad_norm": 1.0859375,
"learning_rate": 1.3933333333333334e-05,
"loss": 0.6708,
"mean_token_accuracy": 0.8331681247800589,
"num_tokens": 37318535.0,
"step": 1120
},
{
"entropy": 0.5731059337034822,
"epoch": 1.808,
"grad_norm": 1.015625,
"learning_rate": 1.385925925925926e-05,
"loss": 0.6585,
"mean_token_accuracy": 0.8363862674683332,
"num_tokens": 37658889.0,
"step": 1130
},
{
"entropy": 0.5865196855738759,
"epoch": 1.8239999999999998,
"grad_norm": 0.984375,
"learning_rate": 1.3785185185185186e-05,
"loss": 0.6698,
"mean_token_accuracy": 0.8333104524761439,
"num_tokens": 37995198.0,
"step": 1140
},
{
"entropy": 0.5773211907595396,
"epoch": 1.8399999999999999,
"grad_norm": 1.0,
"learning_rate": 1.3711111111111112e-05,
"loss": 0.6726,
"mean_token_accuracy": 0.8339518435299397,
"num_tokens": 38325542.0,
"step": 1150
},
{
"entropy": 0.5834932073950767,
"epoch": 1.8559999999999999,
"grad_norm": 0.97265625,
"learning_rate": 1.3637037037037037e-05,
"loss": 0.6632,
"mean_token_accuracy": 0.8345976937562227,
"num_tokens": 38668148.0,
"step": 1160
},
{
"entropy": 0.5739704865962267,
"epoch": 1.8719999999999999,
"grad_norm": 0.95703125,
"learning_rate": 1.3562962962962965e-05,
"loss": 0.6687,
"mean_token_accuracy": 0.8333972290158271,
"num_tokens": 39001652.0,
"step": 1170
},
{
"entropy": 0.5951925914734602,
"epoch": 1.888,
"grad_norm": 1.09375,
"learning_rate": 1.3488888888888888e-05,
"loss": 0.682,
"mean_token_accuracy": 0.8303560864180326,
"num_tokens": 39333474.0,
"step": 1180
},
{
"entropy": 0.6042962603271007,
"epoch": 1.904,
"grad_norm": 1.0234375,
"learning_rate": 1.3414814814814817e-05,
"loss": 0.6915,
"mean_token_accuracy": 0.8288168527185917,
"num_tokens": 39663055.0,
"step": 1190
},
{
"entropy": 0.601089458540082,
"epoch": 1.92,
"grad_norm": 1.0234375,
"learning_rate": 1.3340740740740741e-05,
"loss": 0.6888,
"mean_token_accuracy": 0.8287999380379916,
"num_tokens": 39989824.0,
"step": 1200
},
{
"epoch": 1.92,
"eval_biology_entropy": 1.759890214920044,
"eval_biology_loss": 2.368649959564209,
"eval_biology_mean_token_accuracy": 0.5835101284980774,
"eval_biology_num_tokens": 39989824.0,
"eval_biology_runtime": 38.6774,
"eval_biology_samples_per_second": 12.927,
"eval_biology_steps_per_second": 3.232,
"step": 1200
},
{
"epoch": 1.92,
"eval_chemistry_entropy": 0.776658641576767,
"eval_chemistry_loss": 1.0122178792953491,
"eval_chemistry_mean_token_accuracy": 0.7752061448097229,
"eval_chemistry_num_tokens": 39989824.0,
"eval_chemistry_runtime": 48.1689,
"eval_chemistry_samples_per_second": 10.38,
"eval_chemistry_steps_per_second": 2.595,
"step": 1200
},
{
"epoch": 1.92,
"eval_math_entropy": 0.6634243364334107,
"eval_math_loss": 1.0101300477981567,
"eval_math_mean_token_accuracy": 0.7840519022941589,
"eval_math_num_tokens": 39989824.0,
"eval_math_runtime": 49.5286,
"eval_math_samples_per_second": 10.095,
"eval_math_steps_per_second": 2.524,
"step": 1200
},
{
"epoch": 1.92,
"eval_physics_entropy": 0.5912484722137451,
"eval_physics_loss": 0.7268282175064087,
"eval_physics_mean_token_accuracy": 0.825360511302948,
"eval_physics_num_tokens": 39989824.0,
"eval_physics_runtime": 56.9412,
"eval_physics_samples_per_second": 8.781,
"eval_physics_steps_per_second": 2.195,
"step": 1200
},
{
"entropy": 0.5859691947698593,
"epoch": 1.936,
"grad_norm": 0.96875,
"learning_rate": 1.3266666666666668e-05,
"loss": 0.676,
"mean_token_accuracy": 0.8319578696042299,
"num_tokens": 40325070.0,
"step": 1210
},
{
"entropy": 0.5632861316204071,
"epoch": 1.952,
"grad_norm": 1.0390625,
"learning_rate": 1.3192592592592594e-05,
"loss": 0.6508,
"mean_token_accuracy": 0.83807716332376,
"num_tokens": 40659614.0,
"step": 1220
},
{
"entropy": 0.5868391951546073,
"epoch": 1.968,
"grad_norm": 0.91796875,
"learning_rate": 1.311851851851852e-05,
"loss": 0.6756,
"mean_token_accuracy": 0.8323294088244438,
"num_tokens": 40991508.0,
"step": 1230
},
{
"entropy": 0.5867437845095992,
"epoch": 1.984,
"grad_norm": 1.078125,
"learning_rate": 1.3044444444444446e-05,
"loss": 0.676,
"mean_token_accuracy": 0.8312037277966737,
"num_tokens": 41328779.0,
"step": 1240
},
{
"entropy": 0.593030778504908,
"epoch": 2.0,
"grad_norm": 0.9765625,
"learning_rate": 1.297037037037037e-05,
"loss": 0.6752,
"mean_token_accuracy": 0.8317596733570098,
"num_tokens": 41664296.0,
"step": 1250
},
{
"entropy": 0.5428176861256361,
"epoch": 2.016,
"grad_norm": 0.91796875,
"learning_rate": 1.2896296296296299e-05,
"loss": 0.6304,
"mean_token_accuracy": 0.8421510916203261,
"num_tokens": 42001193.0,
"step": 1260
},
{
"entropy": 0.5524626910686493,
"epoch": 2.032,
"grad_norm": 1.046875,
"learning_rate": 1.2822222222222222e-05,
"loss": 0.633,
"mean_token_accuracy": 0.8409431543201208,
"num_tokens": 42340757.0,
"step": 1270
},
{
"entropy": 0.5630420710891485,
"epoch": 2.048,
"grad_norm": 1.0625,
"learning_rate": 1.274814814814815e-05,
"loss": 0.6561,
"mean_token_accuracy": 0.8348792966455221,
"num_tokens": 42670324.0,
"step": 1280
},
{
"entropy": 0.5547426689416171,
"epoch": 2.064,
"grad_norm": 1.015625,
"learning_rate": 1.2674074074074075e-05,
"loss": 0.6368,
"mean_token_accuracy": 0.8413027279078961,
"num_tokens": 43010231.0,
"step": 1290
},
{
"entropy": 0.5610322959721088,
"epoch": 2.08,
"grad_norm": 1.0,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.6516,
"mean_token_accuracy": 0.8378075629472732,
"num_tokens": 43340227.0,
"step": 1300
},
{
"epoch": 2.08,
"eval_biology_entropy": 1.6757129163742066,
"eval_biology_loss": 2.434731960296631,
"eval_biology_mean_token_accuracy": 0.5802675273418426,
"eval_biology_num_tokens": 43340227.0,
"eval_biology_runtime": 38.7759,
"eval_biology_samples_per_second": 12.895,
"eval_biology_steps_per_second": 3.224,
"step": 1300
},
{
"epoch": 2.08,
"eval_chemistry_entropy": 0.7416200067996979,
"eval_chemistry_loss": 1.027290940284729,
"eval_chemistry_mean_token_accuracy": 0.774878448009491,
"eval_chemistry_num_tokens": 43340227.0,
"eval_chemistry_runtime": 48.262,
"eval_chemistry_samples_per_second": 10.36,
"eval_chemistry_steps_per_second": 2.59,
"step": 1300
},
{
"epoch": 2.08,
"eval_math_entropy": 0.6465173182487488,
"eval_math_loss": 1.023693561553955,
"eval_math_mean_token_accuracy": 0.7827271037101745,
"eval_math_num_tokens": 43340227.0,
"eval_math_runtime": 49.631,
"eval_math_samples_per_second": 10.074,
"eval_math_steps_per_second": 2.519,
"step": 1300
},
{
"epoch": 2.08,
"eval_physics_entropy": 0.5699445073604583,
"eval_physics_loss": 0.7285439372062683,
"eval_physics_mean_token_accuracy": 0.8254043416976928,
"eval_physics_num_tokens": 43340227.0,
"eval_physics_runtime": 57.0739,
"eval_physics_samples_per_second": 8.761,
"eval_physics_steps_per_second": 2.19,
"step": 1300
},
{
"entropy": 0.5561112010851502,
"epoch": 2.096,
"grad_norm": 1.03125,
"learning_rate": 1.2525925925925928e-05,
"loss": 0.6449,
"mean_token_accuracy": 0.838719493150711,
"num_tokens": 43673099.0,
"step": 1310
},
{
"entropy": 0.5452175224199891,
"epoch": 2.112,
"grad_norm": 1.0234375,
"learning_rate": 1.2451851851851853e-05,
"loss": 0.6321,
"mean_token_accuracy": 0.8417959384620189,
"num_tokens": 44005986.0,
"step": 1320
},
{
"entropy": 0.5562603289261461,
"epoch": 2.128,
"grad_norm": 1.0390625,
"learning_rate": 1.237777777777778e-05,
"loss": 0.653,
"mean_token_accuracy": 0.8379485800862312,
"num_tokens": 44321465.0,
"step": 1330
},
{
"entropy": 0.5499181509017944,
"epoch": 2.144,
"grad_norm": 0.9921875,
"learning_rate": 1.2303703703703704e-05,
"loss": 0.638,
"mean_token_accuracy": 0.8397632710635662,
"num_tokens": 44661027.0,
"step": 1340
},
{
"entropy": 0.5740855507552624,
"epoch": 2.16,
"grad_norm": 1.0703125,
"learning_rate": 1.222962962962963e-05,
"loss": 0.6641,
"mean_token_accuracy": 0.8336062435060739,
"num_tokens": 44994236.0,
"step": 1350
},
{
"entropy": 0.5468110611662269,
"epoch": 2.176,
"grad_norm": 0.99609375,
"learning_rate": 1.2155555555555555e-05,
"loss": 0.6341,
"mean_token_accuracy": 0.8411031287163496,
"num_tokens": 45334280.0,
"step": 1360
},
{
"entropy": 0.5445626365020871,
"epoch": 2.192,
"grad_norm": 1.0234375,
"learning_rate": 1.2081481481481484e-05,
"loss": 0.6379,
"mean_token_accuracy": 0.8422058593481779,
"num_tokens": 45666954.0,
"step": 1370
},
{
"entropy": 0.5418444711714983,
"epoch": 2.208,
"grad_norm": 1.0859375,
"learning_rate": 1.2007407407407408e-05,
"loss": 0.6314,
"mean_token_accuracy": 0.8422281835228205,
"num_tokens": 45999546.0,
"step": 1380
},
{
"entropy": 0.54025251083076,
"epoch": 2.224,
"grad_norm": 1.0,
"learning_rate": 1.1933333333333335e-05,
"loss": 0.6343,
"mean_token_accuracy": 0.8418932400643826,
"num_tokens": 46333809.0,
"step": 1390
},
{
"entropy": 0.5499276254326105,
"epoch": 2.24,
"grad_norm": 1.015625,
"learning_rate": 1.185925925925926e-05,
"loss": 0.6369,
"mean_token_accuracy": 0.8407698534429073,
"num_tokens": 46673328.0,
"step": 1400
},
{
"epoch": 2.24,
"eval_biology_entropy": 1.6533626160621644,
"eval_biology_loss": 2.441138744354248,
"eval_biology_mean_token_accuracy": 0.5818718819618225,
"eval_biology_num_tokens": 46673328.0,
"eval_biology_runtime": 38.7889,
"eval_biology_samples_per_second": 12.89,
"eval_biology_steps_per_second": 3.223,
"step": 1400
},
{
"epoch": 2.24,
"eval_chemistry_entropy": 0.7358436925411225,
"eval_chemistry_loss": 1.0279189348220825,
"eval_chemistry_mean_token_accuracy": 0.7754086136817933,
"eval_chemistry_num_tokens": 46673328.0,
"eval_chemistry_runtime": 48.2677,
"eval_chemistry_samples_per_second": 10.359,
"eval_chemistry_steps_per_second": 2.59,
"step": 1400
},
{
"epoch": 2.24,
"eval_math_entropy": 0.6436889851093293,
"eval_math_loss": 1.0210638046264648,
"eval_math_mean_token_accuracy": 0.7834169192314148,
"eval_math_num_tokens": 46673328.0,
"eval_math_runtime": 49.6493,
"eval_math_samples_per_second": 10.071,
"eval_math_steps_per_second": 2.518,
"step": 1400
},
{
"epoch": 2.24,
"eval_physics_entropy": 0.5687800683975219,
"eval_physics_loss": 0.7271425127983093,
"eval_physics_mean_token_accuracy": 0.8259177951812744,
"eval_physics_num_tokens": 46673328.0,
"eval_physics_runtime": 57.064,
"eval_physics_samples_per_second": 8.762,
"eval_physics_steps_per_second": 2.191,
"step": 1400
},
{
"entropy": 0.5672985427081585,
"epoch": 2.2560000000000002,
"grad_norm": 1.109375,
"learning_rate": 1.1785185185185186e-05,
"loss": 0.6585,
"mean_token_accuracy": 0.8361219819635153,
"num_tokens": 47001509.0,
"step": 1410
},
{
"entropy": 0.5440221125259995,
"epoch": 2.2720000000000002,
"grad_norm": 1.0859375,
"learning_rate": 1.1711111111111113e-05,
"loss": 0.6308,
"mean_token_accuracy": 0.842396317794919,
"num_tokens": 47340142.0,
"step": 1420
},
{
"entropy": 0.5601490139961243,
"epoch": 2.288,
"grad_norm": 1.09375,
"learning_rate": 1.1637037037037037e-05,
"loss": 0.6515,
"mean_token_accuracy": 0.8371975239366293,
"num_tokens": 47677550.0,
"step": 1430
},
{
"entropy": 0.5608395885676145,
"epoch": 2.304,
"grad_norm": 1.15625,
"learning_rate": 1.1562962962962964e-05,
"loss": 0.65,
"mean_token_accuracy": 0.8390717066824436,
"num_tokens": 47993851.0,
"step": 1440
},
{
"entropy": 0.5507894741371274,
"epoch": 2.32,
"grad_norm": 0.96484375,
"learning_rate": 1.1488888888888889e-05,
"loss": 0.6392,
"mean_token_accuracy": 0.8389977443963289,
"num_tokens": 48332577.0,
"step": 1450
},
{
"entropy": 0.5349721314385534,
"epoch": 2.336,
"grad_norm": 1.0078125,
"learning_rate": 1.1414814814814817e-05,
"loss": 0.6212,
"mean_token_accuracy": 0.8431226223707199,
"num_tokens": 48676623.0,
"step": 1460
},
{
"entropy": 0.5349597102031112,
"epoch": 2.352,
"grad_norm": 1.015625,
"learning_rate": 1.1340740740740742e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.8435559894889593,
"num_tokens": 49007770.0,
"step": 1470
},
{
"entropy": 0.5474015891551971,
"epoch": 2.368,
"grad_norm": 1.046875,
"learning_rate": 1.1266666666666668e-05,
"loss": 0.6384,
"mean_token_accuracy": 0.8399505577981472,
"num_tokens": 49350016.0,
"step": 1480
},
{
"entropy": 0.5547215724363923,
"epoch": 2.384,
"grad_norm": 1.171875,
"learning_rate": 1.1192592592592593e-05,
"loss": 0.6461,
"mean_token_accuracy": 0.8389234948903322,
"num_tokens": 49679074.0,
"step": 1490
},
{
"entropy": 0.5506776092574001,
"epoch": 2.4,
"grad_norm": 1.09375,
"learning_rate": 1.111851851851852e-05,
"loss": 0.6381,
"mean_token_accuracy": 0.8396286979317665,
"num_tokens": 50020648.0,
"step": 1500
},
{
"epoch": 2.4,
"eval_biology_entropy": 1.6101444239616394,
"eval_biology_loss": 2.4393153190612793,
"eval_biology_mean_token_accuracy": 0.5827285711765289,
"eval_biology_num_tokens": 50020648.0,
"eval_biology_runtime": 38.6357,
"eval_biology_samples_per_second": 12.941,
"eval_biology_steps_per_second": 3.235,
"step": 1500
},
{
"epoch": 2.4,
"eval_chemistry_entropy": 0.7312388877868652,
"eval_chemistry_loss": 1.029062271118164,
"eval_chemistry_mean_token_accuracy": 0.7757972526550293,
"eval_chemistry_num_tokens": 50020648.0,
"eval_chemistry_runtime": 48.0523,
"eval_chemistry_samples_per_second": 10.405,
"eval_chemistry_steps_per_second": 2.601,
"step": 1500
},
{
"epoch": 2.4,
"eval_math_entropy": 0.6399386277198792,
"eval_math_loss": 1.0237832069396973,
"eval_math_mean_token_accuracy": 0.7829466652870178,
"eval_math_num_tokens": 50020648.0,
"eval_math_runtime": 49.5118,
"eval_math_samples_per_second": 10.099,
"eval_math_steps_per_second": 2.525,
"step": 1500
},
{
"epoch": 2.4,
"eval_physics_entropy": 0.5676034677028656,
"eval_physics_loss": 0.7261826395988464,
"eval_physics_mean_token_accuracy": 0.825977876663208,
"eval_physics_num_tokens": 50020648.0,
"eval_physics_runtime": 56.9466,
"eval_physics_samples_per_second": 8.78,
"eval_physics_steps_per_second": 2.195,
"step": 1500
},
{
"entropy": 0.546076669357717,
"epoch": 2.416,
"grad_norm": 1.0234375,
"learning_rate": 1.1044444444444444e-05,
"loss": 0.6307,
"mean_token_accuracy": 0.8414915602654218,
"num_tokens": 50370135.0,
"step": 1510
},
{
"entropy": 0.5511986341327428,
"epoch": 2.432,
"grad_norm": 1.0859375,
"learning_rate": 1.0970370370370371e-05,
"loss": 0.6434,
"mean_token_accuracy": 0.8396999359130859,
"num_tokens": 50695363.0,
"step": 1520
},
{
"entropy": 0.568221763893962,
"epoch": 2.448,
"grad_norm": 1.0234375,
"learning_rate": 1.0896296296296298e-05,
"loss": 0.6577,
"mean_token_accuracy": 0.8357432372868061,
"num_tokens": 51023498.0,
"step": 1530
},
{
"entropy": 0.5589647406712175,
"epoch": 2.464,
"grad_norm": 1.1328125,
"learning_rate": 1.0822222222222222e-05,
"loss": 0.6484,
"mean_token_accuracy": 0.8391309097409249,
"num_tokens": 51349816.0,
"step": 1540
},
{
"entropy": 0.5509988136589528,
"epoch": 2.48,
"grad_norm": 1.109375,
"learning_rate": 1.074814814814815e-05,
"loss": 0.6417,
"mean_token_accuracy": 0.8385909989476203,
"num_tokens": 51680507.0,
"step": 1550
},
{
"entropy": 0.5563702458515764,
"epoch": 2.496,
"grad_norm": 1.046875,
"learning_rate": 1.0674074074074074e-05,
"loss": 0.6462,
"mean_token_accuracy": 0.8388838239014149,
"num_tokens": 52017752.0,
"step": 1560
},
{
"entropy": 0.5431887688115239,
"epoch": 2.512,
"grad_norm": 1.140625,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.6342,
"mean_token_accuracy": 0.8411614701151848,
"num_tokens": 52345439.0,
"step": 1570
},
{
"entropy": 0.543870740942657,
"epoch": 2.528,
"grad_norm": 0.99609375,
"learning_rate": 1.0525925925925927e-05,
"loss": 0.6315,
"mean_token_accuracy": 0.8416834581643343,
"num_tokens": 52683886.0,
"step": 1580
},
{
"entropy": 0.5500559687614441,
"epoch": 2.544,
"grad_norm": 1.125,
"learning_rate": 1.0451851851851853e-05,
"loss": 0.6438,
"mean_token_accuracy": 0.8382218111306429,
"num_tokens": 53004529.0,
"step": 1590
},
{
"entropy": 0.5530376594513655,
"epoch": 2.56,
"grad_norm": 1.1171875,
"learning_rate": 1.0377777777777778e-05,
"loss": 0.6366,
"mean_token_accuracy": 0.8400206513702869,
"num_tokens": 53336280.0,
"step": 1600
},
{
"epoch": 2.56,
"eval_biology_entropy": 1.6161543126106261,
"eval_biology_loss": 2.4465079307556152,
"eval_biology_mean_token_accuracy": 0.5826826608180999,
"eval_biology_num_tokens": 53336280.0,
"eval_biology_runtime": 38.7006,
"eval_biology_samples_per_second": 12.92,
"eval_biology_steps_per_second": 3.23,
"step": 1600
},
{
"epoch": 2.56,
"eval_chemistry_entropy": 0.7235212452411651,
"eval_chemistry_loss": 1.0302128791809082,
"eval_chemistry_mean_token_accuracy": 0.776062783241272,
"eval_chemistry_num_tokens": 53336280.0,
"eval_chemistry_runtime": 48.1934,
"eval_chemistry_samples_per_second": 10.375,
"eval_chemistry_steps_per_second": 2.594,
"step": 1600
},
{
"epoch": 2.56,
"eval_math_entropy": 0.6339258260726929,
"eval_math_loss": 1.025081753730774,
"eval_math_mean_token_accuracy": 0.7834318013191223,
"eval_math_num_tokens": 53336280.0,
"eval_math_runtime": 49.912,
"eval_math_samples_per_second": 10.018,
"eval_math_steps_per_second": 2.504,
"step": 1600
},
{
"epoch": 2.56,
"eval_physics_entropy": 0.5578962779045105,
"eval_physics_loss": 0.7250717878341675,
"eval_physics_mean_token_accuracy": 0.8264809098243714,
"eval_physics_num_tokens": 53336280.0,
"eval_physics_runtime": 56.9593,
"eval_physics_samples_per_second": 8.778,
"eval_physics_steps_per_second": 2.195,
"step": 1600
},
{
"entropy": 0.537515789270401,
"epoch": 2.576,
"grad_norm": 1.0625,
"learning_rate": 1.0303703703703705e-05,
"loss": 0.6322,
"mean_token_accuracy": 0.8432505313307047,
"num_tokens": 53667647.0,
"step": 1610
},
{
"entropy": 0.5506160443648696,
"epoch": 2.592,
"grad_norm": 0.97265625,
"learning_rate": 1.0229629629629631e-05,
"loss": 0.6402,
"mean_token_accuracy": 0.8399668127298355,
"num_tokens": 53997614.0,
"step": 1620
},
{
"entropy": 0.5386643601581454,
"epoch": 2.608,
"grad_norm": 1.0078125,
"learning_rate": 1.0155555555555556e-05,
"loss": 0.6243,
"mean_token_accuracy": 0.8430452451109887,
"num_tokens": 54330195.0,
"step": 1630
},
{
"entropy": 0.5495816670358181,
"epoch": 2.624,
"grad_norm": 1.03125,
"learning_rate": 1.0081481481481484e-05,
"loss": 0.6364,
"mean_token_accuracy": 0.8401540901511908,
"num_tokens": 54664342.0,
"step": 1640
},
{
"entropy": 0.5469144558534026,
"epoch": 2.64,
"grad_norm": 1.09375,
"learning_rate": 1.0007407407407407e-05,
"loss": 0.636,
"mean_token_accuracy": 0.8406256098300219,
"num_tokens": 54993626.0,
"step": 1650
},
{
"entropy": 0.5694095639511942,
"epoch": 2.656,
"grad_norm": 1.1953125,
"learning_rate": 9.933333333333334e-06,
"loss": 0.656,
"mean_token_accuracy": 0.8346509717404842,
"num_tokens": 55339962.0,
"step": 1660
},
{
"entropy": 0.5597302883863449,
"epoch": 2.672,
"grad_norm": 1.140625,
"learning_rate": 9.85925925925926e-06,
"loss": 0.6462,
"mean_token_accuracy": 0.838615670055151,
"num_tokens": 55670567.0,
"step": 1670
},
{
"entropy": 0.5369519403204321,
"epoch": 2.6879999999999997,
"grad_norm": 1.1015625,
"learning_rate": 9.785185185185187e-06,
"loss": 0.6323,
"mean_token_accuracy": 0.843082357198,
"num_tokens": 56003156.0,
"step": 1680
},
{
"entropy": 0.5516389394178987,
"epoch": 2.7039999999999997,
"grad_norm": 1.046875,
"learning_rate": 9.711111111111111e-06,
"loss": 0.6369,
"mean_token_accuracy": 0.8410698171705008,
"num_tokens": 56342926.0,
"step": 1690
},
{
"entropy": 0.5484106032177806,
"epoch": 2.7199999999999998,
"grad_norm": 1.203125,
"learning_rate": 9.637037037037038e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.8402947820723057,
"num_tokens": 56677521.0,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_biology_entropy": 1.5972739911079408,
"eval_biology_loss": 2.458798408508301,
"eval_biology_mean_token_accuracy": 0.5806125638484955,
"eval_biology_num_tokens": 56677521.0,
"eval_biology_runtime": 38.6662,
"eval_biology_samples_per_second": 12.931,
"eval_biology_steps_per_second": 3.233,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_chemistry_entropy": 0.7181811017990112,
"eval_chemistry_loss": 1.0299792289733887,
"eval_chemistry_mean_token_accuracy": 0.7751972675323486,
"eval_chemistry_num_tokens": 56677521.0,
"eval_chemistry_runtime": 48.1483,
"eval_chemistry_samples_per_second": 10.385,
"eval_chemistry_steps_per_second": 2.596,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_math_entropy": 0.6283527569770813,
"eval_math_loss": 1.0243655443191528,
"eval_math_mean_token_accuracy": 0.7836487565040589,
"eval_math_num_tokens": 56677521.0,
"eval_math_runtime": 49.508,
"eval_math_samples_per_second": 10.099,
"eval_math_steps_per_second": 2.525,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_physics_entropy": 0.5540005767345428,
"eval_physics_loss": 0.7235716581344604,
"eval_physics_mean_token_accuracy": 0.8267813692092896,
"eval_physics_num_tokens": 56677521.0,
"eval_physics_runtime": 56.9575,
"eval_physics_samples_per_second": 8.778,
"eval_physics_steps_per_second": 2.195,
"step": 1700
},
{
"entropy": 0.5626651704311371,
"epoch": 2.7359999999999998,
"grad_norm": 1.0859375,
"learning_rate": 9.562962962962965e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.8361094355583191,
"num_tokens": 57003161.0,
"step": 1710
},
{
"entropy": 0.5559584245085716,
"epoch": 2.752,
"grad_norm": 1.0859375,
"learning_rate": 9.48888888888889e-06,
"loss": 0.6447,
"mean_token_accuracy": 0.8389039475470781,
"num_tokens": 57335850.0,
"step": 1720
},
{
"entropy": 0.5382809387519956,
"epoch": 2.768,
"grad_norm": 1.1171875,
"learning_rate": 9.414814814814816e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.8432002298533916,
"num_tokens": 57672649.0,
"step": 1730
},
{
"entropy": 0.5509585844352841,
"epoch": 2.784,
"grad_norm": 1.09375,
"learning_rate": 9.34074074074074e-06,
"loss": 0.6395,
"mean_token_accuracy": 0.8397889394313097,
"num_tokens": 58007431.0,
"step": 1740
},
{
"entropy": 0.5838387541472911,
"epoch": 2.8,
"grad_norm": 1.0859375,
"learning_rate": 9.266666666666667e-06,
"loss": 0.6711,
"mean_token_accuracy": 0.8319451794028282,
"num_tokens": 58332730.0,
"step": 1750
},
{
"entropy": 0.5306204471737146,
"epoch": 2.816,
"grad_norm": 1.078125,
"learning_rate": 9.192592592592594e-06,
"loss": 0.6171,
"mean_token_accuracy": 0.8446537777781487,
"num_tokens": 58672106.0,
"step": 1760
},
{
"entropy": 0.553738858550787,
"epoch": 2.832,
"grad_norm": 1.078125,
"learning_rate": 9.118518518518518e-06,
"loss": 0.6486,
"mean_token_accuracy": 0.8376123756170273,
"num_tokens": 58997592.0,
"step": 1770
},
{
"entropy": 0.5568923223763704,
"epoch": 2.848,
"grad_norm": 1.0625,
"learning_rate": 9.044444444444445e-06,
"loss": 0.6446,
"mean_token_accuracy": 0.8393427152186632,
"num_tokens": 59326336.0,
"step": 1780
},
{
"entropy": 0.5415080957114696,
"epoch": 2.864,
"grad_norm": 0.94921875,
"learning_rate": 8.970370370370372e-06,
"loss": 0.6308,
"mean_token_accuracy": 0.8419267870485783,
"num_tokens": 59668586.0,
"step": 1790
},
{
"entropy": 0.5516748385503888,
"epoch": 2.88,
"grad_norm": 1.2578125,
"learning_rate": 8.896296296296298e-06,
"loss": 0.6441,
"mean_token_accuracy": 0.840485867485404,
"num_tokens": 59993572.0,
"step": 1800
},
{
"epoch": 2.88,
"eval_biology_entropy": 1.6067570729255676,
"eval_biology_loss": 2.442582130432129,
"eval_biology_mean_token_accuracy": 0.5815401375293732,
"eval_biology_num_tokens": 59993572.0,
"eval_biology_runtime": 38.7204,
"eval_biology_samples_per_second": 12.913,
"eval_biology_steps_per_second": 3.228,
"step": 1800
},
{
"epoch": 2.88,
"eval_chemistry_entropy": 0.7207075932025909,
"eval_chemistry_loss": 1.0278831720352173,
"eval_chemistry_mean_token_accuracy": 0.7756774797439575,
"eval_chemistry_num_tokens": 59993572.0,
"eval_chemistry_runtime": 48.2116,
"eval_chemistry_samples_per_second": 10.371,
"eval_chemistry_steps_per_second": 2.593,
"step": 1800
},
{
"epoch": 2.88,
"eval_math_entropy": 0.629777349948883,
"eval_math_loss": 1.0234665870666504,
"eval_math_mean_token_accuracy": 0.7837573509216309,
"eval_math_num_tokens": 59993572.0,
"eval_math_runtime": 49.5638,
"eval_math_samples_per_second": 10.088,
"eval_math_steps_per_second": 2.522,
"step": 1800
},
{
"epoch": 2.88,
"eval_physics_entropy": 0.5562911832332611,
"eval_physics_loss": 0.7223864793777466,
"eval_physics_mean_token_accuracy": 0.8269883937835694,
"eval_physics_num_tokens": 59993572.0,
"eval_physics_runtime": 56.9994,
"eval_physics_samples_per_second": 8.772,
"eval_physics_steps_per_second": 2.193,
"step": 1800
},
{
"entropy": 0.5429861357435584,
"epoch": 2.896,
"grad_norm": 1.1171875,
"learning_rate": 8.822222222222223e-06,
"loss": 0.6347,
"mean_token_accuracy": 0.8413365628570318,
"num_tokens": 60324128.0,
"step": 1810
},
{
"entropy": 0.5474321844056249,
"epoch": 2.912,
"grad_norm": 1.1015625,
"learning_rate": 8.74814814814815e-06,
"loss": 0.635,
"mean_token_accuracy": 0.8426228888332844,
"num_tokens": 60657399.0,
"step": 1820
},
{
"entropy": 0.5416806817054749,
"epoch": 2.928,
"grad_norm": 1.0703125,
"learning_rate": 8.674074074074074e-06,
"loss": 0.6306,
"mean_token_accuracy": 0.8423144549131394,
"num_tokens": 60984711.0,
"step": 1830
},
{
"entropy": 0.5390040006488561,
"epoch": 2.944,
"grad_norm": 1.1875,
"learning_rate": 8.6e-06,
"loss": 0.6304,
"mean_token_accuracy": 0.8424391083419323,
"num_tokens": 61321359.0,
"step": 1840
},
{
"entropy": 0.5532678855583072,
"epoch": 2.96,
"grad_norm": 0.984375,
"learning_rate": 8.525925925925927e-06,
"loss": 0.6378,
"mean_token_accuracy": 0.8402687277644872,
"num_tokens": 61659042.0,
"step": 1850
},
{
"entropy": 0.5464650699868798,
"epoch": 2.976,
"grad_norm": 1.0546875,
"learning_rate": 8.451851851851852e-06,
"loss": 0.6345,
"mean_token_accuracy": 0.8401576526463032,
"num_tokens": 61993595.0,
"step": 1860
},
{
"entropy": 0.5306900983676315,
"epoch": 2.992,
"grad_norm": 1.0390625,
"learning_rate": 8.377777777777779e-06,
"loss": 0.6196,
"mean_token_accuracy": 0.8441225662827492,
"num_tokens": 62334012.0,
"step": 1870
},
{
"entropy": 0.5364003209397197,
"epoch": 3.008,
"grad_norm": 1.0703125,
"learning_rate": 8.303703703703705e-06,
"loss": 0.6242,
"mean_token_accuracy": 0.8451750382781029,
"num_tokens": 62660928.0,
"step": 1880
},
{
"entropy": 0.5261571481823921,
"epoch": 3.024,
"grad_norm": 1.078125,
"learning_rate": 8.229629629629632e-06,
"loss": 0.614,
"mean_token_accuracy": 0.8461304292082786,
"num_tokens": 62992670.0,
"step": 1890
},
{
"entropy": 0.5176527475938201,
"epoch": 3.04,
"grad_norm": 1.1328125,
"learning_rate": 8.155555555555556e-06,
"loss": 0.6074,
"mean_token_accuracy": 0.8469121795147657,
"num_tokens": 63335157.0,
"step": 1900
},
{
"epoch": 3.04,
"eval_biology_entropy": 1.5792802815437317,
"eval_biology_loss": 2.4808876514434814,
"eval_biology_mean_token_accuracy": 0.5793148455619812,
"eval_biology_num_tokens": 63335157.0,
"eval_biology_runtime": 38.7055,
"eval_biology_samples_per_second": 12.918,
"eval_biology_steps_per_second": 3.23,
"step": 1900
},
{
"epoch": 3.04,
"eval_chemistry_entropy": 0.7061369748115539,
"eval_chemistry_loss": 1.0390231609344482,
"eval_chemistry_mean_token_accuracy": 0.7745346717834473,
"eval_chemistry_num_tokens": 63335157.0,
"eval_chemistry_runtime": 48.1872,
"eval_chemistry_samples_per_second": 10.376,
"eval_chemistry_steps_per_second": 2.594,
"step": 1900
},
{
"epoch": 3.04,
"eval_math_entropy": 0.6224793126583099,
"eval_math_loss": 1.034122347831726,
"eval_math_mean_token_accuracy": 0.7828424015045166,
"eval_math_num_tokens": 63335157.0,
"eval_math_runtime": 49.5497,
"eval_math_samples_per_second": 10.091,
"eval_math_steps_per_second": 2.523,
"step": 1900
},
{
"epoch": 3.04,
"eval_physics_entropy": 0.5448903846740722,
"eval_physics_loss": 0.7253366708755493,
"eval_physics_mean_token_accuracy": 0.8264335384368896,
"eval_physics_num_tokens": 63335157.0,
"eval_physics_runtime": 56.9581,
"eval_physics_samples_per_second": 8.778,
"eval_physics_steps_per_second": 2.195,
"step": 1900
},
{
"entropy": 0.5329983660951256,
"epoch": 3.056,
"grad_norm": 1.140625,
"learning_rate": 8.081481481481483e-06,
"loss": 0.6183,
"mean_token_accuracy": 0.8438076838850975,
"num_tokens": 63662314.0,
"step": 1910
},
{
"entropy": 0.5316095747053623,
"epoch": 3.072,
"grad_norm": 1.171875,
"learning_rate": 8.007407407407408e-06,
"loss": 0.6223,
"mean_token_accuracy": 0.8441695164889097,
"num_tokens": 63997780.0,
"step": 1920
},
{
"entropy": 0.541679815761745,
"epoch": 3.088,
"grad_norm": 1.15625,
"learning_rate": 7.933333333333334e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.8417537044733763,
"num_tokens": 64325274.0,
"step": 1930
},
{
"entropy": 0.5170316396281123,
"epoch": 3.104,
"grad_norm": 1.140625,
"learning_rate": 7.859259259259259e-06,
"loss": 0.6062,
"mean_token_accuracy": 0.8478735946118832,
"num_tokens": 64659683.0,
"step": 1940
},
{
"entropy": 0.5163595724850893,
"epoch": 3.12,
"grad_norm": 1.1328125,
"learning_rate": 7.785185185185185e-06,
"loss": 0.603,
"mean_token_accuracy": 0.8484522052109241,
"num_tokens": 64998212.0,
"step": 1950
},
{
"entropy": 0.5422403154894709,
"epoch": 3.136,
"grad_norm": 1.1015625,
"learning_rate": 7.711111111111112e-06,
"loss": 0.6357,
"mean_token_accuracy": 0.8405985131859779,
"num_tokens": 65328436.0,
"step": 1960
},
{
"entropy": 0.5195852382108569,
"epoch": 3.152,
"grad_norm": 1.125,
"learning_rate": 7.637037037037037e-06,
"loss": 0.602,
"mean_token_accuracy": 0.8487723391503096,
"num_tokens": 65659346.0,
"step": 1970
},
{
"entropy": 0.5302856534719467,
"epoch": 3.168,
"grad_norm": 1.03125,
"learning_rate": 7.562962962962963e-06,
"loss": 0.6216,
"mean_token_accuracy": 0.8445643980056048,
"num_tokens": 65986382.0,
"step": 1980
},
{
"entropy": 0.5176609115675092,
"epoch": 3.184,
"grad_norm": 1.1953125,
"learning_rate": 7.48888888888889e-06,
"loss": 0.6056,
"mean_token_accuracy": 0.8478365700691939,
"num_tokens": 66324308.0,
"step": 1990
},
{
"entropy": 0.5267078908160329,
"epoch": 3.2,
"grad_norm": 1.1015625,
"learning_rate": 7.4148148148148155e-06,
"loss": 0.6149,
"mean_token_accuracy": 0.8465773615986109,
"num_tokens": 66658712.0,
"step": 2000
},
{
"epoch": 3.2,
"eval_biology_entropy": 1.5625373516082763,
"eval_biology_loss": 2.4981629848480225,
"eval_biology_mean_token_accuracy": 0.57876149559021,
"eval_biology_num_tokens": 66658712.0,
"eval_biology_runtime": 38.9285,
"eval_biology_samples_per_second": 12.844,
"eval_biology_steps_per_second": 3.211,
"step": 2000
},
{
"epoch": 3.2,
"eval_chemistry_entropy": 0.70146466588974,
"eval_chemistry_loss": 1.0409005880355835,
"eval_chemistry_mean_token_accuracy": 0.7748316297531128,
"eval_chemistry_num_tokens": 66658712.0,
"eval_chemistry_runtime": 48.3893,
"eval_chemistry_samples_per_second": 10.333,
"eval_chemistry_steps_per_second": 2.583,
"step": 2000
},
{
"epoch": 3.2,
"eval_math_entropy": 0.6218927059173583,
"eval_math_loss": 1.0327448844909668,
"eval_math_mean_token_accuracy": 0.7830894327163697,
"eval_math_num_tokens": 66658712.0,
"eval_math_runtime": 49.899,
"eval_math_samples_per_second": 10.02,
"eval_math_steps_per_second": 2.505,
"step": 2000
},
{
"epoch": 3.2,
"eval_physics_entropy": 0.5445813267230988,
"eval_physics_loss": 0.725453794002533,
"eval_physics_mean_token_accuracy": 0.8266869735717773,
"eval_physics_num_tokens": 66658712.0,
"eval_physics_runtime": 57.3527,
"eval_physics_samples_per_second": 8.718,
"eval_physics_steps_per_second": 2.179,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.956042200464073e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}