ti9xjma3 / checkpoint-2000 /trainer_state.json
roonbug's picture
Upload folder using huggingface_hub
157ec73 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.2,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5841219527646899,
"epoch": 0.016,
"grad_norm": 10.875,
"learning_rate": 1.8e-07,
"loss": 1.0536,
"mean_token_accuracy": 0.7792524203658104,
"num_tokens": 282849.0,
"step": 10
},
{
"entropy": 0.6104077549651266,
"epoch": 0.032,
"grad_norm": 9.625,
"learning_rate": 3.8e-07,
"loss": 1.0813,
"mean_token_accuracy": 0.7717054646462203,
"num_tokens": 561726.0,
"step": 20
},
{
"entropy": 0.604045126773417,
"epoch": 0.048,
"grad_norm": 8.9375,
"learning_rate": 5.800000000000001e-07,
"loss": 1.0425,
"mean_token_accuracy": 0.7788930047303438,
"num_tokens": 842092.0,
"step": 30
},
{
"entropy": 0.6275176709517837,
"epoch": 0.064,
"grad_norm": 8.5625,
"learning_rate": 7.8e-07,
"loss": 1.0681,
"mean_token_accuracy": 0.7725805886089802,
"num_tokens": 1133349.0,
"step": 40
},
{
"entropy": 0.6530287317931652,
"epoch": 0.08,
"grad_norm": 7.59375,
"learning_rate": 9.800000000000001e-07,
"loss": 1.0273,
"mean_token_accuracy": 0.7785692941397429,
"num_tokens": 1422340.0,
"step": 50
},
{
"entropy": 0.6810424191877246,
"epoch": 0.096,
"grad_norm": 5.40625,
"learning_rate": 1.1800000000000001e-06,
"loss": 0.9779,
"mean_token_accuracy": 0.7848060742020607,
"num_tokens": 1711821.0,
"step": 60
},
{
"entropy": 0.7722993083298206,
"epoch": 0.112,
"grad_norm": 4.03125,
"learning_rate": 1.3800000000000001e-06,
"loss": 0.9772,
"mean_token_accuracy": 0.782012278959155,
"num_tokens": 1991855.0,
"step": 70
},
{
"entropy": 0.8406166139990091,
"epoch": 0.128,
"grad_norm": 3.171875,
"learning_rate": 1.5800000000000001e-06,
"loss": 0.959,
"mean_token_accuracy": 0.784759023040533,
"num_tokens": 2288720.0,
"step": 80
},
{
"entropy": 0.8299949843436479,
"epoch": 0.144,
"grad_norm": 2.875,
"learning_rate": 1.7800000000000001e-06,
"loss": 0.9481,
"mean_token_accuracy": 0.7854762740433217,
"num_tokens": 2564352.0,
"step": 90
},
{
"entropy": 0.7978475514799357,
"epoch": 0.16,
"grad_norm": 2.78125,
"learning_rate": 1.98e-06,
"loss": 0.9304,
"mean_token_accuracy": 0.7901426807045937,
"num_tokens": 2841876.0,
"step": 100
},
{
"epoch": 0.16,
"eval_chemistry_entropy": 0.817941864490509,
"eval_chemistry_loss": 0.8207718729972839,
"eval_chemistry_mean_token_accuracy": 0.8094593362808228,
"eval_chemistry_num_tokens": 2841876.0,
"eval_chemistry_runtime": 34.8392,
"eval_chemistry_samples_per_second": 14.352,
"eval_chemistry_steps_per_second": 3.588,
"step": 100
},
{
"epoch": 0.16,
"eval_math_entropy": 0.8300764834880829,
"eval_math_loss": 0.9698297381401062,
"eval_math_mean_token_accuracy": 0.7842242226600648,
"eval_math_num_tokens": 2841876.0,
"eval_math_runtime": 35.8893,
"eval_math_samples_per_second": 13.932,
"eval_math_steps_per_second": 3.483,
"step": 100
},
{
"entropy": 0.8128786478191614,
"epoch": 0.176,
"grad_norm": 2.234375,
"learning_rate": 2.1800000000000003e-06,
"loss": 0.9222,
"mean_token_accuracy": 0.7908758621662855,
"num_tokens": 3129237.0,
"step": 110
},
{
"entropy": 0.830844565667212,
"epoch": 0.192,
"grad_norm": 2.671875,
"learning_rate": 2.38e-06,
"loss": 0.9398,
"mean_token_accuracy": 0.7865626137703657,
"num_tokens": 3414875.0,
"step": 120
},
{
"entropy": 0.8196758177131415,
"epoch": 0.208,
"grad_norm": 2.953125,
"learning_rate": 2.5800000000000003e-06,
"loss": 0.92,
"mean_token_accuracy": 0.7892169930040837,
"num_tokens": 3695623.0,
"step": 130
},
{
"entropy": 0.8214951554313302,
"epoch": 0.224,
"grad_norm": 2.703125,
"learning_rate": 2.7800000000000005e-06,
"loss": 0.9102,
"mean_token_accuracy": 0.7903474017977714,
"num_tokens": 3977666.0,
"step": 140
},
{
"entropy": 0.7900505751371384,
"epoch": 0.24,
"grad_norm": 2.609375,
"learning_rate": 2.9800000000000003e-06,
"loss": 0.8856,
"mean_token_accuracy": 0.7941167835146189,
"num_tokens": 4269287.0,
"step": 150
},
{
"entropy": 0.783742449991405,
"epoch": 0.256,
"grad_norm": 2.328125,
"learning_rate": 3.1800000000000005e-06,
"loss": 0.857,
"mean_token_accuracy": 0.7969042614102364,
"num_tokens": 4558632.0,
"step": 160
},
{
"entropy": 0.7985043607652187,
"epoch": 0.272,
"grad_norm": 2.28125,
"learning_rate": 3.3800000000000007e-06,
"loss": 0.8738,
"mean_token_accuracy": 0.7948056727647781,
"num_tokens": 4839270.0,
"step": 170
},
{
"entropy": 0.7821594245731831,
"epoch": 0.288,
"grad_norm": 2.046875,
"learning_rate": 3.58e-06,
"loss": 0.8578,
"mean_token_accuracy": 0.7972537841647863,
"num_tokens": 5125447.0,
"step": 180
},
{
"entropy": 0.8105486003682018,
"epoch": 0.304,
"grad_norm": 2.28125,
"learning_rate": 3.7800000000000002e-06,
"loss": 0.8516,
"mean_token_accuracy": 0.7949083779007197,
"num_tokens": 5415925.0,
"step": 190
},
{
"entropy": 0.7668969418853522,
"epoch": 0.32,
"grad_norm": 2.125,
"learning_rate": 3.980000000000001e-06,
"loss": 0.8079,
"mean_token_accuracy": 0.80227146781981,
"num_tokens": 5710178.0,
"step": 200
},
{
"epoch": 0.32,
"eval_chemistry_entropy": 0.8007053558826447,
"eval_chemistry_loss": 0.7652505040168762,
"eval_chemistry_mean_token_accuracy": 0.8065420956611633,
"eval_chemistry_num_tokens": 5710178.0,
"eval_chemistry_runtime": 34.8946,
"eval_chemistry_samples_per_second": 14.329,
"eval_chemistry_steps_per_second": 3.582,
"step": 200
},
{
"epoch": 0.32,
"eval_math_entropy": 0.8049644639492035,
"eval_math_loss": 0.8578509092330933,
"eval_math_mean_token_accuracy": 0.7935372619628906,
"eval_math_num_tokens": 5710178.0,
"eval_math_runtime": 35.9751,
"eval_math_samples_per_second": 13.899,
"eval_math_steps_per_second": 3.475,
"step": 200
},
{
"entropy": 0.8033319305628538,
"epoch": 0.336,
"grad_norm": 2.0625,
"learning_rate": 4.18e-06,
"loss": 0.8255,
"mean_token_accuracy": 0.7956937287002802,
"num_tokens": 5993843.0,
"step": 210
},
{
"entropy": 0.776984517276287,
"epoch": 0.352,
"grad_norm": 2.3125,
"learning_rate": 4.38e-06,
"loss": 0.8027,
"mean_token_accuracy": 0.8002396024763584,
"num_tokens": 6279137.0,
"step": 220
},
{
"entropy": 0.7891015050932765,
"epoch": 0.368,
"grad_norm": 1.921875,
"learning_rate": 4.58e-06,
"loss": 0.7877,
"mean_token_accuracy": 0.7985758237540722,
"num_tokens": 6562387.0,
"step": 230
},
{
"entropy": 0.7725065175443888,
"epoch": 0.384,
"grad_norm": 2.09375,
"learning_rate": 4.78e-06,
"loss": 0.7733,
"mean_token_accuracy": 0.8037754170596599,
"num_tokens": 6851520.0,
"step": 240
},
{
"entropy": 0.8092642717063427,
"epoch": 0.4,
"grad_norm": 2.09375,
"learning_rate": 4.980000000000001e-06,
"loss": 0.8171,
"mean_token_accuracy": 0.7956945843994617,
"num_tokens": 7138932.0,
"step": 250
},
{
"entropy": 0.7882049404084682,
"epoch": 0.416,
"grad_norm": 1.8359375,
"learning_rate": 5.18e-06,
"loss": 0.7764,
"mean_token_accuracy": 0.8010819923132658,
"num_tokens": 7420720.0,
"step": 260
},
{
"entropy": 0.7719787143170833,
"epoch": 0.432,
"grad_norm": 1.875,
"learning_rate": 5.380000000000001e-06,
"loss": 0.7655,
"mean_token_accuracy": 0.8039833098649979,
"num_tokens": 7695518.0,
"step": 270
},
{
"entropy": 0.7650891080498695,
"epoch": 0.448,
"grad_norm": 2.484375,
"learning_rate": 5.580000000000001e-06,
"loss": 0.7529,
"mean_token_accuracy": 0.8057680625468493,
"num_tokens": 7983543.0,
"step": 280
},
{
"entropy": 0.7654503894969821,
"epoch": 0.464,
"grad_norm": 2.96875,
"learning_rate": 5.78e-06,
"loss": 0.7582,
"mean_token_accuracy": 0.8070501256734133,
"num_tokens": 8271787.0,
"step": 290
},
{
"entropy": 0.7974133348092437,
"epoch": 0.48,
"grad_norm": 2.046875,
"learning_rate": 5.98e-06,
"loss": 0.7749,
"mean_token_accuracy": 0.8022410120815039,
"num_tokens": 8556170.0,
"step": 300
},
{
"epoch": 0.48,
"eval_chemistry_entropy": 0.7947752397060395,
"eval_chemistry_loss": 0.7221509218215942,
"eval_chemistry_mean_token_accuracy": 0.8083039855957032,
"eval_chemistry_num_tokens": 8556170.0,
"eval_chemistry_runtime": 34.8605,
"eval_chemistry_samples_per_second": 14.343,
"eval_chemistry_steps_per_second": 3.586,
"step": 300
},
{
"epoch": 0.48,
"eval_math_entropy": 0.7911813812255859,
"eval_math_loss": 0.7818350195884705,
"eval_math_mean_token_accuracy": 0.8022260117530823,
"eval_math_num_tokens": 8556170.0,
"eval_math_runtime": 35.8842,
"eval_math_samples_per_second": 13.934,
"eval_math_steps_per_second": 3.483,
"step": 300
},
{
"entropy": 0.7553723743185401,
"epoch": 0.496,
"grad_norm": 2.09375,
"learning_rate": 6.18e-06,
"loss": 0.7364,
"mean_token_accuracy": 0.809390364587307,
"num_tokens": 8840590.0,
"step": 310
},
{
"entropy": 0.763883944042027,
"epoch": 0.512,
"grad_norm": 1.984375,
"learning_rate": 6.380000000000001e-06,
"loss": 0.737,
"mean_token_accuracy": 0.8089169431477785,
"num_tokens": 9131903.0,
"step": 320
},
{
"entropy": 0.7371911529451609,
"epoch": 0.528,
"grad_norm": 1.9765625,
"learning_rate": 6.5800000000000005e-06,
"loss": 0.7238,
"mean_token_accuracy": 0.8125522993505001,
"num_tokens": 9429519.0,
"step": 330
},
{
"entropy": 0.753829506970942,
"epoch": 0.544,
"grad_norm": 2.671875,
"learning_rate": 6.780000000000001e-06,
"loss": 0.7411,
"mean_token_accuracy": 0.8106916502118111,
"num_tokens": 9718247.0,
"step": 340
},
{
"entropy": 0.7809748956933618,
"epoch": 0.56,
"grad_norm": 2.90625,
"learning_rate": 6.98e-06,
"loss": 0.756,
"mean_token_accuracy": 0.803440049290657,
"num_tokens": 9991100.0,
"step": 350
},
{
"entropy": 0.7534385286271572,
"epoch": 0.576,
"grad_norm": 2.125,
"learning_rate": 7.180000000000001e-06,
"loss": 0.7358,
"mean_token_accuracy": 0.807822210714221,
"num_tokens": 10276090.0,
"step": 360
},
{
"entropy": 0.7527043262496591,
"epoch": 0.592,
"grad_norm": 1.75,
"learning_rate": 7.3800000000000005e-06,
"loss": 0.7343,
"mean_token_accuracy": 0.8090104408562183,
"num_tokens": 10566762.0,
"step": 370
},
{
"entropy": 0.7644007310271264,
"epoch": 0.608,
"grad_norm": 1.875,
"learning_rate": 7.58e-06,
"loss": 0.7436,
"mean_token_accuracy": 0.8086390372365713,
"num_tokens": 10857002.0,
"step": 380
},
{
"entropy": 0.7420919336378574,
"epoch": 0.624,
"grad_norm": 2.515625,
"learning_rate": 7.78e-06,
"loss": 0.7239,
"mean_token_accuracy": 0.8104523837566375,
"num_tokens": 11147326.0,
"step": 390
},
{
"entropy": 0.7273864936083555,
"epoch": 0.64,
"grad_norm": 1.7109375,
"learning_rate": 7.980000000000002e-06,
"loss": 0.7111,
"mean_token_accuracy": 0.8139414019882679,
"num_tokens": 11437710.0,
"step": 400
},
{
"epoch": 0.64,
"eval_chemistry_entropy": 0.7778651757240296,
"eval_chemistry_loss": 0.7157668471336365,
"eval_chemistry_mean_token_accuracy": 0.808542202949524,
"eval_chemistry_num_tokens": 11437710.0,
"eval_chemistry_runtime": 34.8261,
"eval_chemistry_samples_per_second": 14.357,
"eval_chemistry_steps_per_second": 3.589,
"step": 400
},
{
"epoch": 0.64,
"eval_math_entropy": 0.763886357307434,
"eval_math_loss": 0.7481760382652283,
"eval_math_mean_token_accuracy": 0.8066890263557434,
"eval_math_num_tokens": 11437710.0,
"eval_math_runtime": 36.0154,
"eval_math_samples_per_second": 13.883,
"eval_math_steps_per_second": 3.471,
"step": 400
},
{
"entropy": 0.729702671058476,
"epoch": 0.656,
"grad_norm": 1.578125,
"learning_rate": 8.18e-06,
"loss": 0.7083,
"mean_token_accuracy": 0.8148457534611225,
"num_tokens": 11727268.0,
"step": 410
},
{
"entropy": 0.7371693229302764,
"epoch": 0.672,
"grad_norm": 1.8515625,
"learning_rate": 8.380000000000001e-06,
"loss": 0.7199,
"mean_token_accuracy": 0.8099267814308405,
"num_tokens": 12020478.0,
"step": 420
},
{
"entropy": 0.7293412426486612,
"epoch": 0.688,
"grad_norm": 1.6875,
"learning_rate": 8.580000000000001e-06,
"loss": 0.7068,
"mean_token_accuracy": 0.8130085315555334,
"num_tokens": 12300380.0,
"step": 430
},
{
"entropy": 0.718653773330152,
"epoch": 0.704,
"grad_norm": 2.046875,
"learning_rate": 8.78e-06,
"loss": 0.6982,
"mean_token_accuracy": 0.8138067662715912,
"num_tokens": 12591366.0,
"step": 440
},
{
"entropy": 0.7371329203248024,
"epoch": 0.72,
"grad_norm": 1.9609375,
"learning_rate": 8.98e-06,
"loss": 0.7104,
"mean_token_accuracy": 0.8120142966508865,
"num_tokens": 12869618.0,
"step": 450
},
{
"entropy": 0.7198993725702166,
"epoch": 0.736,
"grad_norm": 1.921875,
"learning_rate": 9.180000000000002e-06,
"loss": 0.7009,
"mean_token_accuracy": 0.815392379835248,
"num_tokens": 13156311.0,
"step": 460
},
{
"entropy": 0.7063086304813624,
"epoch": 0.752,
"grad_norm": 1.8046875,
"learning_rate": 9.38e-06,
"loss": 0.6793,
"mean_token_accuracy": 0.8184278599917889,
"num_tokens": 13438911.0,
"step": 470
},
{
"entropy": 0.6851496128365397,
"epoch": 0.768,
"grad_norm": 1.5078125,
"learning_rate": 9.58e-06,
"loss": 0.6731,
"mean_token_accuracy": 0.8231410879641772,
"num_tokens": 13735397.0,
"step": 480
},
{
"entropy": 0.7282077683135867,
"epoch": 0.784,
"grad_norm": 1.9609375,
"learning_rate": 9.780000000000001e-06,
"loss": 0.7094,
"mean_token_accuracy": 0.8130000628530979,
"num_tokens": 14024497.0,
"step": 490
},
{
"entropy": 0.7247441383078694,
"epoch": 0.8,
"grad_norm": 2.0,
"learning_rate": 9.980000000000001e-06,
"loss": 0.7048,
"mean_token_accuracy": 0.812597556039691,
"num_tokens": 14316427.0,
"step": 500
},
{
"epoch": 0.8,
"eval_chemistry_entropy": 0.7653646836280823,
"eval_chemistry_loss": 0.7123190760612488,
"eval_chemistry_mean_token_accuracy": 0.8073507056236268,
"eval_chemistry_num_tokens": 14316427.0,
"eval_chemistry_runtime": 34.8547,
"eval_chemistry_samples_per_second": 14.345,
"eval_chemistry_steps_per_second": 3.586,
"step": 500
},
{
"epoch": 0.8,
"eval_math_entropy": 0.7450865380764008,
"eval_math_loss": 0.7230644822120667,
"eval_math_mean_token_accuracy": 0.8105058965682983,
"eval_math_num_tokens": 14316427.0,
"eval_math_runtime": 35.5994,
"eval_math_samples_per_second": 14.045,
"eval_math_steps_per_second": 3.511,
"step": 500
},
{
"entropy": 0.714242628775537,
"epoch": 0.816,
"grad_norm": 2.046875,
"learning_rate": 1.018e-05,
"loss": 0.6884,
"mean_token_accuracy": 0.8151328191161156,
"num_tokens": 14607136.0,
"step": 510
},
{
"entropy": 0.7071703864261508,
"epoch": 0.832,
"grad_norm": 1.84375,
"learning_rate": 1.038e-05,
"loss": 0.6839,
"mean_token_accuracy": 0.8182855024933815,
"num_tokens": 14894084.0,
"step": 520
},
{
"entropy": 0.7319832380861044,
"epoch": 0.848,
"grad_norm": 1.8359375,
"learning_rate": 1.0580000000000002e-05,
"loss": 0.709,
"mean_token_accuracy": 0.8122099358588457,
"num_tokens": 15177039.0,
"step": 530
},
{
"entropy": 0.7191415606066585,
"epoch": 0.864,
"grad_norm": 1.546875,
"learning_rate": 1.0780000000000002e-05,
"loss": 0.6966,
"mean_token_accuracy": 0.8139543637633324,
"num_tokens": 15463393.0,
"step": 540
},
{
"entropy": 0.7217472817748785,
"epoch": 0.88,
"grad_norm": 1.8359375,
"learning_rate": 1.0980000000000002e-05,
"loss": 0.699,
"mean_token_accuracy": 0.8122927758842706,
"num_tokens": 15747943.0,
"step": 550
},
{
"entropy": 0.7142911188304424,
"epoch": 0.896,
"grad_norm": 1.6484375,
"learning_rate": 1.1180000000000001e-05,
"loss": 0.6964,
"mean_token_accuracy": 0.8145412191748619,
"num_tokens": 16027931.0,
"step": 560
},
{
"entropy": 0.7013007398694754,
"epoch": 0.912,
"grad_norm": 1.7421875,
"learning_rate": 1.138e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.8172509890049696,
"num_tokens": 16317127.0,
"step": 570
},
{
"entropy": 0.6971628932282329,
"epoch": 0.928,
"grad_norm": 1.578125,
"learning_rate": 1.1580000000000001e-05,
"loss": 0.6725,
"mean_token_accuracy": 0.8180789042264223,
"num_tokens": 16598757.0,
"step": 580
},
{
"entropy": 0.7133341139182449,
"epoch": 0.944,
"grad_norm": 5.8125,
"learning_rate": 1.178e-05,
"loss": 0.6958,
"mean_token_accuracy": 0.8153590984642506,
"num_tokens": 16882236.0,
"step": 590
},
{
"entropy": 0.7189922722056508,
"epoch": 0.96,
"grad_norm": 1.4453125,
"learning_rate": 1.198e-05,
"loss": 0.6956,
"mean_token_accuracy": 0.8149452641606331,
"num_tokens": 17169495.0,
"step": 600
},
{
"epoch": 0.96,
"eval_chemistry_entropy": 0.7501673064231873,
"eval_chemistry_loss": 0.7121618390083313,
"eval_chemistry_mean_token_accuracy": 0.8068772978782653,
"eval_chemistry_num_tokens": 17169495.0,
"eval_chemistry_runtime": 34.8316,
"eval_chemistry_samples_per_second": 14.355,
"eval_chemistry_steps_per_second": 3.589,
"step": 600
},
{
"epoch": 0.96,
"eval_math_entropy": 0.7047771346569062,
"eval_math_loss": 0.7034372687339783,
"eval_math_mean_token_accuracy": 0.8131817808151245,
"eval_math_num_tokens": 17169495.0,
"eval_math_runtime": 35.8696,
"eval_math_samples_per_second": 13.939,
"eval_math_steps_per_second": 3.485,
"step": 600
},
{
"entropy": 0.6952510023489594,
"epoch": 0.976,
"grad_norm": 1.875,
"learning_rate": 1.218e-05,
"loss": 0.6788,
"mean_token_accuracy": 0.817372427508235,
"num_tokens": 17454147.0,
"step": 610
},
{
"entropy": 0.7092014687135816,
"epoch": 0.992,
"grad_norm": 1.6640625,
"learning_rate": 1.2380000000000002e-05,
"loss": 0.6838,
"mean_token_accuracy": 0.8138624154031276,
"num_tokens": 17742596.0,
"step": 620
},
{
"entropy": 0.694889472052455,
"epoch": 1.008,
"grad_norm": 1.5546875,
"learning_rate": 1.2580000000000002e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.8174668036401271,
"num_tokens": 18033540.0,
"step": 630
},
{
"entropy": 0.6714398205280304,
"epoch": 1.024,
"grad_norm": 1.7578125,
"learning_rate": 1.2780000000000001e-05,
"loss": 0.6507,
"mean_token_accuracy": 0.8213358622044324,
"num_tokens": 18316346.0,
"step": 640
},
{
"entropy": 0.6660828510299325,
"epoch": 1.04,
"grad_norm": 1.796875,
"learning_rate": 1.2980000000000001e-05,
"loss": 0.6496,
"mean_token_accuracy": 0.8221284162253142,
"num_tokens": 18595230.0,
"step": 650
},
{
"entropy": 0.6795160492882133,
"epoch": 1.056,
"grad_norm": 1.765625,
"learning_rate": 1.3180000000000001e-05,
"loss": 0.6581,
"mean_token_accuracy": 0.8199356343597174,
"num_tokens": 18885547.0,
"step": 660
},
{
"entropy": 0.6750219637528062,
"epoch": 1.072,
"grad_norm": 1.6484375,
"learning_rate": 1.3380000000000002e-05,
"loss": 0.6555,
"mean_token_accuracy": 0.8197014667093754,
"num_tokens": 19169083.0,
"step": 670
},
{
"entropy": 0.6712652388960123,
"epoch": 1.088,
"grad_norm": 1.7109375,
"learning_rate": 1.3580000000000002e-05,
"loss": 0.6434,
"mean_token_accuracy": 0.8221997711807489,
"num_tokens": 19461465.0,
"step": 680
},
{
"entropy": 0.6703712901100516,
"epoch": 1.104,
"grad_norm": 1.46875,
"learning_rate": 1.378e-05,
"loss": 0.6514,
"mean_token_accuracy": 0.8217100899666547,
"num_tokens": 19755027.0,
"step": 690
},
{
"entropy": 0.6529567580670118,
"epoch": 1.12,
"grad_norm": 1.765625,
"learning_rate": 1.398e-05,
"loss": 0.6325,
"mean_token_accuracy": 0.8260030064731836,
"num_tokens": 20043293.0,
"step": 700
},
{
"epoch": 1.12,
"eval_chemistry_entropy": 0.7117109818458557,
"eval_chemistry_loss": 0.715001106262207,
"eval_chemistry_mean_token_accuracy": 0.805173789024353,
"eval_chemistry_num_tokens": 20043293.0,
"eval_chemistry_runtime": 34.8451,
"eval_chemistry_samples_per_second": 14.349,
"eval_chemistry_steps_per_second": 3.587,
"step": 700
},
{
"epoch": 1.12,
"eval_math_entropy": 0.6742688639163971,
"eval_math_loss": 0.6903207302093506,
"eval_math_mean_token_accuracy": 0.8157781276702881,
"eval_math_num_tokens": 20043293.0,
"eval_math_runtime": 35.8894,
"eval_math_samples_per_second": 13.932,
"eval_math_steps_per_second": 3.483,
"step": 700
},
{
"entropy": 0.6546579284593463,
"epoch": 1.1360000000000001,
"grad_norm": 1.671875,
"learning_rate": 1.418e-05,
"loss": 0.6415,
"mean_token_accuracy": 0.8239764388650656,
"num_tokens": 20323010.0,
"step": 710
},
{
"entropy": 0.6666190484538674,
"epoch": 1.152,
"grad_norm": 1.6953125,
"learning_rate": 1.4380000000000001e-05,
"loss": 0.6493,
"mean_token_accuracy": 0.8230571333318949,
"num_tokens": 20613266.0,
"step": 720
},
{
"entropy": 0.6584050474688411,
"epoch": 1.168,
"grad_norm": 1.5,
"learning_rate": 1.4580000000000001e-05,
"loss": 0.6408,
"mean_token_accuracy": 0.8246052328497171,
"num_tokens": 20910404.0,
"step": 730
},
{
"entropy": 0.664110666513443,
"epoch": 1.184,
"grad_norm": 1.734375,
"learning_rate": 1.478e-05,
"loss": 0.6435,
"mean_token_accuracy": 0.8229058619588614,
"num_tokens": 21200197.0,
"step": 740
},
{
"entropy": 0.6511959439143539,
"epoch": 1.2,
"grad_norm": 1.46875,
"learning_rate": 1.498e-05,
"loss": 0.633,
"mean_token_accuracy": 0.8250994741916656,
"num_tokens": 21496441.0,
"step": 750
},
{
"entropy": 0.668997959420085,
"epoch": 1.216,
"grad_norm": 1.7421875,
"learning_rate": 1.5180000000000002e-05,
"loss": 0.6456,
"mean_token_accuracy": 0.8220734592527151,
"num_tokens": 21778030.0,
"step": 760
},
{
"entropy": 0.6459713563323021,
"epoch": 1.232,
"grad_norm": 1.75,
"learning_rate": 1.5380000000000002e-05,
"loss": 0.6253,
"mean_token_accuracy": 0.8270299468189478,
"num_tokens": 22060851.0,
"step": 770
},
{
"entropy": 0.6455961847677827,
"epoch": 1.248,
"grad_norm": 1.5859375,
"learning_rate": 1.5580000000000003e-05,
"loss": 0.6255,
"mean_token_accuracy": 0.8260451622307301,
"num_tokens": 22348738.0,
"step": 780
},
{
"entropy": 0.6676814066246152,
"epoch": 1.264,
"grad_norm": 1.484375,
"learning_rate": 1.578e-05,
"loss": 0.6457,
"mean_token_accuracy": 0.8219054654240608,
"num_tokens": 22635039.0,
"step": 790
},
{
"entropy": 0.6654805542901159,
"epoch": 1.28,
"grad_norm": 1.53125,
"learning_rate": 1.5980000000000003e-05,
"loss": 0.6444,
"mean_token_accuracy": 0.8224660288542509,
"num_tokens": 22915190.0,
"step": 800
},
{
"epoch": 1.28,
"eval_chemistry_entropy": 0.7472894523143768,
"eval_chemistry_loss": 0.7169390916824341,
"eval_chemistry_mean_token_accuracy": 0.805417845249176,
"eval_chemistry_num_tokens": 22915190.0,
"eval_chemistry_runtime": 34.8749,
"eval_chemistry_samples_per_second": 14.337,
"eval_chemistry_steps_per_second": 3.584,
"step": 800
},
{
"epoch": 1.28,
"eval_math_entropy": 0.6949311044216157,
"eval_math_loss": 0.678849458694458,
"eval_math_mean_token_accuracy": 0.8176955370903015,
"eval_math_num_tokens": 22915190.0,
"eval_math_runtime": 35.8885,
"eval_math_samples_per_second": 13.932,
"eval_math_steps_per_second": 3.483,
"step": 800
},
{
"entropy": 0.6526781121268869,
"epoch": 1.296,
"grad_norm": 1.46875,
"learning_rate": 1.618e-05,
"loss": 0.627,
"mean_token_accuracy": 0.8251245643943548,
"num_tokens": 23194947.0,
"step": 810
},
{
"entropy": 0.6623267890885473,
"epoch": 1.312,
"grad_norm": 1.65625,
"learning_rate": 1.638e-05,
"loss": 0.6522,
"mean_token_accuracy": 0.8221593346446753,
"num_tokens": 23483496.0,
"step": 820
},
{
"entropy": 0.675568882189691,
"epoch": 1.328,
"grad_norm": 1.6640625,
"learning_rate": 1.658e-05,
"loss": 0.6523,
"mean_token_accuracy": 0.8208769094198942,
"num_tokens": 23762589.0,
"step": 830
},
{
"entropy": 0.6467019423842431,
"epoch": 1.3439999999999999,
"grad_norm": 1.6953125,
"learning_rate": 1.6780000000000002e-05,
"loss": 0.6297,
"mean_token_accuracy": 0.825079932808876,
"num_tokens": 24041497.0,
"step": 840
},
{
"entropy": 0.656816397048533,
"epoch": 1.3599999999999999,
"grad_norm": 1.296875,
"learning_rate": 1.698e-05,
"loss": 0.6365,
"mean_token_accuracy": 0.8254420697689057,
"num_tokens": 24330591.0,
"step": 850
},
{
"entropy": 0.6973325841128826,
"epoch": 1.376,
"grad_norm": 1.484375,
"learning_rate": 1.718e-05,
"loss": 0.6754,
"mean_token_accuracy": 0.8146579563617706,
"num_tokens": 24611420.0,
"step": 860
},
{
"entropy": 0.6515109525993467,
"epoch": 1.392,
"grad_norm": 1.5234375,
"learning_rate": 1.7380000000000003e-05,
"loss": 0.6297,
"mean_token_accuracy": 0.8239485524594784,
"num_tokens": 24894694.0,
"step": 870
},
{
"entropy": 0.6447439486160874,
"epoch": 1.408,
"grad_norm": 1.546875,
"learning_rate": 1.758e-05,
"loss": 0.6231,
"mean_token_accuracy": 0.8267700038850307,
"num_tokens": 25184946.0,
"step": 880
},
{
"entropy": 0.6419832136482,
"epoch": 1.424,
"grad_norm": 1.3515625,
"learning_rate": 1.7780000000000003e-05,
"loss": 0.624,
"mean_token_accuracy": 0.8271039195358754,
"num_tokens": 25474746.0,
"step": 890
},
{
"entropy": 0.6597341772168874,
"epoch": 1.44,
"grad_norm": 1.4296875,
"learning_rate": 1.798e-05,
"loss": 0.6378,
"mean_token_accuracy": 0.8231316354125738,
"num_tokens": 25758550.0,
"step": 900
},
{
"epoch": 1.44,
"eval_chemistry_entropy": 0.7189412865638732,
"eval_chemistry_loss": 0.721181333065033,
"eval_chemistry_mean_token_accuracy": 0.8038096494674682,
"eval_chemistry_num_tokens": 25758550.0,
"eval_chemistry_runtime": 34.8369,
"eval_chemistry_samples_per_second": 14.353,
"eval_chemistry_steps_per_second": 3.588,
"step": 900
},
{
"epoch": 1.44,
"eval_math_entropy": 0.6560292990207672,
"eval_math_loss": 0.6677282452583313,
"eval_math_mean_token_accuracy": 0.8188103575706482,
"eval_math_num_tokens": 25758550.0,
"eval_math_runtime": 35.8656,
"eval_math_samples_per_second": 13.941,
"eval_math_steps_per_second": 3.485,
"step": 900
},
{
"entropy": 0.6441044477745891,
"epoch": 1.456,
"grad_norm": 1.5078125,
"learning_rate": 1.8180000000000002e-05,
"loss": 0.6259,
"mean_token_accuracy": 0.8252961106598378,
"num_tokens": 26039862.0,
"step": 910
},
{
"entropy": 0.6502820059657097,
"epoch": 1.472,
"grad_norm": 1.3125,
"learning_rate": 1.8380000000000004e-05,
"loss": 0.6342,
"mean_token_accuracy": 0.8237598706036806,
"num_tokens": 26318666.0,
"step": 920
},
{
"entropy": 0.6764587434008718,
"epoch": 1.488,
"grad_norm": 1.40625,
"learning_rate": 1.858e-05,
"loss": 0.6547,
"mean_token_accuracy": 0.8194571785628796,
"num_tokens": 26594867.0,
"step": 930
},
{
"entropy": 0.6333928175270558,
"epoch": 1.504,
"grad_norm": 1.6875,
"learning_rate": 1.878e-05,
"loss": 0.6176,
"mean_token_accuracy": 0.8289908330887557,
"num_tokens": 26887233.0,
"step": 940
},
{
"entropy": 0.6682084022089839,
"epoch": 1.52,
"grad_norm": 1.2109375,
"learning_rate": 1.898e-05,
"loss": 0.6442,
"mean_token_accuracy": 0.8210660863667727,
"num_tokens": 27177448.0,
"step": 950
},
{
"entropy": 0.6442653369158506,
"epoch": 1.536,
"grad_norm": 1.2109375,
"learning_rate": 1.918e-05,
"loss": 0.6319,
"mean_token_accuracy": 0.8249651778489351,
"num_tokens": 27467857.0,
"step": 960
},
{
"entropy": 0.6457945328205824,
"epoch": 1.552,
"grad_norm": 1.3828125,
"learning_rate": 1.938e-05,
"loss": 0.6262,
"mean_token_accuracy": 0.8254117891192436,
"num_tokens": 27755138.0,
"step": 970
},
{
"entropy": 0.6328146204352378,
"epoch": 1.568,
"grad_norm": 1.234375,
"learning_rate": 1.9580000000000002e-05,
"loss": 0.6132,
"mean_token_accuracy": 0.8306384857743978,
"num_tokens": 28049363.0,
"step": 980
},
{
"entropy": 0.6414034033194185,
"epoch": 1.584,
"grad_norm": 1.4375,
"learning_rate": 1.978e-05,
"loss": 0.6233,
"mean_token_accuracy": 0.8266693830490113,
"num_tokens": 28337890.0,
"step": 990
},
{
"entropy": 0.6323226554319262,
"epoch": 1.6,
"grad_norm": 1.4296875,
"learning_rate": 1.9980000000000002e-05,
"loss": 0.6189,
"mean_token_accuracy": 0.8256060272455216,
"num_tokens": 28626068.0,
"step": 1000
},
{
"epoch": 1.6,
"eval_chemistry_entropy": 0.7118635489940643,
"eval_chemistry_loss": 0.7256439924240112,
"eval_chemistry_mean_token_accuracy": 0.8026832752227783,
"eval_chemistry_num_tokens": 28626068.0,
"eval_chemistry_runtime": 34.8345,
"eval_chemistry_samples_per_second": 14.354,
"eval_chemistry_steps_per_second": 3.588,
"step": 1000
},
{
"epoch": 1.6,
"eval_math_entropy": 0.6450024034976959,
"eval_math_loss": 0.6582702994346619,
"eval_math_mean_token_accuracy": 0.820967010974884,
"eval_math_num_tokens": 28626068.0,
"eval_math_runtime": 35.6245,
"eval_math_samples_per_second": 14.035,
"eval_math_steps_per_second": 3.509,
"step": 1000
},
{
"entropy": 0.6489538656547665,
"epoch": 1.616,
"grad_norm": 1.3125,
"learning_rate": 1.9980000000000002e-05,
"loss": 0.636,
"mean_token_accuracy": 0.8241962313652038,
"num_tokens": 28913208.0,
"step": 1010
},
{
"entropy": 0.656857686303556,
"epoch": 1.6320000000000001,
"grad_norm": 1.3359375,
"learning_rate": 1.995777777777778e-05,
"loss": 0.6357,
"mean_token_accuracy": 0.8214856889098883,
"num_tokens": 29188917.0,
"step": 1020
},
{
"entropy": 0.6321157278493047,
"epoch": 1.6480000000000001,
"grad_norm": 1.390625,
"learning_rate": 1.9935555555555557e-05,
"loss": 0.6152,
"mean_token_accuracy": 0.8298161163926124,
"num_tokens": 29477292.0,
"step": 1030
},
{
"entropy": 0.6432073757052421,
"epoch": 1.6640000000000001,
"grad_norm": 1.3359375,
"learning_rate": 1.9913333333333335e-05,
"loss": 0.62,
"mean_token_accuracy": 0.8257944118231535,
"num_tokens": 29772480.0,
"step": 1040
},
{
"entropy": 0.647973028384149,
"epoch": 1.6800000000000002,
"grad_norm": 1.2109375,
"learning_rate": 1.9891111111111112e-05,
"loss": 0.6323,
"mean_token_accuracy": 0.8256966724991799,
"num_tokens": 30061007.0,
"step": 1050
},
{
"entropy": 0.6499059528112412,
"epoch": 1.696,
"grad_norm": 1.296875,
"learning_rate": 1.986888888888889e-05,
"loss": 0.6316,
"mean_token_accuracy": 0.8242154024541378,
"num_tokens": 30344415.0,
"step": 1060
},
{
"entropy": 0.6270535726100206,
"epoch": 1.712,
"grad_norm": 1.2109375,
"learning_rate": 1.9846666666666668e-05,
"loss": 0.6152,
"mean_token_accuracy": 0.8287085957825184,
"num_tokens": 30640891.0,
"step": 1070
},
{
"entropy": 0.6247420905157923,
"epoch": 1.728,
"grad_norm": 1.3515625,
"learning_rate": 1.9824444444444445e-05,
"loss": 0.6059,
"mean_token_accuracy": 0.8300342559814453,
"num_tokens": 30921295.0,
"step": 1080
},
{
"entropy": 0.6317020528018474,
"epoch": 1.744,
"grad_norm": 1.3984375,
"learning_rate": 1.9802222222222226e-05,
"loss": 0.6158,
"mean_token_accuracy": 0.828870889171958,
"num_tokens": 31206375.0,
"step": 1090
},
{
"entropy": 0.6282909054309129,
"epoch": 1.76,
"grad_norm": 1.1953125,
"learning_rate": 1.978e-05,
"loss": 0.6101,
"mean_token_accuracy": 0.8280309360474348,
"num_tokens": 31491856.0,
"step": 1100
},
{
"epoch": 1.76,
"eval_chemistry_entropy": 0.7381789875030518,
"eval_chemistry_loss": 0.7256398797035217,
"eval_chemistry_mean_token_accuracy": 0.8018755903244018,
"eval_chemistry_num_tokens": 31491856.0,
"eval_chemistry_runtime": 34.8532,
"eval_chemistry_samples_per_second": 14.346,
"eval_chemistry_steps_per_second": 3.586,
"step": 1100
},
{
"epoch": 1.76,
"eval_math_entropy": 0.6412687346935272,
"eval_math_loss": 0.6473493576049805,
"eval_math_mean_token_accuracy": 0.8233258814811707,
"eval_math_num_tokens": 31491856.0,
"eval_math_runtime": 35.8787,
"eval_math_samples_per_second": 13.936,
"eval_math_steps_per_second": 3.484,
"step": 1100
},
{
"entropy": 0.6336805198341608,
"epoch": 1.776,
"grad_norm": 1.3203125,
"learning_rate": 1.975777777777778e-05,
"loss": 0.6135,
"mean_token_accuracy": 0.8275633446872235,
"num_tokens": 31782829.0,
"step": 1110
},
{
"entropy": 0.6414820792153477,
"epoch": 1.792,
"grad_norm": 1.2734375,
"learning_rate": 1.9735555555555556e-05,
"loss": 0.6246,
"mean_token_accuracy": 0.8254438240081072,
"num_tokens": 32070187.0,
"step": 1120
},
{
"entropy": 0.6382445661351085,
"epoch": 1.808,
"grad_norm": 1.2890625,
"learning_rate": 1.9713333333333337e-05,
"loss": 0.6217,
"mean_token_accuracy": 0.8260911278426647,
"num_tokens": 32355072.0,
"step": 1130
},
{
"entropy": 0.6067643767222762,
"epoch": 1.8239999999999998,
"grad_norm": 1.1484375,
"learning_rate": 1.969111111111111e-05,
"loss": 0.5867,
"mean_token_accuracy": 0.8337960425764323,
"num_tokens": 32634013.0,
"step": 1140
},
{
"entropy": 0.6227916920557618,
"epoch": 1.8399999999999999,
"grad_norm": 1.34375,
"learning_rate": 1.9668888888888892e-05,
"loss": 0.6108,
"mean_token_accuracy": 0.8300515715032816,
"num_tokens": 32917222.0,
"step": 1150
},
{
"entropy": 0.6352459752932191,
"epoch": 1.8559999999999999,
"grad_norm": 1.265625,
"learning_rate": 1.9646666666666666e-05,
"loss": 0.613,
"mean_token_accuracy": 0.8282338980585336,
"num_tokens": 33207147.0,
"step": 1160
},
{
"entropy": 0.6206054732203483,
"epoch": 1.8719999999999999,
"grad_norm": 1.28125,
"learning_rate": 1.9624444444444447e-05,
"loss": 0.6023,
"mean_token_accuracy": 0.8303315650671721,
"num_tokens": 33496572.0,
"step": 1170
},
{
"entropy": 0.6307253973558545,
"epoch": 1.888,
"grad_norm": 1.3046875,
"learning_rate": 1.9602222222222225e-05,
"loss": 0.6134,
"mean_token_accuracy": 0.8282632239162921,
"num_tokens": 33775788.0,
"step": 1180
},
{
"entropy": 0.6315167531371116,
"epoch": 1.904,
"grad_norm": 1.2421875,
"learning_rate": 1.9580000000000002e-05,
"loss": 0.615,
"mean_token_accuracy": 0.8275392096489668,
"num_tokens": 34052322.0,
"step": 1190
},
{
"entropy": 0.6110374081879855,
"epoch": 1.92,
"grad_norm": 1.1796875,
"learning_rate": 1.955777777777778e-05,
"loss": 0.5959,
"mean_token_accuracy": 0.8313204348087311,
"num_tokens": 34331468.0,
"step": 1200
},
{
"epoch": 1.92,
"eval_chemistry_entropy": 0.7433430399894715,
"eval_chemistry_loss": 0.728480339050293,
"eval_chemistry_mean_token_accuracy": 0.8014561586380005,
"eval_chemistry_num_tokens": 34331468.0,
"eval_chemistry_runtime": 34.8484,
"eval_chemistry_samples_per_second": 14.348,
"eval_chemistry_steps_per_second": 3.587,
"step": 1200
},
{
"epoch": 1.92,
"eval_math_entropy": 0.6532090711593628,
"eval_math_loss": 0.6392548680305481,
"eval_math_mean_token_accuracy": 0.8242080550193787,
"eval_math_num_tokens": 34331468.0,
"eval_math_runtime": 35.8593,
"eval_math_samples_per_second": 13.943,
"eval_math_steps_per_second": 3.486,
"step": 1200
},
{
"entropy": 0.6277910789474845,
"epoch": 1.936,
"grad_norm": 1.2890625,
"learning_rate": 1.9535555555555557e-05,
"loss": 0.6122,
"mean_token_accuracy": 0.8281190965324641,
"num_tokens": 34612743.0,
"step": 1210
},
{
"entropy": 0.6204241087660194,
"epoch": 1.952,
"grad_norm": 1.1796875,
"learning_rate": 1.9513333333333335e-05,
"loss": 0.5964,
"mean_token_accuracy": 0.8305780492722988,
"num_tokens": 34891115.0,
"step": 1220
},
{
"entropy": 0.6416850406676531,
"epoch": 1.968,
"grad_norm": 2.8125,
"learning_rate": 1.9491111111111113e-05,
"loss": 0.6335,
"mean_token_accuracy": 0.8254266548901796,
"num_tokens": 35179505.0,
"step": 1230
},
{
"entropy": 0.6098293786868453,
"epoch": 1.984,
"grad_norm": 1.140625,
"learning_rate": 1.946888888888889e-05,
"loss": 0.5945,
"mean_token_accuracy": 0.8320291046053171,
"num_tokens": 35472596.0,
"step": 1240
},
{
"entropy": 0.6196965377777814,
"epoch": 2.0,
"grad_norm": 1.171875,
"learning_rate": 1.9446666666666668e-05,
"loss": 0.604,
"mean_token_accuracy": 0.8302332308143378,
"num_tokens": 35772848.0,
"step": 1250
},
{
"entropy": 0.5497036971151829,
"epoch": 2.016,
"grad_norm": 1.46875,
"learning_rate": 1.9424444444444446e-05,
"loss": 0.5255,
"mean_token_accuracy": 0.8438023224472999,
"num_tokens": 36060330.0,
"step": 1260
},
{
"entropy": 0.5297272937372327,
"epoch": 2.032,
"grad_norm": 1.46875,
"learning_rate": 1.9402222222222223e-05,
"loss": 0.5147,
"mean_token_accuracy": 0.8473685499280691,
"num_tokens": 36343894.0,
"step": 1270
},
{
"entropy": 0.5197363485582173,
"epoch": 2.048,
"grad_norm": 1.4296875,
"learning_rate": 1.938e-05,
"loss": 0.5054,
"mean_token_accuracy": 0.8493004187941551,
"num_tokens": 36636410.0,
"step": 1280
},
{
"entropy": 0.5172868834808468,
"epoch": 2.064,
"grad_norm": 1.328125,
"learning_rate": 1.935777777777778e-05,
"loss": 0.5014,
"mean_token_accuracy": 0.849420978501439,
"num_tokens": 36925382.0,
"step": 1290
},
{
"entropy": 0.5238144496455789,
"epoch": 2.08,
"grad_norm": 1.5,
"learning_rate": 1.9335555555555556e-05,
"loss": 0.5006,
"mean_token_accuracy": 0.8484534539282322,
"num_tokens": 37208672.0,
"step": 1300
},
{
"epoch": 2.08,
"eval_chemistry_entropy": 0.5902576115131378,
"eval_chemistry_loss": 0.7615314722061157,
"eval_chemistry_mean_token_accuracy": 0.7987616105079651,
"eval_chemistry_num_tokens": 37208672.0,
"eval_chemistry_runtime": 34.8214,
"eval_chemistry_samples_per_second": 14.359,
"eval_chemistry_steps_per_second": 3.59,
"step": 1300
},
{
"epoch": 2.08,
"eval_math_entropy": 0.5490774600505829,
"eval_math_loss": 0.6556233167648315,
"eval_math_mean_token_accuracy": 0.8236036248207093,
"eval_math_num_tokens": 37208672.0,
"eval_math_runtime": 35.898,
"eval_math_samples_per_second": 13.928,
"eval_math_steps_per_second": 3.482,
"step": 1300
},
{
"entropy": 0.5296811152249574,
"epoch": 2.096,
"grad_norm": 1.7265625,
"learning_rate": 1.9313333333333334e-05,
"loss": 0.5143,
"mean_token_accuracy": 0.8471574913710356,
"num_tokens": 37491954.0,
"step": 1310
},
{
"entropy": 0.5180984031409025,
"epoch": 2.112,
"grad_norm": 1.4375,
"learning_rate": 1.9291111111111115e-05,
"loss": 0.4988,
"mean_token_accuracy": 0.8482684683054685,
"num_tokens": 37787033.0,
"step": 1320
},
{
"entropy": 0.5158460404723882,
"epoch": 2.128,
"grad_norm": 1.4453125,
"learning_rate": 1.926888888888889e-05,
"loss": 0.4989,
"mean_token_accuracy": 0.8504650525748729,
"num_tokens": 38068955.0,
"step": 1330
},
{
"entropy": 0.5193909807130694,
"epoch": 2.144,
"grad_norm": 1.4140625,
"learning_rate": 1.924666666666667e-05,
"loss": 0.5058,
"mean_token_accuracy": 0.8479062043130398,
"num_tokens": 38359914.0,
"step": 1340
},
{
"entropy": 0.518505304865539,
"epoch": 2.16,
"grad_norm": 1.3203125,
"learning_rate": 1.9224444444444444e-05,
"loss": 0.5031,
"mean_token_accuracy": 0.8498795099556447,
"num_tokens": 38649684.0,
"step": 1350
},
{
"entropy": 0.49965119622647763,
"epoch": 2.176,
"grad_norm": 1.359375,
"learning_rate": 1.9202222222222225e-05,
"loss": 0.4821,
"mean_token_accuracy": 0.8528774298727513,
"num_tokens": 38938155.0,
"step": 1360
},
{
"entropy": 0.5324017994105816,
"epoch": 2.192,
"grad_norm": 1.3359375,
"learning_rate": 1.918e-05,
"loss": 0.5166,
"mean_token_accuracy": 0.8449586551636458,
"num_tokens": 39228800.0,
"step": 1370
},
{
"entropy": 0.523920483700931,
"epoch": 2.208,
"grad_norm": 1.5390625,
"learning_rate": 1.915777777777778e-05,
"loss": 0.5153,
"mean_token_accuracy": 0.8468315444886685,
"num_tokens": 39513118.0,
"step": 1380
},
{
"entropy": 0.5206425994634628,
"epoch": 2.224,
"grad_norm": 1.5546875,
"learning_rate": 1.9135555555555555e-05,
"loss": 0.5006,
"mean_token_accuracy": 0.8491694446653127,
"num_tokens": 39804212.0,
"step": 1390
},
{
"entropy": 0.5240093268454075,
"epoch": 2.24,
"grad_norm": 1.40625,
"learning_rate": 1.9113333333333336e-05,
"loss": 0.5127,
"mean_token_accuracy": 0.8465723715722561,
"num_tokens": 40092012.0,
"step": 1400
},
{
"epoch": 2.24,
"eval_chemistry_entropy": 0.5741701843738556,
"eval_chemistry_loss": 0.7734760642051697,
"eval_chemistry_mean_token_accuracy": 0.7974499335289001,
"eval_chemistry_num_tokens": 40092012.0,
"eval_chemistry_runtime": 34.8009,
"eval_chemistry_samples_per_second": 14.367,
"eval_chemistry_steps_per_second": 3.592,
"step": 1400
},
{
"epoch": 2.24,
"eval_math_entropy": 0.5538151891231536,
"eval_math_loss": 0.6544409394264221,
"eval_math_mean_token_accuracy": 0.8238605060577393,
"eval_math_num_tokens": 40092012.0,
"eval_math_runtime": 35.8968,
"eval_math_samples_per_second": 13.929,
"eval_math_steps_per_second": 3.482,
"step": 1400
},
{
"entropy": 0.5231095163151622,
"epoch": 2.2560000000000002,
"grad_norm": 1.46875,
"learning_rate": 1.9091111111111113e-05,
"loss": 0.5086,
"mean_token_accuracy": 0.8475595053285361,
"num_tokens": 40380069.0,
"step": 1410
},
{
"entropy": 0.5324242118746042,
"epoch": 2.2720000000000002,
"grad_norm": 1.6171875,
"learning_rate": 1.906888888888889e-05,
"loss": 0.5147,
"mean_token_accuracy": 0.8457300752401352,
"num_tokens": 40655903.0,
"step": 1420
},
{
"entropy": 0.5270635643973947,
"epoch": 2.288,
"grad_norm": 1.484375,
"learning_rate": 1.904666666666667e-05,
"loss": 0.51,
"mean_token_accuracy": 0.8453500598669053,
"num_tokens": 40944791.0,
"step": 1430
},
{
"entropy": 0.5129735874012112,
"epoch": 2.304,
"grad_norm": 1.5390625,
"learning_rate": 1.9024444444444446e-05,
"loss": 0.4984,
"mean_token_accuracy": 0.8505097340792418,
"num_tokens": 41227146.0,
"step": 1440
},
{
"entropy": 0.5308720570988953,
"epoch": 2.32,
"grad_norm": 1.5,
"learning_rate": 1.9002222222222224e-05,
"loss": 0.5197,
"mean_token_accuracy": 0.8463008664548397,
"num_tokens": 41509322.0,
"step": 1450
},
{
"entropy": 0.5264440540224313,
"epoch": 2.336,
"grad_norm": 1.4609375,
"learning_rate": 1.898e-05,
"loss": 0.5068,
"mean_token_accuracy": 0.8473060473799705,
"num_tokens": 41792799.0,
"step": 1460
},
{
"entropy": 0.5393752640113234,
"epoch": 2.352,
"grad_norm": 1.4609375,
"learning_rate": 1.895777777777778e-05,
"loss": 0.5226,
"mean_token_accuracy": 0.8433732774108649,
"num_tokens": 42083080.0,
"step": 1470
},
{
"entropy": 0.5265612868592143,
"epoch": 2.368,
"grad_norm": 1.4921875,
"learning_rate": 1.8935555555555556e-05,
"loss": 0.5118,
"mean_token_accuracy": 0.8467371355742216,
"num_tokens": 42368442.0,
"step": 1480
},
{
"entropy": 0.5453734394162894,
"epoch": 2.384,
"grad_norm": 1.7578125,
"learning_rate": 1.8913333333333334e-05,
"loss": 0.5308,
"mean_token_accuracy": 0.8432964202016592,
"num_tokens": 42663926.0,
"step": 1490
},
{
"entropy": 0.5337216904386878,
"epoch": 2.4,
"grad_norm": 1.8203125,
"learning_rate": 1.8891111111111115e-05,
"loss": 0.5215,
"mean_token_accuracy": 0.8452994517982007,
"num_tokens": 42946351.0,
"step": 1500
},
{
"epoch": 2.4,
"eval_chemistry_entropy": 0.5733621008396149,
"eval_chemistry_loss": 0.7763350009918213,
"eval_chemistry_mean_token_accuracy": 0.7971291627883911,
"eval_chemistry_num_tokens": 42946351.0,
"eval_chemistry_runtime": 34.7936,
"eval_chemistry_samples_per_second": 14.37,
"eval_chemistry_steps_per_second": 3.593,
"step": 1500
},
{
"epoch": 2.4,
"eval_math_entropy": 0.5494145798683167,
"eval_math_loss": 0.6489285826683044,
"eval_math_mean_token_accuracy": 0.8246520318984986,
"eval_math_num_tokens": 42946351.0,
"eval_math_runtime": 35.4869,
"eval_math_samples_per_second": 14.09,
"eval_math_steps_per_second": 3.522,
"step": 1500
},
{
"entropy": 0.5415687510743737,
"epoch": 2.416,
"grad_norm": 1.484375,
"learning_rate": 1.886888888888889e-05,
"loss": 0.5263,
"mean_token_accuracy": 0.8432823572307825,
"num_tokens": 43227782.0,
"step": 1510
},
{
"entropy": 0.5234041666612029,
"epoch": 2.432,
"grad_norm": 1.421875,
"learning_rate": 1.884666666666667e-05,
"loss": 0.5124,
"mean_token_accuracy": 0.8473271746188402,
"num_tokens": 43511939.0,
"step": 1520
},
{
"entropy": 0.5191137973219156,
"epoch": 2.448,
"grad_norm": 1.5703125,
"learning_rate": 1.8824444444444445e-05,
"loss": 0.5039,
"mean_token_accuracy": 0.8494564235210419,
"num_tokens": 43811005.0,
"step": 1530
},
{
"entropy": 0.5308497181162238,
"epoch": 2.464,
"grad_norm": 1.59375,
"learning_rate": 1.8802222222222226e-05,
"loss": 0.5174,
"mean_token_accuracy": 0.8456704583019018,
"num_tokens": 44106223.0,
"step": 1540
},
{
"entropy": 0.5506666025146842,
"epoch": 2.48,
"grad_norm": 1.53125,
"learning_rate": 1.878e-05,
"loss": 0.5316,
"mean_token_accuracy": 0.8412250328809023,
"num_tokens": 44379095.0,
"step": 1550
},
{
"entropy": 0.5348324475809931,
"epoch": 2.496,
"grad_norm": 1.4296875,
"learning_rate": 1.875777777777778e-05,
"loss": 0.52,
"mean_token_accuracy": 0.8440562028437852,
"num_tokens": 44670316.0,
"step": 1560
},
{
"entropy": 0.5286164516583085,
"epoch": 2.512,
"grad_norm": 1.546875,
"learning_rate": 1.873555555555556e-05,
"loss": 0.516,
"mean_token_accuracy": 0.8460781283676624,
"num_tokens": 44952877.0,
"step": 1570
},
{
"entropy": 0.5242383845150471,
"epoch": 2.528,
"grad_norm": 1.4453125,
"learning_rate": 1.8713333333333336e-05,
"loss": 0.5084,
"mean_token_accuracy": 0.8474905800074339,
"num_tokens": 45237653.0,
"step": 1580
},
{
"entropy": 0.5217398202046752,
"epoch": 2.544,
"grad_norm": 1.625,
"learning_rate": 1.8691111111111114e-05,
"loss": 0.51,
"mean_token_accuracy": 0.8485121335834265,
"num_tokens": 45524866.0,
"step": 1590
},
{
"entropy": 0.5217585685662925,
"epoch": 2.56,
"grad_norm": 1.40625,
"learning_rate": 1.866888888888889e-05,
"loss": 0.504,
"mean_token_accuracy": 0.8478538550436496,
"num_tokens": 45803783.0,
"step": 1600
},
{
"epoch": 2.56,
"eval_chemistry_entropy": 0.5687358362674713,
"eval_chemistry_loss": 0.7774361371994019,
"eval_chemistry_mean_token_accuracy": 0.7964602584838867,
"eval_chemistry_num_tokens": 45803783.0,
"eval_chemistry_runtime": 34.792,
"eval_chemistry_samples_per_second": 14.371,
"eval_chemistry_steps_per_second": 3.593,
"step": 1600
},
{
"epoch": 2.56,
"eval_math_entropy": 0.5417093946933746,
"eval_math_loss": 0.6447312831878662,
"eval_math_mean_token_accuracy": 0.8262028379440307,
"eval_math_num_tokens": 45803783.0,
"eval_math_runtime": 35.8206,
"eval_math_samples_per_second": 13.958,
"eval_math_steps_per_second": 3.49,
"step": 1600
},
{
"entropy": 0.5245849631726742,
"epoch": 2.576,
"grad_norm": 1.4375,
"learning_rate": 1.864666666666667e-05,
"loss": 0.5095,
"mean_token_accuracy": 0.8479252554476261,
"num_tokens": 46090713.0,
"step": 1610
},
{
"entropy": 0.5172529483214021,
"epoch": 2.592,
"grad_norm": 1.3359375,
"learning_rate": 1.8624444444444446e-05,
"loss": 0.5036,
"mean_token_accuracy": 0.8490103390067816,
"num_tokens": 46384084.0,
"step": 1620
},
{
"entropy": 0.5333136133849621,
"epoch": 2.608,
"grad_norm": 1.625,
"learning_rate": 1.8602222222222224e-05,
"loss": 0.5216,
"mean_token_accuracy": 0.8443999428302049,
"num_tokens": 46667552.0,
"step": 1630
},
{
"entropy": 0.49988405164331196,
"epoch": 2.624,
"grad_norm": 1.453125,
"learning_rate": 1.858e-05,
"loss": 0.4842,
"mean_token_accuracy": 0.8543243549764157,
"num_tokens": 46961124.0,
"step": 1640
},
{
"entropy": 0.5301500145345927,
"epoch": 2.64,
"grad_norm": 1.5546875,
"learning_rate": 1.855777777777778e-05,
"loss": 0.518,
"mean_token_accuracy": 0.8460444647818803,
"num_tokens": 47245326.0,
"step": 1650
},
{
"entropy": 0.5089625578373671,
"epoch": 2.656,
"grad_norm": 1.578125,
"learning_rate": 1.8535555555555557e-05,
"loss": 0.4942,
"mean_token_accuracy": 0.8499364998191595,
"num_tokens": 47527347.0,
"step": 1660
},
{
"entropy": 0.5320184031501413,
"epoch": 2.672,
"grad_norm": 1.6875,
"learning_rate": 1.8513333333333335e-05,
"loss": 0.5159,
"mean_token_accuracy": 0.8455484293401241,
"num_tokens": 47808167.0,
"step": 1670
},
{
"entropy": 0.5261984881013632,
"epoch": 2.6879999999999997,
"grad_norm": 1.4140625,
"learning_rate": 1.8491111111111112e-05,
"loss": 0.5096,
"mean_token_accuracy": 0.8478115413337945,
"num_tokens": 48092035.0,
"step": 1680
},
{
"entropy": 0.5308143127709627,
"epoch": 2.7039999999999997,
"grad_norm": 1.609375,
"learning_rate": 1.846888888888889e-05,
"loss": 0.5208,
"mean_token_accuracy": 0.8449086494743824,
"num_tokens": 48386231.0,
"step": 1690
},
{
"entropy": 0.522861585021019,
"epoch": 2.7199999999999998,
"grad_norm": 1.5078125,
"learning_rate": 1.8446666666666667e-05,
"loss": 0.5101,
"mean_token_accuracy": 0.8472402919083833,
"num_tokens": 48663262.0,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_chemistry_entropy": 0.5746280739307403,
"eval_chemistry_loss": 0.7798963785171509,
"eval_chemistry_mean_token_accuracy": 0.7963899078369141,
"eval_chemistry_num_tokens": 48663262.0,
"eval_chemistry_runtime": 34.7443,
"eval_chemistry_samples_per_second": 14.391,
"eval_chemistry_steps_per_second": 3.598,
"step": 1700
},
{
"epoch": 2.7199999999999998,
"eval_math_entropy": 0.5586551280021668,
"eval_math_loss": 0.6379777789115906,
"eval_math_mean_token_accuracy": 0.8271588444709778,
"eval_math_num_tokens": 48663262.0,
"eval_math_runtime": 35.8034,
"eval_math_samples_per_second": 13.965,
"eval_math_steps_per_second": 3.491,
"step": 1700
},
{
"entropy": 0.5201129943132401,
"epoch": 2.7359999999999998,
"grad_norm": 1.4921875,
"learning_rate": 1.842444444444445e-05,
"loss": 0.5075,
"mean_token_accuracy": 0.8486686907708645,
"num_tokens": 48949456.0,
"step": 1710
},
{
"entropy": 0.5246327675879001,
"epoch": 2.752,
"grad_norm": 1.5078125,
"learning_rate": 1.8402222222222223e-05,
"loss": 0.5068,
"mean_token_accuracy": 0.8480775609612465,
"num_tokens": 49235751.0,
"step": 1720
},
{
"entropy": 0.5258618659339845,
"epoch": 2.768,
"grad_norm": 1.4453125,
"learning_rate": 1.8380000000000004e-05,
"loss": 0.5131,
"mean_token_accuracy": 0.8470302954316139,
"num_tokens": 49524628.0,
"step": 1730
},
{
"entropy": 0.5411237273365259,
"epoch": 2.784,
"grad_norm": 1.546875,
"learning_rate": 1.8357777777777778e-05,
"loss": 0.5228,
"mean_token_accuracy": 0.8433229614049196,
"num_tokens": 49799604.0,
"step": 1740
},
{
"entropy": 0.522053999081254,
"epoch": 2.8,
"grad_norm": 1.28125,
"learning_rate": 1.833555555555556e-05,
"loss": 0.509,
"mean_token_accuracy": 0.8484720811247826,
"num_tokens": 50079307.0,
"step": 1750
},
{
"entropy": 0.5252829389646649,
"epoch": 2.816,
"grad_norm": 1.5234375,
"learning_rate": 1.8313333333333333e-05,
"loss": 0.5056,
"mean_token_accuracy": 0.8476968578994274,
"num_tokens": 50372216.0,
"step": 1760
},
{
"entropy": 0.5136345084756613,
"epoch": 2.832,
"grad_norm": 1.59375,
"learning_rate": 1.8291111111111114e-05,
"loss": 0.5037,
"mean_token_accuracy": 0.8490433126688004,
"num_tokens": 50657211.0,
"step": 1770
},
{
"entropy": 0.5356445843353868,
"epoch": 2.848,
"grad_norm": 1.46875,
"learning_rate": 1.8268888888888888e-05,
"loss": 0.5228,
"mean_token_accuracy": 0.8450450662523508,
"num_tokens": 50946503.0,
"step": 1780
},
{
"entropy": 0.5204181535169482,
"epoch": 2.864,
"grad_norm": 1.609375,
"learning_rate": 1.824666666666667e-05,
"loss": 0.5087,
"mean_token_accuracy": 0.8479438953101635,
"num_tokens": 51229298.0,
"step": 1790
},
{
"entropy": 0.5238792803138494,
"epoch": 2.88,
"grad_norm": 1.5234375,
"learning_rate": 1.8224444444444447e-05,
"loss": 0.5047,
"mean_token_accuracy": 0.8475392743945122,
"num_tokens": 51509622.0,
"step": 1800
},
{
"epoch": 2.88,
"eval_chemistry_entropy": 0.5654533207416534,
"eval_chemistry_loss": 0.7800628542900085,
"eval_chemistry_mean_token_accuracy": 0.7969496622085571,
"eval_chemistry_num_tokens": 51509622.0,
"eval_chemistry_runtime": 34.7126,
"eval_chemistry_samples_per_second": 14.404,
"eval_chemistry_steps_per_second": 3.601,
"step": 1800
},
{
"epoch": 2.88,
"eval_math_entropy": 0.538058566570282,
"eval_math_loss": 0.636343240737915,
"eval_math_mean_token_accuracy": 0.8282605667114258,
"eval_math_num_tokens": 51509622.0,
"eval_math_runtime": 35.7752,
"eval_math_samples_per_second": 13.976,
"eval_math_steps_per_second": 3.494,
"step": 1800
},
{
"entropy": 0.5157067686319351,
"epoch": 2.896,
"grad_norm": 1.578125,
"learning_rate": 1.8202222222222225e-05,
"loss": 0.5005,
"mean_token_accuracy": 0.8493956170976162,
"num_tokens": 51790856.0,
"step": 1810
},
{
"entropy": 0.5220442852005363,
"epoch": 2.912,
"grad_norm": 1.5390625,
"learning_rate": 1.8180000000000002e-05,
"loss": 0.5082,
"mean_token_accuracy": 0.8484512511640787,
"num_tokens": 52074393.0,
"step": 1820
},
{
"entropy": 0.5298560874536633,
"epoch": 2.928,
"grad_norm": 1.5,
"learning_rate": 1.815777777777778e-05,
"loss": 0.5177,
"mean_token_accuracy": 0.8467922162264585,
"num_tokens": 52366467.0,
"step": 1830
},
{
"entropy": 0.5144043141975999,
"epoch": 2.944,
"grad_norm": 1.4765625,
"learning_rate": 1.8135555555555557e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.848617946729064,
"num_tokens": 52657863.0,
"step": 1840
},
{
"entropy": 0.5239851342514157,
"epoch": 2.96,
"grad_norm": 1.5078125,
"learning_rate": 1.8113333333333335e-05,
"loss": 0.5086,
"mean_token_accuracy": 0.8469469167292119,
"num_tokens": 52947183.0,
"step": 1850
},
{
"entropy": 0.5214887933805585,
"epoch": 2.976,
"grad_norm": 1.390625,
"learning_rate": 1.8091111111111113e-05,
"loss": 0.5049,
"mean_token_accuracy": 0.8478761337697506,
"num_tokens": 53226984.0,
"step": 1860
},
{
"entropy": 0.5146605940535665,
"epoch": 2.992,
"grad_norm": 1.46875,
"learning_rate": 1.806888888888889e-05,
"loss": 0.4994,
"mean_token_accuracy": 0.8494556181132793,
"num_tokens": 53515204.0,
"step": 1870
},
{
"entropy": 0.47540333569049836,
"epoch": 3.008,
"grad_norm": 2.078125,
"learning_rate": 1.8046666666666668e-05,
"loss": 0.436,
"mean_token_accuracy": 0.8662762742489576,
"num_tokens": 53800592.0,
"step": 1880
},
{
"entropy": 0.35706824259832504,
"epoch": 3.024,
"grad_norm": 1.921875,
"learning_rate": 1.8024444444444445e-05,
"loss": 0.3438,
"mean_token_accuracy": 0.8883892893791199,
"num_tokens": 54092017.0,
"step": 1890
},
{
"entropy": 0.3444691884331405,
"epoch": 3.04,
"grad_norm": 1.90625,
"learning_rate": 1.8002222222222223e-05,
"loss": 0.3242,
"mean_token_accuracy": 0.8934439823031426,
"num_tokens": 54380158.0,
"step": 1900
},
{
"epoch": 3.04,
"eval_chemistry_entropy": 0.4136744683980942,
"eval_chemistry_loss": 0.9674221873283386,
"eval_chemistry_mean_token_accuracy": 0.785107521533966,
"eval_chemistry_num_tokens": 54380158.0,
"eval_chemistry_runtime": 34.7442,
"eval_chemistry_samples_per_second": 14.391,
"eval_chemistry_steps_per_second": 3.598,
"step": 1900
},
{
"epoch": 3.04,
"eval_math_entropy": 0.40369147157669066,
"eval_math_loss": 0.7466345429420471,
"eval_math_mean_token_accuracy": 0.8216371216773987,
"eval_math_num_tokens": 54380158.0,
"eval_math_runtime": 36.3078,
"eval_math_samples_per_second": 13.771,
"eval_math_steps_per_second": 3.443,
"step": 1900
},
{
"entropy": 0.3425462535582483,
"epoch": 3.056,
"grad_norm": 2.015625,
"learning_rate": 1.798e-05,
"loss": 0.3194,
"mean_token_accuracy": 0.894545292109251,
"num_tokens": 54658093.0,
"step": 1910
},
{
"entropy": 0.3412907443009317,
"epoch": 3.072,
"grad_norm": 2.015625,
"learning_rate": 1.7957777777777778e-05,
"loss": 0.3209,
"mean_token_accuracy": 0.8948256101459264,
"num_tokens": 54949938.0,
"step": 1920
},
{
"entropy": 0.32589660566300155,
"epoch": 3.088,
"grad_norm": 2.1875,
"learning_rate": 1.7935555555555556e-05,
"loss": 0.3083,
"mean_token_accuracy": 0.8980491489171982,
"num_tokens": 55234643.0,
"step": 1930
},
{
"entropy": 0.34667133893817664,
"epoch": 3.104,
"grad_norm": 2.015625,
"learning_rate": 1.7913333333333337e-05,
"loss": 0.3253,
"mean_token_accuracy": 0.8923853240907192,
"num_tokens": 55519679.0,
"step": 1940
},
{
"entropy": 0.33345927773043516,
"epoch": 3.12,
"grad_norm": 1.984375,
"learning_rate": 1.789111111111111e-05,
"loss": 0.3172,
"mean_token_accuracy": 0.8965467110276222,
"num_tokens": 55812397.0,
"step": 1950
},
{
"entropy": 0.33698156690225006,
"epoch": 3.136,
"grad_norm": 1.8984375,
"learning_rate": 1.7868888888888892e-05,
"loss": 0.319,
"mean_token_accuracy": 0.8949916884303093,
"num_tokens": 56101550.0,
"step": 1960
},
{
"entropy": 0.3372134535573423,
"epoch": 3.152,
"grad_norm": 2.140625,
"learning_rate": 1.7846666666666666e-05,
"loss": 0.3227,
"mean_token_accuracy": 0.8940825492143631,
"num_tokens": 56387662.0,
"step": 1970
},
{
"entropy": 0.3512561682611704,
"epoch": 3.168,
"grad_norm": 2.109375,
"learning_rate": 1.7824444444444447e-05,
"loss": 0.3342,
"mean_token_accuracy": 0.8908473681658506,
"num_tokens": 56673426.0,
"step": 1980
},
{
"entropy": 0.3502405107952654,
"epoch": 3.184,
"grad_norm": 2.296875,
"learning_rate": 1.780222222222222e-05,
"loss": 0.3273,
"mean_token_accuracy": 0.8911666806787253,
"num_tokens": 56957044.0,
"step": 1990
},
{
"entropy": 0.33547057397663593,
"epoch": 3.2,
"grad_norm": 1.96875,
"learning_rate": 1.7780000000000003e-05,
"loss": 0.3184,
"mean_token_accuracy": 0.8955803520977497,
"num_tokens": 57250097.0,
"step": 2000
},
{
"epoch": 3.2,
"eval_chemistry_entropy": 0.3912388117313385,
"eval_chemistry_loss": 0.9860958456993103,
"eval_chemistry_mean_token_accuracy": 0.7855022087097168,
"eval_chemistry_num_tokens": 57250097.0,
"eval_chemistry_runtime": 34.752,
"eval_chemistry_samples_per_second": 14.388,
"eval_chemistry_steps_per_second": 3.597,
"step": 2000
},
{
"epoch": 3.2,
"eval_math_entropy": 0.3877128950357437,
"eval_math_loss": 0.7659533023834229,
"eval_math_mean_token_accuracy": 0.8217745156288147,
"eval_math_num_tokens": 57250097.0,
"eval_math_runtime": 35.4631,
"eval_math_samples_per_second": 14.099,
"eval_math_steps_per_second": 3.525,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 16,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.343629994565034e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}