{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5841219527646899, "epoch": 0.016, "grad_norm": 10.875, "learning_rate": 1.8e-07, "loss": 1.0536, "mean_token_accuracy": 0.7792524203658104, "num_tokens": 282849.0, "step": 10 }, { "entropy": 0.6104077549651266, "epoch": 0.032, "grad_norm": 9.625, "learning_rate": 3.8e-07, "loss": 1.0813, "mean_token_accuracy": 0.7717054646462203, "num_tokens": 561726.0, "step": 20 }, { "entropy": 0.604045126773417, "epoch": 0.048, "grad_norm": 8.9375, "learning_rate": 5.800000000000001e-07, "loss": 1.0425, "mean_token_accuracy": 0.7788930047303438, "num_tokens": 842092.0, "step": 30 }, { "entropy": 0.6275176709517837, "epoch": 0.064, "grad_norm": 8.5625, "learning_rate": 7.8e-07, "loss": 1.0681, "mean_token_accuracy": 0.7725805886089802, "num_tokens": 1133349.0, "step": 40 }, { "entropy": 0.6530287317931652, "epoch": 0.08, "grad_norm": 7.59375, "learning_rate": 9.800000000000001e-07, "loss": 1.0273, "mean_token_accuracy": 0.7785692941397429, "num_tokens": 1422340.0, "step": 50 }, { "entropy": 0.6810424191877246, "epoch": 0.096, "grad_norm": 5.40625, "learning_rate": 1.1800000000000001e-06, "loss": 0.9779, "mean_token_accuracy": 0.7848060742020607, "num_tokens": 1711821.0, "step": 60 }, { "entropy": 0.7722993083298206, "epoch": 0.112, "grad_norm": 4.03125, "learning_rate": 1.3800000000000001e-06, "loss": 0.9772, "mean_token_accuracy": 0.782012278959155, "num_tokens": 1991855.0, "step": 70 }, { "entropy": 0.8406166139990091, "epoch": 0.128, "grad_norm": 3.171875, "learning_rate": 1.5800000000000001e-06, "loss": 0.959, "mean_token_accuracy": 0.784759023040533, "num_tokens": 2288720.0, "step": 80 }, { "entropy": 0.8299949843436479, "epoch": 0.144, "grad_norm": 2.875, "learning_rate": 1.7800000000000001e-06, "loss": 0.9481, "mean_token_accuracy": 0.7854762740433217, "num_tokens": 2564352.0, "step": 90 }, { "entropy": 0.7978475514799357, "epoch": 0.16, "grad_norm": 2.78125, "learning_rate": 1.98e-06, "loss": 0.9304, "mean_token_accuracy": 0.7901426807045937, "num_tokens": 2841876.0, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 0.817941864490509, "eval_chemistry_loss": 0.8207718729972839, "eval_chemistry_mean_token_accuracy": 0.8094593362808228, "eval_chemistry_num_tokens": 2841876.0, "eval_chemistry_runtime": 34.8392, "eval_chemistry_samples_per_second": 14.352, "eval_chemistry_steps_per_second": 3.588, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.8300764834880829, "eval_math_loss": 0.9698297381401062, "eval_math_mean_token_accuracy": 0.7842242226600648, "eval_math_num_tokens": 2841876.0, "eval_math_runtime": 35.8893, "eval_math_samples_per_second": 13.932, "eval_math_steps_per_second": 3.483, "step": 100 }, { "entropy": 0.8128786478191614, "epoch": 0.176, "grad_norm": 2.234375, "learning_rate": 2.1800000000000003e-06, "loss": 0.9222, "mean_token_accuracy": 0.7908758621662855, "num_tokens": 3129237.0, "step": 110 }, { "entropy": 0.830844565667212, "epoch": 0.192, "grad_norm": 2.671875, "learning_rate": 2.38e-06, "loss": 0.9398, "mean_token_accuracy": 0.7865626137703657, "num_tokens": 3414875.0, "step": 120 }, { "entropy": 0.8196758177131415, "epoch": 0.208, "grad_norm": 2.953125, "learning_rate": 2.5800000000000003e-06, "loss": 0.92, "mean_token_accuracy": 0.7892169930040837, "num_tokens": 3695623.0, "step": 130 }, { "entropy": 0.8214951554313302, "epoch": 0.224, "grad_norm": 2.703125, "learning_rate": 2.7800000000000005e-06, "loss": 0.9102, "mean_token_accuracy": 0.7903474017977714, "num_tokens": 3977666.0, "step": 140 }, { "entropy": 0.7900505751371384, "epoch": 0.24, "grad_norm": 2.609375, "learning_rate": 2.9800000000000003e-06, "loss": 0.8856, "mean_token_accuracy": 0.7941167835146189, "num_tokens": 4269287.0, "step": 150 }, { "entropy": 0.783742449991405, "epoch": 0.256, "grad_norm": 2.328125, "learning_rate": 3.1800000000000005e-06, "loss": 0.857, "mean_token_accuracy": 0.7969042614102364, "num_tokens": 4558632.0, "step": 160 }, { "entropy": 0.7985043607652187, "epoch": 0.272, "grad_norm": 2.28125, "learning_rate": 3.3800000000000007e-06, "loss": 0.8738, "mean_token_accuracy": 0.7948056727647781, "num_tokens": 4839270.0, "step": 170 }, { "entropy": 0.7821594245731831, "epoch": 0.288, "grad_norm": 2.046875, "learning_rate": 3.58e-06, "loss": 0.8578, "mean_token_accuracy": 0.7972537841647863, "num_tokens": 5125447.0, "step": 180 }, { "entropy": 0.8105486003682018, "epoch": 0.304, "grad_norm": 2.28125, "learning_rate": 3.7800000000000002e-06, "loss": 0.8516, "mean_token_accuracy": 0.7949083779007197, "num_tokens": 5415925.0, "step": 190 }, { "entropy": 0.7668969418853522, "epoch": 0.32, "grad_norm": 2.125, "learning_rate": 3.980000000000001e-06, "loss": 0.8079, "mean_token_accuracy": 0.80227146781981, "num_tokens": 5710178.0, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 0.8007053558826447, "eval_chemistry_loss": 0.7652505040168762, "eval_chemistry_mean_token_accuracy": 0.8065420956611633, "eval_chemistry_num_tokens": 5710178.0, "eval_chemistry_runtime": 34.8946, "eval_chemistry_samples_per_second": 14.329, "eval_chemistry_steps_per_second": 3.582, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.8049644639492035, "eval_math_loss": 0.8578509092330933, "eval_math_mean_token_accuracy": 0.7935372619628906, "eval_math_num_tokens": 5710178.0, "eval_math_runtime": 35.9751, "eval_math_samples_per_second": 13.899, "eval_math_steps_per_second": 3.475, "step": 200 }, { "entropy": 0.8033319305628538, "epoch": 0.336, "grad_norm": 2.0625, "learning_rate": 4.18e-06, "loss": 0.8255, "mean_token_accuracy": 0.7956937287002802, "num_tokens": 5993843.0, "step": 210 }, { "entropy": 0.776984517276287, "epoch": 0.352, "grad_norm": 2.3125, "learning_rate": 4.38e-06, "loss": 0.8027, "mean_token_accuracy": 0.8002396024763584, "num_tokens": 6279137.0, "step": 220 }, { "entropy": 0.7891015050932765, "epoch": 0.368, "grad_norm": 1.921875, "learning_rate": 4.58e-06, "loss": 0.7877, "mean_token_accuracy": 0.7985758237540722, "num_tokens": 6562387.0, "step": 230 }, { "entropy": 0.7725065175443888, "epoch": 0.384, "grad_norm": 2.09375, "learning_rate": 4.78e-06, "loss": 0.7733, "mean_token_accuracy": 0.8037754170596599, "num_tokens": 6851520.0, "step": 240 }, { "entropy": 0.8092642717063427, "epoch": 0.4, "grad_norm": 2.09375, "learning_rate": 4.980000000000001e-06, "loss": 0.8171, "mean_token_accuracy": 0.7956945843994617, "num_tokens": 7138932.0, "step": 250 }, { "entropy": 0.7882049404084682, "epoch": 0.416, "grad_norm": 1.8359375, "learning_rate": 5.18e-06, "loss": 0.7764, "mean_token_accuracy": 0.8010819923132658, "num_tokens": 7420720.0, "step": 260 }, { "entropy": 0.7719787143170833, "epoch": 0.432, "grad_norm": 1.875, "learning_rate": 5.380000000000001e-06, "loss": 0.7655, "mean_token_accuracy": 0.8039833098649979, "num_tokens": 7695518.0, "step": 270 }, { "entropy": 0.7650891080498695, "epoch": 0.448, "grad_norm": 2.484375, "learning_rate": 5.580000000000001e-06, "loss": 0.7529, "mean_token_accuracy": 0.8057680625468493, "num_tokens": 7983543.0, "step": 280 }, { "entropy": 0.7654503894969821, "epoch": 0.464, "grad_norm": 2.96875, "learning_rate": 5.78e-06, "loss": 0.7582, "mean_token_accuracy": 0.8070501256734133, "num_tokens": 8271787.0, "step": 290 }, { "entropy": 0.7974133348092437, "epoch": 0.48, "grad_norm": 2.046875, "learning_rate": 5.98e-06, "loss": 0.7749, "mean_token_accuracy": 0.8022410120815039, "num_tokens": 8556170.0, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.7947752397060395, "eval_chemistry_loss": 0.7221509218215942, "eval_chemistry_mean_token_accuracy": 0.8083039855957032, "eval_chemistry_num_tokens": 8556170.0, "eval_chemistry_runtime": 34.8605, "eval_chemistry_samples_per_second": 14.343, "eval_chemistry_steps_per_second": 3.586, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.7911813812255859, "eval_math_loss": 0.7818350195884705, "eval_math_mean_token_accuracy": 0.8022260117530823, "eval_math_num_tokens": 8556170.0, "eval_math_runtime": 35.8842, "eval_math_samples_per_second": 13.934, "eval_math_steps_per_second": 3.483, "step": 300 }, { "entropy": 0.7553723743185401, "epoch": 0.496, "grad_norm": 2.09375, "learning_rate": 6.18e-06, "loss": 0.7364, "mean_token_accuracy": 0.809390364587307, "num_tokens": 8840590.0, "step": 310 }, { "entropy": 0.763883944042027, "epoch": 0.512, "grad_norm": 1.984375, "learning_rate": 6.380000000000001e-06, "loss": 0.737, "mean_token_accuracy": 0.8089169431477785, "num_tokens": 9131903.0, "step": 320 }, { "entropy": 0.7371911529451609, "epoch": 0.528, "grad_norm": 1.9765625, "learning_rate": 6.5800000000000005e-06, "loss": 0.7238, "mean_token_accuracy": 0.8125522993505001, "num_tokens": 9429519.0, "step": 330 }, { "entropy": 0.753829506970942, "epoch": 0.544, "grad_norm": 2.671875, "learning_rate": 6.780000000000001e-06, "loss": 0.7411, "mean_token_accuracy": 0.8106916502118111, "num_tokens": 9718247.0, "step": 340 }, { "entropy": 0.7809748956933618, "epoch": 0.56, "grad_norm": 2.90625, "learning_rate": 6.98e-06, "loss": 0.756, "mean_token_accuracy": 0.803440049290657, "num_tokens": 9991100.0, "step": 350 }, { "entropy": 0.7534385286271572, "epoch": 0.576, "grad_norm": 2.125, "learning_rate": 7.180000000000001e-06, "loss": 0.7358, "mean_token_accuracy": 0.807822210714221, "num_tokens": 10276090.0, "step": 360 }, { "entropy": 0.7527043262496591, "epoch": 0.592, "grad_norm": 1.75, "learning_rate": 7.3800000000000005e-06, "loss": 0.7343, "mean_token_accuracy": 0.8090104408562183, "num_tokens": 10566762.0, "step": 370 }, { "entropy": 0.7644007310271264, "epoch": 0.608, "grad_norm": 1.875, "learning_rate": 7.58e-06, "loss": 0.7436, "mean_token_accuracy": 0.8086390372365713, "num_tokens": 10857002.0, "step": 380 }, { "entropy": 0.7420919336378574, "epoch": 0.624, "grad_norm": 2.515625, "learning_rate": 7.78e-06, "loss": 0.7239, "mean_token_accuracy": 0.8104523837566375, "num_tokens": 11147326.0, "step": 390 }, { "entropy": 0.7273864936083555, "epoch": 0.64, "grad_norm": 1.7109375, "learning_rate": 7.980000000000002e-06, "loss": 0.7111, "mean_token_accuracy": 0.8139414019882679, "num_tokens": 11437710.0, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.7778651757240296, "eval_chemistry_loss": 0.7157668471336365, "eval_chemistry_mean_token_accuracy": 0.808542202949524, "eval_chemistry_num_tokens": 11437710.0, "eval_chemistry_runtime": 34.8261, "eval_chemistry_samples_per_second": 14.357, "eval_chemistry_steps_per_second": 3.589, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.763886357307434, "eval_math_loss": 0.7481760382652283, "eval_math_mean_token_accuracy": 0.8066890263557434, "eval_math_num_tokens": 11437710.0, "eval_math_runtime": 36.0154, "eval_math_samples_per_second": 13.883, "eval_math_steps_per_second": 3.471, "step": 400 }, { "entropy": 0.729702671058476, "epoch": 0.656, "grad_norm": 1.578125, "learning_rate": 8.18e-06, "loss": 0.7083, "mean_token_accuracy": 0.8148457534611225, "num_tokens": 11727268.0, "step": 410 }, { "entropy": 0.7371693229302764, "epoch": 0.672, "grad_norm": 1.8515625, "learning_rate": 8.380000000000001e-06, "loss": 0.7199, "mean_token_accuracy": 0.8099267814308405, "num_tokens": 12020478.0, "step": 420 }, { "entropy": 0.7293412426486612, "epoch": 0.688, "grad_norm": 1.6875, "learning_rate": 8.580000000000001e-06, "loss": 0.7068, "mean_token_accuracy": 0.8130085315555334, "num_tokens": 12300380.0, "step": 430 }, { "entropy": 0.718653773330152, "epoch": 0.704, "grad_norm": 2.046875, "learning_rate": 8.78e-06, "loss": 0.6982, "mean_token_accuracy": 0.8138067662715912, "num_tokens": 12591366.0, "step": 440 }, { "entropy": 0.7371329203248024, "epoch": 0.72, "grad_norm": 1.9609375, "learning_rate": 8.98e-06, "loss": 0.7104, "mean_token_accuracy": 0.8120142966508865, "num_tokens": 12869618.0, "step": 450 }, { "entropy": 0.7198993725702166, "epoch": 0.736, "grad_norm": 1.921875, "learning_rate": 9.180000000000002e-06, "loss": 0.7009, "mean_token_accuracy": 0.815392379835248, "num_tokens": 13156311.0, "step": 460 }, { "entropy": 0.7063086304813624, "epoch": 0.752, "grad_norm": 1.8046875, "learning_rate": 9.38e-06, "loss": 0.6793, "mean_token_accuracy": 0.8184278599917889, "num_tokens": 13438911.0, "step": 470 }, { "entropy": 0.6851496128365397, "epoch": 0.768, "grad_norm": 1.5078125, "learning_rate": 9.58e-06, "loss": 0.6731, "mean_token_accuracy": 0.8231410879641772, "num_tokens": 13735397.0, "step": 480 }, { "entropy": 0.7282077683135867, "epoch": 0.784, "grad_norm": 1.9609375, "learning_rate": 9.780000000000001e-06, "loss": 0.7094, "mean_token_accuracy": 0.8130000628530979, "num_tokens": 14024497.0, "step": 490 }, { "entropy": 0.7247441383078694, "epoch": 0.8, "grad_norm": 2.0, "learning_rate": 9.980000000000001e-06, "loss": 0.7048, "mean_token_accuracy": 0.812597556039691, "num_tokens": 14316427.0, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.7653646836280823, "eval_chemistry_loss": 0.7123190760612488, "eval_chemistry_mean_token_accuracy": 0.8073507056236268, "eval_chemistry_num_tokens": 14316427.0, "eval_chemistry_runtime": 34.8547, "eval_chemistry_samples_per_second": 14.345, "eval_chemistry_steps_per_second": 3.586, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7450865380764008, "eval_math_loss": 0.7230644822120667, "eval_math_mean_token_accuracy": 0.8105058965682983, "eval_math_num_tokens": 14316427.0, "eval_math_runtime": 35.5994, "eval_math_samples_per_second": 14.045, "eval_math_steps_per_second": 3.511, "step": 500 }, { "entropy": 0.714242628775537, "epoch": 0.816, "grad_norm": 2.046875, "learning_rate": 1.018e-05, "loss": 0.6884, "mean_token_accuracy": 0.8151328191161156, "num_tokens": 14607136.0, "step": 510 }, { "entropy": 0.7071703864261508, "epoch": 0.832, "grad_norm": 1.84375, "learning_rate": 1.038e-05, "loss": 0.6839, "mean_token_accuracy": 0.8182855024933815, "num_tokens": 14894084.0, "step": 520 }, { "entropy": 0.7319832380861044, "epoch": 0.848, "grad_norm": 1.8359375, "learning_rate": 1.0580000000000002e-05, "loss": 0.709, "mean_token_accuracy": 0.8122099358588457, "num_tokens": 15177039.0, "step": 530 }, { "entropy": 0.7191415606066585, "epoch": 0.864, "grad_norm": 1.546875, "learning_rate": 1.0780000000000002e-05, "loss": 0.6966, "mean_token_accuracy": 0.8139543637633324, "num_tokens": 15463393.0, "step": 540 }, { "entropy": 0.7217472817748785, "epoch": 0.88, "grad_norm": 1.8359375, "learning_rate": 1.0980000000000002e-05, "loss": 0.699, "mean_token_accuracy": 0.8122927758842706, "num_tokens": 15747943.0, "step": 550 }, { "entropy": 0.7142911188304424, "epoch": 0.896, "grad_norm": 1.6484375, "learning_rate": 1.1180000000000001e-05, "loss": 0.6964, "mean_token_accuracy": 0.8145412191748619, "num_tokens": 16027931.0, "step": 560 }, { "entropy": 0.7013007398694754, "epoch": 0.912, "grad_norm": 1.7421875, "learning_rate": 1.138e-05, "loss": 0.6761, "mean_token_accuracy": 0.8172509890049696, "num_tokens": 16317127.0, "step": 570 }, { "entropy": 0.6971628932282329, "epoch": 0.928, "grad_norm": 1.578125, "learning_rate": 1.1580000000000001e-05, "loss": 0.6725, "mean_token_accuracy": 0.8180789042264223, "num_tokens": 16598757.0, "step": 580 }, { "entropy": 0.7133341139182449, "epoch": 0.944, "grad_norm": 5.8125, "learning_rate": 1.178e-05, "loss": 0.6958, "mean_token_accuracy": 0.8153590984642506, "num_tokens": 16882236.0, "step": 590 }, { "entropy": 0.7189922722056508, "epoch": 0.96, "grad_norm": 1.4453125, "learning_rate": 1.198e-05, "loss": 0.6956, "mean_token_accuracy": 0.8149452641606331, "num_tokens": 17169495.0, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 0.7501673064231873, "eval_chemistry_loss": 0.7121618390083313, "eval_chemistry_mean_token_accuracy": 0.8068772978782653, "eval_chemistry_num_tokens": 17169495.0, "eval_chemistry_runtime": 34.8316, "eval_chemistry_samples_per_second": 14.355, "eval_chemistry_steps_per_second": 3.589, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.7047771346569062, "eval_math_loss": 0.7034372687339783, "eval_math_mean_token_accuracy": 0.8131817808151245, "eval_math_num_tokens": 17169495.0, "eval_math_runtime": 35.8696, "eval_math_samples_per_second": 13.939, "eval_math_steps_per_second": 3.485, "step": 600 }, { "entropy": 0.6952510023489594, "epoch": 0.976, "grad_norm": 1.875, "learning_rate": 1.218e-05, "loss": 0.6788, "mean_token_accuracy": 0.817372427508235, "num_tokens": 17454147.0, "step": 610 }, { "entropy": 0.7092014687135816, "epoch": 0.992, "grad_norm": 1.6640625, "learning_rate": 1.2380000000000002e-05, "loss": 0.6838, "mean_token_accuracy": 0.8138624154031276, "num_tokens": 17742596.0, "step": 620 }, { "entropy": 0.694889472052455, "epoch": 1.008, "grad_norm": 1.5546875, "learning_rate": 1.2580000000000002e-05, "loss": 0.6742, "mean_token_accuracy": 0.8174668036401271, "num_tokens": 18033540.0, "step": 630 }, { "entropy": 0.6714398205280304, "epoch": 1.024, "grad_norm": 1.7578125, "learning_rate": 1.2780000000000001e-05, "loss": 0.6507, "mean_token_accuracy": 0.8213358622044324, "num_tokens": 18316346.0, "step": 640 }, { "entropy": 0.6660828510299325, "epoch": 1.04, "grad_norm": 1.796875, "learning_rate": 1.2980000000000001e-05, "loss": 0.6496, "mean_token_accuracy": 0.8221284162253142, "num_tokens": 18595230.0, "step": 650 }, { "entropy": 0.6795160492882133, "epoch": 1.056, "grad_norm": 1.765625, "learning_rate": 1.3180000000000001e-05, "loss": 0.6581, "mean_token_accuracy": 0.8199356343597174, "num_tokens": 18885547.0, "step": 660 }, { "entropy": 0.6750219637528062, "epoch": 1.072, "grad_norm": 1.6484375, "learning_rate": 1.3380000000000002e-05, "loss": 0.6555, "mean_token_accuracy": 0.8197014667093754, "num_tokens": 19169083.0, "step": 670 }, { "entropy": 0.6712652388960123, "epoch": 1.088, "grad_norm": 1.7109375, "learning_rate": 1.3580000000000002e-05, "loss": 0.6434, "mean_token_accuracy": 0.8221997711807489, "num_tokens": 19461465.0, "step": 680 }, { "entropy": 0.6703712901100516, "epoch": 1.104, "grad_norm": 1.46875, "learning_rate": 1.378e-05, "loss": 0.6514, "mean_token_accuracy": 0.8217100899666547, "num_tokens": 19755027.0, "step": 690 }, { "entropy": 0.6529567580670118, "epoch": 1.12, "grad_norm": 1.765625, "learning_rate": 1.398e-05, "loss": 0.6325, "mean_token_accuracy": 0.8260030064731836, "num_tokens": 20043293.0, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 0.7117109818458557, "eval_chemistry_loss": 0.715001106262207, "eval_chemistry_mean_token_accuracy": 0.805173789024353, "eval_chemistry_num_tokens": 20043293.0, "eval_chemistry_runtime": 34.8451, "eval_chemistry_samples_per_second": 14.349, "eval_chemistry_steps_per_second": 3.587, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.6742688639163971, "eval_math_loss": 0.6903207302093506, "eval_math_mean_token_accuracy": 0.8157781276702881, "eval_math_num_tokens": 20043293.0, "eval_math_runtime": 35.8894, "eval_math_samples_per_second": 13.932, "eval_math_steps_per_second": 3.483, "step": 700 }, { "entropy": 0.6546579284593463, "epoch": 1.1360000000000001, "grad_norm": 1.671875, "learning_rate": 1.418e-05, "loss": 0.6415, "mean_token_accuracy": 0.8239764388650656, "num_tokens": 20323010.0, "step": 710 }, { "entropy": 0.6666190484538674, "epoch": 1.152, "grad_norm": 1.6953125, "learning_rate": 1.4380000000000001e-05, "loss": 0.6493, "mean_token_accuracy": 0.8230571333318949, "num_tokens": 20613266.0, "step": 720 }, { "entropy": 0.6584050474688411, "epoch": 1.168, "grad_norm": 1.5, "learning_rate": 1.4580000000000001e-05, "loss": 0.6408, "mean_token_accuracy": 0.8246052328497171, "num_tokens": 20910404.0, "step": 730 }, { "entropy": 0.664110666513443, "epoch": 1.184, "grad_norm": 1.734375, "learning_rate": 1.478e-05, "loss": 0.6435, "mean_token_accuracy": 0.8229058619588614, "num_tokens": 21200197.0, "step": 740 }, { "entropy": 0.6511959439143539, "epoch": 1.2, "grad_norm": 1.46875, "learning_rate": 1.498e-05, "loss": 0.633, "mean_token_accuracy": 0.8250994741916656, "num_tokens": 21496441.0, "step": 750 }, { "entropy": 0.668997959420085, "epoch": 1.216, "grad_norm": 1.7421875, "learning_rate": 1.5180000000000002e-05, "loss": 0.6456, "mean_token_accuracy": 0.8220734592527151, "num_tokens": 21778030.0, "step": 760 }, { "entropy": 0.6459713563323021, "epoch": 1.232, "grad_norm": 1.75, "learning_rate": 1.5380000000000002e-05, "loss": 0.6253, "mean_token_accuracy": 0.8270299468189478, "num_tokens": 22060851.0, "step": 770 }, { "entropy": 0.6455961847677827, "epoch": 1.248, "grad_norm": 1.5859375, "learning_rate": 1.5580000000000003e-05, "loss": 0.6255, "mean_token_accuracy": 0.8260451622307301, "num_tokens": 22348738.0, "step": 780 }, { "entropy": 0.6676814066246152, "epoch": 1.264, "grad_norm": 1.484375, "learning_rate": 1.578e-05, "loss": 0.6457, "mean_token_accuracy": 0.8219054654240608, "num_tokens": 22635039.0, "step": 790 }, { "entropy": 0.6654805542901159, "epoch": 1.28, "grad_norm": 1.53125, "learning_rate": 1.5980000000000003e-05, "loss": 0.6444, "mean_token_accuracy": 0.8224660288542509, "num_tokens": 22915190.0, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 0.7472894523143768, "eval_chemistry_loss": 0.7169390916824341, "eval_chemistry_mean_token_accuracy": 0.805417845249176, "eval_chemistry_num_tokens": 22915190.0, "eval_chemistry_runtime": 34.8749, "eval_chemistry_samples_per_second": 14.337, "eval_chemistry_steps_per_second": 3.584, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.6949311044216157, "eval_math_loss": 0.678849458694458, "eval_math_mean_token_accuracy": 0.8176955370903015, "eval_math_num_tokens": 22915190.0, "eval_math_runtime": 35.8885, "eval_math_samples_per_second": 13.932, "eval_math_steps_per_second": 3.483, "step": 800 }, { "entropy": 0.6526781121268869, "epoch": 1.296, "grad_norm": 1.46875, "learning_rate": 1.618e-05, "loss": 0.627, "mean_token_accuracy": 0.8251245643943548, "num_tokens": 23194947.0, "step": 810 }, { "entropy": 0.6623267890885473, "epoch": 1.312, "grad_norm": 1.65625, "learning_rate": 1.638e-05, "loss": 0.6522, "mean_token_accuracy": 0.8221593346446753, "num_tokens": 23483496.0, "step": 820 }, { "entropy": 0.675568882189691, "epoch": 1.328, "grad_norm": 1.6640625, "learning_rate": 1.658e-05, "loss": 0.6523, "mean_token_accuracy": 0.8208769094198942, "num_tokens": 23762589.0, "step": 830 }, { "entropy": 0.6467019423842431, "epoch": 1.3439999999999999, "grad_norm": 1.6953125, "learning_rate": 1.6780000000000002e-05, "loss": 0.6297, "mean_token_accuracy": 0.825079932808876, "num_tokens": 24041497.0, "step": 840 }, { "entropy": 0.656816397048533, "epoch": 1.3599999999999999, "grad_norm": 1.296875, "learning_rate": 1.698e-05, "loss": 0.6365, "mean_token_accuracy": 0.8254420697689057, "num_tokens": 24330591.0, "step": 850 }, { "entropy": 0.6973325841128826, "epoch": 1.376, "grad_norm": 1.484375, "learning_rate": 1.718e-05, "loss": 0.6754, "mean_token_accuracy": 0.8146579563617706, "num_tokens": 24611420.0, "step": 860 }, { "entropy": 0.6515109525993467, "epoch": 1.392, "grad_norm": 1.5234375, "learning_rate": 1.7380000000000003e-05, "loss": 0.6297, "mean_token_accuracy": 0.8239485524594784, "num_tokens": 24894694.0, "step": 870 }, { "entropy": 0.6447439486160874, "epoch": 1.408, "grad_norm": 1.546875, "learning_rate": 1.758e-05, "loss": 0.6231, "mean_token_accuracy": 0.8267700038850307, "num_tokens": 25184946.0, "step": 880 }, { "entropy": 0.6419832136482, "epoch": 1.424, "grad_norm": 1.3515625, "learning_rate": 1.7780000000000003e-05, "loss": 0.624, "mean_token_accuracy": 0.8271039195358754, "num_tokens": 25474746.0, "step": 890 }, { "entropy": 0.6597341772168874, "epoch": 1.44, "grad_norm": 1.4296875, "learning_rate": 1.798e-05, "loss": 0.6378, "mean_token_accuracy": 0.8231316354125738, "num_tokens": 25758550.0, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 0.7189412865638732, "eval_chemistry_loss": 0.721181333065033, "eval_chemistry_mean_token_accuracy": 0.8038096494674682, "eval_chemistry_num_tokens": 25758550.0, "eval_chemistry_runtime": 34.8369, "eval_chemistry_samples_per_second": 14.353, "eval_chemistry_steps_per_second": 3.588, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.6560292990207672, "eval_math_loss": 0.6677282452583313, "eval_math_mean_token_accuracy": 0.8188103575706482, "eval_math_num_tokens": 25758550.0, "eval_math_runtime": 35.8656, "eval_math_samples_per_second": 13.941, "eval_math_steps_per_second": 3.485, "step": 900 }, { "entropy": 0.6441044477745891, "epoch": 1.456, "grad_norm": 1.5078125, "learning_rate": 1.8180000000000002e-05, "loss": 0.6259, "mean_token_accuracy": 0.8252961106598378, "num_tokens": 26039862.0, "step": 910 }, { "entropy": 0.6502820059657097, "epoch": 1.472, "grad_norm": 1.3125, "learning_rate": 1.8380000000000004e-05, "loss": 0.6342, "mean_token_accuracy": 0.8237598706036806, "num_tokens": 26318666.0, "step": 920 }, { "entropy": 0.6764587434008718, "epoch": 1.488, "grad_norm": 1.40625, "learning_rate": 1.858e-05, "loss": 0.6547, "mean_token_accuracy": 0.8194571785628796, "num_tokens": 26594867.0, "step": 930 }, { "entropy": 0.6333928175270558, "epoch": 1.504, "grad_norm": 1.6875, "learning_rate": 1.878e-05, "loss": 0.6176, "mean_token_accuracy": 0.8289908330887557, "num_tokens": 26887233.0, "step": 940 }, { "entropy": 0.6682084022089839, "epoch": 1.52, "grad_norm": 1.2109375, "learning_rate": 1.898e-05, "loss": 0.6442, "mean_token_accuracy": 0.8210660863667727, "num_tokens": 27177448.0, "step": 950 }, { "entropy": 0.6442653369158506, "epoch": 1.536, "grad_norm": 1.2109375, "learning_rate": 1.918e-05, "loss": 0.6319, "mean_token_accuracy": 0.8249651778489351, "num_tokens": 27467857.0, "step": 960 }, { "entropy": 0.6457945328205824, "epoch": 1.552, "grad_norm": 1.3828125, "learning_rate": 1.938e-05, "loss": 0.6262, "mean_token_accuracy": 0.8254117891192436, "num_tokens": 27755138.0, "step": 970 }, { "entropy": 0.6328146204352378, "epoch": 1.568, "grad_norm": 1.234375, "learning_rate": 1.9580000000000002e-05, "loss": 0.6132, "mean_token_accuracy": 0.8306384857743978, "num_tokens": 28049363.0, "step": 980 }, { "entropy": 0.6414034033194185, "epoch": 1.584, "grad_norm": 1.4375, "learning_rate": 1.978e-05, "loss": 0.6233, "mean_token_accuracy": 0.8266693830490113, "num_tokens": 28337890.0, "step": 990 }, { "entropy": 0.6323226554319262, "epoch": 1.6, "grad_norm": 1.4296875, "learning_rate": 1.9980000000000002e-05, "loss": 0.6189, "mean_token_accuracy": 0.8256060272455216, "num_tokens": 28626068.0, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 0.7118635489940643, "eval_chemistry_loss": 0.7256439924240112, "eval_chemistry_mean_token_accuracy": 0.8026832752227783, "eval_chemistry_num_tokens": 28626068.0, "eval_chemistry_runtime": 34.8345, "eval_chemistry_samples_per_second": 14.354, "eval_chemistry_steps_per_second": 3.588, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.6450024034976959, "eval_math_loss": 0.6582702994346619, "eval_math_mean_token_accuracy": 0.820967010974884, "eval_math_num_tokens": 28626068.0, "eval_math_runtime": 35.6245, "eval_math_samples_per_second": 14.035, "eval_math_steps_per_second": 3.509, "step": 1000 }, { "entropy": 0.6489538656547665, "epoch": 1.616, "grad_norm": 1.3125, "learning_rate": 1.9980000000000002e-05, "loss": 0.636, "mean_token_accuracy": 0.8241962313652038, "num_tokens": 28913208.0, "step": 1010 }, { "entropy": 0.656857686303556, "epoch": 1.6320000000000001, "grad_norm": 1.3359375, "learning_rate": 1.995777777777778e-05, "loss": 0.6357, "mean_token_accuracy": 0.8214856889098883, "num_tokens": 29188917.0, "step": 1020 }, { "entropy": 0.6321157278493047, "epoch": 1.6480000000000001, "grad_norm": 1.390625, "learning_rate": 1.9935555555555557e-05, "loss": 0.6152, "mean_token_accuracy": 0.8298161163926124, "num_tokens": 29477292.0, "step": 1030 }, { "entropy": 0.6432073757052421, "epoch": 1.6640000000000001, "grad_norm": 1.3359375, "learning_rate": 1.9913333333333335e-05, "loss": 0.62, "mean_token_accuracy": 0.8257944118231535, "num_tokens": 29772480.0, "step": 1040 }, { "entropy": 0.647973028384149, "epoch": 1.6800000000000002, "grad_norm": 1.2109375, "learning_rate": 1.9891111111111112e-05, "loss": 0.6323, "mean_token_accuracy": 0.8256966724991799, "num_tokens": 30061007.0, "step": 1050 }, { "entropy": 0.6499059528112412, "epoch": 1.696, "grad_norm": 1.296875, "learning_rate": 1.986888888888889e-05, "loss": 0.6316, "mean_token_accuracy": 0.8242154024541378, "num_tokens": 30344415.0, "step": 1060 }, { "entropy": 0.6270535726100206, "epoch": 1.712, "grad_norm": 1.2109375, "learning_rate": 1.9846666666666668e-05, "loss": 0.6152, "mean_token_accuracy": 0.8287085957825184, "num_tokens": 30640891.0, "step": 1070 }, { "entropy": 0.6247420905157923, "epoch": 1.728, "grad_norm": 1.3515625, "learning_rate": 1.9824444444444445e-05, "loss": 0.6059, "mean_token_accuracy": 0.8300342559814453, "num_tokens": 30921295.0, "step": 1080 }, { "entropy": 0.6317020528018474, "epoch": 1.744, "grad_norm": 1.3984375, "learning_rate": 1.9802222222222226e-05, "loss": 0.6158, "mean_token_accuracy": 0.828870889171958, "num_tokens": 31206375.0, "step": 1090 }, { "entropy": 0.6282909054309129, "epoch": 1.76, "grad_norm": 1.1953125, "learning_rate": 1.978e-05, "loss": 0.6101, "mean_token_accuracy": 0.8280309360474348, "num_tokens": 31491856.0, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 0.7381789875030518, "eval_chemistry_loss": 0.7256398797035217, "eval_chemistry_mean_token_accuracy": 0.8018755903244018, "eval_chemistry_num_tokens": 31491856.0, "eval_chemistry_runtime": 34.8532, "eval_chemistry_samples_per_second": 14.346, "eval_chemistry_steps_per_second": 3.586, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.6412687346935272, "eval_math_loss": 0.6473493576049805, "eval_math_mean_token_accuracy": 0.8233258814811707, "eval_math_num_tokens": 31491856.0, "eval_math_runtime": 35.8787, "eval_math_samples_per_second": 13.936, "eval_math_steps_per_second": 3.484, "step": 1100 }, { "entropy": 0.6336805198341608, "epoch": 1.776, "grad_norm": 1.3203125, "learning_rate": 1.975777777777778e-05, "loss": 0.6135, "mean_token_accuracy": 0.8275633446872235, "num_tokens": 31782829.0, "step": 1110 }, { "entropy": 0.6414820792153477, "epoch": 1.792, "grad_norm": 1.2734375, "learning_rate": 1.9735555555555556e-05, "loss": 0.6246, "mean_token_accuracy": 0.8254438240081072, "num_tokens": 32070187.0, "step": 1120 }, { "entropy": 0.6382445661351085, "epoch": 1.808, "grad_norm": 1.2890625, "learning_rate": 1.9713333333333337e-05, "loss": 0.6217, "mean_token_accuracy": 0.8260911278426647, "num_tokens": 32355072.0, "step": 1130 }, { "entropy": 0.6067643767222762, "epoch": 1.8239999999999998, "grad_norm": 1.1484375, "learning_rate": 1.969111111111111e-05, "loss": 0.5867, "mean_token_accuracy": 0.8337960425764323, "num_tokens": 32634013.0, "step": 1140 }, { "entropy": 0.6227916920557618, "epoch": 1.8399999999999999, "grad_norm": 1.34375, "learning_rate": 1.9668888888888892e-05, "loss": 0.6108, "mean_token_accuracy": 0.8300515715032816, "num_tokens": 32917222.0, "step": 1150 }, { "entropy": 0.6352459752932191, "epoch": 1.8559999999999999, "grad_norm": 1.265625, "learning_rate": 1.9646666666666666e-05, "loss": 0.613, "mean_token_accuracy": 0.8282338980585336, "num_tokens": 33207147.0, "step": 1160 }, { "entropy": 0.6206054732203483, "epoch": 1.8719999999999999, "grad_norm": 1.28125, "learning_rate": 1.9624444444444447e-05, "loss": 0.6023, "mean_token_accuracy": 0.8303315650671721, "num_tokens": 33496572.0, "step": 1170 }, { "entropy": 0.6307253973558545, "epoch": 1.888, "grad_norm": 1.3046875, "learning_rate": 1.9602222222222225e-05, "loss": 0.6134, "mean_token_accuracy": 0.8282632239162921, "num_tokens": 33775788.0, "step": 1180 }, { "entropy": 0.6315167531371116, "epoch": 1.904, "grad_norm": 1.2421875, "learning_rate": 1.9580000000000002e-05, "loss": 0.615, "mean_token_accuracy": 0.8275392096489668, "num_tokens": 34052322.0, "step": 1190 }, { "entropy": 0.6110374081879855, "epoch": 1.92, "grad_norm": 1.1796875, "learning_rate": 1.955777777777778e-05, "loss": 0.5959, "mean_token_accuracy": 0.8313204348087311, "num_tokens": 34331468.0, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 0.7433430399894715, "eval_chemistry_loss": 0.728480339050293, "eval_chemistry_mean_token_accuracy": 0.8014561586380005, "eval_chemistry_num_tokens": 34331468.0, "eval_chemistry_runtime": 34.8484, "eval_chemistry_samples_per_second": 14.348, "eval_chemistry_steps_per_second": 3.587, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.6532090711593628, "eval_math_loss": 0.6392548680305481, "eval_math_mean_token_accuracy": 0.8242080550193787, "eval_math_num_tokens": 34331468.0, "eval_math_runtime": 35.8593, "eval_math_samples_per_second": 13.943, "eval_math_steps_per_second": 3.486, "step": 1200 }, { "entropy": 0.6277910789474845, "epoch": 1.936, "grad_norm": 1.2890625, "learning_rate": 1.9535555555555557e-05, "loss": 0.6122, "mean_token_accuracy": 0.8281190965324641, "num_tokens": 34612743.0, "step": 1210 }, { "entropy": 0.6204241087660194, "epoch": 1.952, "grad_norm": 1.1796875, "learning_rate": 1.9513333333333335e-05, "loss": 0.5964, "mean_token_accuracy": 0.8305780492722988, "num_tokens": 34891115.0, "step": 1220 }, { "entropy": 0.6416850406676531, "epoch": 1.968, "grad_norm": 2.8125, "learning_rate": 1.9491111111111113e-05, "loss": 0.6335, "mean_token_accuracy": 0.8254266548901796, "num_tokens": 35179505.0, "step": 1230 }, { "entropy": 0.6098293786868453, "epoch": 1.984, "grad_norm": 1.140625, "learning_rate": 1.946888888888889e-05, "loss": 0.5945, "mean_token_accuracy": 0.8320291046053171, "num_tokens": 35472596.0, "step": 1240 }, { "entropy": 0.6196965377777814, "epoch": 2.0, "grad_norm": 1.171875, "learning_rate": 1.9446666666666668e-05, "loss": 0.604, "mean_token_accuracy": 0.8302332308143378, "num_tokens": 35772848.0, "step": 1250 }, { "entropy": 0.5497036971151829, "epoch": 2.016, "grad_norm": 1.46875, "learning_rate": 1.9424444444444446e-05, "loss": 0.5255, "mean_token_accuracy": 0.8438023224472999, "num_tokens": 36060330.0, "step": 1260 }, { "entropy": 0.5297272937372327, "epoch": 2.032, "grad_norm": 1.46875, "learning_rate": 1.9402222222222223e-05, "loss": 0.5147, "mean_token_accuracy": 0.8473685499280691, "num_tokens": 36343894.0, "step": 1270 }, { "entropy": 0.5197363485582173, "epoch": 2.048, "grad_norm": 1.4296875, "learning_rate": 1.938e-05, "loss": 0.5054, "mean_token_accuracy": 0.8493004187941551, "num_tokens": 36636410.0, "step": 1280 }, { "entropy": 0.5172868834808468, "epoch": 2.064, "grad_norm": 1.328125, "learning_rate": 1.935777777777778e-05, "loss": 0.5014, "mean_token_accuracy": 0.849420978501439, "num_tokens": 36925382.0, "step": 1290 }, { "entropy": 0.5238144496455789, "epoch": 2.08, "grad_norm": 1.5, "learning_rate": 1.9335555555555556e-05, "loss": 0.5006, "mean_token_accuracy": 0.8484534539282322, "num_tokens": 37208672.0, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 0.5902576115131378, "eval_chemistry_loss": 0.7615314722061157, "eval_chemistry_mean_token_accuracy": 0.7987616105079651, "eval_chemistry_num_tokens": 37208672.0, "eval_chemistry_runtime": 34.8214, "eval_chemistry_samples_per_second": 14.359, "eval_chemistry_steps_per_second": 3.59, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.5490774600505829, "eval_math_loss": 0.6556233167648315, "eval_math_mean_token_accuracy": 0.8236036248207093, "eval_math_num_tokens": 37208672.0, "eval_math_runtime": 35.898, "eval_math_samples_per_second": 13.928, "eval_math_steps_per_second": 3.482, "step": 1300 }, { "entropy": 0.5296811152249574, "epoch": 2.096, "grad_norm": 1.7265625, "learning_rate": 1.9313333333333334e-05, "loss": 0.5143, "mean_token_accuracy": 0.8471574913710356, "num_tokens": 37491954.0, "step": 1310 }, { "entropy": 0.5180984031409025, "epoch": 2.112, "grad_norm": 1.4375, "learning_rate": 1.9291111111111115e-05, "loss": 0.4988, "mean_token_accuracy": 0.8482684683054685, "num_tokens": 37787033.0, "step": 1320 }, { "entropy": 0.5158460404723882, "epoch": 2.128, "grad_norm": 1.4453125, "learning_rate": 1.926888888888889e-05, "loss": 0.4989, "mean_token_accuracy": 0.8504650525748729, "num_tokens": 38068955.0, "step": 1330 }, { "entropy": 0.5193909807130694, "epoch": 2.144, "grad_norm": 1.4140625, "learning_rate": 1.924666666666667e-05, "loss": 0.5058, "mean_token_accuracy": 0.8479062043130398, "num_tokens": 38359914.0, "step": 1340 }, { "entropy": 0.518505304865539, "epoch": 2.16, "grad_norm": 1.3203125, "learning_rate": 1.9224444444444444e-05, "loss": 0.5031, "mean_token_accuracy": 0.8498795099556447, "num_tokens": 38649684.0, "step": 1350 }, { "entropy": 0.49965119622647763, "epoch": 2.176, "grad_norm": 1.359375, "learning_rate": 1.9202222222222225e-05, "loss": 0.4821, "mean_token_accuracy": 0.8528774298727513, "num_tokens": 38938155.0, "step": 1360 }, { "entropy": 0.5324017994105816, "epoch": 2.192, "grad_norm": 1.3359375, "learning_rate": 1.918e-05, "loss": 0.5166, "mean_token_accuracy": 0.8449586551636458, "num_tokens": 39228800.0, "step": 1370 }, { "entropy": 0.523920483700931, "epoch": 2.208, "grad_norm": 1.5390625, "learning_rate": 1.915777777777778e-05, "loss": 0.5153, "mean_token_accuracy": 0.8468315444886685, "num_tokens": 39513118.0, "step": 1380 }, { "entropy": 0.5206425994634628, "epoch": 2.224, "grad_norm": 1.5546875, "learning_rate": 1.9135555555555555e-05, "loss": 0.5006, "mean_token_accuracy": 0.8491694446653127, "num_tokens": 39804212.0, "step": 1390 }, { "entropy": 0.5240093268454075, "epoch": 2.24, "grad_norm": 1.40625, "learning_rate": 1.9113333333333336e-05, "loss": 0.5127, "mean_token_accuracy": 0.8465723715722561, "num_tokens": 40092012.0, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 0.5741701843738556, "eval_chemistry_loss": 0.7734760642051697, "eval_chemistry_mean_token_accuracy": 0.7974499335289001, "eval_chemistry_num_tokens": 40092012.0, "eval_chemistry_runtime": 34.8009, "eval_chemistry_samples_per_second": 14.367, "eval_chemistry_steps_per_second": 3.592, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.5538151891231536, "eval_math_loss": 0.6544409394264221, "eval_math_mean_token_accuracy": 0.8238605060577393, "eval_math_num_tokens": 40092012.0, "eval_math_runtime": 35.8968, "eval_math_samples_per_second": 13.929, "eval_math_steps_per_second": 3.482, "step": 1400 }, { "entropy": 0.5231095163151622, "epoch": 2.2560000000000002, "grad_norm": 1.46875, "learning_rate": 1.9091111111111113e-05, "loss": 0.5086, "mean_token_accuracy": 0.8475595053285361, "num_tokens": 40380069.0, "step": 1410 }, { "entropy": 0.5324242118746042, "epoch": 2.2720000000000002, "grad_norm": 1.6171875, "learning_rate": 1.906888888888889e-05, "loss": 0.5147, "mean_token_accuracy": 0.8457300752401352, "num_tokens": 40655903.0, "step": 1420 }, { "entropy": 0.5270635643973947, "epoch": 2.288, "grad_norm": 1.484375, "learning_rate": 1.904666666666667e-05, "loss": 0.51, "mean_token_accuracy": 0.8453500598669053, "num_tokens": 40944791.0, "step": 1430 }, { "entropy": 0.5129735874012112, "epoch": 2.304, "grad_norm": 1.5390625, "learning_rate": 1.9024444444444446e-05, "loss": 0.4984, "mean_token_accuracy": 0.8505097340792418, "num_tokens": 41227146.0, "step": 1440 }, { "entropy": 0.5308720570988953, "epoch": 2.32, "grad_norm": 1.5, "learning_rate": 1.9002222222222224e-05, "loss": 0.5197, "mean_token_accuracy": 0.8463008664548397, "num_tokens": 41509322.0, "step": 1450 }, { "entropy": 0.5264440540224313, "epoch": 2.336, "grad_norm": 1.4609375, "learning_rate": 1.898e-05, "loss": 0.5068, "mean_token_accuracy": 0.8473060473799705, "num_tokens": 41792799.0, "step": 1460 }, { "entropy": 0.5393752640113234, "epoch": 2.352, "grad_norm": 1.4609375, "learning_rate": 1.895777777777778e-05, "loss": 0.5226, "mean_token_accuracy": 0.8433732774108649, "num_tokens": 42083080.0, "step": 1470 }, { "entropy": 0.5265612868592143, "epoch": 2.368, "grad_norm": 1.4921875, "learning_rate": 1.8935555555555556e-05, "loss": 0.5118, "mean_token_accuracy": 0.8467371355742216, "num_tokens": 42368442.0, "step": 1480 }, { "entropy": 0.5453734394162894, "epoch": 2.384, "grad_norm": 1.7578125, "learning_rate": 1.8913333333333334e-05, "loss": 0.5308, "mean_token_accuracy": 0.8432964202016592, "num_tokens": 42663926.0, "step": 1490 }, { "entropy": 0.5337216904386878, "epoch": 2.4, "grad_norm": 1.8203125, "learning_rate": 1.8891111111111115e-05, "loss": 0.5215, "mean_token_accuracy": 0.8452994517982007, "num_tokens": 42946351.0, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 0.5733621008396149, "eval_chemistry_loss": 0.7763350009918213, "eval_chemistry_mean_token_accuracy": 0.7971291627883911, "eval_chemistry_num_tokens": 42946351.0, "eval_chemistry_runtime": 34.7936, "eval_chemistry_samples_per_second": 14.37, "eval_chemistry_steps_per_second": 3.593, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.5494145798683167, "eval_math_loss": 0.6489285826683044, "eval_math_mean_token_accuracy": 0.8246520318984986, "eval_math_num_tokens": 42946351.0, "eval_math_runtime": 35.4869, "eval_math_samples_per_second": 14.09, "eval_math_steps_per_second": 3.522, "step": 1500 }, { "entropy": 0.5415687510743737, "epoch": 2.416, "grad_norm": 1.484375, "learning_rate": 1.886888888888889e-05, "loss": 0.5263, "mean_token_accuracy": 0.8432823572307825, "num_tokens": 43227782.0, "step": 1510 }, { "entropy": 0.5234041666612029, "epoch": 2.432, "grad_norm": 1.421875, "learning_rate": 1.884666666666667e-05, "loss": 0.5124, "mean_token_accuracy": 0.8473271746188402, "num_tokens": 43511939.0, "step": 1520 }, { "entropy": 0.5191137973219156, "epoch": 2.448, "grad_norm": 1.5703125, "learning_rate": 1.8824444444444445e-05, "loss": 0.5039, "mean_token_accuracy": 0.8494564235210419, "num_tokens": 43811005.0, "step": 1530 }, { "entropy": 0.5308497181162238, "epoch": 2.464, "grad_norm": 1.59375, "learning_rate": 1.8802222222222226e-05, "loss": 0.5174, "mean_token_accuracy": 0.8456704583019018, "num_tokens": 44106223.0, "step": 1540 }, { "entropy": 0.5506666025146842, "epoch": 2.48, "grad_norm": 1.53125, "learning_rate": 1.878e-05, "loss": 0.5316, "mean_token_accuracy": 0.8412250328809023, "num_tokens": 44379095.0, "step": 1550 }, { "entropy": 0.5348324475809931, "epoch": 2.496, "grad_norm": 1.4296875, "learning_rate": 1.875777777777778e-05, "loss": 0.52, "mean_token_accuracy": 0.8440562028437852, "num_tokens": 44670316.0, "step": 1560 }, { "entropy": 0.5286164516583085, "epoch": 2.512, "grad_norm": 1.546875, "learning_rate": 1.873555555555556e-05, "loss": 0.516, "mean_token_accuracy": 0.8460781283676624, "num_tokens": 44952877.0, "step": 1570 }, { "entropy": 0.5242383845150471, "epoch": 2.528, "grad_norm": 1.4453125, "learning_rate": 1.8713333333333336e-05, "loss": 0.5084, "mean_token_accuracy": 0.8474905800074339, "num_tokens": 45237653.0, "step": 1580 }, { "entropy": 0.5217398202046752, "epoch": 2.544, "grad_norm": 1.625, "learning_rate": 1.8691111111111114e-05, "loss": 0.51, "mean_token_accuracy": 0.8485121335834265, "num_tokens": 45524866.0, "step": 1590 }, { "entropy": 0.5217585685662925, "epoch": 2.56, "grad_norm": 1.40625, "learning_rate": 1.866888888888889e-05, "loss": 0.504, "mean_token_accuracy": 0.8478538550436496, "num_tokens": 45803783.0, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 0.5687358362674713, "eval_chemistry_loss": 0.7774361371994019, "eval_chemistry_mean_token_accuracy": 0.7964602584838867, "eval_chemistry_num_tokens": 45803783.0, "eval_chemistry_runtime": 34.792, "eval_chemistry_samples_per_second": 14.371, "eval_chemistry_steps_per_second": 3.593, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.5417093946933746, "eval_math_loss": 0.6447312831878662, "eval_math_mean_token_accuracy": 0.8262028379440307, "eval_math_num_tokens": 45803783.0, "eval_math_runtime": 35.8206, "eval_math_samples_per_second": 13.958, "eval_math_steps_per_second": 3.49, "step": 1600 }, { "entropy": 0.5245849631726742, "epoch": 2.576, "grad_norm": 1.4375, "learning_rate": 1.864666666666667e-05, "loss": 0.5095, "mean_token_accuracy": 0.8479252554476261, "num_tokens": 46090713.0, "step": 1610 }, { "entropy": 0.5172529483214021, "epoch": 2.592, "grad_norm": 1.3359375, "learning_rate": 1.8624444444444446e-05, "loss": 0.5036, "mean_token_accuracy": 0.8490103390067816, "num_tokens": 46384084.0, "step": 1620 }, { "entropy": 0.5333136133849621, "epoch": 2.608, "grad_norm": 1.625, "learning_rate": 1.8602222222222224e-05, "loss": 0.5216, "mean_token_accuracy": 0.8443999428302049, "num_tokens": 46667552.0, "step": 1630 }, { "entropy": 0.49988405164331196, "epoch": 2.624, "grad_norm": 1.453125, "learning_rate": 1.858e-05, "loss": 0.4842, "mean_token_accuracy": 0.8543243549764157, "num_tokens": 46961124.0, "step": 1640 }, { "entropy": 0.5301500145345927, "epoch": 2.64, "grad_norm": 1.5546875, "learning_rate": 1.855777777777778e-05, "loss": 0.518, "mean_token_accuracy": 0.8460444647818803, "num_tokens": 47245326.0, "step": 1650 }, { "entropy": 0.5089625578373671, "epoch": 2.656, "grad_norm": 1.578125, "learning_rate": 1.8535555555555557e-05, "loss": 0.4942, "mean_token_accuracy": 0.8499364998191595, "num_tokens": 47527347.0, "step": 1660 }, { "entropy": 0.5320184031501413, "epoch": 2.672, "grad_norm": 1.6875, "learning_rate": 1.8513333333333335e-05, "loss": 0.5159, "mean_token_accuracy": 0.8455484293401241, "num_tokens": 47808167.0, "step": 1670 }, { "entropy": 0.5261984881013632, "epoch": 2.6879999999999997, "grad_norm": 1.4140625, "learning_rate": 1.8491111111111112e-05, "loss": 0.5096, "mean_token_accuracy": 0.8478115413337945, "num_tokens": 48092035.0, "step": 1680 }, { "entropy": 0.5308143127709627, "epoch": 2.7039999999999997, "grad_norm": 1.609375, "learning_rate": 1.846888888888889e-05, "loss": 0.5208, "mean_token_accuracy": 0.8449086494743824, "num_tokens": 48386231.0, "step": 1690 }, { "entropy": 0.522861585021019, "epoch": 2.7199999999999998, "grad_norm": 1.5078125, "learning_rate": 1.8446666666666667e-05, "loss": 0.5101, "mean_token_accuracy": 0.8472402919083833, "num_tokens": 48663262.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 0.5746280739307403, "eval_chemistry_loss": 0.7798963785171509, "eval_chemistry_mean_token_accuracy": 0.7963899078369141, "eval_chemistry_num_tokens": 48663262.0, "eval_chemistry_runtime": 34.7443, "eval_chemistry_samples_per_second": 14.391, "eval_chemistry_steps_per_second": 3.598, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.5586551280021668, "eval_math_loss": 0.6379777789115906, "eval_math_mean_token_accuracy": 0.8271588444709778, "eval_math_num_tokens": 48663262.0, "eval_math_runtime": 35.8034, "eval_math_samples_per_second": 13.965, "eval_math_steps_per_second": 3.491, "step": 1700 }, { "entropy": 0.5201129943132401, "epoch": 2.7359999999999998, "grad_norm": 1.4921875, "learning_rate": 1.842444444444445e-05, "loss": 0.5075, "mean_token_accuracy": 0.8486686907708645, "num_tokens": 48949456.0, "step": 1710 }, { "entropy": 0.5246327675879001, "epoch": 2.752, "grad_norm": 1.5078125, "learning_rate": 1.8402222222222223e-05, "loss": 0.5068, "mean_token_accuracy": 0.8480775609612465, "num_tokens": 49235751.0, "step": 1720 }, { "entropy": 0.5258618659339845, "epoch": 2.768, "grad_norm": 1.4453125, "learning_rate": 1.8380000000000004e-05, "loss": 0.5131, "mean_token_accuracy": 0.8470302954316139, "num_tokens": 49524628.0, "step": 1730 }, { "entropy": 0.5411237273365259, "epoch": 2.784, "grad_norm": 1.546875, "learning_rate": 1.8357777777777778e-05, "loss": 0.5228, "mean_token_accuracy": 0.8433229614049196, "num_tokens": 49799604.0, "step": 1740 }, { "entropy": 0.522053999081254, "epoch": 2.8, "grad_norm": 1.28125, "learning_rate": 1.833555555555556e-05, "loss": 0.509, "mean_token_accuracy": 0.8484720811247826, "num_tokens": 50079307.0, "step": 1750 }, { "entropy": 0.5252829389646649, "epoch": 2.816, "grad_norm": 1.5234375, "learning_rate": 1.8313333333333333e-05, "loss": 0.5056, "mean_token_accuracy": 0.8476968578994274, "num_tokens": 50372216.0, "step": 1760 }, { "entropy": 0.5136345084756613, "epoch": 2.832, "grad_norm": 1.59375, "learning_rate": 1.8291111111111114e-05, "loss": 0.5037, "mean_token_accuracy": 0.8490433126688004, "num_tokens": 50657211.0, "step": 1770 }, { "entropy": 0.5356445843353868, "epoch": 2.848, "grad_norm": 1.46875, "learning_rate": 1.8268888888888888e-05, "loss": 0.5228, "mean_token_accuracy": 0.8450450662523508, "num_tokens": 50946503.0, "step": 1780 }, { "entropy": 0.5204181535169482, "epoch": 2.864, "grad_norm": 1.609375, "learning_rate": 1.824666666666667e-05, "loss": 0.5087, "mean_token_accuracy": 0.8479438953101635, "num_tokens": 51229298.0, "step": 1790 }, { "entropy": 0.5238792803138494, "epoch": 2.88, "grad_norm": 1.5234375, "learning_rate": 1.8224444444444447e-05, "loss": 0.5047, "mean_token_accuracy": 0.8475392743945122, "num_tokens": 51509622.0, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 0.5654533207416534, "eval_chemistry_loss": 0.7800628542900085, "eval_chemistry_mean_token_accuracy": 0.7969496622085571, "eval_chemistry_num_tokens": 51509622.0, "eval_chemistry_runtime": 34.7126, "eval_chemistry_samples_per_second": 14.404, "eval_chemistry_steps_per_second": 3.601, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.538058566570282, "eval_math_loss": 0.636343240737915, "eval_math_mean_token_accuracy": 0.8282605667114258, "eval_math_num_tokens": 51509622.0, "eval_math_runtime": 35.7752, "eval_math_samples_per_second": 13.976, "eval_math_steps_per_second": 3.494, "step": 1800 }, { "entropy": 0.5157067686319351, "epoch": 2.896, "grad_norm": 1.578125, "learning_rate": 1.8202222222222225e-05, "loss": 0.5005, "mean_token_accuracy": 0.8493956170976162, "num_tokens": 51790856.0, "step": 1810 }, { "entropy": 0.5220442852005363, "epoch": 2.912, "grad_norm": 1.5390625, "learning_rate": 1.8180000000000002e-05, "loss": 0.5082, "mean_token_accuracy": 0.8484512511640787, "num_tokens": 52074393.0, "step": 1820 }, { "entropy": 0.5298560874536633, "epoch": 2.928, "grad_norm": 1.5, "learning_rate": 1.815777777777778e-05, "loss": 0.5177, "mean_token_accuracy": 0.8467922162264585, "num_tokens": 52366467.0, "step": 1830 }, { "entropy": 0.5144043141975999, "epoch": 2.944, "grad_norm": 1.4765625, "learning_rate": 1.8135555555555557e-05, "loss": 0.5041, "mean_token_accuracy": 0.848617946729064, "num_tokens": 52657863.0, "step": 1840 }, { "entropy": 0.5239851342514157, "epoch": 2.96, "grad_norm": 1.5078125, "learning_rate": 1.8113333333333335e-05, "loss": 0.5086, "mean_token_accuracy": 0.8469469167292119, "num_tokens": 52947183.0, "step": 1850 }, { "entropy": 0.5214887933805585, "epoch": 2.976, "grad_norm": 1.390625, "learning_rate": 1.8091111111111113e-05, "loss": 0.5049, "mean_token_accuracy": 0.8478761337697506, "num_tokens": 53226984.0, "step": 1860 }, { "entropy": 0.5146605940535665, "epoch": 2.992, "grad_norm": 1.46875, "learning_rate": 1.806888888888889e-05, "loss": 0.4994, "mean_token_accuracy": 0.8494556181132793, "num_tokens": 53515204.0, "step": 1870 }, { "entropy": 0.47540333569049836, "epoch": 3.008, "grad_norm": 2.078125, "learning_rate": 1.8046666666666668e-05, "loss": 0.436, "mean_token_accuracy": 0.8662762742489576, "num_tokens": 53800592.0, "step": 1880 }, { "entropy": 0.35706824259832504, "epoch": 3.024, "grad_norm": 1.921875, "learning_rate": 1.8024444444444445e-05, "loss": 0.3438, "mean_token_accuracy": 0.8883892893791199, "num_tokens": 54092017.0, "step": 1890 }, { "entropy": 0.3444691884331405, "epoch": 3.04, "grad_norm": 1.90625, "learning_rate": 1.8002222222222223e-05, "loss": 0.3242, "mean_token_accuracy": 0.8934439823031426, "num_tokens": 54380158.0, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 0.4136744683980942, "eval_chemistry_loss": 0.9674221873283386, "eval_chemistry_mean_token_accuracy": 0.785107521533966, "eval_chemistry_num_tokens": 54380158.0, "eval_chemistry_runtime": 34.7442, "eval_chemistry_samples_per_second": 14.391, "eval_chemistry_steps_per_second": 3.598, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.40369147157669066, "eval_math_loss": 0.7466345429420471, "eval_math_mean_token_accuracy": 0.8216371216773987, "eval_math_num_tokens": 54380158.0, "eval_math_runtime": 36.3078, "eval_math_samples_per_second": 13.771, "eval_math_steps_per_second": 3.443, "step": 1900 }, { "entropy": 0.3425462535582483, "epoch": 3.056, "grad_norm": 2.015625, "learning_rate": 1.798e-05, "loss": 0.3194, "mean_token_accuracy": 0.894545292109251, "num_tokens": 54658093.0, "step": 1910 }, { "entropy": 0.3412907443009317, "epoch": 3.072, "grad_norm": 2.015625, "learning_rate": 1.7957777777777778e-05, "loss": 0.3209, "mean_token_accuracy": 0.8948256101459264, "num_tokens": 54949938.0, "step": 1920 }, { "entropy": 0.32589660566300155, "epoch": 3.088, "grad_norm": 2.1875, "learning_rate": 1.7935555555555556e-05, "loss": 0.3083, "mean_token_accuracy": 0.8980491489171982, "num_tokens": 55234643.0, "step": 1930 }, { "entropy": 0.34667133893817664, "epoch": 3.104, "grad_norm": 2.015625, "learning_rate": 1.7913333333333337e-05, "loss": 0.3253, "mean_token_accuracy": 0.8923853240907192, "num_tokens": 55519679.0, "step": 1940 }, { "entropy": 0.33345927773043516, "epoch": 3.12, "grad_norm": 1.984375, "learning_rate": 1.789111111111111e-05, "loss": 0.3172, "mean_token_accuracy": 0.8965467110276222, "num_tokens": 55812397.0, "step": 1950 }, { "entropy": 0.33698156690225006, "epoch": 3.136, "grad_norm": 1.8984375, "learning_rate": 1.7868888888888892e-05, "loss": 0.319, "mean_token_accuracy": 0.8949916884303093, "num_tokens": 56101550.0, "step": 1960 }, { "entropy": 0.3372134535573423, "epoch": 3.152, "grad_norm": 2.140625, "learning_rate": 1.7846666666666666e-05, "loss": 0.3227, "mean_token_accuracy": 0.8940825492143631, "num_tokens": 56387662.0, "step": 1970 }, { "entropy": 0.3512561682611704, "epoch": 3.168, "grad_norm": 2.109375, "learning_rate": 1.7824444444444447e-05, "loss": 0.3342, "mean_token_accuracy": 0.8908473681658506, "num_tokens": 56673426.0, "step": 1980 }, { "entropy": 0.3502405107952654, "epoch": 3.184, "grad_norm": 2.296875, "learning_rate": 1.780222222222222e-05, "loss": 0.3273, "mean_token_accuracy": 0.8911666806787253, "num_tokens": 56957044.0, "step": 1990 }, { "entropy": 0.33547057397663593, "epoch": 3.2, "grad_norm": 1.96875, "learning_rate": 1.7780000000000003e-05, "loss": 0.3184, "mean_token_accuracy": 0.8955803520977497, "num_tokens": 57250097.0, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 0.3912388117313385, "eval_chemistry_loss": 0.9860958456993103, "eval_chemistry_mean_token_accuracy": 0.7855022087097168, "eval_chemistry_num_tokens": 57250097.0, "eval_chemistry_runtime": 34.752, "eval_chemistry_samples_per_second": 14.388, "eval_chemistry_steps_per_second": 3.597, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.3877128950357437, "eval_math_loss": 0.7659533023834229, "eval_math_mean_token_accuracy": 0.8217745156288147, "eval_math_num_tokens": 57250097.0, "eval_math_runtime": 35.4631, "eval_math_samples_per_second": 14.099, "eval_math_steps_per_second": 3.525, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.343629994565034e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }