| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.2, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.5841219527646899, |
| "epoch": 0.016, |
| "grad_norm": 10.875, |
| "learning_rate": 1.8e-07, |
| "loss": 1.0536, |
| "mean_token_accuracy": 0.7792524203658104, |
| "num_tokens": 282849.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.6104077549651266, |
| "epoch": 0.032, |
| "grad_norm": 9.625, |
| "learning_rate": 3.8e-07, |
| "loss": 1.0813, |
| "mean_token_accuracy": 0.7717054646462203, |
| "num_tokens": 561726.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.604045126773417, |
| "epoch": 0.048, |
| "grad_norm": 8.9375, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 1.0425, |
| "mean_token_accuracy": 0.7788930047303438, |
| "num_tokens": 842092.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.6275176709517837, |
| "epoch": 0.064, |
| "grad_norm": 8.5625, |
| "learning_rate": 7.8e-07, |
| "loss": 1.0681, |
| "mean_token_accuracy": 0.7725805886089802, |
| "num_tokens": 1133349.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.6530287317931652, |
| "epoch": 0.08, |
| "grad_norm": 7.59375, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 1.0273, |
| "mean_token_accuracy": 0.7785692941397429, |
| "num_tokens": 1422340.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.6810424191877246, |
| "epoch": 0.096, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 0.9779, |
| "mean_token_accuracy": 0.7848060742020607, |
| "num_tokens": 1711821.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.7722993083298206, |
| "epoch": 0.112, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 0.9772, |
| "mean_token_accuracy": 0.782012278959155, |
| "num_tokens": 1991855.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8406166139990091, |
| "epoch": 0.128, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 0.959, |
| "mean_token_accuracy": 0.784759023040533, |
| "num_tokens": 2288720.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8299949843436479, |
| "epoch": 0.144, |
| "grad_norm": 2.875, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 0.9481, |
| "mean_token_accuracy": 0.7854762740433217, |
| "num_tokens": 2564352.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.7978475514799357, |
| "epoch": 0.16, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.98e-06, |
| "loss": 0.9304, |
| "mean_token_accuracy": 0.7901426807045937, |
| "num_tokens": 2841876.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_chemistry_entropy": 0.817941864490509, |
| "eval_chemistry_loss": 0.8207718729972839, |
| "eval_chemistry_mean_token_accuracy": 0.8094593362808228, |
| "eval_chemistry_num_tokens": 2841876.0, |
| "eval_chemistry_runtime": 34.8392, |
| "eval_chemistry_samples_per_second": 14.352, |
| "eval_chemistry_steps_per_second": 3.588, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_math_entropy": 0.8300764834880829, |
| "eval_math_loss": 0.9698297381401062, |
| "eval_math_mean_token_accuracy": 0.7842242226600648, |
| "eval_math_num_tokens": 2841876.0, |
| "eval_math_runtime": 35.8893, |
| "eval_math_samples_per_second": 13.932, |
| "eval_math_steps_per_second": 3.483, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8128786478191614, |
| "epoch": 0.176, |
| "grad_norm": 2.234375, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 0.9222, |
| "mean_token_accuracy": 0.7908758621662855, |
| "num_tokens": 3129237.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.830844565667212, |
| "epoch": 0.192, |
| "grad_norm": 2.671875, |
| "learning_rate": 2.38e-06, |
| "loss": 0.9398, |
| "mean_token_accuracy": 0.7865626137703657, |
| "num_tokens": 3414875.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.8196758177131415, |
| "epoch": 0.208, |
| "grad_norm": 2.953125, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 0.92, |
| "mean_token_accuracy": 0.7892169930040837, |
| "num_tokens": 3695623.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8214951554313302, |
| "epoch": 0.224, |
| "grad_norm": 2.703125, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 0.9102, |
| "mean_token_accuracy": 0.7903474017977714, |
| "num_tokens": 3977666.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.7900505751371384, |
| "epoch": 0.24, |
| "grad_norm": 2.609375, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 0.8856, |
| "mean_token_accuracy": 0.7941167835146189, |
| "num_tokens": 4269287.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.783742449991405, |
| "epoch": 0.256, |
| "grad_norm": 2.328125, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 0.857, |
| "mean_token_accuracy": 0.7969042614102364, |
| "num_tokens": 4558632.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.7985043607652187, |
| "epoch": 0.272, |
| "grad_norm": 2.28125, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 0.8738, |
| "mean_token_accuracy": 0.7948056727647781, |
| "num_tokens": 4839270.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.7821594245731831, |
| "epoch": 0.288, |
| "grad_norm": 2.046875, |
| "learning_rate": 3.58e-06, |
| "loss": 0.8578, |
| "mean_token_accuracy": 0.7972537841647863, |
| "num_tokens": 5125447.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.8105486003682018, |
| "epoch": 0.304, |
| "grad_norm": 2.28125, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 0.8516, |
| "mean_token_accuracy": 0.7949083779007197, |
| "num_tokens": 5415925.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.7668969418853522, |
| "epoch": 0.32, |
| "grad_norm": 2.125, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 0.8079, |
| "mean_token_accuracy": 0.80227146781981, |
| "num_tokens": 5710178.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_chemistry_entropy": 0.8007053558826447, |
| "eval_chemistry_loss": 0.7652505040168762, |
| "eval_chemistry_mean_token_accuracy": 0.8065420956611633, |
| "eval_chemistry_num_tokens": 5710178.0, |
| "eval_chemistry_runtime": 34.8946, |
| "eval_chemistry_samples_per_second": 14.329, |
| "eval_chemistry_steps_per_second": 3.582, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_math_entropy": 0.8049644639492035, |
| "eval_math_loss": 0.8578509092330933, |
| "eval_math_mean_token_accuracy": 0.7935372619628906, |
| "eval_math_num_tokens": 5710178.0, |
| "eval_math_runtime": 35.9751, |
| "eval_math_samples_per_second": 13.899, |
| "eval_math_steps_per_second": 3.475, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8033319305628538, |
| "epoch": 0.336, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.18e-06, |
| "loss": 0.8255, |
| "mean_token_accuracy": 0.7956937287002802, |
| "num_tokens": 5993843.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.776984517276287, |
| "epoch": 0.352, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.38e-06, |
| "loss": 0.8027, |
| "mean_token_accuracy": 0.8002396024763584, |
| "num_tokens": 6279137.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.7891015050932765, |
| "epoch": 0.368, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.58e-06, |
| "loss": 0.7877, |
| "mean_token_accuracy": 0.7985758237540722, |
| "num_tokens": 6562387.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.7725065175443888, |
| "epoch": 0.384, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.78e-06, |
| "loss": 0.7733, |
| "mean_token_accuracy": 0.8037754170596599, |
| "num_tokens": 6851520.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.8092642717063427, |
| "epoch": 0.4, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 0.8171, |
| "mean_token_accuracy": 0.7956945843994617, |
| "num_tokens": 7138932.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.7882049404084682, |
| "epoch": 0.416, |
| "grad_norm": 1.8359375, |
| "learning_rate": 5.18e-06, |
| "loss": 0.7764, |
| "mean_token_accuracy": 0.8010819923132658, |
| "num_tokens": 7420720.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.7719787143170833, |
| "epoch": 0.432, |
| "grad_norm": 1.875, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 0.7655, |
| "mean_token_accuracy": 0.8039833098649979, |
| "num_tokens": 7695518.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.7650891080498695, |
| "epoch": 0.448, |
| "grad_norm": 2.484375, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 0.7529, |
| "mean_token_accuracy": 0.8057680625468493, |
| "num_tokens": 7983543.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.7654503894969821, |
| "epoch": 0.464, |
| "grad_norm": 2.96875, |
| "learning_rate": 5.78e-06, |
| "loss": 0.7582, |
| "mean_token_accuracy": 0.8070501256734133, |
| "num_tokens": 8271787.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.7974133348092437, |
| "epoch": 0.48, |
| "grad_norm": 2.046875, |
| "learning_rate": 5.98e-06, |
| "loss": 0.7749, |
| "mean_token_accuracy": 0.8022410120815039, |
| "num_tokens": 8556170.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_chemistry_entropy": 0.7947752397060395, |
| "eval_chemistry_loss": 0.7221509218215942, |
| "eval_chemistry_mean_token_accuracy": 0.8083039855957032, |
| "eval_chemistry_num_tokens": 8556170.0, |
| "eval_chemistry_runtime": 34.8605, |
| "eval_chemistry_samples_per_second": 14.343, |
| "eval_chemistry_steps_per_second": 3.586, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_math_entropy": 0.7911813812255859, |
| "eval_math_loss": 0.7818350195884705, |
| "eval_math_mean_token_accuracy": 0.8022260117530823, |
| "eval_math_num_tokens": 8556170.0, |
| "eval_math_runtime": 35.8842, |
| "eval_math_samples_per_second": 13.934, |
| "eval_math_steps_per_second": 3.483, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7553723743185401, |
| "epoch": 0.496, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.18e-06, |
| "loss": 0.7364, |
| "mean_token_accuracy": 0.809390364587307, |
| "num_tokens": 8840590.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.763883944042027, |
| "epoch": 0.512, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 0.737, |
| "mean_token_accuracy": 0.8089169431477785, |
| "num_tokens": 9131903.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.7371911529451609, |
| "epoch": 0.528, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 0.7238, |
| "mean_token_accuracy": 0.8125522993505001, |
| "num_tokens": 9429519.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.753829506970942, |
| "epoch": 0.544, |
| "grad_norm": 2.671875, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 0.7411, |
| "mean_token_accuracy": 0.8106916502118111, |
| "num_tokens": 9718247.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.7809748956933618, |
| "epoch": 0.56, |
| "grad_norm": 2.90625, |
| "learning_rate": 6.98e-06, |
| "loss": 0.756, |
| "mean_token_accuracy": 0.803440049290657, |
| "num_tokens": 9991100.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.7534385286271572, |
| "epoch": 0.576, |
| "grad_norm": 2.125, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 0.7358, |
| "mean_token_accuracy": 0.807822210714221, |
| "num_tokens": 10276090.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.7527043262496591, |
| "epoch": 0.592, |
| "grad_norm": 1.75, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 0.7343, |
| "mean_token_accuracy": 0.8090104408562183, |
| "num_tokens": 10566762.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.7644007310271264, |
| "epoch": 0.608, |
| "grad_norm": 1.875, |
| "learning_rate": 7.58e-06, |
| "loss": 0.7436, |
| "mean_token_accuracy": 0.8086390372365713, |
| "num_tokens": 10857002.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.7420919336378574, |
| "epoch": 0.624, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.78e-06, |
| "loss": 0.7239, |
| "mean_token_accuracy": 0.8104523837566375, |
| "num_tokens": 11147326.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.7273864936083555, |
| "epoch": 0.64, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 0.7111, |
| "mean_token_accuracy": 0.8139414019882679, |
| "num_tokens": 11437710.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_chemistry_entropy": 0.7778651757240296, |
| "eval_chemistry_loss": 0.7157668471336365, |
| "eval_chemistry_mean_token_accuracy": 0.808542202949524, |
| "eval_chemistry_num_tokens": 11437710.0, |
| "eval_chemistry_runtime": 34.8261, |
| "eval_chemistry_samples_per_second": 14.357, |
| "eval_chemistry_steps_per_second": 3.589, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_math_entropy": 0.763886357307434, |
| "eval_math_loss": 0.7481760382652283, |
| "eval_math_mean_token_accuracy": 0.8066890263557434, |
| "eval_math_num_tokens": 11437710.0, |
| "eval_math_runtime": 36.0154, |
| "eval_math_samples_per_second": 13.883, |
| "eval_math_steps_per_second": 3.471, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.729702671058476, |
| "epoch": 0.656, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.18e-06, |
| "loss": 0.7083, |
| "mean_token_accuracy": 0.8148457534611225, |
| "num_tokens": 11727268.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.7371693229302764, |
| "epoch": 0.672, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 0.7199, |
| "mean_token_accuracy": 0.8099267814308405, |
| "num_tokens": 12020478.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.7293412426486612, |
| "epoch": 0.688, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 0.7068, |
| "mean_token_accuracy": 0.8130085315555334, |
| "num_tokens": 12300380.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.718653773330152, |
| "epoch": 0.704, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.78e-06, |
| "loss": 0.6982, |
| "mean_token_accuracy": 0.8138067662715912, |
| "num_tokens": 12591366.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.7371329203248024, |
| "epoch": 0.72, |
| "grad_norm": 1.9609375, |
| "learning_rate": 8.98e-06, |
| "loss": 0.7104, |
| "mean_token_accuracy": 0.8120142966508865, |
| "num_tokens": 12869618.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7198993725702166, |
| "epoch": 0.736, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 0.7009, |
| "mean_token_accuracy": 0.815392379835248, |
| "num_tokens": 13156311.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.7063086304813624, |
| "epoch": 0.752, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.38e-06, |
| "loss": 0.6793, |
| "mean_token_accuracy": 0.8184278599917889, |
| "num_tokens": 13438911.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.6851496128365397, |
| "epoch": 0.768, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.58e-06, |
| "loss": 0.6731, |
| "mean_token_accuracy": 0.8231410879641772, |
| "num_tokens": 13735397.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.7282077683135867, |
| "epoch": 0.784, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 0.7094, |
| "mean_token_accuracy": 0.8130000628530979, |
| "num_tokens": 14024497.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.7247441383078694, |
| "epoch": 0.8, |
| "grad_norm": 2.0, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 0.7048, |
| "mean_token_accuracy": 0.812597556039691, |
| "num_tokens": 14316427.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_chemistry_entropy": 0.7653646836280823, |
| "eval_chemistry_loss": 0.7123190760612488, |
| "eval_chemistry_mean_token_accuracy": 0.8073507056236268, |
| "eval_chemistry_num_tokens": 14316427.0, |
| "eval_chemistry_runtime": 34.8547, |
| "eval_chemistry_samples_per_second": 14.345, |
| "eval_chemistry_steps_per_second": 3.586, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_math_entropy": 0.7450865380764008, |
| "eval_math_loss": 0.7230644822120667, |
| "eval_math_mean_token_accuracy": 0.8105058965682983, |
| "eval_math_num_tokens": 14316427.0, |
| "eval_math_runtime": 35.5994, |
| "eval_math_samples_per_second": 14.045, |
| "eval_math_steps_per_second": 3.511, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.714242628775537, |
| "epoch": 0.816, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.018e-05, |
| "loss": 0.6884, |
| "mean_token_accuracy": 0.8151328191161156, |
| "num_tokens": 14607136.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.7071703864261508, |
| "epoch": 0.832, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.038e-05, |
| "loss": 0.6839, |
| "mean_token_accuracy": 0.8182855024933815, |
| "num_tokens": 14894084.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.7319832380861044, |
| "epoch": 0.848, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 0.709, |
| "mean_token_accuracy": 0.8122099358588457, |
| "num_tokens": 15177039.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.7191415606066585, |
| "epoch": 0.864, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.8139543637633324, |
| "num_tokens": 15463393.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.7217472817748785, |
| "epoch": 0.88, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 0.699, |
| "mean_token_accuracy": 0.8122927758842706, |
| "num_tokens": 15747943.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.7142911188304424, |
| "epoch": 0.896, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 0.6964, |
| "mean_token_accuracy": 0.8145412191748619, |
| "num_tokens": 16027931.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.7013007398694754, |
| "epoch": 0.912, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.138e-05, |
| "loss": 0.6761, |
| "mean_token_accuracy": 0.8172509890049696, |
| "num_tokens": 16317127.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.6971628932282329, |
| "epoch": 0.928, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 0.6725, |
| "mean_token_accuracy": 0.8180789042264223, |
| "num_tokens": 16598757.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.7133341139182449, |
| "epoch": 0.944, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.178e-05, |
| "loss": 0.6958, |
| "mean_token_accuracy": 0.8153590984642506, |
| "num_tokens": 16882236.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.7189922722056508, |
| "epoch": 0.96, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.198e-05, |
| "loss": 0.6956, |
| "mean_token_accuracy": 0.8149452641606331, |
| "num_tokens": 17169495.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_chemistry_entropy": 0.7501673064231873, |
| "eval_chemistry_loss": 0.7121618390083313, |
| "eval_chemistry_mean_token_accuracy": 0.8068772978782653, |
| "eval_chemistry_num_tokens": 17169495.0, |
| "eval_chemistry_runtime": 34.8316, |
| "eval_chemistry_samples_per_second": 14.355, |
| "eval_chemistry_steps_per_second": 3.589, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_math_entropy": 0.7047771346569062, |
| "eval_math_loss": 0.7034372687339783, |
| "eval_math_mean_token_accuracy": 0.8131817808151245, |
| "eval_math_num_tokens": 17169495.0, |
| "eval_math_runtime": 35.8696, |
| "eval_math_samples_per_second": 13.939, |
| "eval_math_steps_per_second": 3.485, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.6952510023489594, |
| "epoch": 0.976, |
| "grad_norm": 1.875, |
| "learning_rate": 1.218e-05, |
| "loss": 0.6788, |
| "mean_token_accuracy": 0.817372427508235, |
| "num_tokens": 17454147.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.7092014687135816, |
| "epoch": 0.992, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 0.6838, |
| "mean_token_accuracy": 0.8138624154031276, |
| "num_tokens": 17742596.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.694889472052455, |
| "epoch": 1.008, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 0.6742, |
| "mean_token_accuracy": 0.8174668036401271, |
| "num_tokens": 18033540.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.6714398205280304, |
| "epoch": 1.024, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 0.6507, |
| "mean_token_accuracy": 0.8213358622044324, |
| "num_tokens": 18316346.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.6660828510299325, |
| "epoch": 1.04, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 0.6496, |
| "mean_token_accuracy": 0.8221284162253142, |
| "num_tokens": 18595230.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.6795160492882133, |
| "epoch": 1.056, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 0.6581, |
| "mean_token_accuracy": 0.8199356343597174, |
| "num_tokens": 18885547.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.6750219637528062, |
| "epoch": 1.072, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 0.6555, |
| "mean_token_accuracy": 0.8197014667093754, |
| "num_tokens": 19169083.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.6712652388960123, |
| "epoch": 1.088, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 0.6434, |
| "mean_token_accuracy": 0.8221997711807489, |
| "num_tokens": 19461465.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.6703712901100516, |
| "epoch": 1.104, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.378e-05, |
| "loss": 0.6514, |
| "mean_token_accuracy": 0.8217100899666547, |
| "num_tokens": 19755027.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.6529567580670118, |
| "epoch": 1.12, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.398e-05, |
| "loss": 0.6325, |
| "mean_token_accuracy": 0.8260030064731836, |
| "num_tokens": 20043293.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_chemistry_entropy": 0.7117109818458557, |
| "eval_chemistry_loss": 0.715001106262207, |
| "eval_chemistry_mean_token_accuracy": 0.805173789024353, |
| "eval_chemistry_num_tokens": 20043293.0, |
| "eval_chemistry_runtime": 34.8451, |
| "eval_chemistry_samples_per_second": 14.349, |
| "eval_chemistry_steps_per_second": 3.587, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_math_entropy": 0.6742688639163971, |
| "eval_math_loss": 0.6903207302093506, |
| "eval_math_mean_token_accuracy": 0.8157781276702881, |
| "eval_math_num_tokens": 20043293.0, |
| "eval_math_runtime": 35.8894, |
| "eval_math_samples_per_second": 13.932, |
| "eval_math_steps_per_second": 3.483, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.6546579284593463, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.418e-05, |
| "loss": 0.6415, |
| "mean_token_accuracy": 0.8239764388650656, |
| "num_tokens": 20323010.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.6666190484538674, |
| "epoch": 1.152, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 0.6493, |
| "mean_token_accuracy": 0.8230571333318949, |
| "num_tokens": 20613266.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.6584050474688411, |
| "epoch": 1.168, |
| "grad_norm": 1.5, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 0.6408, |
| "mean_token_accuracy": 0.8246052328497171, |
| "num_tokens": 20910404.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.664110666513443, |
| "epoch": 1.184, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.478e-05, |
| "loss": 0.6435, |
| "mean_token_accuracy": 0.8229058619588614, |
| "num_tokens": 21200197.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.6511959439143539, |
| "epoch": 1.2, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.498e-05, |
| "loss": 0.633, |
| "mean_token_accuracy": 0.8250994741916656, |
| "num_tokens": 21496441.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.668997959420085, |
| "epoch": 1.216, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 0.6456, |
| "mean_token_accuracy": 0.8220734592527151, |
| "num_tokens": 21778030.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.6459713563323021, |
| "epoch": 1.232, |
| "grad_norm": 1.75, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 0.6253, |
| "mean_token_accuracy": 0.8270299468189478, |
| "num_tokens": 22060851.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.6455961847677827, |
| "epoch": 1.248, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 0.6255, |
| "mean_token_accuracy": 0.8260451622307301, |
| "num_tokens": 22348738.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.6676814066246152, |
| "epoch": 1.264, |
| "grad_norm": 1.484375, |
| "learning_rate": 1.578e-05, |
| "loss": 0.6457, |
| "mean_token_accuracy": 0.8219054654240608, |
| "num_tokens": 22635039.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.6654805542901159, |
| "epoch": 1.28, |
| "grad_norm": 1.53125, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 0.6444, |
| "mean_token_accuracy": 0.8224660288542509, |
| "num_tokens": 22915190.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_chemistry_entropy": 0.7472894523143768, |
| "eval_chemistry_loss": 0.7169390916824341, |
| "eval_chemistry_mean_token_accuracy": 0.805417845249176, |
| "eval_chemistry_num_tokens": 22915190.0, |
| "eval_chemistry_runtime": 34.8749, |
| "eval_chemistry_samples_per_second": 14.337, |
| "eval_chemistry_steps_per_second": 3.584, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_math_entropy": 0.6949311044216157, |
| "eval_math_loss": 0.678849458694458, |
| "eval_math_mean_token_accuracy": 0.8176955370903015, |
| "eval_math_num_tokens": 22915190.0, |
| "eval_math_runtime": 35.8885, |
| "eval_math_samples_per_second": 13.932, |
| "eval_math_steps_per_second": 3.483, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.6526781121268869, |
| "epoch": 1.296, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.618e-05, |
| "loss": 0.627, |
| "mean_token_accuracy": 0.8251245643943548, |
| "num_tokens": 23194947.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.6623267890885473, |
| "epoch": 1.312, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.638e-05, |
| "loss": 0.6522, |
| "mean_token_accuracy": 0.8221593346446753, |
| "num_tokens": 23483496.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.675568882189691, |
| "epoch": 1.328, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.658e-05, |
| "loss": 0.6523, |
| "mean_token_accuracy": 0.8208769094198942, |
| "num_tokens": 23762589.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.6467019423842431, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 1.6953125, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 0.6297, |
| "mean_token_accuracy": 0.825079932808876, |
| "num_tokens": 24041497.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.656816397048533, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 1.296875, |
| "learning_rate": 1.698e-05, |
| "loss": 0.6365, |
| "mean_token_accuracy": 0.8254420697689057, |
| "num_tokens": 24330591.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.6973325841128826, |
| "epoch": 1.376, |
| "grad_norm": 1.484375, |
| "learning_rate": 1.718e-05, |
| "loss": 0.6754, |
| "mean_token_accuracy": 0.8146579563617706, |
| "num_tokens": 24611420.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.6515109525993467, |
| "epoch": 1.392, |
| "grad_norm": 1.5234375, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 0.6297, |
| "mean_token_accuracy": 0.8239485524594784, |
| "num_tokens": 24894694.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.6447439486160874, |
| "epoch": 1.408, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.758e-05, |
| "loss": 0.6231, |
| "mean_token_accuracy": 0.8267700038850307, |
| "num_tokens": 25184946.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.6419832136482, |
| "epoch": 1.424, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 0.624, |
| "mean_token_accuracy": 0.8271039195358754, |
| "num_tokens": 25474746.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.6597341772168874, |
| "epoch": 1.44, |
| "grad_norm": 1.4296875, |
| "learning_rate": 1.798e-05, |
| "loss": 0.6378, |
| "mean_token_accuracy": 0.8231316354125738, |
| "num_tokens": 25758550.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_chemistry_entropy": 0.7189412865638732, |
| "eval_chemistry_loss": 0.721181333065033, |
| "eval_chemistry_mean_token_accuracy": 0.8038096494674682, |
| "eval_chemistry_num_tokens": 25758550.0, |
| "eval_chemistry_runtime": 34.8369, |
| "eval_chemistry_samples_per_second": 14.353, |
| "eval_chemistry_steps_per_second": 3.588, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_math_entropy": 0.6560292990207672, |
| "eval_math_loss": 0.6677282452583313, |
| "eval_math_mean_token_accuracy": 0.8188103575706482, |
| "eval_math_num_tokens": 25758550.0, |
| "eval_math_runtime": 35.8656, |
| "eval_math_samples_per_second": 13.941, |
| "eval_math_steps_per_second": 3.485, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.6441044477745891, |
| "epoch": 1.456, |
| "grad_norm": 1.5078125, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 0.6259, |
| "mean_token_accuracy": 0.8252961106598378, |
| "num_tokens": 26039862.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.6502820059657097, |
| "epoch": 1.472, |
| "grad_norm": 1.3125, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 0.6342, |
| "mean_token_accuracy": 0.8237598706036806, |
| "num_tokens": 26318666.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.6764587434008718, |
| "epoch": 1.488, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.858e-05, |
| "loss": 0.6547, |
| "mean_token_accuracy": 0.8194571785628796, |
| "num_tokens": 26594867.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.6333928175270558, |
| "epoch": 1.504, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.878e-05, |
| "loss": 0.6176, |
| "mean_token_accuracy": 0.8289908330887557, |
| "num_tokens": 26887233.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.6682084022089839, |
| "epoch": 1.52, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.898e-05, |
| "loss": 0.6442, |
| "mean_token_accuracy": 0.8210660863667727, |
| "num_tokens": 27177448.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.6442653369158506, |
| "epoch": 1.536, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.918e-05, |
| "loss": 0.6319, |
| "mean_token_accuracy": 0.8249651778489351, |
| "num_tokens": 27467857.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.6457945328205824, |
| "epoch": 1.552, |
| "grad_norm": 1.3828125, |
| "learning_rate": 1.938e-05, |
| "loss": 0.6262, |
| "mean_token_accuracy": 0.8254117891192436, |
| "num_tokens": 27755138.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.6328146204352378, |
| "epoch": 1.568, |
| "grad_norm": 1.234375, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 0.6132, |
| "mean_token_accuracy": 0.8306384857743978, |
| "num_tokens": 28049363.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.6414034033194185, |
| "epoch": 1.584, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.978e-05, |
| "loss": 0.6233, |
| "mean_token_accuracy": 0.8266693830490113, |
| "num_tokens": 28337890.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.6323226554319262, |
| "epoch": 1.6, |
| "grad_norm": 1.4296875, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 0.6189, |
| "mean_token_accuracy": 0.8256060272455216, |
| "num_tokens": 28626068.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_chemistry_entropy": 0.7118635489940643, |
| "eval_chemistry_loss": 0.7256439924240112, |
| "eval_chemistry_mean_token_accuracy": 0.8026832752227783, |
| "eval_chemistry_num_tokens": 28626068.0, |
| "eval_chemistry_runtime": 34.8345, |
| "eval_chemistry_samples_per_second": 14.354, |
| "eval_chemistry_steps_per_second": 3.588, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_math_entropy": 0.6450024034976959, |
| "eval_math_loss": 0.6582702994346619, |
| "eval_math_mean_token_accuracy": 0.820967010974884, |
| "eval_math_num_tokens": 28626068.0, |
| "eval_math_runtime": 35.6245, |
| "eval_math_samples_per_second": 14.035, |
| "eval_math_steps_per_second": 3.509, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.6489538656547665, |
| "epoch": 1.616, |
| "grad_norm": 1.3125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 0.636, |
| "mean_token_accuracy": 0.8241962313652038, |
| "num_tokens": 28913208.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.656857686303556, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 0.6357, |
| "mean_token_accuracy": 0.8214856889098883, |
| "num_tokens": 29188917.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.6321157278493047, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 1.390625, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 0.6152, |
| "mean_token_accuracy": 0.8298161163926124, |
| "num_tokens": 29477292.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.6432073757052421, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 0.62, |
| "mean_token_accuracy": 0.8257944118231535, |
| "num_tokens": 29772480.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.647973028384149, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 0.6323, |
| "mean_token_accuracy": 0.8256966724991799, |
| "num_tokens": 30061007.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.6499059528112412, |
| "epoch": 1.696, |
| "grad_norm": 1.296875, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 0.6316, |
| "mean_token_accuracy": 0.8242154024541378, |
| "num_tokens": 30344415.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.6270535726100206, |
| "epoch": 1.712, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 0.6152, |
| "mean_token_accuracy": 0.8287085957825184, |
| "num_tokens": 30640891.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.6247420905157923, |
| "epoch": 1.728, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 0.6059, |
| "mean_token_accuracy": 0.8300342559814453, |
| "num_tokens": 30921295.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.6317020528018474, |
| "epoch": 1.744, |
| "grad_norm": 1.3984375, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 0.6158, |
| "mean_token_accuracy": 0.828870889171958, |
| "num_tokens": 31206375.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.6282909054309129, |
| "epoch": 1.76, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.978e-05, |
| "loss": 0.6101, |
| "mean_token_accuracy": 0.8280309360474348, |
| "num_tokens": 31491856.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_chemistry_entropy": 0.7381789875030518, |
| "eval_chemistry_loss": 0.7256398797035217, |
| "eval_chemistry_mean_token_accuracy": 0.8018755903244018, |
| "eval_chemistry_num_tokens": 31491856.0, |
| "eval_chemistry_runtime": 34.8532, |
| "eval_chemistry_samples_per_second": 14.346, |
| "eval_chemistry_steps_per_second": 3.586, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_math_entropy": 0.6412687346935272, |
| "eval_math_loss": 0.6473493576049805, |
| "eval_math_mean_token_accuracy": 0.8233258814811707, |
| "eval_math_num_tokens": 31491856.0, |
| "eval_math_runtime": 35.8787, |
| "eval_math_samples_per_second": 13.936, |
| "eval_math_steps_per_second": 3.484, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.6336805198341608, |
| "epoch": 1.776, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 0.6135, |
| "mean_token_accuracy": 0.8275633446872235, |
| "num_tokens": 31782829.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.6414820792153477, |
| "epoch": 1.792, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 0.6246, |
| "mean_token_accuracy": 0.8254438240081072, |
| "num_tokens": 32070187.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.6382445661351085, |
| "epoch": 1.808, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 0.6217, |
| "mean_token_accuracy": 0.8260911278426647, |
| "num_tokens": 32355072.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.6067643767222762, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 0.5867, |
| "mean_token_accuracy": 0.8337960425764323, |
| "num_tokens": 32634013.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.6227916920557618, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 1.34375, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 0.6108, |
| "mean_token_accuracy": 0.8300515715032816, |
| "num_tokens": 32917222.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.6352459752932191, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 1.265625, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 0.613, |
| "mean_token_accuracy": 0.8282338980585336, |
| "num_tokens": 33207147.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.6206054732203483, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 0.6023, |
| "mean_token_accuracy": 0.8303315650671721, |
| "num_tokens": 33496572.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.6307253973558545, |
| "epoch": 1.888, |
| "grad_norm": 1.3046875, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 0.6134, |
| "mean_token_accuracy": 0.8282632239162921, |
| "num_tokens": 33775788.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.6315167531371116, |
| "epoch": 1.904, |
| "grad_norm": 1.2421875, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 0.615, |
| "mean_token_accuracy": 0.8275392096489668, |
| "num_tokens": 34052322.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.6110374081879855, |
| "epoch": 1.92, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 0.5959, |
| "mean_token_accuracy": 0.8313204348087311, |
| "num_tokens": 34331468.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_chemistry_entropy": 0.7433430399894715, |
| "eval_chemistry_loss": 0.728480339050293, |
| "eval_chemistry_mean_token_accuracy": 0.8014561586380005, |
| "eval_chemistry_num_tokens": 34331468.0, |
| "eval_chemistry_runtime": 34.8484, |
| "eval_chemistry_samples_per_second": 14.348, |
| "eval_chemistry_steps_per_second": 3.587, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_math_entropy": 0.6532090711593628, |
| "eval_math_loss": 0.6392548680305481, |
| "eval_math_mean_token_accuracy": 0.8242080550193787, |
| "eval_math_num_tokens": 34331468.0, |
| "eval_math_runtime": 35.8593, |
| "eval_math_samples_per_second": 13.943, |
| "eval_math_steps_per_second": 3.486, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.6277910789474845, |
| "epoch": 1.936, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 0.6122, |
| "mean_token_accuracy": 0.8281190965324641, |
| "num_tokens": 34612743.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.6204241087660194, |
| "epoch": 1.952, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 0.5964, |
| "mean_token_accuracy": 0.8305780492722988, |
| "num_tokens": 34891115.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.6416850406676531, |
| "epoch": 1.968, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 0.6335, |
| "mean_token_accuracy": 0.8254266548901796, |
| "num_tokens": 35179505.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.6098293786868453, |
| "epoch": 1.984, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 0.5945, |
| "mean_token_accuracy": 0.8320291046053171, |
| "num_tokens": 35472596.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.6196965377777814, |
| "epoch": 2.0, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 0.604, |
| "mean_token_accuracy": 0.8302332308143378, |
| "num_tokens": 35772848.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.5497036971151829, |
| "epoch": 2.016, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 0.5255, |
| "mean_token_accuracy": 0.8438023224472999, |
| "num_tokens": 36060330.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.5297272937372327, |
| "epoch": 2.032, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8473685499280691, |
| "num_tokens": 36343894.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.5197363485582173, |
| "epoch": 2.048, |
| "grad_norm": 1.4296875, |
| "learning_rate": 1.938e-05, |
| "loss": 0.5054, |
| "mean_token_accuracy": 0.8493004187941551, |
| "num_tokens": 36636410.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.5172868834808468, |
| "epoch": 2.064, |
| "grad_norm": 1.328125, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 0.5014, |
| "mean_token_accuracy": 0.849420978501439, |
| "num_tokens": 36925382.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.5238144496455789, |
| "epoch": 2.08, |
| "grad_norm": 1.5, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 0.5006, |
| "mean_token_accuracy": 0.8484534539282322, |
| "num_tokens": 37208672.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_chemistry_entropy": 0.5902576115131378, |
| "eval_chemistry_loss": 0.7615314722061157, |
| "eval_chemistry_mean_token_accuracy": 0.7987616105079651, |
| "eval_chemistry_num_tokens": 37208672.0, |
| "eval_chemistry_runtime": 34.8214, |
| "eval_chemistry_samples_per_second": 14.359, |
| "eval_chemistry_steps_per_second": 3.59, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_math_entropy": 0.5490774600505829, |
| "eval_math_loss": 0.6556233167648315, |
| "eval_math_mean_token_accuracy": 0.8236036248207093, |
| "eval_math_num_tokens": 37208672.0, |
| "eval_math_runtime": 35.898, |
| "eval_math_samples_per_second": 13.928, |
| "eval_math_steps_per_second": 3.482, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.5296811152249574, |
| "epoch": 2.096, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 0.5143, |
| "mean_token_accuracy": 0.8471574913710356, |
| "num_tokens": 37491954.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.5180984031409025, |
| "epoch": 2.112, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.8482684683054685, |
| "num_tokens": 37787033.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.5158460404723882, |
| "epoch": 2.128, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 0.4989, |
| "mean_token_accuracy": 0.8504650525748729, |
| "num_tokens": 38068955.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.5193909807130694, |
| "epoch": 2.144, |
| "grad_norm": 1.4140625, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 0.5058, |
| "mean_token_accuracy": 0.8479062043130398, |
| "num_tokens": 38359914.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.518505304865539, |
| "epoch": 2.16, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 0.5031, |
| "mean_token_accuracy": 0.8498795099556447, |
| "num_tokens": 38649684.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.49965119622647763, |
| "epoch": 2.176, |
| "grad_norm": 1.359375, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.8528774298727513, |
| "num_tokens": 38938155.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.5324017994105816, |
| "epoch": 2.192, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.918e-05, |
| "loss": 0.5166, |
| "mean_token_accuracy": 0.8449586551636458, |
| "num_tokens": 39228800.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.523920483700931, |
| "epoch": 2.208, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.8468315444886685, |
| "num_tokens": 39513118.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.5206425994634628, |
| "epoch": 2.224, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 0.5006, |
| "mean_token_accuracy": 0.8491694446653127, |
| "num_tokens": 39804212.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.5240093268454075, |
| "epoch": 2.24, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 0.5127, |
| "mean_token_accuracy": 0.8465723715722561, |
| "num_tokens": 40092012.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_chemistry_entropy": 0.5741701843738556, |
| "eval_chemistry_loss": 0.7734760642051697, |
| "eval_chemistry_mean_token_accuracy": 0.7974499335289001, |
| "eval_chemistry_num_tokens": 40092012.0, |
| "eval_chemistry_runtime": 34.8009, |
| "eval_chemistry_samples_per_second": 14.367, |
| "eval_chemistry_steps_per_second": 3.592, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_math_entropy": 0.5538151891231536, |
| "eval_math_loss": 0.6544409394264221, |
| "eval_math_mean_token_accuracy": 0.8238605060577393, |
| "eval_math_num_tokens": 40092012.0, |
| "eval_math_runtime": 35.8968, |
| "eval_math_samples_per_second": 13.929, |
| "eval_math_steps_per_second": 3.482, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.5231095163151622, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 0.5086, |
| "mean_token_accuracy": 0.8475595053285361, |
| "num_tokens": 40380069.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.5324242118746042, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 1.6171875, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8457300752401352, |
| "num_tokens": 40655903.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.5270635643973947, |
| "epoch": 2.288, |
| "grad_norm": 1.484375, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 0.51, |
| "mean_token_accuracy": 0.8453500598669053, |
| "num_tokens": 40944791.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.5129735874012112, |
| "epoch": 2.304, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 0.4984, |
| "mean_token_accuracy": 0.8505097340792418, |
| "num_tokens": 41227146.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.5308720570988953, |
| "epoch": 2.32, |
| "grad_norm": 1.5, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 0.5197, |
| "mean_token_accuracy": 0.8463008664548397, |
| "num_tokens": 41509322.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.5264440540224313, |
| "epoch": 2.336, |
| "grad_norm": 1.4609375, |
| "learning_rate": 1.898e-05, |
| "loss": 0.5068, |
| "mean_token_accuracy": 0.8473060473799705, |
| "num_tokens": 41792799.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.5393752640113234, |
| "epoch": 2.352, |
| "grad_norm": 1.4609375, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 0.5226, |
| "mean_token_accuracy": 0.8433732774108649, |
| "num_tokens": 42083080.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.5265612868592143, |
| "epoch": 2.368, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 0.5118, |
| "mean_token_accuracy": 0.8467371355742216, |
| "num_tokens": 42368442.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.5453734394162894, |
| "epoch": 2.384, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 0.5308, |
| "mean_token_accuracy": 0.8432964202016592, |
| "num_tokens": 42663926.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.5337216904386878, |
| "epoch": 2.4, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 0.5215, |
| "mean_token_accuracy": 0.8452994517982007, |
| "num_tokens": 42946351.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_chemistry_entropy": 0.5733621008396149, |
| "eval_chemistry_loss": 0.7763350009918213, |
| "eval_chemistry_mean_token_accuracy": 0.7971291627883911, |
| "eval_chemistry_num_tokens": 42946351.0, |
| "eval_chemistry_runtime": 34.7936, |
| "eval_chemistry_samples_per_second": 14.37, |
| "eval_chemistry_steps_per_second": 3.593, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_math_entropy": 0.5494145798683167, |
| "eval_math_loss": 0.6489285826683044, |
| "eval_math_mean_token_accuracy": 0.8246520318984986, |
| "eval_math_num_tokens": 42946351.0, |
| "eval_math_runtime": 35.4869, |
| "eval_math_samples_per_second": 14.09, |
| "eval_math_steps_per_second": 3.522, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.5415687510743737, |
| "epoch": 2.416, |
| "grad_norm": 1.484375, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 0.5263, |
| "mean_token_accuracy": 0.8432823572307825, |
| "num_tokens": 43227782.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.5234041666612029, |
| "epoch": 2.432, |
| "grad_norm": 1.421875, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 0.5124, |
| "mean_token_accuracy": 0.8473271746188402, |
| "num_tokens": 43511939.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.5191137973219156, |
| "epoch": 2.448, |
| "grad_norm": 1.5703125, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 0.5039, |
| "mean_token_accuracy": 0.8494564235210419, |
| "num_tokens": 43811005.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.5308497181162238, |
| "epoch": 2.464, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 0.5174, |
| "mean_token_accuracy": 0.8456704583019018, |
| "num_tokens": 44106223.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.5506666025146842, |
| "epoch": 2.48, |
| "grad_norm": 1.53125, |
| "learning_rate": 1.878e-05, |
| "loss": 0.5316, |
| "mean_token_accuracy": 0.8412250328809023, |
| "num_tokens": 44379095.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.5348324475809931, |
| "epoch": 2.496, |
| "grad_norm": 1.4296875, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 0.52, |
| "mean_token_accuracy": 0.8440562028437852, |
| "num_tokens": 44670316.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.5286164516583085, |
| "epoch": 2.512, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 0.516, |
| "mean_token_accuracy": 0.8460781283676624, |
| "num_tokens": 44952877.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.5242383845150471, |
| "epoch": 2.528, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 0.5084, |
| "mean_token_accuracy": 0.8474905800074339, |
| "num_tokens": 45237653.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.5217398202046752, |
| "epoch": 2.544, |
| "grad_norm": 1.625, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 0.51, |
| "mean_token_accuracy": 0.8485121335834265, |
| "num_tokens": 45524866.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.5217585685662925, |
| "epoch": 2.56, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 0.504, |
| "mean_token_accuracy": 0.8478538550436496, |
| "num_tokens": 45803783.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_chemistry_entropy": 0.5687358362674713, |
| "eval_chemistry_loss": 0.7774361371994019, |
| "eval_chemistry_mean_token_accuracy": 0.7964602584838867, |
| "eval_chemistry_num_tokens": 45803783.0, |
| "eval_chemistry_runtime": 34.792, |
| "eval_chemistry_samples_per_second": 14.371, |
| "eval_chemistry_steps_per_second": 3.593, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_math_entropy": 0.5417093946933746, |
| "eval_math_loss": 0.6447312831878662, |
| "eval_math_mean_token_accuracy": 0.8262028379440307, |
| "eval_math_num_tokens": 45803783.0, |
| "eval_math_runtime": 35.8206, |
| "eval_math_samples_per_second": 13.958, |
| "eval_math_steps_per_second": 3.49, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.5245849631726742, |
| "epoch": 2.576, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 0.5095, |
| "mean_token_accuracy": 0.8479252554476261, |
| "num_tokens": 46090713.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.5172529483214021, |
| "epoch": 2.592, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 0.5036, |
| "mean_token_accuracy": 0.8490103390067816, |
| "num_tokens": 46384084.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.5333136133849621, |
| "epoch": 2.608, |
| "grad_norm": 1.625, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 0.5216, |
| "mean_token_accuracy": 0.8443999428302049, |
| "num_tokens": 46667552.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.49988405164331196, |
| "epoch": 2.624, |
| "grad_norm": 1.453125, |
| "learning_rate": 1.858e-05, |
| "loss": 0.4842, |
| "mean_token_accuracy": 0.8543243549764157, |
| "num_tokens": 46961124.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.5301500145345927, |
| "epoch": 2.64, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 0.518, |
| "mean_token_accuracy": 0.8460444647818803, |
| "num_tokens": 47245326.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.5089625578373671, |
| "epoch": 2.656, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 0.4942, |
| "mean_token_accuracy": 0.8499364998191595, |
| "num_tokens": 47527347.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.5320184031501413, |
| "epoch": 2.672, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 0.5159, |
| "mean_token_accuracy": 0.8455484293401241, |
| "num_tokens": 47808167.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.5261984881013632, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 1.4140625, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 0.5096, |
| "mean_token_accuracy": 0.8478115413337945, |
| "num_tokens": 48092035.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.5308143127709627, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 1.609375, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 0.5208, |
| "mean_token_accuracy": 0.8449086494743824, |
| "num_tokens": 48386231.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.522861585021019, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 1.5078125, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 0.5101, |
| "mean_token_accuracy": 0.8472402919083833, |
| "num_tokens": 48663262.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_chemistry_entropy": 0.5746280739307403, |
| "eval_chemistry_loss": 0.7798963785171509, |
| "eval_chemistry_mean_token_accuracy": 0.7963899078369141, |
| "eval_chemistry_num_tokens": 48663262.0, |
| "eval_chemistry_runtime": 34.7443, |
| "eval_chemistry_samples_per_second": 14.391, |
| "eval_chemistry_steps_per_second": 3.598, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_math_entropy": 0.5586551280021668, |
| "eval_math_loss": 0.6379777789115906, |
| "eval_math_mean_token_accuracy": 0.8271588444709778, |
| "eval_math_num_tokens": 48663262.0, |
| "eval_math_runtime": 35.8034, |
| "eval_math_samples_per_second": 13.965, |
| "eval_math_steps_per_second": 3.491, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.5201129943132401, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 0.5075, |
| "mean_token_accuracy": 0.8486686907708645, |
| "num_tokens": 48949456.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.5246327675879001, |
| "epoch": 2.752, |
| "grad_norm": 1.5078125, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 0.5068, |
| "mean_token_accuracy": 0.8480775609612465, |
| "num_tokens": 49235751.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.5258618659339845, |
| "epoch": 2.768, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 0.5131, |
| "mean_token_accuracy": 0.8470302954316139, |
| "num_tokens": 49524628.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.5411237273365259, |
| "epoch": 2.784, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 0.5228, |
| "mean_token_accuracy": 0.8433229614049196, |
| "num_tokens": 49799604.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.522053999081254, |
| "epoch": 2.8, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 0.509, |
| "mean_token_accuracy": 0.8484720811247826, |
| "num_tokens": 50079307.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.5252829389646649, |
| "epoch": 2.816, |
| "grad_norm": 1.5234375, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 0.5056, |
| "mean_token_accuracy": 0.8476968578994274, |
| "num_tokens": 50372216.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.5136345084756613, |
| "epoch": 2.832, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 0.5037, |
| "mean_token_accuracy": 0.8490433126688004, |
| "num_tokens": 50657211.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.5356445843353868, |
| "epoch": 2.848, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 0.5228, |
| "mean_token_accuracy": 0.8450450662523508, |
| "num_tokens": 50946503.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.5204181535169482, |
| "epoch": 2.864, |
| "grad_norm": 1.609375, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 0.5087, |
| "mean_token_accuracy": 0.8479438953101635, |
| "num_tokens": 51229298.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.5238792803138494, |
| "epoch": 2.88, |
| "grad_norm": 1.5234375, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 0.5047, |
| "mean_token_accuracy": 0.8475392743945122, |
| "num_tokens": 51509622.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_chemistry_entropy": 0.5654533207416534, |
| "eval_chemistry_loss": 0.7800628542900085, |
| "eval_chemistry_mean_token_accuracy": 0.7969496622085571, |
| "eval_chemistry_num_tokens": 51509622.0, |
| "eval_chemistry_runtime": 34.7126, |
| "eval_chemistry_samples_per_second": 14.404, |
| "eval_chemistry_steps_per_second": 3.601, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_math_entropy": 0.538058566570282, |
| "eval_math_loss": 0.636343240737915, |
| "eval_math_mean_token_accuracy": 0.8282605667114258, |
| "eval_math_num_tokens": 51509622.0, |
| "eval_math_runtime": 35.7752, |
| "eval_math_samples_per_second": 13.976, |
| "eval_math_steps_per_second": 3.494, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.5157067686319351, |
| "epoch": 2.896, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 0.5005, |
| "mean_token_accuracy": 0.8493956170976162, |
| "num_tokens": 51790856.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.5220442852005363, |
| "epoch": 2.912, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 0.5082, |
| "mean_token_accuracy": 0.8484512511640787, |
| "num_tokens": 52074393.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.5298560874536633, |
| "epoch": 2.928, |
| "grad_norm": 1.5, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 0.5177, |
| "mean_token_accuracy": 0.8467922162264585, |
| "num_tokens": 52366467.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.5144043141975999, |
| "epoch": 2.944, |
| "grad_norm": 1.4765625, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 0.5041, |
| "mean_token_accuracy": 0.848617946729064, |
| "num_tokens": 52657863.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.5239851342514157, |
| "epoch": 2.96, |
| "grad_norm": 1.5078125, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 0.5086, |
| "mean_token_accuracy": 0.8469469167292119, |
| "num_tokens": 52947183.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.5214887933805585, |
| "epoch": 2.976, |
| "grad_norm": 1.390625, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 0.5049, |
| "mean_token_accuracy": 0.8478761337697506, |
| "num_tokens": 53226984.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.5146605940535665, |
| "epoch": 2.992, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.8494556181132793, |
| "num_tokens": 53515204.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.47540333569049836, |
| "epoch": 3.008, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 0.436, |
| "mean_token_accuracy": 0.8662762742489576, |
| "num_tokens": 53800592.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.35706824259832504, |
| "epoch": 3.024, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 0.3438, |
| "mean_token_accuracy": 0.8883892893791199, |
| "num_tokens": 54092017.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.3444691884331405, |
| "epoch": 3.04, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 0.3242, |
| "mean_token_accuracy": 0.8934439823031426, |
| "num_tokens": 54380158.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_chemistry_entropy": 0.4136744683980942, |
| "eval_chemistry_loss": 0.9674221873283386, |
| "eval_chemistry_mean_token_accuracy": 0.785107521533966, |
| "eval_chemistry_num_tokens": 54380158.0, |
| "eval_chemistry_runtime": 34.7442, |
| "eval_chemistry_samples_per_second": 14.391, |
| "eval_chemistry_steps_per_second": 3.598, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_math_entropy": 0.40369147157669066, |
| "eval_math_loss": 0.7466345429420471, |
| "eval_math_mean_token_accuracy": 0.8216371216773987, |
| "eval_math_num_tokens": 54380158.0, |
| "eval_math_runtime": 36.3078, |
| "eval_math_samples_per_second": 13.771, |
| "eval_math_steps_per_second": 3.443, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.3425462535582483, |
| "epoch": 3.056, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.798e-05, |
| "loss": 0.3194, |
| "mean_token_accuracy": 0.894545292109251, |
| "num_tokens": 54658093.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.3412907443009317, |
| "epoch": 3.072, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 0.3209, |
| "mean_token_accuracy": 0.8948256101459264, |
| "num_tokens": 54949938.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.32589660566300155, |
| "epoch": 3.088, |
| "grad_norm": 2.1875, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 0.3083, |
| "mean_token_accuracy": 0.8980491489171982, |
| "num_tokens": 55234643.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.34667133893817664, |
| "epoch": 3.104, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 0.3253, |
| "mean_token_accuracy": 0.8923853240907192, |
| "num_tokens": 55519679.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.33345927773043516, |
| "epoch": 3.12, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 0.3172, |
| "mean_token_accuracy": 0.8965467110276222, |
| "num_tokens": 55812397.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.33698156690225006, |
| "epoch": 3.136, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 0.319, |
| "mean_token_accuracy": 0.8949916884303093, |
| "num_tokens": 56101550.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.3372134535573423, |
| "epoch": 3.152, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 0.3227, |
| "mean_token_accuracy": 0.8940825492143631, |
| "num_tokens": 56387662.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.3512561682611704, |
| "epoch": 3.168, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 0.3342, |
| "mean_token_accuracy": 0.8908473681658506, |
| "num_tokens": 56673426.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.3502405107952654, |
| "epoch": 3.184, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 0.3273, |
| "mean_token_accuracy": 0.8911666806787253, |
| "num_tokens": 56957044.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.33547057397663593, |
| "epoch": 3.2, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 0.3184, |
| "mean_token_accuracy": 0.8955803520977497, |
| "num_tokens": 57250097.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_chemistry_entropy": 0.3912388117313385, |
| "eval_chemistry_loss": 0.9860958456993103, |
| "eval_chemistry_mean_token_accuracy": 0.7855022087097168, |
| "eval_chemistry_num_tokens": 57250097.0, |
| "eval_chemistry_runtime": 34.752, |
| "eval_chemistry_samples_per_second": 14.388, |
| "eval_chemistry_steps_per_second": 3.597, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_math_entropy": 0.3877128950357437, |
| "eval_math_loss": 0.7659533023834229, |
| "eval_math_mean_token_accuracy": 0.8217745156288147, |
| "eval_math_num_tokens": 57250097.0, |
| "eval_math_runtime": 35.4631, |
| "eval_math_samples_per_second": 14.099, |
| "eval_math_steps_per_second": 3.525, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 16, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.343629994565034e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|