| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8901746967842439, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.8410565290600062, |
| "epoch": 0.017803493935684877, |
| "grad_norm": 41.25, |
| "learning_rate": 1.8e-07, |
| "loss": 14.231, |
| "mean_token_accuracy": 0.766436281055212, |
| "num_tokens": 257189.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.8121558612212538, |
| "epoch": 0.035606987871369754, |
| "grad_norm": 43.25, |
| "learning_rate": 3.8e-07, |
| "loss": 13.7028, |
| "mean_token_accuracy": 0.7704860582947731, |
| "num_tokens": 528285.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.8321617901325226, |
| "epoch": 0.053410481807054634, |
| "grad_norm": 38.75, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 14.0203, |
| "mean_token_accuracy": 0.7678851690143347, |
| "num_tokens": 801635.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.8524315822869539, |
| "epoch": 0.07121397574273951, |
| "grad_norm": 37.75, |
| "learning_rate": 7.8e-07, |
| "loss": 14.3071, |
| "mean_token_accuracy": 0.7648834150284529, |
| "num_tokens": 1060812.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8403167355805635, |
| "epoch": 0.08901746967842439, |
| "grad_norm": 34.5, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 14.0049, |
| "mean_token_accuracy": 0.7667405389249324, |
| "num_tokens": 1327380.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8425804980099201, |
| "epoch": 0.10682096361410927, |
| "grad_norm": 34.25, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 13.9802, |
| "mean_token_accuracy": 0.7680465068668128, |
| "num_tokens": 1597405.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8755916632711888, |
| "epoch": 0.12462445754979415, |
| "grad_norm": 32.25, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 14.4557, |
| "mean_token_accuracy": 0.7614609662443399, |
| "num_tokens": 1859684.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8682003870606423, |
| "epoch": 0.14242795148547902, |
| "grad_norm": 30.375, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 14.1174, |
| "mean_token_accuracy": 0.7653359699994325, |
| "num_tokens": 2118068.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8360268581658602, |
| "epoch": 0.1602314454211639, |
| "grad_norm": 28.375, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 13.6773, |
| "mean_token_accuracy": 0.7724899325519801, |
| "num_tokens": 2388824.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8634540975093842, |
| "epoch": 0.17803493935684878, |
| "grad_norm": 34.0, |
| "learning_rate": 1.98e-06, |
| "loss": 14.0689, |
| "mean_token_accuracy": 0.7672080259770155, |
| "num_tokens": 2644330.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_biology_entropy": 1.128741333961487, |
| "eval_biology_loss": 1.1601769924163818, |
| "eval_biology_mean_token_accuracy": 0.7042496418952942, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 54.4222, |
| "eval_biology_samples_per_second": 9.187, |
| "eval_biology_steps_per_second": 2.297, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_chemistry_entropy": 0.8544474849700928, |
| "eval_chemistry_loss": 0.8545747995376587, |
| "eval_chemistry_mean_token_accuracy": 0.7713311352729797, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 65.3258, |
| "eval_chemistry_samples_per_second": 7.654, |
| "eval_chemistry_steps_per_second": 1.913, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8453936260193586, |
| "epoch": 0.19583843329253367, |
| "grad_norm": 29.5, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 13.6681, |
| "mean_token_accuracy": 0.7730784911662341, |
| "num_tokens": 2913700.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8460511896759272, |
| "epoch": 0.21364192722821854, |
| "grad_norm": 27.625, |
| "learning_rate": 2.38e-06, |
| "loss": 13.728, |
| "mean_token_accuracy": 0.7713178683072328, |
| "num_tokens": 3185255.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.8403830077499151, |
| "epoch": 0.2314454211639034, |
| "grad_norm": 28.25, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 13.5642, |
| "mean_token_accuracy": 0.7730850588530302, |
| "num_tokens": 3454750.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8515023712068797, |
| "epoch": 0.2492489150995883, |
| "grad_norm": 30.0, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 13.8032, |
| "mean_token_accuracy": 0.7721988521516323, |
| "num_tokens": 3719113.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.8474330805242062, |
| "epoch": 0.26705240903527316, |
| "grad_norm": 27.375, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 13.728, |
| "mean_token_accuracy": 0.7698215767741203, |
| "num_tokens": 3990505.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.8292176539078355, |
| "epoch": 0.28485590297095803, |
| "grad_norm": 24.625, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 13.3391, |
| "mean_token_accuracy": 0.776393149420619, |
| "num_tokens": 4267403.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.8280835278332234, |
| "epoch": 0.30265939690664295, |
| "grad_norm": 29.875, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 13.3175, |
| "mean_token_accuracy": 0.7770637154579163, |
| "num_tokens": 4535458.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.842758315615356, |
| "epoch": 0.3204628908423278, |
| "grad_norm": 27.5, |
| "learning_rate": 3.58e-06, |
| "loss": 13.6062, |
| "mean_token_accuracy": 0.7731371156871318, |
| "num_tokens": 4796815.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.8247860476374627, |
| "epoch": 0.3382663847780127, |
| "grad_norm": 22.75, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 13.2445, |
| "mean_token_accuracy": 0.7772165313363075, |
| "num_tokens": 5066948.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.8495354067534209, |
| "epoch": 0.35606987871369755, |
| "grad_norm": 26.125, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 13.7137, |
| "mean_token_accuracy": 0.7696468211710453, |
| "num_tokens": 5324751.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_biology_entropy": 1.1391102423667907, |
| "eval_biology_loss": 1.1645851135253906, |
| "eval_biology_mean_token_accuracy": 0.7036963219642639, |
| "eval_biology_num_tokens": 5324751.0, |
| "eval_biology_runtime": 47.5522, |
| "eval_biology_samples_per_second": 10.515, |
| "eval_biology_steps_per_second": 2.629, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_chemistry_entropy": 0.8357059621810913, |
| "eval_chemistry_loss": 0.8271888494491577, |
| "eval_chemistry_mean_token_accuracy": 0.7765080814361572, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 58.2133, |
| "eval_chemistry_samples_per_second": 8.589, |
| "eval_chemistry_steps_per_second": 2.147, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.8138596788048744, |
| "epoch": 0.3738733726493824, |
| "grad_norm": 27.875, |
| "learning_rate": 4.18e-06, |
| "loss": 13.0337, |
| "mean_token_accuracy": 0.7797718059271574, |
| "num_tokens": 5585508.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.8215312957763672, |
| "epoch": 0.39167686658506734, |
| "grad_norm": 25.75, |
| "learning_rate": 4.38e-06, |
| "loss": 13.2224, |
| "mean_token_accuracy": 0.7784637857228518, |
| "num_tokens": 5848889.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.8100055737420917, |
| "epoch": 0.4094803605207522, |
| "grad_norm": 27.625, |
| "learning_rate": 4.58e-06, |
| "loss": 13.0714, |
| "mean_token_accuracy": 0.7809950839728117, |
| "num_tokens": 6114855.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.8117449183017016, |
| "epoch": 0.4272838544564371, |
| "grad_norm": 25.875, |
| "learning_rate": 4.78e-06, |
| "loss": 13.0914, |
| "mean_token_accuracy": 0.7797230206429958, |
| "num_tokens": 6378152.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.8300687098875642, |
| "epoch": 0.44508734839212194, |
| "grad_norm": 24.5, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 13.3199, |
| "mean_token_accuracy": 0.7753712415695191, |
| "num_tokens": 6637273.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.8000802919268608, |
| "epoch": 0.4628908423278068, |
| "grad_norm": 23.75, |
| "learning_rate": 5.18e-06, |
| "loss": 12.8911, |
| "mean_token_accuracy": 0.7822641927748919, |
| "num_tokens": 6896684.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.8173866732046008, |
| "epoch": 0.48069433626349173, |
| "grad_norm": 25.75, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 13.1839, |
| "mean_token_accuracy": 0.7791078709065914, |
| "num_tokens": 7166608.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.7851757485419512, |
| "epoch": 0.4984978301991766, |
| "grad_norm": 23.125, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 12.5647, |
| "mean_token_accuracy": 0.7872234936803579, |
| "num_tokens": 7444923.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.8196133345365524, |
| "epoch": 0.5163013241348615, |
| "grad_norm": 23.75, |
| "learning_rate": 5.78e-06, |
| "loss": 13.195, |
| "mean_token_accuracy": 0.7780409008264542, |
| "num_tokens": 7706502.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.8108824253082275, |
| "epoch": 0.5341048180705463, |
| "grad_norm": 25.75, |
| "learning_rate": 5.98e-06, |
| "loss": 13.1243, |
| "mean_token_accuracy": 0.7791608296334743, |
| "num_tokens": 7969704.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_biology_entropy": 1.1140506463050843, |
| "eval_biology_loss": 1.1709669828414917, |
| "eval_biology_mean_token_accuracy": 0.7033902740478516, |
| "eval_biology_num_tokens": 7969704.0, |
| "eval_biology_runtime": 47.6418, |
| "eval_biology_samples_per_second": 10.495, |
| "eval_biology_steps_per_second": 2.624, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_chemistry_entropy": 0.7940924577713012, |
| "eval_chemistry_loss": 0.8012509942054749, |
| "eval_chemistry_mean_token_accuracy": 0.7820386853218079, |
| "eval_chemistry_num_tokens": 7969704.0, |
| "eval_chemistry_runtime": 59.4475, |
| "eval_chemistry_samples_per_second": 8.411, |
| "eval_chemistry_steps_per_second": 2.103, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7919084688648581, |
| "epoch": 0.5519083120062312, |
| "grad_norm": 24.375, |
| "learning_rate": 6.18e-06, |
| "loss": 12.7303, |
| "mean_token_accuracy": 0.7834540419280529, |
| "num_tokens": 8242162.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.8003328915685415, |
| "epoch": 0.5697118059419161, |
| "grad_norm": 22.875, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 12.8333, |
| "mean_token_accuracy": 0.7824427511543035, |
| "num_tokens": 8497852.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.7982260027900339, |
| "epoch": 0.587515299877601, |
| "grad_norm": 28.0, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 12.8827, |
| "mean_token_accuracy": 0.7826927099376917, |
| "num_tokens": 8757753.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.7888958260416985, |
| "epoch": 0.6053187938132859, |
| "grad_norm": 25.125, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 12.7464, |
| "mean_token_accuracy": 0.7863177515566349, |
| "num_tokens": 9024677.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.7913781819865108, |
| "epoch": 0.6231222877489707, |
| "grad_norm": 26.0, |
| "learning_rate": 6.98e-06, |
| "loss": 12.7471, |
| "mean_token_accuracy": 0.7842035111039877, |
| "num_tokens": 9291760.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.7765169985592365, |
| "epoch": 0.6409257816846556, |
| "grad_norm": 22.0, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 12.5378, |
| "mean_token_accuracy": 0.7859202962368727, |
| "num_tokens": 9561091.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.7792716162279248, |
| "epoch": 0.6587292756203404, |
| "grad_norm": 25.0, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 12.4956, |
| "mean_token_accuracy": 0.7880564954131841, |
| "num_tokens": 9827272.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.758336128294468, |
| "epoch": 0.6765327695560254, |
| "grad_norm": 25.0, |
| "learning_rate": 7.58e-06, |
| "loss": 12.1732, |
| "mean_token_accuracy": 0.7918921418488025, |
| "num_tokens": 10096065.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.7533971995115281, |
| "epoch": 0.6943362634917103, |
| "grad_norm": 24.375, |
| "learning_rate": 7.78e-06, |
| "loss": 12.1132, |
| "mean_token_accuracy": 0.7922064792364836, |
| "num_tokens": 10364601.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.7620638139545918, |
| "epoch": 0.7121397574273951, |
| "grad_norm": 23.625, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 12.3379, |
| "mean_token_accuracy": 0.7897147350013256, |
| "num_tokens": 10633325.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_biology_entropy": 1.1209784712791442, |
| "eval_biology_loss": 1.175487995147705, |
| "eval_biology_mean_token_accuracy": 0.7023502192497253, |
| "eval_biology_num_tokens": 10633325.0, |
| "eval_biology_runtime": 45.7349, |
| "eval_biology_samples_per_second": 10.933, |
| "eval_biology_steps_per_second": 2.733, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_chemistry_entropy": 0.7846209690570831, |
| "eval_chemistry_loss": 0.7767007946968079, |
| "eval_chemistry_mean_token_accuracy": 0.7876848134994506, |
| "eval_chemistry_num_tokens": 10633325.0, |
| "eval_chemistry_runtime": 56.279, |
| "eval_chemistry_samples_per_second": 8.884, |
| "eval_chemistry_steps_per_second": 2.221, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.7557655736804009, |
| "epoch": 0.72994325136308, |
| "grad_norm": 25.0, |
| "learning_rate": 8.18e-06, |
| "loss": 12.1754, |
| "mean_token_accuracy": 0.7914514016360045, |
| "num_tokens": 10897916.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.7626162808388471, |
| "epoch": 0.7477467452987648, |
| "grad_norm": 22.375, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 12.1991, |
| "mean_token_accuracy": 0.7916438620537519, |
| "num_tokens": 11165356.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.7673018729314208, |
| "epoch": 0.7655502392344498, |
| "grad_norm": 24.375, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 12.3987, |
| "mean_token_accuracy": 0.7888187035918236, |
| "num_tokens": 11436799.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.7744929634034634, |
| "epoch": 0.7833537331701347, |
| "grad_norm": 24.125, |
| "learning_rate": 8.78e-06, |
| "loss": 12.4708, |
| "mean_token_accuracy": 0.787474300712347, |
| "num_tokens": 11703496.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.7655632747337222, |
| "epoch": 0.8011572271058195, |
| "grad_norm": 24.875, |
| "learning_rate": 8.98e-06, |
| "loss": 12.299, |
| "mean_token_accuracy": 0.7900604665279388, |
| "num_tokens": 11965530.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7661685338243842, |
| "epoch": 0.8189607210415044, |
| "grad_norm": 23.0, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 12.3101, |
| "mean_token_accuracy": 0.7891276117414237, |
| "num_tokens": 12224427.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.7463042287155985, |
| "epoch": 0.8367642149771892, |
| "grad_norm": 24.125, |
| "learning_rate": 9.38e-06, |
| "loss": 12.0346, |
| "mean_token_accuracy": 0.7945169288665056, |
| "num_tokens": 12509124.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.753023486584425, |
| "epoch": 0.8545677089128741, |
| "grad_norm": 21.25, |
| "learning_rate": 9.58e-06, |
| "loss": 12.0236, |
| "mean_token_accuracy": 0.7931863989681005, |
| "num_tokens": 12778408.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.7310503415763379, |
| "epoch": 0.8723712028485591, |
| "grad_norm": 22.25, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 11.8094, |
| "mean_token_accuracy": 0.7964685469865799, |
| "num_tokens": 13046473.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.7645759535953403, |
| "epoch": 0.8901746967842439, |
| "grad_norm": 22.375, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 12.2645, |
| "mean_token_accuracy": 0.7906601417809724, |
| "num_tokens": 13301659.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_biology_entropy": 1.092752426624298, |
| "eval_biology_loss": 1.1775906085968018, |
| "eval_biology_mean_token_accuracy": 0.7025688862800599, |
| "eval_biology_num_tokens": 13301659.0, |
| "eval_biology_runtime": 268.8833, |
| "eval_biology_samples_per_second": 1.86, |
| "eval_biology_steps_per_second": 0.465, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_chemistry_entropy": 0.7384080934524536, |
| "eval_chemistry_loss": 0.7555699944496155, |
| "eval_chemistry_mean_token_accuracy": 0.7920897974967956, |
| "eval_chemistry_num_tokens": 13301659.0, |
| "eval_chemistry_runtime": 450.4029, |
| "eval_chemistry_samples_per_second": 1.11, |
| "eval_chemistry_steps_per_second": 0.278, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 18, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3874307456966482e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|