| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.5590297095805052, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.4558615636080503, |
| "epoch": 0.017803493935684877, |
| "grad_norm": 150.0, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 38.2052, |
| "mean_token_accuracy": 0.5860222218558192, |
| "num_tokens": 257189.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.4555816859006883, |
| "epoch": 0.035606987871369754, |
| "grad_norm": 145.0, |
| "learning_rate": 1.2666666666666669e-06, |
| "loss": 37.0513, |
| "mean_token_accuracy": 0.5905844675377011, |
| "num_tokens": 528285.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.494897262006998, |
| "epoch": 0.053410481807054634, |
| "grad_norm": 143.0, |
| "learning_rate": 1.9333333333333336e-06, |
| "loss": 36.9306, |
| "mean_token_accuracy": 0.5914231909438967, |
| "num_tokens": 801635.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.616441186517477, |
| "epoch": 0.07121397574273951, |
| "grad_norm": 99.0, |
| "learning_rate": 2.6e-06, |
| "loss": 36.4312, |
| "mean_token_accuracy": 0.5914697827771306, |
| "num_tokens": 1060812.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.7240715526044368, |
| "epoch": 0.08901746967842439, |
| "grad_norm": 77.0, |
| "learning_rate": 3.266666666666667e-06, |
| "loss": 33.559, |
| "mean_token_accuracy": 0.60635172650218, |
| "num_tokens": 1327380.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.758330113068223, |
| "epoch": 0.10682096361410927, |
| "grad_norm": 50.0, |
| "learning_rate": 3.9333333333333335e-06, |
| "loss": 31.6612, |
| "mean_token_accuracy": 0.6195196183398366, |
| "num_tokens": 1597405.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.7726996555924415, |
| "epoch": 0.12462445754979415, |
| "grad_norm": 40.5, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 29.7933, |
| "mean_token_accuracy": 0.6268794447183609, |
| "num_tokens": 1859684.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.6855496373027563, |
| "epoch": 0.14242795148547902, |
| "grad_norm": 36.25, |
| "learning_rate": 5.2666666666666665e-06, |
| "loss": 27.4103, |
| "mean_token_accuracy": 0.6419319735839963, |
| "num_tokens": 2118068.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.5652924977242946, |
| "epoch": 0.1602314454211639, |
| "grad_norm": 25.75, |
| "learning_rate": 5.933333333333335e-06, |
| "loss": 24.9134, |
| "mean_token_accuracy": 0.6668034821748734, |
| "num_tokens": 2388824.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.5398647494614124, |
| "epoch": 0.17803493935684878, |
| "grad_norm": 26.125, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 24.164, |
| "mean_token_accuracy": 0.6711678432300687, |
| "num_tokens": 2644330.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_biology_entropy": 2.6375960063934327, |
| "eval_biology_loss": 3.2082109451293945, |
| "eval_biology_mean_token_accuracy": 0.46086212635040286, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 50.0833, |
| "eval_biology_samples_per_second": 9.983, |
| "eval_biology_steps_per_second": 2.496, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_chemistry_entropy": 1.5032154874801635, |
| "eval_chemistry_loss": 1.455440878868103, |
| "eval_chemistry_mean_token_accuracy": 0.6789760603904724, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 62.0343, |
| "eval_chemistry_samples_per_second": 8.06, |
| "eval_chemistry_steps_per_second": 2.015, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_math_entropy": 1.193375108242035, |
| "eval_math_loss": 1.458133578300476, |
| "eval_math_mean_token_accuracy": 0.6929113068580628, |
| "eval_math_num_tokens": 2644330.0, |
| "eval_math_runtime": 63.5964, |
| "eval_math_samples_per_second": 7.862, |
| "eval_math_steps_per_second": 1.966, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_physics_entropy": 1.4176845016479491, |
| "eval_physics_loss": 1.4238468408584595, |
| "eval_physics_mean_token_accuracy": 0.6904244375228882, |
| "eval_physics_num_tokens": 2644330.0, |
| "eval_physics_runtime": 72.4595, |
| "eval_physics_samples_per_second": 6.9, |
| "eval_physics_steps_per_second": 1.725, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.4581830620765686, |
| "epoch": 0.19583843329253367, |
| "grad_norm": 24.125, |
| "learning_rate": 7.266666666666668e-06, |
| "loss": 22.507, |
| "mean_token_accuracy": 0.6856932375580073, |
| "num_tokens": 2913700.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.402471886202693, |
| "epoch": 0.21364192722821854, |
| "grad_norm": 19.5, |
| "learning_rate": 7.933333333333334e-06, |
| "loss": 21.9311, |
| "mean_token_accuracy": 0.6896604306995868, |
| "num_tokens": 3185255.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.335694681107998, |
| "epoch": 0.2314454211639034, |
| "grad_norm": 20.5, |
| "learning_rate": 8.6e-06, |
| "loss": 21.1586, |
| "mean_token_accuracy": 0.6965065613389015, |
| "num_tokens": 3454750.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.3426212836056948, |
| "epoch": 0.2492489150995883, |
| "grad_norm": 20.0, |
| "learning_rate": 9.266666666666667e-06, |
| "loss": 21.1798, |
| "mean_token_accuracy": 0.697324163839221, |
| "num_tokens": 3719113.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.3519339598715305, |
| "epoch": 0.26705240903527316, |
| "grad_norm": 19.625, |
| "learning_rate": 9.933333333333334e-06, |
| "loss": 21.3078, |
| "mean_token_accuracy": 0.6932455036789179, |
| "num_tokens": 3990505.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.2789028503000737, |
| "epoch": 0.28485590297095803, |
| "grad_norm": 20.75, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 20.2105, |
| "mean_token_accuracy": 0.7049414362758398, |
| "num_tokens": 4267403.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.2489309102296828, |
| "epoch": 0.30265939690664295, |
| "grad_norm": 19.75, |
| "learning_rate": 1.1266666666666668e-05, |
| "loss": 19.7199, |
| "mean_token_accuracy": 0.7114499010145664, |
| "num_tokens": 4535458.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.2442967489361763, |
| "epoch": 0.3204628908423278, |
| "grad_norm": 22.0, |
| "learning_rate": 1.1933333333333335e-05, |
| "loss": 19.7762, |
| "mean_token_accuracy": 0.7092621237039566, |
| "num_tokens": 4796815.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.203914923593402, |
| "epoch": 0.3382663847780127, |
| "grad_norm": 17.5, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 19.2074, |
| "mean_token_accuracy": 0.7151789110153913, |
| "num_tokens": 5066948.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.2423121832311153, |
| "epoch": 0.35606987871369755, |
| "grad_norm": 19.125, |
| "learning_rate": 1.3266666666666668e-05, |
| "loss": 19.7315, |
| "mean_token_accuracy": 0.7082974564284086, |
| "num_tokens": 5324751.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_biology_entropy": 2.4902925472259523, |
| "eval_biology_loss": 2.76477313041687, |
| "eval_biology_mean_token_accuracy": 0.49958884620666505, |
| "eval_biology_num_tokens": 5324751.0, |
| "eval_biology_runtime": 50.1978, |
| "eval_biology_samples_per_second": 9.961, |
| "eval_biology_steps_per_second": 2.49, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_chemistry_entropy": 1.2307413153648377, |
| "eval_chemistry_loss": 1.196126937866211, |
| "eval_chemistry_mean_token_accuracy": 0.7152912592887879, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 62.1826, |
| "eval_chemistry_samples_per_second": 8.041, |
| "eval_chemistry_steps_per_second": 2.01, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_math_entropy": 1.0849005093574524, |
| "eval_math_loss": 1.2871687412261963, |
| "eval_math_mean_token_accuracy": 0.7124236588478088, |
| "eval_math_num_tokens": 5324751.0, |
| "eval_math_runtime": 63.7854, |
| "eval_math_samples_per_second": 7.839, |
| "eval_math_steps_per_second": 1.96, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_physics_entropy": 1.1968733882904052, |
| "eval_physics_loss": 1.2079969644546509, |
| "eval_physics_mean_token_accuracy": 0.7211485290527344, |
| "eval_physics_num_tokens": 5324751.0, |
| "eval_physics_runtime": 72.5978, |
| "eval_physics_samples_per_second": 6.887, |
| "eval_physics_steps_per_second": 1.722, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.1588745508342981, |
| "epoch": 0.3738733726493824, |
| "grad_norm": 18.75, |
| "learning_rate": 1.3933333333333334e-05, |
| "loss": 18.3959, |
| "mean_token_accuracy": 0.7249933958053589, |
| "num_tokens": 5585508.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.176125205308199, |
| "epoch": 0.39167686658506734, |
| "grad_norm": 18.875, |
| "learning_rate": 1.46e-05, |
| "loss": 18.7519, |
| "mean_token_accuracy": 0.7198500070720911, |
| "num_tokens": 5848889.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.1387052025645972, |
| "epoch": 0.4094803605207522, |
| "grad_norm": 19.375, |
| "learning_rate": 1.5266666666666667e-05, |
| "loss": 18.0866, |
| "mean_token_accuracy": 0.7284072007983923, |
| "num_tokens": 6114855.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.1471338219940663, |
| "epoch": 0.4272838544564371, |
| "grad_norm": 19.875, |
| "learning_rate": 1.5933333333333336e-05, |
| "loss": 18.299, |
| "mean_token_accuracy": 0.7252648994326591, |
| "num_tokens": 6378152.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.146025552228093, |
| "epoch": 0.44508734839212194, |
| "grad_norm": 19.0, |
| "learning_rate": 1.66e-05, |
| "loss": 18.2416, |
| "mean_token_accuracy": 0.723513275757432, |
| "num_tokens": 6637273.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.1037513725459576, |
| "epoch": 0.4628908423278068, |
| "grad_norm": 21.875, |
| "learning_rate": 1.726666666666667e-05, |
| "loss": 17.6157, |
| "mean_token_accuracy": 0.732468755915761, |
| "num_tokens": 6896684.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.1126793511211872, |
| "epoch": 0.48069433626349173, |
| "grad_norm": 20.5, |
| "learning_rate": 1.7933333333333333e-05, |
| "loss": 17.8207, |
| "mean_token_accuracy": 0.7313903696835041, |
| "num_tokens": 7166608.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.0699752174317836, |
| "epoch": 0.4984978301991766, |
| "grad_norm": 18.125, |
| "learning_rate": 1.86e-05, |
| "loss": 17.087, |
| "mean_token_accuracy": 0.7397925728932023, |
| "num_tokens": 7444923.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.107931500300765, |
| "epoch": 0.5163013241348615, |
| "grad_norm": 18.25, |
| "learning_rate": 1.926666666666667e-05, |
| "loss": 17.6806, |
| "mean_token_accuracy": 0.7309919781982899, |
| "num_tokens": 7706502.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.1062700897455215, |
| "epoch": 0.5341048180705463, |
| "grad_norm": 21.0, |
| "learning_rate": 1.9933333333333334e-05, |
| "loss": 17.8362, |
| "mean_token_accuracy": 0.7301551215350628, |
| "num_tokens": 7969704.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_biology_entropy": 2.340687463760376, |
| "eval_biology_loss": 2.611450672149658, |
| "eval_biology_mean_token_accuracy": 0.5168745517730713, |
| "eval_biology_num_tokens": 7969704.0, |
| "eval_biology_runtime": 50.2023, |
| "eval_biology_samples_per_second": 9.96, |
| "eval_biology_steps_per_second": 2.49, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_chemistry_entropy": 1.0767778873443603, |
| "eval_chemistry_loss": 1.0889482498168945, |
| "eval_chemistry_mean_token_accuracy": 0.7341342482566834, |
| "eval_chemistry_num_tokens": 7969704.0, |
| "eval_chemistry_runtime": 62.1899, |
| "eval_chemistry_samples_per_second": 8.04, |
| "eval_chemistry_steps_per_second": 2.01, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_math_entropy": 1.015703486919403, |
| "eval_math_loss": 1.2289094924926758, |
| "eval_math_mean_token_accuracy": 0.7229186329841614, |
| "eval_math_num_tokens": 7969704.0, |
| "eval_math_runtime": 63.7316, |
| "eval_math_samples_per_second": 7.845, |
| "eval_math_steps_per_second": 1.961, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_physics_entropy": 1.0841173582077026, |
| "eval_physics_loss": 1.126592993736267, |
| "eval_physics_mean_token_accuracy": 0.7341932263374329, |
| "eval_physics_num_tokens": 7969704.0, |
| "eval_physics_runtime": 72.5904, |
| "eval_physics_samples_per_second": 6.888, |
| "eval_physics_steps_per_second": 1.722, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.0747302711009978, |
| "epoch": 0.5519083120062312, |
| "grad_norm": 20.875, |
| "learning_rate": 1.9933333333333334e-05, |
| "loss": 17.1791, |
| "mean_token_accuracy": 0.7361905895173549, |
| "num_tokens": 8242162.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.0719103053212167, |
| "epoch": 0.5697118059419161, |
| "grad_norm": 17.75, |
| "learning_rate": 1.985925925925926e-05, |
| "loss": 17.1736, |
| "mean_token_accuracy": 0.7374342199414968, |
| "num_tokens": 8497852.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.0716772571206092, |
| "epoch": 0.587515299877601, |
| "grad_norm": 19.75, |
| "learning_rate": 1.9785185185185187e-05, |
| "loss": 17.1699, |
| "mean_token_accuracy": 0.7366803426295518, |
| "num_tokens": 8757753.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.0332202341407537, |
| "epoch": 0.6053187938132859, |
| "grad_norm": 17.625, |
| "learning_rate": 1.971111111111111e-05, |
| "loss": 16.5924, |
| "mean_token_accuracy": 0.745457150787115, |
| "num_tokens": 9024677.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.039108694717288, |
| "epoch": 0.6231222877489707, |
| "grad_norm": 19.625, |
| "learning_rate": 1.963703703703704e-05, |
| "loss": 16.7555, |
| "mean_token_accuracy": 0.7415129184722901, |
| "num_tokens": 9291760.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.0237553935497998, |
| "epoch": 0.6409257816846556, |
| "grad_norm": 16.5, |
| "learning_rate": 1.9562962962962964e-05, |
| "loss": 16.3169, |
| "mean_token_accuracy": 0.7460552308708429, |
| "num_tokens": 9561091.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.0181537088006736, |
| "epoch": 0.6587292756203404, |
| "grad_norm": 24.875, |
| "learning_rate": 1.948888888888889e-05, |
| "loss": 16.4161, |
| "mean_token_accuracy": 0.7461798526346684, |
| "num_tokens": 9827272.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.0005255609750747, |
| "epoch": 0.6765327695560254, |
| "grad_norm": 18.0, |
| "learning_rate": 1.9414814814814817e-05, |
| "loss": 16.0281, |
| "mean_token_accuracy": 0.7507184192538261, |
| "num_tokens": 10096065.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.9823020908981561, |
| "epoch": 0.6943362634917103, |
| "grad_norm": 17.25, |
| "learning_rate": 1.9340740740740743e-05, |
| "loss": 15.7951, |
| "mean_token_accuracy": 0.7518063317984343, |
| "num_tokens": 10364601.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.9987297028303146, |
| "epoch": 0.7121397574273951, |
| "grad_norm": 18.0, |
| "learning_rate": 1.926666666666667e-05, |
| "loss": 16.1426, |
| "mean_token_accuracy": 0.750870854780078, |
| "num_tokens": 10633325.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_biology_entropy": 2.3316021585464477, |
| "eval_biology_loss": 2.5441582202911377, |
| "eval_biology_mean_token_accuracy": 0.525359542131424, |
| "eval_biology_num_tokens": 10633325.0, |
| "eval_biology_runtime": 50.0463, |
| "eval_biology_samples_per_second": 9.991, |
| "eval_biology_steps_per_second": 2.498, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_chemistry_entropy": 1.042556393146515, |
| "eval_chemistry_loss": 1.0275757312774658, |
| "eval_chemistry_mean_token_accuracy": 0.7446413273811341, |
| "eval_chemistry_num_tokens": 10633325.0, |
| "eval_chemistry_runtime": 62.0477, |
| "eval_chemistry_samples_per_second": 8.058, |
| "eval_chemistry_steps_per_second": 2.015, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_math_entropy": 0.9996259999275208, |
| "eval_math_loss": 1.197845697402954, |
| "eval_math_mean_token_accuracy": 0.728334641456604, |
| "eval_math_num_tokens": 10633325.0, |
| "eval_math_runtime": 63.6823, |
| "eval_math_samples_per_second": 7.851, |
| "eval_math_steps_per_second": 1.963, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_physics_entropy": 1.0703304433822631, |
| "eval_physics_loss": 1.0842761993408203, |
| "eval_physics_mean_token_accuracy": 0.7408034291267395, |
| "eval_physics_num_tokens": 10633325.0, |
| "eval_physics_runtime": 72.5592, |
| "eval_physics_samples_per_second": 6.891, |
| "eval_physics_steps_per_second": 1.723, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.987117737904191, |
| "epoch": 0.72994325136308, |
| "grad_norm": 18.625, |
| "learning_rate": 1.9192592592592593e-05, |
| "loss": 15.8075, |
| "mean_token_accuracy": 0.7534378308802843, |
| "num_tokens": 10897916.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.0024050276726484, |
| "epoch": 0.7477467452987648, |
| "grad_norm": 17.375, |
| "learning_rate": 1.911851851851852e-05, |
| "loss": 16.0465, |
| "mean_token_accuracy": 0.7507158998399973, |
| "num_tokens": 11165356.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.0176004242151975, |
| "epoch": 0.7655502392344498, |
| "grad_norm": 17.625, |
| "learning_rate": 1.9044444444444446e-05, |
| "loss": 16.3104, |
| "mean_token_accuracy": 0.7471235640347004, |
| "num_tokens": 11436799.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.0073765676468611, |
| "epoch": 0.7833537331701347, |
| "grad_norm": 19.75, |
| "learning_rate": 1.8970370370370372e-05, |
| "loss": 16.2267, |
| "mean_token_accuracy": 0.7476929984986782, |
| "num_tokens": 11703496.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.0097984321415425, |
| "epoch": 0.8011572271058195, |
| "grad_norm": 20.125, |
| "learning_rate": 1.8896296296296295e-05, |
| "loss": 16.22, |
| "mean_token_accuracy": 0.7488253738731145, |
| "num_tokens": 11965530.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.9962503872811794, |
| "epoch": 0.8189607210415044, |
| "grad_norm": 18.875, |
| "learning_rate": 1.8822222222222225e-05, |
| "loss": 16.0471, |
| "mean_token_accuracy": 0.749473949894309, |
| "num_tokens": 12224427.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.9670230474323034, |
| "epoch": 0.8367642149771892, |
| "grad_norm": 17.625, |
| "learning_rate": 1.874814814814815e-05, |
| "loss": 15.4773, |
| "mean_token_accuracy": 0.7574279136955738, |
| "num_tokens": 12509124.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.9755645230412483, |
| "epoch": 0.8545677089128741, |
| "grad_norm": 16.375, |
| "learning_rate": 1.8674074074074075e-05, |
| "loss": 15.6296, |
| "mean_token_accuracy": 0.7540624614804983, |
| "num_tokens": 12778408.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.958490178361535, |
| "epoch": 0.8723712028485591, |
| "grad_norm": 18.25, |
| "learning_rate": 1.86e-05, |
| "loss": 15.4005, |
| "mean_token_accuracy": 0.7574606340378522, |
| "num_tokens": 13046473.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.9886346895247697, |
| "epoch": 0.8901746967842439, |
| "grad_norm": 17.375, |
| "learning_rate": 1.8525925925925928e-05, |
| "loss": 15.8912, |
| "mean_token_accuracy": 0.7506300307810306, |
| "num_tokens": 13301659.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_biology_entropy": 2.3308397455215455, |
| "eval_biology_loss": 2.4938247203826904, |
| "eval_biology_mean_token_accuracy": 0.5307838084697724, |
| "eval_biology_num_tokens": 13301659.0, |
| "eval_biology_runtime": 50.1683, |
| "eval_biology_samples_per_second": 9.966, |
| "eval_biology_steps_per_second": 2.492, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_chemistry_entropy": 1.0188498930931091, |
| "eval_chemistry_loss": 0.993285596370697, |
| "eval_chemistry_mean_token_accuracy": 0.7511732378005982, |
| "eval_chemistry_num_tokens": 13301659.0, |
| "eval_chemistry_runtime": 61.7475, |
| "eval_chemistry_samples_per_second": 8.097, |
| "eval_chemistry_steps_per_second": 2.024, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_math_entropy": 0.9689855208396911, |
| "eval_math_loss": 1.1819260120391846, |
| "eval_math_mean_token_accuracy": 0.7314836554527283, |
| "eval_math_num_tokens": 13301659.0, |
| "eval_math_runtime": 63.7311, |
| "eval_math_samples_per_second": 7.845, |
| "eval_math_steps_per_second": 1.961, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_physics_entropy": 1.045595247745514, |
| "eval_physics_loss": 1.060469150543213, |
| "eval_physics_mean_token_accuracy": 0.745576328754425, |
| "eval_physics_num_tokens": 13301659.0, |
| "eval_physics_runtime": 72.5821, |
| "eval_physics_samples_per_second": 6.889, |
| "eval_physics_steps_per_second": 1.722, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.9531665652990341, |
| "epoch": 0.9079781907199288, |
| "grad_norm": 16.875, |
| "learning_rate": 1.8451851851851855e-05, |
| "loss": 15.3376, |
| "mean_token_accuracy": 0.7606961231678724, |
| "num_tokens": 13570986.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.9829177517443896, |
| "epoch": 0.9257816846556136, |
| "grad_norm": 18.75, |
| "learning_rate": 1.8377777777777778e-05, |
| "loss": 15.8539, |
| "mean_token_accuracy": 0.7539051301777363, |
| "num_tokens": 13833901.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.9593800643458963, |
| "epoch": 0.9435851785912985, |
| "grad_norm": 16.5, |
| "learning_rate": 1.8303703703703704e-05, |
| "loss": 15.3067, |
| "mean_token_accuracy": 0.7598523862659932, |
| "num_tokens": 14093888.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.9910766646265984, |
| "epoch": 0.9613886725269835, |
| "grad_norm": 17.875, |
| "learning_rate": 1.822962962962963e-05, |
| "loss": 15.8945, |
| "mean_token_accuracy": 0.7506263140588999, |
| "num_tokens": 14354918.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.960052739828825, |
| "epoch": 0.9791921664626683, |
| "grad_norm": 18.0, |
| "learning_rate": 1.8155555555555557e-05, |
| "loss": 15.3286, |
| "mean_token_accuracy": 0.7570735458284616, |
| "num_tokens": 14620244.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.9758090447634459, |
| "epoch": 0.9969956603983532, |
| "grad_norm": 19.125, |
| "learning_rate": 1.8081481481481484e-05, |
| "loss": 15.8003, |
| "mean_token_accuracy": 0.7534121379256249, |
| "num_tokens": 14886593.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.9164214801403784, |
| "epoch": 1.014242795148548, |
| "grad_norm": 17.125, |
| "learning_rate": 1.800740740740741e-05, |
| "loss": 14.1756, |
| "mean_token_accuracy": 0.7682134124540514, |
| "num_tokens": 15146469.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.9603695683181286, |
| "epoch": 1.0320462890842328, |
| "grad_norm": 17.125, |
| "learning_rate": 1.7933333333333333e-05, |
| "loss": 15.3128, |
| "mean_token_accuracy": 0.7566317860037088, |
| "num_tokens": 15404569.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.9028990363702178, |
| "epoch": 1.0498497830199176, |
| "grad_norm": 19.25, |
| "learning_rate": 1.785925925925926e-05, |
| "loss": 14.589, |
| "mean_token_accuracy": 0.7690575599670411, |
| "num_tokens": 15681709.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.9441474135965109, |
| "epoch": 1.0676532769556026, |
| "grad_norm": 18.125, |
| "learning_rate": 1.7785185185185186e-05, |
| "loss": 15.0989, |
| "mean_token_accuracy": 0.7607255820184946, |
| "num_tokens": 15949421.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_biology_entropy": 2.2264863386154174, |
| "eval_biology_loss": 2.469179391860962, |
| "eval_biology_mean_token_accuracy": 0.5342538194656372, |
| "eval_biology_num_tokens": 15949421.0, |
| "eval_biology_runtime": 50.1502, |
| "eval_biology_samples_per_second": 9.97, |
| "eval_biology_steps_per_second": 2.493, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_chemistry_entropy": 0.9480550351142883, |
| "eval_chemistry_loss": 0.9721810817718506, |
| "eval_chemistry_mean_token_accuracy": 0.7548542003631592, |
| "eval_chemistry_num_tokens": 15949421.0, |
| "eval_chemistry_runtime": 62.2488, |
| "eval_chemistry_samples_per_second": 8.032, |
| "eval_chemistry_steps_per_second": 2.008, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_math_entropy": 0.9378721189498901, |
| "eval_math_loss": 1.1737841367721558, |
| "eval_math_mean_token_accuracy": 0.7330943899154663, |
| "eval_math_num_tokens": 15949421.0, |
| "eval_math_runtime": 63.7255, |
| "eval_math_samples_per_second": 7.846, |
| "eval_math_steps_per_second": 1.962, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_physics_entropy": 0.9874802966117859, |
| "eval_physics_loss": 1.0468889474868774, |
| "eval_physics_mean_token_accuracy": 0.7478159666061401, |
| "eval_physics_num_tokens": 15949421.0, |
| "eval_physics_runtime": 72.6712, |
| "eval_physics_samples_per_second": 6.88, |
| "eval_physics_steps_per_second": 1.72, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.9052479151636362, |
| "epoch": 1.0854567708912874, |
| "grad_norm": 18.0, |
| "learning_rate": 1.7711111111111113e-05, |
| "loss": 14.6237, |
| "mean_token_accuracy": 0.7670071702450514, |
| "num_tokens": 16221916.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.9550071226432919, |
| "epoch": 1.1032602648269723, |
| "grad_norm": 20.0, |
| "learning_rate": 1.763703703703704e-05, |
| "loss": 15.3094, |
| "mean_token_accuracy": 0.7585636477917432, |
| "num_tokens": 16488154.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.9349688177928328, |
| "epoch": 1.121063758762657, |
| "grad_norm": 17.75, |
| "learning_rate": 1.7562962962962962e-05, |
| "loss": 14.9904, |
| "mean_token_accuracy": 0.7620084758847951, |
| "num_tokens": 16756828.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.9166740071028471, |
| "epoch": 1.138867252698342, |
| "grad_norm": 18.125, |
| "learning_rate": 1.7488888888888892e-05, |
| "loss": 14.8096, |
| "mean_token_accuracy": 0.7640962358564138, |
| "num_tokens": 17023681.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.9381619554013014, |
| "epoch": 1.156670746634027, |
| "grad_norm": 19.375, |
| "learning_rate": 1.7414814814814815e-05, |
| "loss": 15.0268, |
| "mean_token_accuracy": 0.7623019654303789, |
| "num_tokens": 17286672.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.9595299258828163, |
| "epoch": 1.1744742405697117, |
| "grad_norm": 20.125, |
| "learning_rate": 1.7340740740740742e-05, |
| "loss": 15.4724, |
| "mean_token_accuracy": 0.7542405981570482, |
| "num_tokens": 17551581.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.931562440469861, |
| "epoch": 1.1922777345053968, |
| "grad_norm": 18.0, |
| "learning_rate": 1.726666666666667e-05, |
| "loss": 14.9543, |
| "mean_token_accuracy": 0.7623463280498981, |
| "num_tokens": 17821205.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.9202350415289402, |
| "epoch": 1.2100812284410816, |
| "grad_norm": 18.375, |
| "learning_rate": 1.7192592592592595e-05, |
| "loss": 14.7341, |
| "mean_token_accuracy": 0.7650586750358344, |
| "num_tokens": 18087759.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.9340890938416123, |
| "epoch": 1.2278847223767664, |
| "grad_norm": 20.875, |
| "learning_rate": 1.711851851851852e-05, |
| "loss": 15.0011, |
| "mean_token_accuracy": 0.7621566403657198, |
| "num_tokens": 18355732.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.91981928832829, |
| "epoch": 1.2456882163124514, |
| "grad_norm": 17.75, |
| "learning_rate": 1.7044444444444445e-05, |
| "loss": 14.8254, |
| "mean_token_accuracy": 0.7640648409724236, |
| "num_tokens": 18626245.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_biology_entropy": 2.2343764390945435, |
| "eval_biology_loss": 2.4601831436157227, |
| "eval_biology_mean_token_accuracy": 0.5344808685779572, |
| "eval_biology_num_tokens": 18626245.0, |
| "eval_biology_runtime": 50.1462, |
| "eval_biology_samples_per_second": 9.971, |
| "eval_biology_steps_per_second": 2.493, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_chemistry_entropy": 0.9566183142662048, |
| "eval_chemistry_loss": 0.9568698406219482, |
| "eval_chemistry_mean_token_accuracy": 0.7578468971252441, |
| "eval_chemistry_num_tokens": 18626245.0, |
| "eval_chemistry_runtime": 62.2346, |
| "eval_chemistry_samples_per_second": 8.034, |
| "eval_chemistry_steps_per_second": 2.009, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_math_entropy": 0.959031442642212, |
| "eval_math_loss": 1.1647942066192627, |
| "eval_math_mean_token_accuracy": 0.7345555291175843, |
| "eval_math_num_tokens": 18626245.0, |
| "eval_math_runtime": 63.7136, |
| "eval_math_samples_per_second": 7.848, |
| "eval_math_steps_per_second": 1.962, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_physics_entropy": 1.0028329586982727, |
| "eval_physics_loss": 1.0349607467651367, |
| "eval_physics_mean_token_accuracy": 0.7502033457756042, |
| "eval_physics_num_tokens": 18626245.0, |
| "eval_physics_runtime": 72.6619, |
| "eval_physics_samples_per_second": 6.881, |
| "eval_physics_steps_per_second": 1.72, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.9163766440004111, |
| "epoch": 1.2634917102481362, |
| "grad_norm": 16.875, |
| "learning_rate": 1.697037037037037e-05, |
| "loss": 14.6189, |
| "mean_token_accuracy": 0.7678167190402746, |
| "num_tokens": 18899632.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.939722764864564, |
| "epoch": 1.281295204183821, |
| "grad_norm": 18.625, |
| "learning_rate": 1.6896296296296298e-05, |
| "loss": 15.0927, |
| "mean_token_accuracy": 0.7612305961549282, |
| "num_tokens": 19173635.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.8912930946797133, |
| "epoch": 1.299098698119506, |
| "grad_norm": 16.5, |
| "learning_rate": 1.6822222222222224e-05, |
| "loss": 14.3538, |
| "mean_token_accuracy": 0.7711974292993545, |
| "num_tokens": 19453495.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.9094827668741345, |
| "epoch": 1.3169021920551909, |
| "grad_norm": 18.375, |
| "learning_rate": 1.6748148148148147e-05, |
| "loss": 14.6091, |
| "mean_token_accuracy": 0.7676353622227907, |
| "num_tokens": 19719086.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.9042323708534241, |
| "epoch": 1.3347056859908757, |
| "grad_norm": 20.0, |
| "learning_rate": 1.6674074074074077e-05, |
| "loss": 14.5751, |
| "mean_token_accuracy": 0.766610498726368, |
| "num_tokens": 19981272.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.917678284458816, |
| "epoch": 1.3525091799265607, |
| "grad_norm": 18.0, |
| "learning_rate": 1.66e-05, |
| "loss": 14.7974, |
| "mean_token_accuracy": 0.765035580471158, |
| "num_tokens": 20243034.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.9306829981505871, |
| "epoch": 1.3703126738622455, |
| "grad_norm": 21.25, |
| "learning_rate": 1.6525925925925927e-05, |
| "loss": 14.808, |
| "mean_token_accuracy": 0.7635540094226598, |
| "num_tokens": 20509028.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.9415363982319832, |
| "epoch": 1.3881161677979303, |
| "grad_norm": 19.0, |
| "learning_rate": 1.6451851851851853e-05, |
| "loss": 15.2598, |
| "mean_token_accuracy": 0.7586205244064331, |
| "num_tokens": 20762668.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.9531059201806784, |
| "epoch": 1.4059196617336152, |
| "grad_norm": 18.25, |
| "learning_rate": 1.637777777777778e-05, |
| "loss": 15.2412, |
| "mean_token_accuracy": 0.7583367008715868, |
| "num_tokens": 21025277.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.9117215130478143, |
| "epoch": 1.4237231556693, |
| "grad_norm": 18.375, |
| "learning_rate": 1.6303703703703706e-05, |
| "loss": 14.7413, |
| "mean_token_accuracy": 0.764646789804101, |
| "num_tokens": 21292320.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_biology_entropy": 2.181595217704773, |
| "eval_biology_loss": 2.4369781017303467, |
| "eval_biology_mean_token_accuracy": 0.5385415596961975, |
| "eval_biology_num_tokens": 21292320.0, |
| "eval_biology_runtime": 50.2095, |
| "eval_biology_samples_per_second": 9.958, |
| "eval_biology_steps_per_second": 2.49, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_chemistry_entropy": 0.9255911149978637, |
| "eval_chemistry_loss": 0.9440104961395264, |
| "eval_chemistry_mean_token_accuracy": 0.7607986888885498, |
| "eval_chemistry_num_tokens": 21292320.0, |
| "eval_chemistry_runtime": 62.2146, |
| "eval_chemistry_samples_per_second": 8.037, |
| "eval_chemistry_steps_per_second": 2.009, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_math_entropy": 0.9267802796363831, |
| "eval_math_loss": 1.1597362756729126, |
| "eval_math_mean_token_accuracy": 0.7358538980484008, |
| "eval_math_num_tokens": 21292320.0, |
| "eval_math_runtime": 63.7696, |
| "eval_math_samples_per_second": 7.841, |
| "eval_math_steps_per_second": 1.96, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_physics_entropy": 0.9747054700851441, |
| "eval_physics_loss": 1.0307121276855469, |
| "eval_physics_mean_token_accuracy": 0.7512628951072693, |
| "eval_physics_num_tokens": 21292320.0, |
| "eval_physics_runtime": 72.6228, |
| "eval_physics_samples_per_second": 6.885, |
| "eval_physics_steps_per_second": 1.721, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.9173937859013677, |
| "epoch": 1.441526649604985, |
| "grad_norm": 19.25, |
| "learning_rate": 1.622962962962963e-05, |
| "loss": 14.7171, |
| "mean_token_accuracy": 0.7644471134990454, |
| "num_tokens": 21563668.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.9219266520813107, |
| "epoch": 1.4593301435406698, |
| "grad_norm": 19.375, |
| "learning_rate": 1.6155555555555556e-05, |
| "loss": 14.6838, |
| "mean_token_accuracy": 0.7670932557433844, |
| "num_tokens": 21817298.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.9219702418893576, |
| "epoch": 1.4771336374763546, |
| "grad_norm": 18.875, |
| "learning_rate": 1.6081481481481482e-05, |
| "loss": 14.9205, |
| "mean_token_accuracy": 0.762965539470315, |
| "num_tokens": 22083010.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.9341276545077563, |
| "epoch": 1.4949371314120397, |
| "grad_norm": 20.5, |
| "learning_rate": 1.600740740740741e-05, |
| "loss": 14.99, |
| "mean_token_accuracy": 0.7623231802135706, |
| "num_tokens": 22350046.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.9002646986395121, |
| "epoch": 1.5127406253477245, |
| "grad_norm": 18.25, |
| "learning_rate": 1.5933333333333336e-05, |
| "loss": 14.4663, |
| "mean_token_accuracy": 0.7691004950553179, |
| "num_tokens": 22615933.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.892382406257093, |
| "epoch": 1.5305441192834093, |
| "grad_norm": 20.125, |
| "learning_rate": 1.5859259259259262e-05, |
| "loss": 14.4268, |
| "mean_token_accuracy": 0.7692761600017548, |
| "num_tokens": 22886843.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.9231874955818057, |
| "epoch": 1.5483476132190943, |
| "grad_norm": 20.375, |
| "learning_rate": 1.5785185185185185e-05, |
| "loss": 14.7653, |
| "mean_token_accuracy": 0.7639353916049003, |
| "num_tokens": 23140244.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.9141563843935728, |
| "epoch": 1.5661511071547791, |
| "grad_norm": 18.75, |
| "learning_rate": 1.571111111111111e-05, |
| "loss": 14.738, |
| "mean_token_accuracy": 0.7652020592242479, |
| "num_tokens": 23405121.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.9176435630768538, |
| "epoch": 1.583954601090464, |
| "grad_norm": 19.25, |
| "learning_rate": 1.5637037037037038e-05, |
| "loss": 14.6798, |
| "mean_token_accuracy": 0.766349321976304, |
| "num_tokens": 23671826.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.9114284439012408, |
| "epoch": 1.601758095026149, |
| "grad_norm": 21.5, |
| "learning_rate": 1.5562962962962965e-05, |
| "loss": 14.7888, |
| "mean_token_accuracy": 0.7634804543107748, |
| "num_tokens": 23933374.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_biology_entropy": 2.2156058864593504, |
| "eval_biology_loss": 2.4252357482910156, |
| "eval_biology_mean_token_accuracy": 0.5407873167991638, |
| "eval_biology_num_tokens": 23933374.0, |
| "eval_biology_runtime": 50.1734, |
| "eval_biology_samples_per_second": 9.965, |
| "eval_biology_steps_per_second": 2.491, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_chemistry_entropy": 0.9252297656536103, |
| "eval_chemistry_loss": 0.9346436858177185, |
| "eval_chemistry_mean_token_accuracy": 0.7620769119262696, |
| "eval_chemistry_num_tokens": 23933374.0, |
| "eval_chemistry_runtime": 62.2728, |
| "eval_chemistry_samples_per_second": 8.029, |
| "eval_chemistry_steps_per_second": 2.007, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_math_entropy": 0.9211918017864227, |
| "eval_math_loss": 1.1551804542541504, |
| "eval_math_mean_token_accuracy": 0.7371255087852479, |
| "eval_math_num_tokens": 23933374.0, |
| "eval_math_runtime": 63.8521, |
| "eval_math_samples_per_second": 7.831, |
| "eval_math_steps_per_second": 1.958, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_physics_entropy": 0.9732348275184631, |
| "eval_physics_loss": 1.02281653881073, |
| "eval_physics_mean_token_accuracy": 0.7527627892494202, |
| "eval_physics_num_tokens": 23933374.0, |
| "eval_physics_runtime": 72.7506, |
| "eval_physics_samples_per_second": 6.873, |
| "eval_physics_steps_per_second": 1.718, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.8828066166490316, |
| "epoch": 1.6195615889618338, |
| "grad_norm": 18.375, |
| "learning_rate": 1.548888888888889e-05, |
| "loss": 14.165, |
| "mean_token_accuracy": 0.7731477297842503, |
| "num_tokens": 24210156.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.9209568034857512, |
| "epoch": 1.6373650828975186, |
| "grad_norm": 20.5, |
| "learning_rate": 1.5414814814814814e-05, |
| "loss": 14.7391, |
| "mean_token_accuracy": 0.7655284915119409, |
| "num_tokens": 24462726.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.8495767239481211, |
| "epoch": 1.6551685768332036, |
| "grad_norm": 17.875, |
| "learning_rate": 1.5340740740740744e-05, |
| "loss": 13.6479, |
| "mean_token_accuracy": 0.7792568020522594, |
| "num_tokens": 24732815.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.8918781792744994, |
| "epoch": 1.6729720707688884, |
| "grad_norm": 17.875, |
| "learning_rate": 1.5266666666666667e-05, |
| "loss": 14.3161, |
| "mean_token_accuracy": 0.769364770874381, |
| "num_tokens": 25002091.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.8921130081638694, |
| "epoch": 1.6907755647045732, |
| "grad_norm": 18.125, |
| "learning_rate": 1.5192592592592594e-05, |
| "loss": 14.3408, |
| "mean_token_accuracy": 0.7707536950707435, |
| "num_tokens": 25267949.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.9075609926134348, |
| "epoch": 1.7085790586402583, |
| "grad_norm": 18.875, |
| "learning_rate": 1.5118518518518519e-05, |
| "loss": 14.5839, |
| "mean_token_accuracy": 0.7668720114976167, |
| "num_tokens": 25525772.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.9098908707499505, |
| "epoch": 1.7263825525759429, |
| "grad_norm": 18.25, |
| "learning_rate": 1.5044444444444445e-05, |
| "loss": 14.6968, |
| "mean_token_accuracy": 0.7657288115471601, |
| "num_tokens": 25787727.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.8908034894615412, |
| "epoch": 1.744186046511628, |
| "grad_norm": 17.375, |
| "learning_rate": 1.497037037037037e-05, |
| "loss": 14.2755, |
| "mean_token_accuracy": 0.7718735598027706, |
| "num_tokens": 26058026.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.8551814038306474, |
| "epoch": 1.761989540447313, |
| "grad_norm": 18.625, |
| "learning_rate": 1.4896296296296298e-05, |
| "loss": 13.7243, |
| "mean_token_accuracy": 0.778313934803009, |
| "num_tokens": 26326631.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.8867750719189644, |
| "epoch": 1.7797930343829975, |
| "grad_norm": 17.75, |
| "learning_rate": 1.4822222222222225e-05, |
| "loss": 14.2245, |
| "mean_token_accuracy": 0.7715762402862311, |
| "num_tokens": 26591735.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_biology_entropy": 2.2226771860122683, |
| "eval_biology_loss": 2.421318292617798, |
| "eval_biology_mean_token_accuracy": 0.5410654952526093, |
| "eval_biology_num_tokens": 26591735.0, |
| "eval_biology_runtime": 50.1559, |
| "eval_biology_samples_per_second": 9.969, |
| "eval_biology_steps_per_second": 2.492, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_chemistry_entropy": 0.9211353542804718, |
| "eval_chemistry_loss": 0.9268925786018372, |
| "eval_chemistry_mean_token_accuracy": 0.7640163869857788, |
| "eval_chemistry_num_tokens": 26591735.0, |
| "eval_chemistry_runtime": 61.7927, |
| "eval_chemistry_samples_per_second": 8.092, |
| "eval_chemistry_steps_per_second": 2.023, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_math_entropy": 0.9199098908901214, |
| "eval_math_loss": 1.1518878936767578, |
| "eval_math_mean_token_accuracy": 0.7376803703308106, |
| "eval_math_num_tokens": 26591735.0, |
| "eval_math_runtime": 63.6337, |
| "eval_math_samples_per_second": 7.857, |
| "eval_math_steps_per_second": 1.964, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_physics_entropy": 0.9745259823799133, |
| "eval_physics_loss": 1.0187253952026367, |
| "eval_physics_mean_token_accuracy": 0.7530020484924317, |
| "eval_physics_num_tokens": 26591735.0, |
| "eval_physics_runtime": 72.5159, |
| "eval_physics_samples_per_second": 6.895, |
| "eval_physics_steps_per_second": 1.724, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.8935877718031406, |
| "epoch": 1.7975965283186826, |
| "grad_norm": 18.875, |
| "learning_rate": 1.474814814814815e-05, |
| "loss": 14.407, |
| "mean_token_accuracy": 0.7685046702623367, |
| "num_tokens": 26857071.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.9290571358054877, |
| "epoch": 1.8154000222543676, |
| "grad_norm": 20.125, |
| "learning_rate": 1.4674074074074076e-05, |
| "loss": 14.9587, |
| "mean_token_accuracy": 0.7615065831691027, |
| "num_tokens": 27120617.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.8820719934999943, |
| "epoch": 1.8332035161900522, |
| "grad_norm": 19.25, |
| "learning_rate": 1.46e-05, |
| "loss": 14.1691, |
| "mean_token_accuracy": 0.7735363423824311, |
| "num_tokens": 27391174.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.8907686296850443, |
| "epoch": 1.8510070101257372, |
| "grad_norm": 16.875, |
| "learning_rate": 1.4525925925925927e-05, |
| "loss": 14.4161, |
| "mean_token_accuracy": 0.7694438762962819, |
| "num_tokens": 27656941.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.8787342315539718, |
| "epoch": 1.868810504061422, |
| "grad_norm": 20.25, |
| "learning_rate": 1.4451851851851852e-05, |
| "loss": 14.0692, |
| "mean_token_accuracy": 0.7733918130397797, |
| "num_tokens": 27927118.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.858662880398333, |
| "epoch": 1.8866139979971068, |
| "grad_norm": 17.75, |
| "learning_rate": 1.4377777777777779e-05, |
| "loss": 13.7913, |
| "mean_token_accuracy": 0.777552730217576, |
| "num_tokens": 28192620.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.8899114480242133, |
| "epoch": 1.9044174919327919, |
| "grad_norm": 18.125, |
| "learning_rate": 1.4303703703703703e-05, |
| "loss": 14.3082, |
| "mean_token_accuracy": 0.7708229344338179, |
| "num_tokens": 28458168.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.8925451099872589, |
| "epoch": 1.9222209858684767, |
| "grad_norm": 19.375, |
| "learning_rate": 1.4229629629629632e-05, |
| "loss": 14.395, |
| "mean_token_accuracy": 0.7694888945668936, |
| "num_tokens": 28722756.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.8925545170903206, |
| "epoch": 1.9400244798041615, |
| "grad_norm": 21.375, |
| "learning_rate": 1.4155555555555556e-05, |
| "loss": 14.3352, |
| "mean_token_accuracy": 0.7692101448774338, |
| "num_tokens": 28978375.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.9070079475641251, |
| "epoch": 1.9578279737398465, |
| "grad_norm": 19.0, |
| "learning_rate": 1.4081481481481483e-05, |
| "loss": 14.5717, |
| "mean_token_accuracy": 0.7671146191656589, |
| "num_tokens": 29236865.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_biology_entropy": 2.179805955886841, |
| "eval_biology_loss": 2.415015697479248, |
| "eval_biology_mean_token_accuracy": 0.541545505285263, |
| "eval_biology_num_tokens": 29236865.0, |
| "eval_biology_runtime": 50.0507, |
| "eval_biology_samples_per_second": 9.99, |
| "eval_biology_steps_per_second": 2.497, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_chemistry_entropy": 0.9021619951725006, |
| "eval_chemistry_loss": 0.9210672378540039, |
| "eval_chemistry_mean_token_accuracy": 0.7648204522132873, |
| "eval_chemistry_num_tokens": 29236865.0, |
| "eval_chemistry_runtime": 62.055, |
| "eval_chemistry_samples_per_second": 8.057, |
| "eval_chemistry_steps_per_second": 2.014, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_math_entropy": 0.9049963767528534, |
| "eval_math_loss": 1.1506003141403198, |
| "eval_math_mean_token_accuracy": 0.7379972767829895, |
| "eval_math_num_tokens": 29236865.0, |
| "eval_math_runtime": 63.8921, |
| "eval_math_samples_per_second": 7.826, |
| "eval_math_steps_per_second": 1.956, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_physics_entropy": 0.9570987558364868, |
| "eval_physics_loss": 1.0145933628082275, |
| "eval_physics_mean_token_accuracy": 0.7539256653785705, |
| "eval_physics_num_tokens": 29236865.0, |
| "eval_physics_runtime": 72.5008, |
| "eval_physics_samples_per_second": 6.896, |
| "eval_physics_steps_per_second": 1.724, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.8772769860923291, |
| "epoch": 1.9756314676755313, |
| "grad_norm": 18.75, |
| "learning_rate": 1.400740740740741e-05, |
| "loss": 14.14, |
| "mean_token_accuracy": 0.7737226475030183, |
| "num_tokens": 29500428.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.8800176231190562, |
| "epoch": 1.9934349616112161, |
| "grad_norm": 19.5, |
| "learning_rate": 1.3933333333333334e-05, |
| "loss": 14.1546, |
| "mean_token_accuracy": 0.7724078316241503, |
| "num_tokens": 29766739.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.8824025586728127, |
| "epoch": 2.010682096361411, |
| "grad_norm": 17.5, |
| "learning_rate": 1.385925925925926e-05, |
| "loss": 13.5102, |
| "mean_token_accuracy": 0.7743466096539651, |
| "num_tokens": 30025427.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.8490588096901774, |
| "epoch": 2.028485590297096, |
| "grad_norm": 18.25, |
| "learning_rate": 1.3785185185185186e-05, |
| "loss": 13.6634, |
| "mean_token_accuracy": 0.7773969031870365, |
| "num_tokens": 30286651.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.8616349579766392, |
| "epoch": 2.0462890842327806, |
| "grad_norm": 18.0, |
| "learning_rate": 1.3711111111111112e-05, |
| "loss": 13.9357, |
| "mean_token_accuracy": 0.7747346997261048, |
| "num_tokens": 30552624.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.8497168300673366, |
| "epoch": 2.0640925781684656, |
| "grad_norm": 23.625, |
| "learning_rate": 1.3637037037037037e-05, |
| "loss": 13.6363, |
| "mean_token_accuracy": 0.778491435945034, |
| "num_tokens": 30823746.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.9048545140773058, |
| "epoch": 2.0818960721041506, |
| "grad_norm": 20.0, |
| "learning_rate": 1.3562962962962965e-05, |
| "loss": 14.5493, |
| "mean_token_accuracy": 0.7658125407993793, |
| "num_tokens": 31076307.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.8968250282108784, |
| "epoch": 2.099699566039835, |
| "grad_norm": 20.25, |
| "learning_rate": 1.3488888888888888e-05, |
| "loss": 14.3439, |
| "mean_token_accuracy": 0.76846056394279, |
| "num_tokens": 31340763.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.8452551759779453, |
| "epoch": 2.1175030599755202, |
| "grad_norm": 18.5, |
| "learning_rate": 1.3414814814814817e-05, |
| "loss": 13.709, |
| "mean_token_accuracy": 0.7770821250975132, |
| "num_tokens": 31607200.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.8926042189821601, |
| "epoch": 2.1353065539112053, |
| "grad_norm": 20.5, |
| "learning_rate": 1.3340740740740741e-05, |
| "loss": 14.1988, |
| "mean_token_accuracy": 0.770186011493206, |
| "num_tokens": 31874392.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_biology_entropy": 2.109283407211304, |
| "eval_biology_loss": 2.4239776134490967, |
| "eval_biology_mean_token_accuracy": 0.5418779618740082, |
| "eval_biology_num_tokens": 31874392.0, |
| "eval_biology_runtime": 50.1375, |
| "eval_biology_samples_per_second": 9.973, |
| "eval_biology_steps_per_second": 2.493, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_chemistry_entropy": 0.8663334174156189, |
| "eval_chemistry_loss": 0.9175812602043152, |
| "eval_chemistry_mean_token_accuracy": 0.7658901815414428, |
| "eval_chemistry_num_tokens": 31874392.0, |
| "eval_chemistry_runtime": 62.1713, |
| "eval_chemistry_samples_per_second": 8.042, |
| "eval_chemistry_steps_per_second": 2.011, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_math_entropy": 0.8795386080741883, |
| "eval_math_loss": 1.1564308404922485, |
| "eval_math_mean_token_accuracy": 0.7383615078926087, |
| "eval_math_num_tokens": 31874392.0, |
| "eval_math_runtime": 63.7584, |
| "eval_math_samples_per_second": 7.842, |
| "eval_math_steps_per_second": 1.961, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_physics_entropy": 0.9255342264175415, |
| "eval_physics_loss": 1.0153311491012573, |
| "eval_physics_mean_token_accuracy": 0.7541386022567749, |
| "eval_physics_num_tokens": 31874392.0, |
| "eval_physics_runtime": 72.5996, |
| "eval_physics_samples_per_second": 6.887, |
| "eval_physics_steps_per_second": 1.722, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.8700024420395494, |
| "epoch": 2.15311004784689, |
| "grad_norm": 18.25, |
| "learning_rate": 1.3266666666666668e-05, |
| "loss": 14.028, |
| "mean_token_accuracy": 0.773828399553895, |
| "num_tokens": 32144016.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.8418688148260116, |
| "epoch": 2.170913541782575, |
| "grad_norm": 18.0, |
| "learning_rate": 1.3192592592592594e-05, |
| "loss": 13.5646, |
| "mean_token_accuracy": 0.7806433077901602, |
| "num_tokens": 32410902.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.8487472753971815, |
| "epoch": 2.18871703571826, |
| "grad_norm": 20.0, |
| "learning_rate": 1.311851851851852e-05, |
| "loss": 13.5766, |
| "mean_token_accuracy": 0.7793460212647915, |
| "num_tokens": 32674338.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.8806360449641943, |
| "epoch": 2.2065205296539445, |
| "grad_norm": 17.375, |
| "learning_rate": 1.3044444444444446e-05, |
| "loss": 14.2088, |
| "mean_token_accuracy": 0.7727648038417101, |
| "num_tokens": 32930219.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.8748544387519359, |
| "epoch": 2.2243240235896296, |
| "grad_norm": 18.375, |
| "learning_rate": 1.297037037037037e-05, |
| "loss": 13.9727, |
| "mean_token_accuracy": 0.773737746104598, |
| "num_tokens": 33194318.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.863721789419651, |
| "epoch": 2.242127517525314, |
| "grad_norm": 22.0, |
| "learning_rate": 1.2896296296296299e-05, |
| "loss": 13.9042, |
| "mean_token_accuracy": 0.7760524280369282, |
| "num_tokens": 33453292.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.8142619809135795, |
| "epoch": 2.259931011460999, |
| "grad_norm": 19.875, |
| "learning_rate": 1.2822222222222222e-05, |
| "loss": 13.1217, |
| "mean_token_accuracy": 0.7859312605112792, |
| "num_tokens": 33728875.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.8621509104967118, |
| "epoch": 2.277734505396684, |
| "grad_norm": 19.375, |
| "learning_rate": 1.274814814814815e-05, |
| "loss": 13.8532, |
| "mean_token_accuracy": 0.7752248857170343, |
| "num_tokens": 33999305.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.8649333998560905, |
| "epoch": 2.295537999332369, |
| "grad_norm": 18.875, |
| "learning_rate": 1.2674074074074075e-05, |
| "loss": 13.8033, |
| "mean_token_accuracy": 0.7762492351233959, |
| "num_tokens": 34269727.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.8091897923499346, |
| "epoch": 2.313341493268054, |
| "grad_norm": 18.625, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 13.1186, |
| "mean_token_accuracy": 0.78697599619627, |
| "num_tokens": 34537714.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_biology_entropy": 2.1390477933883667, |
| "eval_biology_loss": 2.4192566871643066, |
| "eval_biology_mean_token_accuracy": 0.5423436806201934, |
| "eval_biology_num_tokens": 34537714.0, |
| "eval_biology_runtime": 50.1191, |
| "eval_biology_samples_per_second": 9.976, |
| "eval_biology_steps_per_second": 2.494, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_chemistry_entropy": 0.8768635582923889, |
| "eval_chemistry_loss": 0.9130807518959045, |
| "eval_chemistry_mean_token_accuracy": 0.7667708139419556, |
| "eval_chemistry_num_tokens": 34537714.0, |
| "eval_chemistry_runtime": 62.0908, |
| "eval_chemistry_samples_per_second": 8.053, |
| "eval_chemistry_steps_per_second": 2.013, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_math_entropy": 0.8873134336471558, |
| "eval_math_loss": 1.155697226524353, |
| "eval_math_mean_token_accuracy": 0.7382589092254639, |
| "eval_math_num_tokens": 34537714.0, |
| "eval_math_runtime": 63.7547, |
| "eval_math_samples_per_second": 7.843, |
| "eval_math_steps_per_second": 1.961, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_physics_entropy": 0.9338676514625549, |
| "eval_physics_loss": 1.0136737823486328, |
| "eval_physics_mean_token_accuracy": 0.7543980851173401, |
| "eval_physics_num_tokens": 34537714.0, |
| "eval_physics_runtime": 72.6478, |
| "eval_physics_samples_per_second": 6.883, |
| "eval_physics_steps_per_second": 1.721, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.878282749839127, |
| "epoch": 2.331144987203739, |
| "grad_norm": 16.875, |
| "learning_rate": 1.2525925925925928e-05, |
| "loss": 14.1127, |
| "mean_token_accuracy": 0.7737123489379882, |
| "num_tokens": 34806647.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.8573022728785873, |
| "epoch": 2.3489484811394234, |
| "grad_norm": 17.625, |
| "learning_rate": 1.2451851851851853e-05, |
| "loss": 13.7362, |
| "mean_token_accuracy": 0.7762724231928587, |
| "num_tokens": 35076423.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.8828617952764034, |
| "epoch": 2.3667519750751085, |
| "grad_norm": 18.0, |
| "learning_rate": 1.237777777777778e-05, |
| "loss": 14.1807, |
| "mean_token_accuracy": 0.7704649318009615, |
| "num_tokens": 35343494.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.8566931948065758, |
| "epoch": 2.3845554690107935, |
| "grad_norm": 19.75, |
| "learning_rate": 1.2303703703703704e-05, |
| "loss": 13.8103, |
| "mean_token_accuracy": 0.7779654558748007, |
| "num_tokens": 35614731.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.873189901188016, |
| "epoch": 2.402358962946478, |
| "grad_norm": 20.125, |
| "learning_rate": 1.222962962962963e-05, |
| "loss": 13.9472, |
| "mean_token_accuracy": 0.7743974085897207, |
| "num_tokens": 35877264.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.8366567937657237, |
| "epoch": 2.420162456882163, |
| "grad_norm": 17.625, |
| "learning_rate": 1.2155555555555555e-05, |
| "loss": 13.4518, |
| "mean_token_accuracy": 0.7802697144448757, |
| "num_tokens": 36147587.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.8612970801070332, |
| "epoch": 2.437965950817848, |
| "grad_norm": 19.875, |
| "learning_rate": 1.2081481481481484e-05, |
| "loss": 13.855, |
| "mean_token_accuracy": 0.7769082084298133, |
| "num_tokens": 36416125.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.8623234683647751, |
| "epoch": 2.4557694447535328, |
| "grad_norm": 19.875, |
| "learning_rate": 1.2007407407407408e-05, |
| "loss": 13.7992, |
| "mean_token_accuracy": 0.7752144418656826, |
| "num_tokens": 36680383.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.8542416779324412, |
| "epoch": 2.473572938689218, |
| "grad_norm": 19.75, |
| "learning_rate": 1.1933333333333335e-05, |
| "loss": 13.816, |
| "mean_token_accuracy": 0.7753123745322228, |
| "num_tokens": 36946349.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.8324566734954715, |
| "epoch": 2.491376432624903, |
| "grad_norm": 19.0, |
| "learning_rate": 1.185925925925926e-05, |
| "loss": 13.3353, |
| "mean_token_accuracy": 0.7832330510020256, |
| "num_tokens": 37219532.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_biology_entropy": 2.1239566135406496, |
| "eval_biology_loss": 2.4204819202423096, |
| "eval_biology_mean_token_accuracy": 0.5423426122665406, |
| "eval_biology_num_tokens": 37219532.0, |
| "eval_biology_runtime": 50.0793, |
| "eval_biology_samples_per_second": 9.984, |
| "eval_biology_steps_per_second": 2.496, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_chemistry_entropy": 0.8717419400215148, |
| "eval_chemistry_loss": 0.9099505543708801, |
| "eval_chemistry_mean_token_accuracy": 0.767410210609436, |
| "eval_chemistry_num_tokens": 37219532.0, |
| "eval_chemistry_runtime": 62.0201, |
| "eval_chemistry_samples_per_second": 8.062, |
| "eval_chemistry_steps_per_second": 2.015, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_math_entropy": 0.8895688972473145, |
| "eval_math_loss": 1.1530780792236328, |
| "eval_math_mean_token_accuracy": 0.7385376715660095, |
| "eval_math_num_tokens": 37219532.0, |
| "eval_math_runtime": 63.7093, |
| "eval_math_samples_per_second": 7.848, |
| "eval_math_steps_per_second": 1.962, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_physics_entropy": 0.9295858397483826, |
| "eval_physics_loss": 1.011209487915039, |
| "eval_physics_mean_token_accuracy": 0.754787941455841, |
| "eval_physics_num_tokens": 37219532.0, |
| "eval_physics_runtime": 72.4466, |
| "eval_physics_samples_per_second": 6.902, |
| "eval_physics_steps_per_second": 1.725, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.8524840004742146, |
| "epoch": 2.5091799265605874, |
| "grad_norm": 17.0, |
| "learning_rate": 1.1785185185185186e-05, |
| "loss": 13.6708, |
| "mean_token_accuracy": 0.7797769341617823, |
| "num_tokens": 37477914.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.8495853399857879, |
| "epoch": 2.5269834204962724, |
| "grad_norm": 19.125, |
| "learning_rate": 1.1711111111111113e-05, |
| "loss": 13.6575, |
| "mean_token_accuracy": 0.7779612559825182, |
| "num_tokens": 37746690.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.8845419194549322, |
| "epoch": 2.544786914431957, |
| "grad_norm": 22.375, |
| "learning_rate": 1.1637037037037037e-05, |
| "loss": 14.2323, |
| "mean_token_accuracy": 0.7705770123749971, |
| "num_tokens": 38005792.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.8683915238827467, |
| "epoch": 2.562590408367642, |
| "grad_norm": 18.875, |
| "learning_rate": 1.1562962962962964e-05, |
| "loss": 14.0432, |
| "mean_token_accuracy": 0.7738983232527972, |
| "num_tokens": 38269021.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.8635239489376545, |
| "epoch": 2.580393902303327, |
| "grad_norm": 19.125, |
| "learning_rate": 1.1488888888888889e-05, |
| "loss": 13.7854, |
| "mean_token_accuracy": 0.7758109841495753, |
| "num_tokens": 38545224.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.8527183417230845, |
| "epoch": 2.598197396239012, |
| "grad_norm": 19.25, |
| "learning_rate": 1.1414814814814817e-05, |
| "loss": 13.7366, |
| "mean_token_accuracy": 0.7779222760349512, |
| "num_tokens": 38802290.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.8462369810789824, |
| "epoch": 2.6160008901746967, |
| "grad_norm": 18.75, |
| "learning_rate": 1.1340740740740742e-05, |
| "loss": 13.5523, |
| "mean_token_accuracy": 0.7808895532041789, |
| "num_tokens": 39065787.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.8484307693317532, |
| "epoch": 2.6338043841103818, |
| "grad_norm": 20.0, |
| "learning_rate": 1.1266666666666668e-05, |
| "loss": 13.6908, |
| "mean_token_accuracy": 0.7780116025358439, |
| "num_tokens": 39328517.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.8285580720752478, |
| "epoch": 2.6516078780460663, |
| "grad_norm": 18.625, |
| "learning_rate": 1.1192592592592593e-05, |
| "loss": 13.2537, |
| "mean_token_accuracy": 0.7836204443126917, |
| "num_tokens": 39600641.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.8712879396975041, |
| "epoch": 2.6694113719817514, |
| "grad_norm": 22.625, |
| "learning_rate": 1.111851851851852e-05, |
| "loss": 14.0783, |
| "mean_token_accuracy": 0.7724885962903499, |
| "num_tokens": 39857917.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_biology_entropy": 2.1526661901474, |
| "eval_biology_loss": 2.407249927520752, |
| "eval_biology_mean_token_accuracy": 0.5433344347476959, |
| "eval_biology_num_tokens": 39857917.0, |
| "eval_biology_runtime": 50.0583, |
| "eval_biology_samples_per_second": 9.988, |
| "eval_biology_steps_per_second": 2.497, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_chemistry_entropy": 0.8803858659267425, |
| "eval_chemistry_loss": 0.9068695306777954, |
| "eval_chemistry_mean_token_accuracy": 0.7678995361328125, |
| "eval_chemistry_num_tokens": 39857917.0, |
| "eval_chemistry_runtime": 61.7716, |
| "eval_chemistry_samples_per_second": 8.094, |
| "eval_chemistry_steps_per_second": 2.024, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_math_entropy": 0.8964290325641632, |
| "eval_math_loss": 1.149368405342102, |
| "eval_math_mean_token_accuracy": 0.7390133018493652, |
| "eval_math_num_tokens": 39857917.0, |
| "eval_math_runtime": 63.6923, |
| "eval_math_samples_per_second": 7.85, |
| "eval_math_steps_per_second": 1.963, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_physics_entropy": 0.9390261335372925, |
| "eval_physics_loss": 1.0092867612838745, |
| "eval_physics_mean_token_accuracy": 0.7551890163421631, |
| "eval_physics_num_tokens": 39857917.0, |
| "eval_physics_runtime": 72.5983, |
| "eval_physics_samples_per_second": 6.887, |
| "eval_physics_steps_per_second": 1.722, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.8572983698919415, |
| "epoch": 2.6872148659174364, |
| "grad_norm": 20.25, |
| "learning_rate": 1.1044444444444444e-05, |
| "loss": 13.7252, |
| "mean_token_accuracy": 0.7782326828688383, |
| "num_tokens": 40119174.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.8729016464203596, |
| "epoch": 2.7050183598531214, |
| "grad_norm": 20.125, |
| "learning_rate": 1.0970370370370371e-05, |
| "loss": 14.1488, |
| "mean_token_accuracy": 0.7719659119844436, |
| "num_tokens": 40379054.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.8609336314722895, |
| "epoch": 2.722821853788806, |
| "grad_norm": 19.0, |
| "learning_rate": 1.0896296296296298e-05, |
| "loss": 13.8104, |
| "mean_token_accuracy": 0.7757605772465468, |
| "num_tokens": 40650970.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.8696575410664081, |
| "epoch": 2.740625347724491, |
| "grad_norm": 18.625, |
| "learning_rate": 1.0822222222222222e-05, |
| "loss": 13.918, |
| "mean_token_accuracy": 0.7764236349612474, |
| "num_tokens": 40914339.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.8541526485234499, |
| "epoch": 2.7584288416601757, |
| "grad_norm": 18.375, |
| "learning_rate": 1.074814814814815e-05, |
| "loss": 13.7742, |
| "mean_token_accuracy": 0.7751550409942866, |
| "num_tokens": 41181827.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.8419796552509069, |
| "epoch": 2.7762323355958607, |
| "grad_norm": 21.0, |
| "learning_rate": 1.0674074074074074e-05, |
| "loss": 13.5073, |
| "mean_token_accuracy": 0.7807594135403633, |
| "num_tokens": 41448961.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.8490352286025882, |
| "epoch": 2.7940358295315457, |
| "grad_norm": 19.75, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 13.5743, |
| "mean_token_accuracy": 0.7788564160466194, |
| "num_tokens": 41712243.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.8538101438432932, |
| "epoch": 2.8118393234672303, |
| "grad_norm": 17.125, |
| "learning_rate": 1.0525925925925927e-05, |
| "loss": 13.8208, |
| "mean_token_accuracy": 0.7763843528926373, |
| "num_tokens": 41981755.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.8148689167574048, |
| "epoch": 2.8296428174029153, |
| "grad_norm": 17.375, |
| "learning_rate": 1.0451851851851853e-05, |
| "loss": 13.1774, |
| "mean_token_accuracy": 0.7859942618757486, |
| "num_tokens": 42258270.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.8575807249173522, |
| "epoch": 2.8474463113386, |
| "grad_norm": 19.0, |
| "learning_rate": 1.0377777777777778e-05, |
| "loss": 13.7128, |
| "mean_token_accuracy": 0.7790150199085474, |
| "num_tokens": 42529321.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_biology_entropy": 2.138750693321228, |
| "eval_biology_loss": 2.4031267166137695, |
| "eval_biology_mean_token_accuracy": 0.5439568190574646, |
| "eval_biology_num_tokens": 42529321.0, |
| "eval_biology_runtime": 50.409, |
| "eval_biology_samples_per_second": 9.919, |
| "eval_biology_steps_per_second": 2.48, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_chemistry_entropy": 0.8717709865570068, |
| "eval_chemistry_loss": 0.9044163823127747, |
| "eval_chemistry_mean_token_accuracy": 0.7687469787597656, |
| "eval_chemistry_num_tokens": 42529321.0, |
| "eval_chemistry_runtime": 62.1654, |
| "eval_chemistry_samples_per_second": 8.043, |
| "eval_chemistry_steps_per_second": 2.011, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_math_entropy": 0.8888742222785949, |
| "eval_math_loss": 1.1471683979034424, |
| "eval_math_mean_token_accuracy": 0.7393438844680786, |
| "eval_math_num_tokens": 42529321.0, |
| "eval_math_runtime": 63.7473, |
| "eval_math_samples_per_second": 7.843, |
| "eval_math_steps_per_second": 1.961, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_physics_entropy": 0.9300240802764893, |
| "eval_physics_loss": 1.0057083368301392, |
| "eval_physics_mean_token_accuracy": 0.756029381275177, |
| "eval_physics_num_tokens": 42529321.0, |
| "eval_physics_runtime": 72.6285, |
| "eval_physics_samples_per_second": 6.884, |
| "eval_physics_steps_per_second": 1.721, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.8835793185979128, |
| "epoch": 2.865249805274285, |
| "grad_norm": 20.125, |
| "learning_rate": 1.0303703703703705e-05, |
| "loss": 14.2433, |
| "mean_token_accuracy": 0.7716228555887937, |
| "num_tokens": 42793729.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.8588731488212943, |
| "epoch": 2.88305329920997, |
| "grad_norm": 18.25, |
| "learning_rate": 1.0229629629629631e-05, |
| "loss": 13.69, |
| "mean_token_accuracy": 0.7786399811506272, |
| "num_tokens": 43067006.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.8394029337912798, |
| "epoch": 2.900856793145655, |
| "grad_norm": 22.0, |
| "learning_rate": 1.0155555555555556e-05, |
| "loss": 13.5472, |
| "mean_token_accuracy": 0.7803862724453211, |
| "num_tokens": 43333868.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.8286140831187367, |
| "epoch": 2.9186602870813396, |
| "grad_norm": 19.625, |
| "learning_rate": 1.0081481481481484e-05, |
| "loss": 13.3561, |
| "mean_token_accuracy": 0.7823506712913513, |
| "num_tokens": 43605151.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.8372505877166987, |
| "epoch": 2.9364637810170247, |
| "grad_norm": 19.5, |
| "learning_rate": 1.0007407407407407e-05, |
| "loss": 13.4152, |
| "mean_token_accuracy": 0.7829842563718558, |
| "num_tokens": 43863034.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.8628016563132406, |
| "epoch": 2.9542672749527092, |
| "grad_norm": 19.875, |
| "learning_rate": 9.933333333333334e-06, |
| "loss": 13.8578, |
| "mean_token_accuracy": 0.7742777109146118, |
| "num_tokens": 44126796.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.8661557337269187, |
| "epoch": 2.9720707688883943, |
| "grad_norm": 19.875, |
| "learning_rate": 9.85925925925926e-06, |
| "loss": 13.953, |
| "mean_token_accuracy": 0.7748874224722385, |
| "num_tokens": 44382724.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.8431165425106883, |
| "epoch": 2.9898742628240793, |
| "grad_norm": 18.375, |
| "learning_rate": 9.785185185185187e-06, |
| "loss": 13.4954, |
| "mean_token_accuracy": 0.7824627134948969, |
| "num_tokens": 44651471.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.830341524078, |
| "epoch": 3.007121397574274, |
| "grad_norm": 17.5, |
| "learning_rate": 9.711111111111111e-06, |
| "loss": 12.8615, |
| "mean_token_accuracy": 0.7825036860281421, |
| "num_tokens": 44909981.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.89626778755337, |
| "epoch": 3.0249248915099587, |
| "grad_norm": 20.75, |
| "learning_rate": 9.637037037037038e-06, |
| "loss": 14.4694, |
| "mean_token_accuracy": 0.7680662952363491, |
| "num_tokens": 45176547.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_biology_entropy": 2.1169527568817137, |
| "eval_biology_loss": 2.4101810455322266, |
| "eval_biology_mean_token_accuracy": 0.543948637008667, |
| "eval_biology_num_tokens": 45176547.0, |
| "eval_biology_runtime": 50.1675, |
| "eval_biology_samples_per_second": 9.967, |
| "eval_biology_steps_per_second": 2.492, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_chemistry_entropy": 0.8637330963611602, |
| "eval_chemistry_loss": 0.9029366374015808, |
| "eval_chemistry_mean_token_accuracy": 0.768845187664032, |
| "eval_chemistry_num_tokens": 45176547.0, |
| "eval_chemistry_runtime": 62.2931, |
| "eval_chemistry_samples_per_second": 8.027, |
| "eval_chemistry_steps_per_second": 2.007, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_math_entropy": 0.8841931850910186, |
| "eval_math_loss": 1.148743748664856, |
| "eval_math_mean_token_accuracy": 0.7394666404724121, |
| "eval_math_num_tokens": 45176547.0, |
| "eval_math_runtime": 63.8752, |
| "eval_math_samples_per_second": 7.828, |
| "eval_math_steps_per_second": 1.957, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_physics_entropy": 0.9223746247291565, |
| "eval_physics_loss": 1.006547212600708, |
| "eval_physics_mean_token_accuracy": 0.7558423929214477, |
| "eval_physics_num_tokens": 45176547.0, |
| "eval_physics_runtime": 72.649, |
| "eval_physics_samples_per_second": 6.882, |
| "eval_physics_steps_per_second": 1.721, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.8229222293943167, |
| "epoch": 3.0427283854456437, |
| "grad_norm": 19.75, |
| "learning_rate": 9.562962962962965e-06, |
| "loss": 13.1542, |
| "mean_token_accuracy": 0.7857496555894613, |
| "num_tokens": 45437181.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.8210136437788605, |
| "epoch": 3.0605318793813288, |
| "grad_norm": 21.5, |
| "learning_rate": 9.48888888888889e-06, |
| "loss": 13.1921, |
| "mean_token_accuracy": 0.7837804082781077, |
| "num_tokens": 45702431.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.8290200307965279, |
| "epoch": 3.0783353733170133, |
| "grad_norm": 18.125, |
| "learning_rate": 9.414814814814816e-06, |
| "loss": 13.3132, |
| "mean_token_accuracy": 0.781994580104947, |
| "num_tokens": 45975788.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.8178752085193992, |
| "epoch": 3.0961388672526984, |
| "grad_norm": 20.0, |
| "learning_rate": 9.34074074074074e-06, |
| "loss": 13.053, |
| "mean_token_accuracy": 0.7845240369439125, |
| "num_tokens": 46251349.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.8523287745192647, |
| "epoch": 3.1139423611883834, |
| "grad_norm": 22.875, |
| "learning_rate": 9.266666666666667e-06, |
| "loss": 13.7343, |
| "mean_token_accuracy": 0.7765240430831909, |
| "num_tokens": 46516802.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.8190072000026702, |
| "epoch": 3.131745855124068, |
| "grad_norm": 19.75, |
| "learning_rate": 9.192592592592594e-06, |
| "loss": 13.0846, |
| "mean_token_accuracy": 0.7857770949602128, |
| "num_tokens": 46790537.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.8315999880433083, |
| "epoch": 3.149549349059753, |
| "grad_norm": 20.125, |
| "learning_rate": 9.118518518518518e-06, |
| "loss": 13.5273, |
| "mean_token_accuracy": 0.7801283650100231, |
| "num_tokens": 47052877.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.8424408141523599, |
| "epoch": 3.167352842995438, |
| "grad_norm": 19.5, |
| "learning_rate": 9.044444444444445e-06, |
| "loss": 13.4137, |
| "mean_token_accuracy": 0.781138914451003, |
| "num_tokens": 47315587.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.8210773909464478, |
| "epoch": 3.1851563369311227, |
| "grad_norm": 20.875, |
| "learning_rate": 8.970370370370372e-06, |
| "loss": 13.2241, |
| "mean_token_accuracy": 0.784696988761425, |
| "num_tokens": 47578352.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.832705906778574, |
| "epoch": 3.2029598308668077, |
| "grad_norm": 20.25, |
| "learning_rate": 8.896296296296298e-06, |
| "loss": 13.468, |
| "mean_token_accuracy": 0.7818292014300823, |
| "num_tokens": 47844623.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_biology_entropy": 2.133446848869324, |
| "eval_biology_loss": 2.4053525924682617, |
| "eval_biology_mean_token_accuracy": 0.5437073798179627, |
| "eval_biology_num_tokens": 47844623.0, |
| "eval_biology_runtime": 50.0957, |
| "eval_biology_samples_per_second": 9.981, |
| "eval_biology_steps_per_second": 2.495, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_chemistry_entropy": 0.8717915427684784, |
| "eval_chemistry_loss": 0.9018566608428955, |
| "eval_chemistry_mean_token_accuracy": 0.7691599769592286, |
| "eval_chemistry_num_tokens": 47844623.0, |
| "eval_chemistry_runtime": 62.0513, |
| "eval_chemistry_samples_per_second": 8.058, |
| "eval_chemistry_steps_per_second": 2.014, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_math_entropy": 0.8891878733634949, |
| "eval_math_loss": 1.150004506111145, |
| "eval_math_mean_token_accuracy": 0.7392585673332215, |
| "eval_math_num_tokens": 47844623.0, |
| "eval_math_runtime": 63.6452, |
| "eval_math_samples_per_second": 7.856, |
| "eval_math_steps_per_second": 1.964, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_physics_entropy": 0.931395646572113, |
| "eval_physics_loss": 1.0083858966827393, |
| "eval_physics_mean_token_accuracy": 0.7555162100791931, |
| "eval_physics_num_tokens": 47844623.0, |
| "eval_physics_runtime": 72.4916, |
| "eval_physics_samples_per_second": 6.897, |
| "eval_physics_steps_per_second": 1.724, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.8242196686565876, |
| "epoch": 3.2207633248024923, |
| "grad_norm": 19.0, |
| "learning_rate": 8.822222222222223e-06, |
| "loss": 13.1731, |
| "mean_token_accuracy": 0.7852105066180229, |
| "num_tokens": 48112262.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.8831753600388765, |
| "epoch": 3.2385668187381773, |
| "grad_norm": 21.375, |
| "learning_rate": 8.74814814814815e-06, |
| "loss": 14.252, |
| "mean_token_accuracy": 0.7709351792931557, |
| "num_tokens": 48369097.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.8386433634907007, |
| "epoch": 3.2563703126738623, |
| "grad_norm": 20.0, |
| "learning_rate": 8.674074074074074e-06, |
| "loss": 13.4078, |
| "mean_token_accuracy": 0.7835471354424953, |
| "num_tokens": 48628165.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.8319166716188192, |
| "epoch": 3.274173806609547, |
| "grad_norm": 18.75, |
| "learning_rate": 8.6e-06, |
| "loss": 13.3578, |
| "mean_token_accuracy": 0.7818007972091436, |
| "num_tokens": 48898634.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.8234225831925869, |
| "epoch": 3.291977300545232, |
| "grad_norm": 18.125, |
| "learning_rate": 8.525925925925927e-06, |
| "loss": 13.2262, |
| "mean_token_accuracy": 0.782720947638154, |
| "num_tokens": 49167839.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.8423925707116723, |
| "epoch": 3.309780794480917, |
| "grad_norm": 19.625, |
| "learning_rate": 8.451851851851852e-06, |
| "loss": 13.5985, |
| "mean_token_accuracy": 0.7810371067374945, |
| "num_tokens": 49431378.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.8335220757871866, |
| "epoch": 3.3275842884166016, |
| "grad_norm": 18.25, |
| "learning_rate": 8.377777777777779e-06, |
| "loss": 13.3981, |
| "mean_token_accuracy": 0.7828397914767266, |
| "num_tokens": 49699309.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.8416358495131135, |
| "epoch": 3.3453877823522866, |
| "grad_norm": 18.75, |
| "learning_rate": 8.303703703703705e-06, |
| "loss": 13.5041, |
| "mean_token_accuracy": 0.7813411135226488, |
| "num_tokens": 49972032.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.8490649621933699, |
| "epoch": 3.3631912762879717, |
| "grad_norm": 21.75, |
| "learning_rate": 8.229629629629632e-06, |
| "loss": 13.623, |
| "mean_token_accuracy": 0.7790623392909766, |
| "num_tokens": 50241367.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.8466175535693765, |
| "epoch": 3.3809947702236562, |
| "grad_norm": 19.5, |
| "learning_rate": 8.155555555555556e-06, |
| "loss": 13.5936, |
| "mean_token_accuracy": 0.777934268862009, |
| "num_tokens": 50511167.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_biology_entropy": 2.0975170640945433, |
| "eval_biology_loss": 2.4081575870513916, |
| "eval_biology_mean_token_accuracy": 0.5437231578826904, |
| "eval_biology_num_tokens": 50511167.0, |
| "eval_biology_runtime": 50.1265, |
| "eval_biology_samples_per_second": 9.975, |
| "eval_biology_steps_per_second": 2.494, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_chemistry_entropy": 0.85458478140831, |
| "eval_chemistry_loss": 0.9009365439414978, |
| "eval_chemistry_mean_token_accuracy": 0.7693327531814576, |
| "eval_chemistry_num_tokens": 50511167.0, |
| "eval_chemistry_runtime": 62.1624, |
| "eval_chemistry_samples_per_second": 8.043, |
| "eval_chemistry_steps_per_second": 2.011, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_math_entropy": 0.8750781128406525, |
| "eval_math_loss": 1.1519843339920044, |
| "eval_math_mean_token_accuracy": 0.7391414003372192, |
| "eval_math_num_tokens": 50511167.0, |
| "eval_math_runtime": 63.7559, |
| "eval_math_samples_per_second": 7.842, |
| "eval_math_steps_per_second": 1.961, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_physics_entropy": 0.9140749173164368, |
| "eval_physics_loss": 1.0088294744491577, |
| "eval_physics_mean_token_accuracy": 0.7556774320602417, |
| "eval_physics_num_tokens": 50511167.0, |
| "eval_physics_runtime": 72.622, |
| "eval_physics_samples_per_second": 6.885, |
| "eval_physics_steps_per_second": 1.721, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.8566948508843779, |
| "epoch": 3.3987982641593413, |
| "grad_norm": 21.875, |
| "learning_rate": 8.081481481481483e-06, |
| "loss": 13.8316, |
| "mean_token_accuracy": 0.7761325091123581, |
| "num_tokens": 50776842.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.8349349481984973, |
| "epoch": 3.4166017580950263, |
| "grad_norm": 20.25, |
| "learning_rate": 8.007407407407408e-06, |
| "loss": 13.4411, |
| "mean_token_accuracy": 0.7803341884166002, |
| "num_tokens": 51040514.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.8467728653922677, |
| "epoch": 3.434405252030711, |
| "grad_norm": 20.75, |
| "learning_rate": 7.933333333333334e-06, |
| "loss": 13.5566, |
| "mean_token_accuracy": 0.779913404583931, |
| "num_tokens": 51300696.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.8475066049024462, |
| "epoch": 3.452208745966396, |
| "grad_norm": 20.875, |
| "learning_rate": 7.859259259259259e-06, |
| "loss": 13.6519, |
| "mean_token_accuracy": 0.7775303546339274, |
| "num_tokens": 51558606.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.8373401986435056, |
| "epoch": 3.470012239902081, |
| "grad_norm": 18.5, |
| "learning_rate": 7.785185185185185e-06, |
| "loss": 13.4239, |
| "mean_token_accuracy": 0.7792705055326223, |
| "num_tokens": 51827834.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.8257303407415748, |
| "epoch": 3.4878157338377656, |
| "grad_norm": 20.75, |
| "learning_rate": 7.711111111111112e-06, |
| "loss": 13.2561, |
| "mean_token_accuracy": 0.7843583811074495, |
| "num_tokens": 52100179.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.8506467677652836, |
| "epoch": 3.5056192277734506, |
| "grad_norm": 19.0, |
| "learning_rate": 7.637037037037037e-06, |
| "loss": 13.6641, |
| "mean_token_accuracy": 0.778495280444622, |
| "num_tokens": 52368055.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.8235666550695896, |
| "epoch": 3.523422721709135, |
| "grad_norm": 20.375, |
| "learning_rate": 7.562962962962963e-06, |
| "loss": 13.2452, |
| "mean_token_accuracy": 0.7847742971032858, |
| "num_tokens": 52637505.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.8493829498067498, |
| "epoch": 3.54122621564482, |
| "grad_norm": 21.625, |
| "learning_rate": 7.48888888888889e-06, |
| "loss": 13.6111, |
| "mean_token_accuracy": 0.7800389505922795, |
| "num_tokens": 52906194.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.8078789742663502, |
| "epoch": 3.5590297095805052, |
| "grad_norm": 18.875, |
| "learning_rate": 7.4148148148148155e-06, |
| "loss": 13.0234, |
| "mean_token_accuracy": 0.7877966694533824, |
| "num_tokens": 53172376.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_biology_entropy": 2.1067893276214598, |
| "eval_biology_loss": 2.4088294506073, |
| "eval_biology_mean_token_accuracy": 0.5440908517837525, |
| "eval_biology_num_tokens": 53172376.0, |
| "eval_biology_runtime": 50.0392, |
| "eval_biology_samples_per_second": 9.992, |
| "eval_biology_steps_per_second": 2.498, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_chemistry_entropy": 0.8589933874607086, |
| "eval_chemistry_loss": 0.8998663425445557, |
| "eval_chemistry_mean_token_accuracy": 0.7694662809371948, |
| "eval_chemistry_num_tokens": 53172376.0, |
| "eval_chemistry_runtime": 61.876, |
| "eval_chemistry_samples_per_second": 8.081, |
| "eval_chemistry_steps_per_second": 2.02, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_math_entropy": 0.8766498160362244, |
| "eval_math_loss": 1.1516472101211548, |
| "eval_math_mean_token_accuracy": 0.7391332941055297, |
| "eval_math_num_tokens": 53172376.0, |
| "eval_math_runtime": 63.5933, |
| "eval_math_samples_per_second": 7.862, |
| "eval_math_steps_per_second": 1.966, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_physics_entropy": 0.9181702899932861, |
| "eval_physics_loss": 1.0072380304336548, |
| "eval_physics_mean_token_accuracy": 0.7556789331436157, |
| "eval_physics_num_tokens": 53172376.0, |
| "eval_physics_runtime": 72.499, |
| "eval_physics_samples_per_second": 6.897, |
| "eval_physics_steps_per_second": 1.724, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.556064716540213e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|