{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.5590297095805052, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0559891855344177, "epoch": 0.017803493935684877, "grad_norm": 176.0, "learning_rate": 6.000000000000001e-07, "loss": 42.5063, "mean_token_accuracy": 0.6153968568891287, "num_tokens": 257189.0, "step": 10 }, { "entropy": 1.0444028429687022, "epoch": 0.035606987871369754, "grad_norm": 160.0, "learning_rate": 1.2666666666666669e-06, "loss": 41.7045, "mean_token_accuracy": 0.619235553778708, "num_tokens": 528285.0, "step": 20 }, { "entropy": 1.0654841650277376, "epoch": 0.053410481807054634, "grad_norm": 175.0, "learning_rate": 1.9333333333333336e-06, "loss": 41.559, "mean_token_accuracy": 0.6179817667230963, "num_tokens": 801635.0, "step": 30 }, { "entropy": 1.0894893994554877, "epoch": 0.07121397574273951, "grad_norm": 154.0, "learning_rate": 2.6e-06, "loss": 41.6138, "mean_token_accuracy": 0.6147882426157594, "num_tokens": 1060812.0, "step": 40 }, { "entropy": 1.0985747126862406, "epoch": 0.08901746967842439, "grad_norm": 141.0, "learning_rate": 3.266666666666667e-06, "loss": 39.4404, "mean_token_accuracy": 0.6201398545876146, "num_tokens": 1327380.0, "step": 50 }, { "entropy": 1.162555592879653, "epoch": 0.10682096361410927, "grad_norm": 114.0, "learning_rate": 3.9333333333333335e-06, "loss": 38.4348, "mean_token_accuracy": 0.6222015436738729, "num_tokens": 1597405.0, "step": 60 }, { "entropy": 1.3072715956717729, "epoch": 0.12462445754979415, "grad_norm": 76.5, "learning_rate": 4.600000000000001e-06, "loss": 37.3098, "mean_token_accuracy": 0.6205647233873606, "num_tokens": 1859684.0, "step": 70 }, { "entropy": 1.47952934615314, "epoch": 0.14242795148547902, "grad_norm": 54.0, "learning_rate": 5.2666666666666665e-06, "loss": 35.3269, "mean_token_accuracy": 0.6284260725602507, "num_tokens": 2118068.0, "step": 80 }, { "entropy": 1.5287989236414432, "epoch": 0.1602314454211639, "grad_norm": 37.25, "learning_rate": 5.933333333333335e-06, "loss": 31.5676, "mean_token_accuracy": 0.65506336633116, "num_tokens": 2388824.0, "step": 90 }, { "entropy": 1.630833999812603, "epoch": 0.17803493935684878, "grad_norm": 28.75, "learning_rate": 6.600000000000001e-06, "loss": 30.3566, "mean_token_accuracy": 0.65591501891613, "num_tokens": 2644330.0, "step": 100 }, { "epoch": 0.17803493935684878, "eval_biology_entropy": 3.4087772026062013, "eval_biology_loss": 3.8683884143829346, "eval_biology_mean_token_accuracy": 0.4378368608951569, "eval_biology_num_tokens": 2644330.0, "eval_biology_runtime": 57.5948, "eval_biology_samples_per_second": 8.681, "eval_biology_steps_per_second": 2.17, "step": 100 }, { "epoch": 0.17803493935684878, "eval_chemistry_entropy": 1.644574547290802, "eval_chemistry_loss": 1.8279423713684082, "eval_chemistry_mean_token_accuracy": 0.663385133266449, "eval_chemistry_num_tokens": 2644330.0, "eval_chemistry_runtime": 72.0267, "eval_chemistry_samples_per_second": 6.942, "eval_chemistry_steps_per_second": 1.735, "step": 100 }, { "epoch": 0.17803493935684878, "eval_math_entropy": 1.2650005059242249, "eval_math_loss": 1.795242428779602, "eval_math_mean_token_accuracy": 0.6832415590286255, "eval_math_num_tokens": 2644330.0, "eval_math_runtime": 75.2166, "eval_math_samples_per_second": 6.647, "eval_math_steps_per_second": 1.662, "step": 100 }, { "epoch": 0.17803493935684878, "eval_physics_entropy": 1.6315128216743469, "eval_physics_loss": 1.8273048400878906, "eval_physics_mean_token_accuracy": 0.6732162787914276, "eval_physics_num_tokens": 2644330.0, "eval_physics_runtime": 85.3066, "eval_physics_samples_per_second": 5.861, "eval_physics_steps_per_second": 1.465, "step": 100 }, { "entropy": 1.558345663547516, "epoch": 0.19583843329253367, "grad_norm": 24.375, "learning_rate": 7.266666666666668e-06, "loss": 28.2409, "mean_token_accuracy": 0.6733386002480983, "num_tokens": 2913700.0, "step": 110 }, { "entropy": 1.5497455261647701, "epoch": 0.21364192722821854, "grad_norm": 20.375, "learning_rate": 7.933333333333334e-06, "loss": 27.4174, "mean_token_accuracy": 0.678030077368021, "num_tokens": 3185255.0, "step": 120 }, { "entropy": 1.5162339597940444, "epoch": 0.2314454211639034, "grad_norm": 20.5, "learning_rate": 8.6e-06, "loss": 26.3563, "mean_token_accuracy": 0.6866102360188961, "num_tokens": 3454750.0, "step": 130 }, { "entropy": 1.5057988293468951, "epoch": 0.2492489150995883, "grad_norm": 19.375, "learning_rate": 9.266666666666667e-06, "loss": 26.0435, "mean_token_accuracy": 0.689361485093832, "num_tokens": 3719113.0, "step": 140 }, { "entropy": 1.544687245413661, "epoch": 0.26705240903527316, "grad_norm": 18.375, "learning_rate": 9.933333333333334e-06, "loss": 26.1449, "mean_token_accuracy": 0.6857006324455142, "num_tokens": 3990505.0, "step": 150 }, { "entropy": 1.4617379672825337, "epoch": 0.28485590297095803, "grad_norm": 17.25, "learning_rate": 1.0600000000000002e-05, "loss": 24.9336, "mean_token_accuracy": 0.6990214973688126, "num_tokens": 4267403.0, "step": 160 }, { "entropy": 1.45880494043231, "epoch": 0.30265939690664295, "grad_norm": 17.375, "learning_rate": 1.1266666666666668e-05, "loss": 24.5033, "mean_token_accuracy": 0.7016687501221895, "num_tokens": 4535458.0, "step": 170 }, { "entropy": 1.435447308793664, "epoch": 0.3204628908423278, "grad_norm": 20.375, "learning_rate": 1.1933333333333335e-05, "loss": 24.3963, "mean_token_accuracy": 0.7021712277084589, "num_tokens": 4796815.0, "step": 180 }, { "entropy": 1.4029231216758489, "epoch": 0.3382663847780127, "grad_norm": 16.875, "learning_rate": 1.2600000000000001e-05, "loss": 23.6953, "mean_token_accuracy": 0.7080240704119205, "num_tokens": 5066948.0, "step": 190 }, { "entropy": 1.4468292627483605, "epoch": 0.35606987871369755, "grad_norm": 17.75, "learning_rate": 1.3266666666666668e-05, "loss": 24.4323, "mean_token_accuracy": 0.698475543037057, "num_tokens": 5324751.0, "step": 200 }, { "epoch": 0.35606987871369755, "eval_biology_entropy": 2.8836843852996825, "eval_biology_loss": 3.357414722442627, "eval_biology_mean_token_accuracy": 0.48108551049232484, "eval_biology_num_tokens": 5324751.0, "eval_biology_runtime": 57.5893, "eval_biology_samples_per_second": 8.682, "eval_biology_steps_per_second": 2.171, "step": 200 }, { "epoch": 0.35606987871369755, "eval_chemistry_entropy": 1.4061724228858947, "eval_chemistry_loss": 1.478424072265625, "eval_chemistry_mean_token_accuracy": 0.7063784518241882, "eval_chemistry_num_tokens": 5324751.0, "eval_chemistry_runtime": 72.1317, "eval_chemistry_samples_per_second": 6.932, "eval_chemistry_steps_per_second": 1.733, "step": 200 }, { "epoch": 0.35606987871369755, "eval_math_entropy": 1.2177042779922485, "eval_math_loss": 1.5454598665237427, "eval_math_mean_token_accuracy": 0.7047145981788635, "eval_math_num_tokens": 5324751.0, "eval_math_runtime": 75.1207, "eval_math_samples_per_second": 6.656, "eval_math_steps_per_second": 1.664, "step": 200 }, { "epoch": 0.35606987871369755, "eval_physics_entropy": 1.4176100668907166, "eval_physics_loss": 1.5222268104553223, "eval_physics_mean_token_accuracy": 0.707690957069397, "eval_physics_num_tokens": 5324751.0, "eval_physics_runtime": 85.2085, "eval_physics_samples_per_second": 5.868, "eval_physics_steps_per_second": 1.467, "step": 200 }, { "entropy": 1.3558454412966967, "epoch": 0.3738733726493824, "grad_norm": 19.0, "learning_rate": 1.3933333333333334e-05, "loss": 22.7949, "mean_token_accuracy": 0.7136942774057389, "num_tokens": 5585508.0, "step": 210 }, { "entropy": 1.383369480073452, "epoch": 0.39167686658506734, "grad_norm": 19.375, "learning_rate": 1.46e-05, "loss": 23.1375, "mean_token_accuracy": 0.7102958835661412, "num_tokens": 5848889.0, "step": 220 }, { "entropy": 1.338986362144351, "epoch": 0.4094803605207522, "grad_norm": 17.25, "learning_rate": 1.5266666666666667e-05, "loss": 22.3403, "mean_token_accuracy": 0.7182383798062801, "num_tokens": 6114855.0, "step": 230 }, { "entropy": 1.3609099809080363, "epoch": 0.4272838544564371, "grad_norm": 17.75, "learning_rate": 1.5933333333333336e-05, "loss": 22.8066, "mean_token_accuracy": 0.7140973150730133, "num_tokens": 6378152.0, "step": 240 }, { "entropy": 1.3484373219311236, "epoch": 0.44508734839212194, "grad_norm": 18.5, "learning_rate": 1.66e-05, "loss": 22.591, "mean_token_accuracy": 0.713333373889327, "num_tokens": 6637273.0, "step": 250 }, { "entropy": 1.315424071252346, "epoch": 0.4628908423278068, "grad_norm": 19.0, "learning_rate": 1.726666666666667e-05, "loss": 21.9951, "mean_token_accuracy": 0.7204139836132526, "num_tokens": 6896684.0, "step": 260 }, { "entropy": 1.32754250690341, "epoch": 0.48069433626349173, "grad_norm": 17.25, "learning_rate": 1.7933333333333333e-05, "loss": 21.9557, "mean_token_accuracy": 0.7203923668712378, "num_tokens": 7166608.0, "step": 270 }, { "entropy": 1.2635501787066459, "epoch": 0.4984978301991766, "grad_norm": 17.0, "learning_rate": 1.86e-05, "loss": 21.113, "mean_token_accuracy": 0.7295621318742633, "num_tokens": 7444923.0, "step": 280 }, { "entropy": 1.3078652255237102, "epoch": 0.5163013241348615, "grad_norm": 17.25, "learning_rate": 1.926666666666667e-05, "loss": 21.7315, "mean_token_accuracy": 0.7203882545232773, "num_tokens": 7706502.0, "step": 290 }, { "entropy": 1.3434829246252775, "epoch": 0.5341048180705463, "grad_norm": 20.25, "learning_rate": 1.9933333333333334e-05, "loss": 22.236, "mean_token_accuracy": 0.7177267197519541, "num_tokens": 7969704.0, "step": 300 }, { "epoch": 0.5341048180705463, "eval_biology_entropy": 2.7598990201950073, "eval_biology_loss": 3.1821975708007812, "eval_biology_mean_token_accuracy": 0.4959620425701141, "eval_biology_num_tokens": 7969704.0, "eval_biology_runtime": 57.9508, "eval_biology_samples_per_second": 8.628, "eval_biology_steps_per_second": 2.157, "step": 300 }, { "epoch": 0.5341048180705463, "eval_chemistry_entropy": 1.2813276634216308, "eval_chemistry_loss": 1.3571581840515137, "eval_chemistry_mean_token_accuracy": 0.7210862817764282, "eval_chemistry_num_tokens": 7969704.0, "eval_chemistry_runtime": 72.2035, "eval_chemistry_samples_per_second": 6.925, "eval_chemistry_steps_per_second": 1.731, "step": 300 }, { "epoch": 0.5341048180705463, "eval_math_entropy": 1.165593782901764, "eval_math_loss": 1.472881555557251, "eval_math_mean_token_accuracy": 0.7125356969833374, "eval_math_num_tokens": 7969704.0, "eval_math_runtime": 75.1477, "eval_math_samples_per_second": 6.654, "eval_math_steps_per_second": 1.663, "step": 300 }, { "epoch": 0.5341048180705463, "eval_physics_entropy": 1.3238337383270264, "eval_physics_loss": 1.4213438034057617, "eval_physics_mean_token_accuracy": 0.7196516485214234, "eval_physics_num_tokens": 7969704.0, "eval_physics_runtime": 85.3194, "eval_physics_samples_per_second": 5.86, "eval_physics_steps_per_second": 1.465, "step": 300 }, { "entropy": 1.284867750480771, "epoch": 0.5519083120062312, "grad_norm": 21.625, "learning_rate": 1.9933333333333334e-05, "loss": 21.4794, "mean_token_accuracy": 0.7222346868366003, "num_tokens": 8242162.0, "step": 310 }, { "entropy": 1.2864018987864256, "epoch": 0.5697118059419161, "grad_norm": 18.5, "learning_rate": 1.985925925925926e-05, "loss": 21.4262, "mean_token_accuracy": 0.724996630847454, "num_tokens": 8497852.0, "step": 320 }, { "entropy": 1.2846032541245223, "epoch": 0.587515299877601, "grad_norm": 21.0, "learning_rate": 1.9785185185185187e-05, "loss": 21.5347, "mean_token_accuracy": 0.7234938707202673, "num_tokens": 8757753.0, "step": 330 }, { "entropy": 1.2416468027979135, "epoch": 0.6053187938132859, "grad_norm": 15.75, "learning_rate": 1.971111111111111e-05, "loss": 20.5325, "mean_token_accuracy": 0.733549839258194, "num_tokens": 9024677.0, "step": 340 }, { "entropy": 1.2626060862094164, "epoch": 0.6231222877489707, "grad_norm": 17.75, "learning_rate": 1.963703703703704e-05, "loss": 21.0676, "mean_token_accuracy": 0.7265046216547489, "num_tokens": 9291760.0, "step": 350 }, { "entropy": 1.228025446087122, "epoch": 0.6409257816846556, "grad_norm": 19.875, "learning_rate": 1.9562962962962964e-05, "loss": 20.3954, "mean_token_accuracy": 0.7328283190727234, "num_tokens": 9561091.0, "step": 360 }, { "entropy": 1.2296066496521234, "epoch": 0.6587292756203404, "grad_norm": 18.875, "learning_rate": 1.948888888888889e-05, "loss": 20.5407, "mean_token_accuracy": 0.7325904417783022, "num_tokens": 9827272.0, "step": 370 }, { "entropy": 1.2079377524554729, "epoch": 0.6765327695560254, "grad_norm": 17.125, "learning_rate": 1.9414814814814817e-05, "loss": 20.0342, "mean_token_accuracy": 0.7388566959649324, "num_tokens": 10096065.0, "step": 380 }, { "entropy": 1.2025926019996405, "epoch": 0.6943362634917103, "grad_norm": 18.125, "learning_rate": 1.9340740740740743e-05, "loss": 19.8907, "mean_token_accuracy": 0.7389910601079464, "num_tokens": 10364601.0, "step": 390 }, { "entropy": 1.215231117978692, "epoch": 0.7121397574273951, "grad_norm": 18.5, "learning_rate": 1.926666666666667e-05, "loss": 20.2245, "mean_token_accuracy": 0.7353602629154921, "num_tokens": 10633325.0, "step": 400 }, { "epoch": 0.7121397574273951, "eval_biology_entropy": 2.8210434341430664, "eval_biology_loss": 3.116989850997925, "eval_biology_mean_token_accuracy": 0.5034433958530427, "eval_biology_num_tokens": 10633325.0, "eval_biology_runtime": 57.5361, "eval_biology_samples_per_second": 8.69, "eval_biology_steps_per_second": 2.173, "step": 400 }, { "epoch": 0.7121397574273951, "eval_chemistry_entropy": 1.261210060596466, "eval_chemistry_loss": 1.294067144393921, "eval_chemistry_mean_token_accuracy": 0.730656672000885, "eval_chemistry_num_tokens": 10633325.0, "eval_chemistry_runtime": 72.1604, "eval_chemistry_samples_per_second": 6.929, "eval_chemistry_steps_per_second": 1.732, "step": 400 }, { "epoch": 0.7121397574273951, "eval_math_entropy": 1.1643331799507142, "eval_math_loss": 1.4408313035964966, "eval_math_mean_token_accuracy": 0.7161806149482727, "eval_math_num_tokens": 10633325.0, "eval_math_runtime": 75.5006, "eval_math_samples_per_second": 6.622, "eval_math_steps_per_second": 1.656, "step": 400 }, { "epoch": 0.7121397574273951, "eval_physics_entropy": 1.3186150641441345, "eval_physics_loss": 1.373258113861084, "eval_physics_mean_token_accuracy": 0.7264462962150574, "eval_physics_num_tokens": 10633325.0, "eval_physics_runtime": 85.0734, "eval_physics_samples_per_second": 5.877, "eval_physics_steps_per_second": 1.469, "step": 400 }, { "entropy": 1.1908438373357058, "epoch": 0.72994325136308, "grad_norm": 22.875, "learning_rate": 1.9192592592592593e-05, "loss": 19.969, "mean_token_accuracy": 0.7395484477281571, "num_tokens": 10897916.0, "step": 410 }, { "entropy": 1.2167691864073276, "epoch": 0.7477467452987648, "grad_norm": 20.0, "learning_rate": 1.911851851851852e-05, "loss": 20.1452, "mean_token_accuracy": 0.7378258787095546, "num_tokens": 11165356.0, "step": 420 }, { "entropy": 1.2289611756801606, "epoch": 0.7655502392344498, "grad_norm": 18.75, "learning_rate": 1.9044444444444446e-05, "loss": 20.5277, "mean_token_accuracy": 0.7328679781407118, "num_tokens": 11436799.0, "step": 430 }, { "entropy": 1.244011626765132, "epoch": 0.7833537331701347, "grad_norm": 19.875, "learning_rate": 1.8970370370370372e-05, "loss": 20.5562, "mean_token_accuracy": 0.7318485330790281, "num_tokens": 11703496.0, "step": 440 }, { "entropy": 1.2247176449745893, "epoch": 0.8011572271058195, "grad_norm": 21.75, "learning_rate": 1.8896296296296295e-05, "loss": 20.421, "mean_token_accuracy": 0.7341447170823813, "num_tokens": 11965530.0, "step": 450 }, { "entropy": 1.2138673104345798, "epoch": 0.8189607210415044, "grad_norm": 17.375, "learning_rate": 1.8822222222222225e-05, "loss": 20.2052, "mean_token_accuracy": 0.7365878012031317, "num_tokens": 12224427.0, "step": 460 }, { "entropy": 1.1654350489377976, "epoch": 0.8367642149771892, "grad_norm": 17.875, "learning_rate": 1.874814814814815e-05, "loss": 19.4104, "mean_token_accuracy": 0.7445328518748283, "num_tokens": 12509124.0, "step": 470 }, { "entropy": 1.198500120639801, "epoch": 0.8545677089128741, "grad_norm": 16.75, "learning_rate": 1.8674074074074075e-05, "loss": 19.8297, "mean_token_accuracy": 0.7386186074465513, "num_tokens": 12778408.0, "step": 480 }, { "entropy": 1.170426043868065, "epoch": 0.8723712028485591, "grad_norm": 18.5, "learning_rate": 1.86e-05, "loss": 19.4215, "mean_token_accuracy": 0.743653716892004, "num_tokens": 13046473.0, "step": 490 }, { "entropy": 1.2106714222580195, "epoch": 0.8901746967842439, "grad_norm": 16.875, "learning_rate": 1.8525925925925928e-05, "loss": 20.1785, "mean_token_accuracy": 0.7360058560967445, "num_tokens": 13301659.0, "step": 500 }, { "epoch": 0.8901746967842439, "eval_biology_entropy": 2.77167928981781, "eval_biology_loss": 3.050346612930298, "eval_biology_mean_token_accuracy": 0.5097111368179321, "eval_biology_num_tokens": 13301659.0, "eval_biology_runtime": 58.0048, "eval_biology_samples_per_second": 8.62, "eval_biology_steps_per_second": 2.155, "step": 500 }, { "epoch": 0.8901746967842439, "eval_chemistry_entropy": 1.2252461080551147, "eval_chemistry_loss": 1.2589221000671387, "eval_chemistry_mean_token_accuracy": 0.7352680149078369, "eval_chemistry_num_tokens": 13301659.0, "eval_chemistry_runtime": 71.4907, "eval_chemistry_samples_per_second": 6.994, "eval_chemistry_steps_per_second": 1.748, "step": 500 }, { "epoch": 0.8901746967842439, "eval_math_entropy": 1.1505462565422058, "eval_math_loss": 1.420082688331604, "eval_math_mean_token_accuracy": 0.7192822680473328, "eval_math_num_tokens": 13301659.0, "eval_math_runtime": 74.8775, "eval_math_samples_per_second": 6.678, "eval_math_steps_per_second": 1.669, "step": 500 }, { "epoch": 0.8901746967842439, "eval_physics_entropy": 1.2864371166229247, "eval_physics_loss": 1.344913125038147, "eval_physics_mean_token_accuracy": 0.7303608517646789, "eval_physics_num_tokens": 13301659.0, "eval_physics_runtime": 84.8812, "eval_physics_samples_per_second": 5.891, "eval_physics_steps_per_second": 1.473, "step": 500 }, { "entropy": 1.1539306506514548, "epoch": 0.9079781907199288, "grad_norm": 15.4375, "learning_rate": 1.8451851851851855e-05, "loss": 19.2513, "mean_token_accuracy": 0.7464804232120514, "num_tokens": 13570986.0, "step": 510 }, { "entropy": 1.1872555747628213, "epoch": 0.9257816846556136, "grad_norm": 18.375, "learning_rate": 1.8377777777777778e-05, "loss": 19.8251, "mean_token_accuracy": 0.7413290243595838, "num_tokens": 13833901.0, "step": 520 }, { "entropy": 1.1907209917902946, "epoch": 0.9435851785912985, "grad_norm": 18.0, "learning_rate": 1.8303703703703704e-05, "loss": 19.7051, "mean_token_accuracy": 0.7420744668692351, "num_tokens": 14093888.0, "step": 530 }, { "entropy": 1.2106883700937032, "epoch": 0.9613886725269835, "grad_norm": 16.625, "learning_rate": 1.822962962962963e-05, "loss": 20.2094, "mean_token_accuracy": 0.7350596960633993, "num_tokens": 14354918.0, "step": 540 }, { "entropy": 1.1697432730346917, "epoch": 0.9791921664626683, "grad_norm": 18.25, "learning_rate": 1.8155555555555557e-05, "loss": 19.3531, "mean_token_accuracy": 0.7433340087532997, "num_tokens": 14620244.0, "step": 550 }, { "entropy": 1.1880640607327222, "epoch": 0.9969956603983532, "grad_norm": 17.0, "learning_rate": 1.8081481481481484e-05, "loss": 19.8443, "mean_token_accuracy": 0.740341067314148, "num_tokens": 14886593.0, "step": 560 }, { "entropy": 1.1296160619105062, "epoch": 1.014242795148548, "grad_norm": 18.0, "learning_rate": 1.800740740740741e-05, "loss": 18.1103, "mean_token_accuracy": 0.7519292243065373, "num_tokens": 15146469.0, "step": 570 }, { "entropy": 1.2114373303949832, "epoch": 1.0320462890842328, "grad_norm": 19.375, "learning_rate": 1.7933333333333333e-05, "loss": 19.9822, "mean_token_accuracy": 0.7364404492080212, "num_tokens": 15404569.0, "step": 580 }, { "entropy": 1.1140953950583934, "epoch": 1.0498497830199176, "grad_norm": 17.5, "learning_rate": 1.785925925925926e-05, "loss": 18.6602, "mean_token_accuracy": 0.7509294405579567, "num_tokens": 15681709.0, "step": 590 }, { "entropy": 1.178190778568387, "epoch": 1.0676532769556026, "grad_norm": 17.75, "learning_rate": 1.7785185185185186e-05, "loss": 19.4194, "mean_token_accuracy": 0.7424272943288088, "num_tokens": 15949421.0, "step": 600 }, { "epoch": 1.0676532769556026, "eval_biology_entropy": 2.754662238121033, "eval_biology_loss": 3.033093214035034, "eval_biology_mean_token_accuracy": 0.5113752641677857, "eval_biology_num_tokens": 15949421.0, "eval_biology_runtime": 57.7707, "eval_biology_samples_per_second": 8.655, "eval_biology_steps_per_second": 2.164, "step": 600 }, { "epoch": 1.0676532769556026, "eval_chemistry_entropy": 1.1933018751144409, "eval_chemistry_loss": 1.2360327243804932, "eval_chemistry_mean_token_accuracy": 0.7386878499984741, "eval_chemistry_num_tokens": 15949421.0, "eval_chemistry_runtime": 72.0715, "eval_chemistry_samples_per_second": 6.938, "eval_chemistry_steps_per_second": 1.734, "step": 600 }, { "epoch": 1.0676532769556026, "eval_math_entropy": 1.1338728556632995, "eval_math_loss": 1.4086648225784302, "eval_math_mean_token_accuracy": 0.721148521900177, "eval_math_num_tokens": 15949421.0, "eval_math_runtime": 75.0522, "eval_math_samples_per_second": 6.662, "eval_math_steps_per_second": 1.666, "step": 600 }, { "epoch": 1.0676532769556026, "eval_physics_entropy": 1.2630673241615296, "eval_physics_loss": 1.330203890800476, "eval_physics_mean_token_accuracy": 0.7322184553146363, "eval_physics_num_tokens": 15949421.0, "eval_physics_runtime": 85.1496, "eval_physics_samples_per_second": 5.872, "eval_physics_steps_per_second": 1.468, "step": 600 }, { "entropy": 1.1252168361097574, "epoch": 1.0854567708912874, "grad_norm": 18.0, "learning_rate": 1.7711111111111113e-05, "loss": 18.8804, "mean_token_accuracy": 0.7498680882155895, "num_tokens": 16221916.0, "step": 610 }, { "entropy": 1.1838977057486773, "epoch": 1.1032602648269723, "grad_norm": 20.375, "learning_rate": 1.763703703703704e-05, "loss": 19.6525, "mean_token_accuracy": 0.7415466036647558, "num_tokens": 16488154.0, "step": 620 }, { "entropy": 1.16671148724854, "epoch": 1.121063758762657, "grad_norm": 19.75, "learning_rate": 1.7562962962962962e-05, "loss": 19.3184, "mean_token_accuracy": 0.7431947711855174, "num_tokens": 16756828.0, "step": 630 }, { "entropy": 1.1459482550621032, "epoch": 1.138867252698342, "grad_norm": 18.125, "learning_rate": 1.7488888888888892e-05, "loss": 19.0361, "mean_token_accuracy": 0.7469833578914404, "num_tokens": 17023681.0, "step": 640 }, { "entropy": 1.160651782527566, "epoch": 1.156670746634027, "grad_norm": 19.5, "learning_rate": 1.7414814814814815e-05, "loss": 19.3198, "mean_token_accuracy": 0.7445345718413592, "num_tokens": 17286672.0, "step": 650 }, { "entropy": 1.2018978279083967, "epoch": 1.1744742405697117, "grad_norm": 19.875, "learning_rate": 1.7340740740740742e-05, "loss": 19.8959, "mean_token_accuracy": 0.7359397515654564, "num_tokens": 17551581.0, "step": 660 }, { "entropy": 1.1597996551543475, "epoch": 1.1922777345053968, "grad_norm": 18.0, "learning_rate": 1.726666666666667e-05, "loss": 19.2421, "mean_token_accuracy": 0.7448843888938427, "num_tokens": 17821205.0, "step": 670 }, { "entropy": 1.1536007285118104, "epoch": 1.2100812284410816, "grad_norm": 20.125, "learning_rate": 1.7192592592592595e-05, "loss": 19.2532, "mean_token_accuracy": 0.7444427307695151, "num_tokens": 18087759.0, "step": 680 }, { "entropy": 1.1601740807294845, "epoch": 1.2278847223767664, "grad_norm": 19.0, "learning_rate": 1.711851851851852e-05, "loss": 19.2395, "mean_token_accuracy": 0.7444933351129294, "num_tokens": 18355732.0, "step": 690 }, { "entropy": 1.1505332425236703, "epoch": 1.2456882163124514, "grad_norm": 17.75, "learning_rate": 1.7044444444444445e-05, "loss": 19.0722, "mean_token_accuracy": 0.7461056258529425, "num_tokens": 18626245.0, "step": 700 }, { "epoch": 1.2456882163124514, "eval_biology_entropy": 2.6926132106781004, "eval_biology_loss": 3.016256332397461, "eval_biology_mean_token_accuracy": 0.5140536696910858, "eval_biology_num_tokens": 18626245.0, "eval_biology_runtime": 57.5147, "eval_biology_samples_per_second": 8.693, "eval_biology_steps_per_second": 2.173, "step": 700 }, { "epoch": 1.2456882163124514, "eval_chemistry_entropy": 1.1651022205352783, "eval_chemistry_loss": 1.2187129259109497, "eval_chemistry_mean_token_accuracy": 0.7408477578163147, "eval_chemistry_num_tokens": 18626245.0, "eval_chemistry_runtime": 71.9835, "eval_chemistry_samples_per_second": 6.946, "eval_chemistry_steps_per_second": 1.737, "step": 700 }, { "epoch": 1.2456882163124514, "eval_math_entropy": 1.1168806705474854, "eval_math_loss": 1.4018580913543701, "eval_math_mean_token_accuracy": 0.7227320442199707, "eval_math_num_tokens": 18626245.0, "eval_math_runtime": 75.0558, "eval_math_samples_per_second": 6.662, "eval_math_steps_per_second": 1.665, "step": 700 }, { "epoch": 1.2456882163124514, "eval_physics_entropy": 1.2369258465766906, "eval_physics_loss": 1.316622257232666, "eval_physics_mean_token_accuracy": 0.7343540601730346, "eval_physics_num_tokens": 18626245.0, "eval_physics_runtime": 85.1088, "eval_physics_samples_per_second": 5.875, "eval_physics_steps_per_second": 1.469, "step": 700 }, { "entropy": 1.1377631668001413, "epoch": 1.2634917102481362, "grad_norm": 17.375, "learning_rate": 1.697037037037037e-05, "loss": 18.8327, "mean_token_accuracy": 0.7493732802569866, "num_tokens": 18899632.0, "step": 710 }, { "entropy": 1.167876774072647, "epoch": 1.281295204183821, "grad_norm": 19.375, "learning_rate": 1.6896296296296298e-05, "loss": 19.4079, "mean_token_accuracy": 0.7431724380701781, "num_tokens": 19173635.0, "step": 720 }, { "entropy": 1.0993454653769732, "epoch": 1.299098698119506, "grad_norm": 15.25, "learning_rate": 1.6822222222222224e-05, "loss": 18.3822, "mean_token_accuracy": 0.7549435570836067, "num_tokens": 19453495.0, "step": 730 }, { "entropy": 1.1375744685530662, "epoch": 1.3169021920551909, "grad_norm": 19.0, "learning_rate": 1.6748148148148147e-05, "loss": 18.8503, "mean_token_accuracy": 0.7493278611451387, "num_tokens": 19719086.0, "step": 740 }, { "entropy": 1.1301548499614, "epoch": 1.3347056859908757, "grad_norm": 21.5, "learning_rate": 1.6674074074074077e-05, "loss": 18.8165, "mean_token_accuracy": 0.7469407975673675, "num_tokens": 19981272.0, "step": 750 }, { "entropy": 1.1576253682374955, "epoch": 1.3525091799265607, "grad_norm": 17.75, "learning_rate": 1.66e-05, "loss": 19.2575, "mean_token_accuracy": 0.7463510025292635, "num_tokens": 20243034.0, "step": 760 }, { "entropy": 1.1525329016149044, "epoch": 1.3703126738622455, "grad_norm": 21.5, "learning_rate": 1.6525925925925927e-05, "loss": 19.0713, "mean_token_accuracy": 0.7454641673713922, "num_tokens": 20509028.0, "step": 770 }, { "entropy": 1.196613146364689, "epoch": 1.3881161677979303, "grad_norm": 21.75, "learning_rate": 1.6451851851851853e-05, "loss": 19.9552, "mean_token_accuracy": 0.7380195271223784, "num_tokens": 20762668.0, "step": 780 }, { "entropy": 1.1995733845978975, "epoch": 1.4059196617336152, "grad_norm": 17.375, "learning_rate": 1.637777777777778e-05, "loss": 19.7882, "mean_token_accuracy": 0.7368107169866562, "num_tokens": 21025277.0, "step": 790 }, { "entropy": 1.1568929351866246, "epoch": 1.4237231556693, "grad_norm": 20.0, "learning_rate": 1.6303703703703706e-05, "loss": 19.2121, "mean_token_accuracy": 0.7448243040591478, "num_tokens": 21292320.0, "step": 800 }, { "epoch": 1.4237231556693, "eval_biology_entropy": 2.6511214599609376, "eval_biology_loss": 2.986257791519165, "eval_biology_mean_token_accuracy": 0.5167679135799408, "eval_biology_num_tokens": 21292320.0, "eval_biology_runtime": 58.1602, "eval_biology_samples_per_second": 8.597, "eval_biology_steps_per_second": 2.149, "step": 800 }, { "epoch": 1.4237231556693, "eval_chemistry_entropy": 1.1516469955444335, "eval_chemistry_loss": 1.205365777015686, "eval_chemistry_mean_token_accuracy": 0.742991099357605, "eval_chemistry_num_tokens": 21292320.0, "eval_chemistry_runtime": 72.059, "eval_chemistry_samples_per_second": 6.939, "eval_chemistry_steps_per_second": 1.735, "step": 800 }, { "epoch": 1.4237231556693, "eval_math_entropy": 1.109243516921997, "eval_math_loss": 1.3933804035186768, "eval_math_mean_token_accuracy": 0.7233971772193909, "eval_math_num_tokens": 21292320.0, "eval_math_runtime": 75.0586, "eval_math_samples_per_second": 6.661, "eval_math_steps_per_second": 1.665, "step": 800 }, { "epoch": 1.4237231556693, "eval_physics_entropy": 1.2258712153434754, "eval_physics_loss": 1.307606816291809, "eval_physics_mean_token_accuracy": 0.735959903717041, "eval_physics_num_tokens": 21292320.0, "eval_physics_runtime": 85.1923, "eval_physics_samples_per_second": 5.869, "eval_physics_steps_per_second": 1.467, "step": 800 }, { "entropy": 1.1539845019578934, "epoch": 1.441526649604985, "grad_norm": 20.625, "learning_rate": 1.622962962962963e-05, "loss": 19.1587, "mean_token_accuracy": 0.7431901153177023, "num_tokens": 21563668.0, "step": 810 }, { "entropy": 1.1525183795019984, "epoch": 1.4593301435406698, "grad_norm": 21.125, "learning_rate": 1.6155555555555556e-05, "loss": 19.1174, "mean_token_accuracy": 0.7469210345298052, "num_tokens": 21817298.0, "step": 820 }, { "entropy": 1.1620672512799501, "epoch": 1.4771336374763546, "grad_norm": 20.625, "learning_rate": 1.6081481481481482e-05, "loss": 19.3092, "mean_token_accuracy": 0.7433301910758019, "num_tokens": 22083010.0, "step": 830 }, { "entropy": 1.1712669901549817, "epoch": 1.4949371314120397, "grad_norm": 21.875, "learning_rate": 1.600740740740741e-05, "loss": 19.4144, "mean_token_accuracy": 0.7423816129565239, "num_tokens": 22350046.0, "step": 840 }, { "entropy": 1.1148280471563339, "epoch": 1.5127406253477245, "grad_norm": 18.125, "learning_rate": 1.5933333333333336e-05, "loss": 18.5777, "mean_token_accuracy": 0.7518805626779794, "num_tokens": 22615933.0, "step": 850 }, { "entropy": 1.12065857835114, "epoch": 1.5305441192834093, "grad_norm": 20.75, "learning_rate": 1.5859259259259262e-05, "loss": 18.6024, "mean_token_accuracy": 0.7530040096491575, "num_tokens": 22886843.0, "step": 860 }, { "entropy": 1.1522621292620898, "epoch": 1.5483476132190943, "grad_norm": 20.5, "learning_rate": 1.5785185185185185e-05, "loss": 19.1737, "mean_token_accuracy": 0.7450519923120738, "num_tokens": 23140244.0, "step": 870 }, { "entropy": 1.1377869185060263, "epoch": 1.5661511071547791, "grad_norm": 19.5, "learning_rate": 1.571111111111111e-05, "loss": 18.9524, "mean_token_accuracy": 0.7473382774740458, "num_tokens": 23405121.0, "step": 880 }, { "entropy": 1.148686793074012, "epoch": 1.583954601090464, "grad_norm": 20.125, "learning_rate": 1.5637037037037038e-05, "loss": 19.0497, "mean_token_accuracy": 0.7466071300208569, "num_tokens": 23671826.0, "step": 890 }, { "entropy": 1.1471959844231605, "epoch": 1.601758095026149, "grad_norm": 20.5, "learning_rate": 1.5562962962962965e-05, "loss": 19.0957, "mean_token_accuracy": 0.7456789817661047, "num_tokens": 23933374.0, "step": 900 }, { "epoch": 1.601758095026149, "eval_biology_entropy": 2.7071199588775636, "eval_biology_loss": 2.972935199737549, "eval_biology_mean_token_accuracy": 0.5189641232490539, "eval_biology_num_tokens": 23933374.0, "eval_biology_runtime": 57.6355, "eval_biology_samples_per_second": 8.675, "eval_biology_steps_per_second": 2.169, "step": 900 }, { "epoch": 1.601758095026149, "eval_chemistry_entropy": 1.162458167076111, "eval_chemistry_loss": 1.195241093635559, "eval_chemistry_mean_token_accuracy": 0.7447079033851624, "eval_chemistry_num_tokens": 23933374.0, "eval_chemistry_runtime": 72.2714, "eval_chemistry_samples_per_second": 6.918, "eval_chemistry_steps_per_second": 1.73, "step": 900 }, { "epoch": 1.601758095026149, "eval_math_entropy": 1.1186215715408325, "eval_math_loss": 1.3856722116470337, "eval_math_mean_token_accuracy": 0.7245329103469849, "eval_math_num_tokens": 23933374.0, "eval_math_runtime": 75.4805, "eval_math_samples_per_second": 6.624, "eval_math_steps_per_second": 1.656, "step": 900 }, { "epoch": 1.601758095026149, "eval_physics_entropy": 1.2408670482635498, "eval_physics_loss": 1.2979875802993774, "eval_physics_mean_token_accuracy": 0.736989695072174, "eval_physics_num_tokens": 23933374.0, "eval_physics_runtime": 85.587, "eval_physics_samples_per_second": 5.842, "eval_physics_steps_per_second": 1.461, "step": 900 }, { "entropy": 1.0977809239178895, "epoch": 1.6195615889618338, "grad_norm": 19.875, "learning_rate": 1.548888888888889e-05, "loss": 18.2108, "mean_token_accuracy": 0.7550504490733146, "num_tokens": 24210156.0, "step": 910 }, { "entropy": 1.14602930508554, "epoch": 1.6373650828975186, "grad_norm": 22.375, "learning_rate": 1.5414814814814814e-05, "loss": 19.1509, "mean_token_accuracy": 0.7462662551552057, "num_tokens": 24462726.0, "step": 920 }, { "entropy": 1.0786818396300077, "epoch": 1.6551685768332036, "grad_norm": 19.5, "learning_rate": 1.5340740740740744e-05, "loss": 17.8442, "mean_token_accuracy": 0.758844393491745, "num_tokens": 24732815.0, "step": 930 }, { "entropy": 1.1209699012339116, "epoch": 1.6729720707688884, "grad_norm": 21.0, "learning_rate": 1.5266666666666667e-05, "loss": 18.5603, "mean_token_accuracy": 0.7515024449676275, "num_tokens": 25002091.0, "step": 940 }, { "entropy": 1.112015390396118, "epoch": 1.6907755647045732, "grad_norm": 19.5, "learning_rate": 1.5192592592592594e-05, "loss": 18.5596, "mean_token_accuracy": 0.7524737507104874, "num_tokens": 25267949.0, "step": 950 }, { "entropy": 1.1459401201456785, "epoch": 1.7085790586402583, "grad_norm": 18.5, "learning_rate": 1.5118518518518519e-05, "loss": 18.9955, "mean_token_accuracy": 0.7468857653439045, "num_tokens": 25525772.0, "step": 960 }, { "entropy": 1.1483839362859727, "epoch": 1.7263825525759429, "grad_norm": 19.25, "learning_rate": 1.5044444444444445e-05, "loss": 19.0262, "mean_token_accuracy": 0.7459982354193926, "num_tokens": 25787727.0, "step": 970 }, { "entropy": 1.1058822341263295, "epoch": 1.744186046511628, "grad_norm": 17.375, "learning_rate": 1.497037037037037e-05, "loss": 18.4319, "mean_token_accuracy": 0.7541479263454676, "num_tokens": 26058026.0, "step": 980 }, { "entropy": 1.0693828593939543, "epoch": 1.761989540447313, "grad_norm": 18.375, "learning_rate": 1.4896296296296298e-05, "loss": 17.7423, "mean_token_accuracy": 0.7604560740292072, "num_tokens": 26326631.0, "step": 990 }, { "entropy": 1.1148248281329871, "epoch": 1.7797930343829975, "grad_norm": 19.125, "learning_rate": 1.4822222222222225e-05, "loss": 18.5599, "mean_token_accuracy": 0.75152304507792, "num_tokens": 26591735.0, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_biology_entropy": 2.7020350008010863, "eval_biology_loss": 2.959472179412842, "eval_biology_mean_token_accuracy": 0.5197799344062806, "eval_biology_num_tokens": 26591735.0, "eval_biology_runtime": 57.5048, "eval_biology_samples_per_second": 8.695, "eval_biology_steps_per_second": 2.174, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_chemistry_entropy": 1.14980606174469, "eval_chemistry_loss": 1.186444878578186, "eval_chemistry_mean_token_accuracy": 0.7462989177703857, "eval_chemistry_num_tokens": 26591735.0, "eval_chemistry_runtime": 71.6887, "eval_chemistry_samples_per_second": 6.975, "eval_chemistry_steps_per_second": 1.744, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_math_entropy": 1.105127462387085, "eval_math_loss": 1.3818320035934448, "eval_math_mean_token_accuracy": 0.7256039061546325, "eval_math_num_tokens": 26591735.0, "eval_math_runtime": 75.0009, "eval_math_samples_per_second": 6.667, "eval_math_steps_per_second": 1.667, "step": 1000 }, { "epoch": 1.7797930343829975, "eval_physics_entropy": 1.2302804527282716, "eval_physics_loss": 1.2946749925613403, "eval_physics_mean_token_accuracy": 0.7378688325881958, "eval_physics_num_tokens": 26591735.0, "eval_physics_runtime": 85.3338, "eval_physics_samples_per_second": 5.859, "eval_physics_steps_per_second": 1.465, "step": 1000 }, { "entropy": 1.1227924428880214, "epoch": 1.7975965283186826, "grad_norm": 20.0, "learning_rate": 1.474814814814815e-05, "loss": 18.6939, "mean_token_accuracy": 0.7498056028038264, "num_tokens": 26857071.0, "step": 1010 }, { "entropy": 1.1648254558444022, "epoch": 1.8154000222543676, "grad_norm": 19.25, "learning_rate": 1.4674074074074076e-05, "loss": 19.3387, "mean_token_accuracy": 0.7423548739403486, "num_tokens": 27120617.0, "step": 1020 }, { "entropy": 1.0981685355305673, "epoch": 1.8332035161900522, "grad_norm": 21.25, "learning_rate": 1.46e-05, "loss": 18.3023, "mean_token_accuracy": 0.7560988407582044, "num_tokens": 27391174.0, "step": 1030 }, { "entropy": 1.139582459628582, "epoch": 1.8510070101257372, "grad_norm": 17.0, "learning_rate": 1.4525925925925927e-05, "loss": 18.8114, "mean_token_accuracy": 0.7490788519382476, "num_tokens": 27656941.0, "step": 1040 }, { "entropy": 1.0934928126633168, "epoch": 1.868810504061422, "grad_norm": 19.125, "learning_rate": 1.4451851851851852e-05, "loss": 18.1743, "mean_token_accuracy": 0.7558255009353161, "num_tokens": 27927118.0, "step": 1050 }, { "entropy": 1.0726651160046459, "epoch": 1.8866139979971068, "grad_norm": 17.25, "learning_rate": 1.4377777777777779e-05, "loss": 17.8591, "mean_token_accuracy": 0.7595527049154043, "num_tokens": 28192620.0, "step": 1060 }, { "entropy": 1.1046127401292325, "epoch": 1.9044174919327919, "grad_norm": 19.875, "learning_rate": 1.4303703703703703e-05, "loss": 18.4902, "mean_token_accuracy": 0.753314646333456, "num_tokens": 28458168.0, "step": 1070 }, { "entropy": 1.1121449010446667, "epoch": 1.9222209858684767, "grad_norm": 20.0, "learning_rate": 1.4229629629629632e-05, "loss": 18.5865, "mean_token_accuracy": 0.7521534610539675, "num_tokens": 28722756.0, "step": 1080 }, { "entropy": 1.1442356582731008, "epoch": 1.9400244798041615, "grad_norm": 24.5, "learning_rate": 1.4155555555555556e-05, "loss": 18.8683, "mean_token_accuracy": 0.7469607297331095, "num_tokens": 28978375.0, "step": 1090 }, { "entropy": 1.1289800632745028, "epoch": 1.9578279737398465, "grad_norm": 18.875, "learning_rate": 1.4081481481481483e-05, "loss": 18.8067, "mean_token_accuracy": 0.7480562552809715, "num_tokens": 29236865.0, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_biology_entropy": 2.670331030845642, "eval_biology_loss": 2.9537129402160645, "eval_biology_mean_token_accuracy": 0.5207649791240692, "eval_biology_num_tokens": 29236865.0, "eval_biology_runtime": 57.7384, "eval_biology_samples_per_second": 8.66, "eval_biology_steps_per_second": 2.165, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_chemistry_entropy": 1.1356218423843383, "eval_chemistry_loss": 1.179543375968933, "eval_chemistry_mean_token_accuracy": 0.7473730354309082, "eval_chemistry_num_tokens": 29236865.0, "eval_chemistry_runtime": 72.0122, "eval_chemistry_samples_per_second": 6.943, "eval_chemistry_steps_per_second": 1.736, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_math_entropy": 1.0909526333808899, "eval_math_loss": 1.3782899379730225, "eval_math_mean_token_accuracy": 0.726540036201477, "eval_math_num_tokens": 29236865.0, "eval_math_runtime": 74.8523, "eval_math_samples_per_second": 6.68, "eval_math_steps_per_second": 1.67, "step": 1100 }, { "epoch": 1.9578279737398465, "eval_physics_entropy": 1.2169800367355346, "eval_physics_loss": 1.2896714210510254, "eval_physics_mean_token_accuracy": 0.7383955039978027, "eval_physics_num_tokens": 29236865.0, "eval_physics_runtime": 85.1627, "eval_physics_samples_per_second": 5.871, "eval_physics_steps_per_second": 1.468, "step": 1100 }, { "entropy": 1.0923401776701211, "epoch": 1.9756314676755313, "grad_norm": 20.25, "learning_rate": 1.400740740740741e-05, "loss": 18.3288, "mean_token_accuracy": 0.7545268822461366, "num_tokens": 29500428.0, "step": 1110 }, { "entropy": 1.1163721553981305, "epoch": 1.9934349616112161, "grad_norm": 21.5, "learning_rate": 1.3933333333333334e-05, "loss": 18.3996, "mean_token_accuracy": 0.7531708285212517, "num_tokens": 29766739.0, "step": 1120 }, { "entropy": 1.1096870253163, "epoch": 2.010682096361411, "grad_norm": 18.75, "learning_rate": 1.385925925925926e-05, "loss": 17.8243, "mean_token_accuracy": 0.753396144605452, "num_tokens": 30025427.0, "step": 1130 }, { "entropy": 1.1090912133455277, "epoch": 2.028485590297096, "grad_norm": 22.25, "learning_rate": 1.3785185185185186e-05, "loss": 18.2931, "mean_token_accuracy": 0.753648667037487, "num_tokens": 30286651.0, "step": 1140 }, { "entropy": 1.1002933204174041, "epoch": 2.0462890842327806, "grad_norm": 19.875, "learning_rate": 1.3711111111111112e-05, "loss": 18.3961, "mean_token_accuracy": 0.7534263014793396, "num_tokens": 30552624.0, "step": 1150 }, { "entropy": 1.0910452891141176, "epoch": 2.0640925781684656, "grad_norm": 22.75, "learning_rate": 1.3637037037037037e-05, "loss": 17.9244, "mean_token_accuracy": 0.7581309732049704, "num_tokens": 30823746.0, "step": 1160 }, { "entropy": 1.1619377303868532, "epoch": 2.0818960721041506, "grad_norm": 22.25, "learning_rate": 1.3562962962962965e-05, "loss": 19.4184, "mean_token_accuracy": 0.7404937222599983, "num_tokens": 31076307.0, "step": 1170 }, { "entropy": 1.144884167984128, "epoch": 2.099699566039835, "grad_norm": 20.875, "learning_rate": 1.3488888888888888e-05, "loss": 18.8592, "mean_token_accuracy": 0.7461796149611473, "num_tokens": 31340763.0, "step": 1180 }, { "entropy": 1.084285356104374, "epoch": 2.1175030599755202, "grad_norm": 19.0, "learning_rate": 1.3414814814814817e-05, "loss": 18.0967, "mean_token_accuracy": 0.756497149169445, "num_tokens": 31607200.0, "step": 1190 }, { "entropy": 1.1388497594743967, "epoch": 2.1353065539112053, "grad_norm": 19.75, "learning_rate": 1.3340740740740741e-05, "loss": 18.82, "mean_token_accuracy": 0.7471547801047563, "num_tokens": 31874392.0, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_biology_entropy": 2.6481752424240113, "eval_biology_loss": 2.955437183380127, "eval_biology_mean_token_accuracy": 0.5212604312896728, "eval_biology_num_tokens": 31874392.0, "eval_biology_runtime": 57.623, "eval_biology_samples_per_second": 8.677, "eval_biology_steps_per_second": 2.169, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_chemistry_entropy": 1.1267094049453736, "eval_chemistry_loss": 1.174714207649231, "eval_chemistry_mean_token_accuracy": 0.7476038336753845, "eval_chemistry_num_tokens": 31874392.0, "eval_chemistry_runtime": 72.0197, "eval_chemistry_samples_per_second": 6.943, "eval_chemistry_steps_per_second": 1.736, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_math_entropy": 1.091456967830658, "eval_math_loss": 1.376819133758545, "eval_math_mean_token_accuracy": 0.7268339867591858, "eval_math_num_tokens": 31874392.0, "eval_math_runtime": 75.1846, "eval_math_samples_per_second": 6.65, "eval_math_steps_per_second": 1.663, "step": 1200 }, { "epoch": 2.1353065539112053, "eval_physics_entropy": 1.2114794454574584, "eval_physics_loss": 1.285851240158081, "eval_physics_mean_token_accuracy": 0.7391742224693298, "eval_physics_num_tokens": 31874392.0, "eval_physics_runtime": 85.115, "eval_physics_samples_per_second": 5.874, "eval_physics_steps_per_second": 1.469, "step": 1200 }, { "entropy": 1.1082856599241495, "epoch": 2.15311004784689, "grad_norm": 18.625, "learning_rate": 1.3266666666666668e-05, "loss": 18.4609, "mean_token_accuracy": 0.7532401513308287, "num_tokens": 32144016.0, "step": 1210 }, { "entropy": 1.0737901078537107, "epoch": 2.170913541782575, "grad_norm": 18.375, "learning_rate": 1.3192592592592594e-05, "loss": 17.9022, "mean_token_accuracy": 0.7580810189247131, "num_tokens": 32410902.0, "step": 1220 }, { "entropy": 1.0895192243158818, "epoch": 2.18871703571826, "grad_norm": 21.375, "learning_rate": 1.311851851851852e-05, "loss": 18.0794, "mean_token_accuracy": 0.7569272886961699, "num_tokens": 32674338.0, "step": 1230 }, { "entropy": 1.1305242408066989, "epoch": 2.2065205296539445, "grad_norm": 16.0, "learning_rate": 1.3044444444444446e-05, "loss": 18.7777, "mean_token_accuracy": 0.7519055023789406, "num_tokens": 32930219.0, "step": 1240 }, { "entropy": 1.1194303661584855, "epoch": 2.2243240235896296, "grad_norm": 18.375, "learning_rate": 1.297037037037037e-05, "loss": 18.3991, "mean_token_accuracy": 0.7505009181797504, "num_tokens": 33194318.0, "step": 1250 }, { "entropy": 1.094122551381588, "epoch": 2.242127517525314, "grad_norm": 21.875, "learning_rate": 1.2896296296296299e-05, "loss": 18.4428, "mean_token_accuracy": 0.7547226294875145, "num_tokens": 33453292.0, "step": 1260 }, { "entropy": 1.0551138285547494, "epoch": 2.259931011460999, "grad_norm": 19.5, "learning_rate": 1.2822222222222222e-05, "loss": 17.3115, "mean_token_accuracy": 0.765355784446001, "num_tokens": 33728875.0, "step": 1270 }, { "entropy": 1.0934181027114391, "epoch": 2.277734505396684, "grad_norm": 21.25, "learning_rate": 1.274814814814815e-05, "loss": 18.262, "mean_token_accuracy": 0.7533521149307489, "num_tokens": 33999305.0, "step": 1280 }, { "entropy": 1.1094968844205142, "epoch": 2.295537999332369, "grad_norm": 22.0, "learning_rate": 1.2674074074074075e-05, "loss": 18.2857, "mean_token_accuracy": 0.7538360808044672, "num_tokens": 34269727.0, "step": 1290 }, { "entropy": 1.0546644374728202, "epoch": 2.313341493268054, "grad_norm": 18.125, "learning_rate": 1.2600000000000001e-05, "loss": 17.4746, "mean_token_accuracy": 0.7642518579959869, "num_tokens": 34537714.0, "step": 1300 }, { "epoch": 2.313341493268054, "eval_biology_entropy": 2.597072058677673, "eval_biology_loss": 2.9495482444763184, "eval_biology_mean_token_accuracy": 0.5221131265163421, "eval_biology_num_tokens": 34537714.0, "eval_biology_runtime": 57.4015, "eval_biology_samples_per_second": 8.711, "eval_biology_steps_per_second": 2.178, "step": 1300 }, { "epoch": 2.313341493268054, "eval_chemistry_entropy": 1.1102874970436096, "eval_chemistry_loss": 1.1702996492385864, "eval_chemistry_mean_token_accuracy": 0.7484057130813598, "eval_chemistry_num_tokens": 34537714.0, "eval_chemistry_runtime": 72.2853, "eval_chemistry_samples_per_second": 6.917, "eval_chemistry_steps_per_second": 1.729, "step": 1300 }, { "epoch": 2.313341493268054, "eval_math_entropy": 1.0795171055793762, "eval_math_loss": 1.3760371208190918, "eval_math_mean_token_accuracy": 0.7270366640090943, "eval_math_num_tokens": 34537714.0, "eval_math_runtime": 75.1171, "eval_math_samples_per_second": 6.656, "eval_math_steps_per_second": 1.664, "step": 1300 }, { "epoch": 2.313341493268054, "eval_physics_entropy": 1.195563290119171, "eval_physics_loss": 1.2820779085159302, "eval_physics_mean_token_accuracy": 0.7392076215744019, "eval_physics_num_tokens": 34537714.0, "eval_physics_runtime": 85.1797, "eval_physics_samples_per_second": 5.87, "eval_physics_steps_per_second": 1.467, "step": 1300 }, { "entropy": 1.1043903956189751, "epoch": 2.331144987203739, "grad_norm": 19.125, "learning_rate": 1.2525925925925928e-05, "loss": 18.5308, "mean_token_accuracy": 0.7503409255295992, "num_tokens": 34806647.0, "step": 1310 }, { "entropy": 1.0996684737503528, "epoch": 2.3489484811394234, "grad_norm": 18.25, "learning_rate": 1.2451851851851853e-05, "loss": 18.0647, "mean_token_accuracy": 0.7559586096554994, "num_tokens": 35076423.0, "step": 1320 }, { "entropy": 1.1198013797402382, "epoch": 2.3667519750751085, "grad_norm": 20.75, "learning_rate": 1.237777777777778e-05, "loss": 18.6202, "mean_token_accuracy": 0.7494822707027197, "num_tokens": 35343494.0, "step": 1330 }, { "entropy": 1.0966038379818202, "epoch": 2.3845554690107935, "grad_norm": 20.25, "learning_rate": 1.2303703703703704e-05, "loss": 18.1883, "mean_token_accuracy": 0.754872740060091, "num_tokens": 35614731.0, "step": 1340 }, { "entropy": 1.1123589921742678, "epoch": 2.402358962946478, "grad_norm": 20.875, "learning_rate": 1.222962962962963e-05, "loss": 18.456, "mean_token_accuracy": 0.7513397440314293, "num_tokens": 35877264.0, "step": 1350 }, { "entropy": 1.061964062973857, "epoch": 2.420162456882163, "grad_norm": 18.375, "learning_rate": 1.2155555555555555e-05, "loss": 17.6878, "mean_token_accuracy": 0.7598675034940243, "num_tokens": 36147587.0, "step": 1360 }, { "entropy": 1.1040230866521596, "epoch": 2.437965950817848, "grad_norm": 21.5, "learning_rate": 1.2081481481481484e-05, "loss": 18.3097, "mean_token_accuracy": 0.7549828208982945, "num_tokens": 36416125.0, "step": 1370 }, { "entropy": 1.1043301727622747, "epoch": 2.4557694447535328, "grad_norm": 21.5, "learning_rate": 1.2007407407407408e-05, "loss": 18.3398, "mean_token_accuracy": 0.7537022326141596, "num_tokens": 36680383.0, "step": 1380 }, { "entropy": 1.1061188193038105, "epoch": 2.473572938689218, "grad_norm": 19.875, "learning_rate": 1.1933333333333335e-05, "loss": 18.3255, "mean_token_accuracy": 0.7524043101817369, "num_tokens": 36946349.0, "step": 1390 }, { "entropy": 1.0568878058344127, "epoch": 2.491376432624903, "grad_norm": 16.875, "learning_rate": 1.185925925925926e-05, "loss": 17.5249, "mean_token_accuracy": 0.7631021294742822, "num_tokens": 37219532.0, "step": 1400 }, { "epoch": 2.491376432624903, "eval_biology_entropy": 2.652871153831482, "eval_biology_loss": 2.94612979888916, "eval_biology_mean_token_accuracy": 0.5220030138492584, "eval_biology_num_tokens": 37219532.0, "eval_biology_runtime": 57.6473, "eval_biology_samples_per_second": 8.673, "eval_biology_steps_per_second": 2.168, "step": 1400 }, { "epoch": 2.491376432624903, "eval_chemistry_entropy": 1.1232130107879639, "eval_chemistry_loss": 1.1665852069854736, "eval_chemistry_mean_token_accuracy": 0.7491925139427185, "eval_chemistry_num_tokens": 37219532.0, "eval_chemistry_runtime": 72.2294, "eval_chemistry_samples_per_second": 6.922, "eval_chemistry_steps_per_second": 1.731, "step": 1400 }, { "epoch": 2.491376432624903, "eval_math_entropy": 1.0859251236915588, "eval_math_loss": 1.372695803642273, "eval_math_mean_token_accuracy": 0.7273387761116028, "eval_math_num_tokens": 37219532.0, "eval_math_runtime": 75.0046, "eval_math_samples_per_second": 6.666, "eval_math_steps_per_second": 1.667, "step": 1400 }, { "epoch": 2.491376432624903, "eval_physics_entropy": 1.2078160109519958, "eval_physics_loss": 1.2796850204467773, "eval_physics_mean_token_accuracy": 0.7398202438354492, "eval_physics_num_tokens": 37219532.0, "eval_physics_runtime": 85.3077, "eval_physics_samples_per_second": 5.861, "eval_physics_steps_per_second": 1.465, "step": 1400 }, { "entropy": 1.091886579245329, "epoch": 2.5091799265605874, "grad_norm": 20.25, "learning_rate": 1.1785185185185186e-05, "loss": 18.0794, "mean_token_accuracy": 0.7578685782849789, "num_tokens": 37477914.0, "step": 1410 }, { "entropy": 1.0806248864158987, "epoch": 2.5269834204962724, "grad_norm": 18.875, "learning_rate": 1.1711111111111113e-05, "loss": 18.0524, "mean_token_accuracy": 0.7555962100625038, "num_tokens": 37746690.0, "step": 1420 }, { "entropy": 1.136592934280634, "epoch": 2.544786914431957, "grad_norm": 23.375, "learning_rate": 1.1637037037037037e-05, "loss": 18.8612, "mean_token_accuracy": 0.7464782755821944, "num_tokens": 38005792.0, "step": 1430 }, { "entropy": 1.117892136052251, "epoch": 2.562590408367642, "grad_norm": 19.875, "learning_rate": 1.1562962962962964e-05, "loss": 18.543, "mean_token_accuracy": 0.751380517706275, "num_tokens": 38269021.0, "step": 1440 }, { "entropy": 1.1022701445966958, "epoch": 2.580393902303327, "grad_norm": 18.625, "learning_rate": 1.1488888888888889e-05, "loss": 18.1605, "mean_token_accuracy": 0.7542933959513902, "num_tokens": 38545224.0, "step": 1450 }, { "entropy": 1.0846613895148038, "epoch": 2.598197396239012, "grad_norm": 18.375, "learning_rate": 1.1414814814814817e-05, "loss": 18.1621, "mean_token_accuracy": 0.7546363666653633, "num_tokens": 38802290.0, "step": 1460 }, { "entropy": 1.079243928194046, "epoch": 2.6160008901746967, "grad_norm": 20.125, "learning_rate": 1.1340740740740742e-05, "loss": 17.8188, "mean_token_accuracy": 0.7600852824747563, "num_tokens": 39065787.0, "step": 1470 }, { "entropy": 1.0927384681999683, "epoch": 2.6338043841103818, "grad_norm": 20.25, "learning_rate": 1.1266666666666668e-05, "loss": 18.1991, "mean_token_accuracy": 0.7551762603223324, "num_tokens": 39328517.0, "step": 1480 }, { "entropy": 1.0439645014703274, "epoch": 2.6516078780460663, "grad_norm": 19.25, "learning_rate": 1.1192592592592593e-05, "loss": 17.3405, "mean_token_accuracy": 0.763591681048274, "num_tokens": 39600641.0, "step": 1490 }, { "entropy": 1.1232297539710998, "epoch": 2.6694113719817514, "grad_norm": 23.5, "learning_rate": 1.111851851851852e-05, "loss": 18.7, "mean_token_accuracy": 0.7493483603000641, "num_tokens": 39857917.0, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_biology_entropy": 2.6466422786712647, "eval_biology_loss": 2.9346089363098145, "eval_biology_mean_token_accuracy": 0.5228305985927582, "eval_biology_num_tokens": 39857917.0, "eval_biology_runtime": 58.0118, "eval_biology_samples_per_second": 8.619, "eval_biology_steps_per_second": 2.155, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_chemistry_entropy": 1.118860065460205, "eval_chemistry_loss": 1.1640205383300781, "eval_chemistry_mean_token_accuracy": 0.7496954665184021, "eval_chemistry_num_tokens": 39857917.0, "eval_chemistry_runtime": 71.5035, "eval_chemistry_samples_per_second": 6.993, "eval_chemistry_steps_per_second": 1.748, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_math_entropy": 1.0849408197402954, "eval_math_loss": 1.370849847793579, "eval_math_mean_token_accuracy": 0.7275271005630494, "eval_math_num_tokens": 39857917.0, "eval_math_runtime": 74.7261, "eval_math_samples_per_second": 6.691, "eval_math_steps_per_second": 1.673, "step": 1500 }, { "epoch": 2.6694113719817514, "eval_physics_entropy": 1.20153675699234, "eval_physics_loss": 1.277133822441101, "eval_physics_mean_token_accuracy": 0.7400126695632935, "eval_physics_num_tokens": 39857917.0, "eval_physics_runtime": 84.812, "eval_physics_samples_per_second": 5.895, "eval_physics_steps_per_second": 1.474, "step": 1500 }, { "entropy": 1.0860063921660186, "epoch": 2.6872148659174364, "grad_norm": 21.125, "learning_rate": 1.1044444444444444e-05, "loss": 18.0853, "mean_token_accuracy": 0.7563430912792682, "num_tokens": 40119174.0, "step": 1510 }, { "entropy": 1.129167440533638, "epoch": 2.7050183598531214, "grad_norm": 20.125, "learning_rate": 1.0970370370370371e-05, "loss": 18.6618, "mean_token_accuracy": 0.7484573539346456, "num_tokens": 40379054.0, "step": 1520 }, { "entropy": 1.07851094417274, "epoch": 2.722821853788806, "grad_norm": 19.0, "learning_rate": 1.0896296296296298e-05, "loss": 18.083, "mean_token_accuracy": 0.7540540069341659, "num_tokens": 40650970.0, "step": 1530 }, { "entropy": 1.1019277192652226, "epoch": 2.740625347724491, "grad_norm": 18.625, "learning_rate": 1.0822222222222222e-05, "loss": 18.2659, "mean_token_accuracy": 0.7548778887838126, "num_tokens": 40914339.0, "step": 1540 }, { "entropy": 1.1040710996836425, "epoch": 2.7584288416601757, "grad_norm": 21.0, "learning_rate": 1.074814814814815e-05, "loss": 18.273, "mean_token_accuracy": 0.7527279917150735, "num_tokens": 41181827.0, "step": 1550 }, { "entropy": 1.0668385986238718, "epoch": 2.7762323355958607, "grad_norm": 22.0, "learning_rate": 1.0674074074074074e-05, "loss": 17.8438, "mean_token_accuracy": 0.7587875317782163, "num_tokens": 41448961.0, "step": 1560 }, { "entropy": 1.0884086616337298, "epoch": 2.7940358295315457, "grad_norm": 21.75, "learning_rate": 1.0600000000000002e-05, "loss": 18.0262, "mean_token_accuracy": 0.7560815311968326, "num_tokens": 41712243.0, "step": 1570 }, { "entropy": 1.1063744578510524, "epoch": 2.8118393234672303, "grad_norm": 18.0, "learning_rate": 1.0525925925925927e-05, "loss": 18.1967, "mean_token_accuracy": 0.754052161052823, "num_tokens": 41981755.0, "step": 1580 }, { "entropy": 1.0421814311295747, "epoch": 2.8296428174029153, "grad_norm": 18.625, "learning_rate": 1.0451851851851853e-05, "loss": 17.409, "mean_token_accuracy": 0.7653664790093899, "num_tokens": 42258270.0, "step": 1590 }, { "entropy": 1.0939543709158897, "epoch": 2.8474463113386, "grad_norm": 18.375, "learning_rate": 1.0377777777777778e-05, "loss": 18.0482, "mean_token_accuracy": 0.7579692296683789, "num_tokens": 42529321.0, "step": 1600 }, { "epoch": 2.8474463113386, "eval_biology_entropy": 2.637769458770752, "eval_biology_loss": 2.940894603729248, "eval_biology_mean_token_accuracy": 0.5229103591442108, "eval_biology_num_tokens": 42529321.0, "eval_biology_runtime": 57.8557, "eval_biology_samples_per_second": 8.642, "eval_biology_steps_per_second": 2.161, "step": 1600 }, { "epoch": 2.8474463113386, "eval_chemistry_entropy": 1.1125650115013122, "eval_chemistry_loss": 1.160796046257019, "eval_chemistry_mean_token_accuracy": 0.7501013536453247, "eval_chemistry_num_tokens": 42529321.0, "eval_chemistry_runtime": 72.2026, "eval_chemistry_samples_per_second": 6.925, "eval_chemistry_steps_per_second": 1.731, "step": 1600 }, { "epoch": 2.8474463113386, "eval_math_entropy": 1.0801175408363342, "eval_math_loss": 1.3694645166397095, "eval_math_mean_token_accuracy": 0.7277571864128113, "eval_math_num_tokens": 42529321.0, "eval_math_runtime": 75.2884, "eval_math_samples_per_second": 6.641, "eval_math_steps_per_second": 1.66, "step": 1600 }, { "epoch": 2.8474463113386, "eval_physics_entropy": 1.198549277305603, "eval_physics_loss": 1.27517569065094, "eval_physics_mean_token_accuracy": 0.7403683791160584, "eval_physics_num_tokens": 42529321.0, "eval_physics_runtime": 85.395, "eval_physics_samples_per_second": 5.855, "eval_physics_steps_per_second": 1.464, "step": 1600 }, { "entropy": 1.1290572371333838, "epoch": 2.865249805274285, "grad_norm": 20.5, "learning_rate": 1.0303703703703705e-05, "loss": 18.8437, "mean_token_accuracy": 0.7473030380904675, "num_tokens": 42793729.0, "step": 1610 }, { "entropy": 1.0939196426421405, "epoch": 2.88305329920997, "grad_norm": 19.375, "learning_rate": 1.0229629629629631e-05, "loss": 18.0962, "mean_token_accuracy": 0.7554187960922718, "num_tokens": 43067006.0, "step": 1620 }, { "entropy": 1.0769088421016932, "epoch": 2.900856793145655, "grad_norm": 24.0, "learning_rate": 1.0155555555555556e-05, "loss": 17.8392, "mean_token_accuracy": 0.7592465952038765, "num_tokens": 43333868.0, "step": 1630 }, { "entropy": 1.0659821335226298, "epoch": 2.9186602870813396, "grad_norm": 20.75, "learning_rate": 1.0081481481481484e-05, "loss": 17.6974, "mean_token_accuracy": 0.7605714075267315, "num_tokens": 43605151.0, "step": 1640 }, { "entropy": 1.0646285666152835, "epoch": 2.9364637810170247, "grad_norm": 19.625, "learning_rate": 1.0007407407407407e-05, "loss": 17.7336, "mean_token_accuracy": 0.7622984137386084, "num_tokens": 43863034.0, "step": 1650 }, { "entropy": 1.1087765533477068, "epoch": 2.9542672749527092, "grad_norm": 20.25, "learning_rate": 9.933333333333334e-06, "loss": 18.3884, "mean_token_accuracy": 0.751496770605445, "num_tokens": 44126796.0, "step": 1660 }, { "entropy": 1.1063459984958173, "epoch": 2.9720707688883943, "grad_norm": 21.5, "learning_rate": 9.85925925925926e-06, "loss": 18.4278, "mean_token_accuracy": 0.7521345388144255, "num_tokens": 44382724.0, "step": 1670 }, { "entropy": 1.0647218599915504, "epoch": 2.9898742628240793, "grad_norm": 18.875, "learning_rate": 9.785185185185187e-06, "loss": 17.7257, "mean_token_accuracy": 0.7603258349001407, "num_tokens": 44651471.0, "step": 1680 }, { "entropy": 1.0653223249220078, "epoch": 3.007121397574274, "grad_norm": 18.5, "learning_rate": 9.711111111111111e-06, "loss": 16.9743, "mean_token_accuracy": 0.7601908556876644, "num_tokens": 44909981.0, "step": 1690 }, { "entropy": 1.1704007178544997, "epoch": 3.0249248915099587, "grad_norm": 22.75, "learning_rate": 9.637037037037038e-06, "loss": 19.3967, "mean_token_accuracy": 0.7418560739606619, "num_tokens": 45176547.0, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_biology_entropy": 2.6161900300979615, "eval_biology_loss": 2.935819625854492, "eval_biology_mean_token_accuracy": 0.5230246422290802, "eval_biology_num_tokens": 45176547.0, "eval_biology_runtime": 57.8378, "eval_biology_samples_per_second": 8.645, "eval_biology_steps_per_second": 2.161, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_chemistry_entropy": 1.1025579800605774, "eval_chemistry_loss": 1.1589548587799072, "eval_chemistry_mean_token_accuracy": 0.7501291260719299, "eval_chemistry_num_tokens": 45176547.0, "eval_chemistry_runtime": 71.8656, "eval_chemistry_samples_per_second": 6.957, "eval_chemistry_steps_per_second": 1.739, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_math_entropy": 1.0748444156646728, "eval_math_loss": 1.3698667287826538, "eval_math_mean_token_accuracy": 0.7280403084754944, "eval_math_num_tokens": 45176547.0, "eval_math_runtime": 75.1807, "eval_math_samples_per_second": 6.651, "eval_math_steps_per_second": 1.663, "step": 1700 }, { "epoch": 3.0249248915099587, "eval_physics_entropy": 1.1882516493797302, "eval_physics_loss": 1.273923397064209, "eval_physics_mean_token_accuracy": 0.7405211963653564, "eval_physics_num_tokens": 45176547.0, "eval_physics_runtime": 85.18, "eval_physics_samples_per_second": 5.87, "eval_physics_steps_per_second": 1.467, "step": 1700 }, { "entropy": 1.045224749110639, "epoch": 3.0427283854456437, "grad_norm": 21.625, "learning_rate": 9.562962962962965e-06, "loss": 17.5191, "mean_token_accuracy": 0.7625730182975531, "num_tokens": 45437181.0, "step": 1710 }, { "entropy": 1.0588873416185378, "epoch": 3.0605318793813288, "grad_norm": 21.0, "learning_rate": 9.48888888888889e-06, "loss": 17.5556, "mean_token_accuracy": 0.7609021920710802, "num_tokens": 45702431.0, "step": 1720 }, { "entropy": 1.0747338887304068, "epoch": 3.0783353733170133, "grad_norm": 18.25, "learning_rate": 9.414814814814816e-06, "loss": 17.8698, "mean_token_accuracy": 0.757227075099945, "num_tokens": 45975788.0, "step": 1730 }, { "entropy": 1.0600129183381797, "epoch": 3.0961388672526984, "grad_norm": 21.0, "learning_rate": 9.34074074074074e-06, "loss": 17.366, "mean_token_accuracy": 0.7626322463154793, "num_tokens": 46251349.0, "step": 1740 }, { "entropy": 1.109250448271632, "epoch": 3.1139423611883834, "grad_norm": 24.75, "learning_rate": 9.266666666666667e-06, "loss": 18.4852, "mean_token_accuracy": 0.7508084613829851, "num_tokens": 46516802.0, "step": 1750 }, { "entropy": 1.056485254317522, "epoch": 3.131745855124068, "grad_norm": 20.125, "learning_rate": 9.192592592592594e-06, "loss": 17.5278, "mean_token_accuracy": 0.7627970885485411, "num_tokens": 46790537.0, "step": 1760 }, { "entropy": 1.0896330252289772, "epoch": 3.149549349059753, "grad_norm": 21.5, "learning_rate": 9.118518518518518e-06, "loss": 18.1303, "mean_token_accuracy": 0.7566281389445066, "num_tokens": 47052877.0, "step": 1770 }, { "entropy": 1.0868952518329025, "epoch": 3.167352842995438, "grad_norm": 20.25, "learning_rate": 9.044444444444445e-06, "loss": 17.9403, "mean_token_accuracy": 0.7568327851593495, "num_tokens": 47315587.0, "step": 1780 }, { "entropy": 1.0584388840943575, "epoch": 3.1851563369311227, "grad_norm": 21.875, "learning_rate": 8.970370370370372e-06, "loss": 17.6068, "mean_token_accuracy": 0.7634176205843687, "num_tokens": 47578352.0, "step": 1790 }, { "entropy": 1.0782266601920127, "epoch": 3.2029598308668077, "grad_norm": 21.125, "learning_rate": 8.896296296296298e-06, "loss": 17.9251, "mean_token_accuracy": 0.7589756917208433, "num_tokens": 47844623.0, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_biology_entropy": 2.6365171356201174, "eval_biology_loss": 2.932905673980713, "eval_biology_mean_token_accuracy": 0.5235939712524414, "eval_biology_num_tokens": 47844623.0, "eval_biology_runtime": 57.6814, "eval_biology_samples_per_second": 8.668, "eval_biology_steps_per_second": 2.167, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_chemistry_entropy": 1.1065808815956115, "eval_chemistry_loss": 1.157857060432434, "eval_chemistry_mean_token_accuracy": 0.7504735474586487, "eval_chemistry_num_tokens": 47844623.0, "eval_chemistry_runtime": 72.0749, "eval_chemistry_samples_per_second": 6.937, "eval_chemistry_steps_per_second": 1.734, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_math_entropy": 1.078178656578064, "eval_math_loss": 1.3683264255523682, "eval_math_mean_token_accuracy": 0.7283754692077636, "eval_math_num_tokens": 47844623.0, "eval_math_runtime": 75.1466, "eval_math_samples_per_second": 6.654, "eval_math_steps_per_second": 1.663, "step": 1800 }, { "epoch": 3.2029598308668077, "eval_physics_entropy": 1.193580945968628, "eval_physics_loss": 1.2739607095718384, "eval_physics_mean_token_accuracy": 0.7406387696266175, "eval_physics_num_tokens": 47844623.0, "eval_physics_runtime": 85.1683, "eval_physics_samples_per_second": 5.871, "eval_physics_steps_per_second": 1.468, "step": 1800 }, { "entropy": 1.0502044271677733, "epoch": 3.2207633248024923, "grad_norm": 19.25, "learning_rate": 8.822222222222223e-06, "loss": 17.532, "mean_token_accuracy": 0.762039003893733, "num_tokens": 48112262.0, "step": 1810 }, { "entropy": 1.1423117272555827, "epoch": 3.2385668187381773, "grad_norm": 21.125, "learning_rate": 8.74814814814815e-06, "loss": 19.0592, "mean_token_accuracy": 0.7459153685718775, "num_tokens": 48369097.0, "step": 1820 }, { "entropy": 1.0870308240875601, "epoch": 3.2563703126738623, "grad_norm": 20.375, "learning_rate": 8.674074074074074e-06, "loss": 17.9442, "mean_token_accuracy": 0.7594770763069392, "num_tokens": 48628165.0, "step": 1830 }, { "entropy": 1.0671832324936985, "epoch": 3.274173806609547, "grad_norm": 19.5, "learning_rate": 8.6e-06, "loss": 17.6901, "mean_token_accuracy": 0.759853546321392, "num_tokens": 48898634.0, "step": 1840 }, { "entropy": 1.0650055054575205, "epoch": 3.291977300545232, "grad_norm": 18.5, "learning_rate": 8.525925925925927e-06, "loss": 17.6381, "mean_token_accuracy": 0.7601484149694443, "num_tokens": 49167839.0, "step": 1850 }, { "entropy": 1.083081658370793, "epoch": 3.309780794480917, "grad_norm": 20.5, "learning_rate": 8.451851851851852e-06, "loss": 17.9985, "mean_token_accuracy": 0.7583192996680737, "num_tokens": 49431378.0, "step": 1860 }, { "entropy": 1.0654306124895812, "epoch": 3.3275842884166016, "grad_norm": 17.375, "learning_rate": 8.377777777777779e-06, "loss": 17.7162, "mean_token_accuracy": 0.7611102845519782, "num_tokens": 49699309.0, "step": 1870 }, { "entropy": 1.086764731630683, "epoch": 3.3453877823522866, "grad_norm": 18.625, "learning_rate": 8.303703703703705e-06, "loss": 18.0623, "mean_token_accuracy": 0.7570337392389774, "num_tokens": 49972032.0, "step": 1880 }, { "entropy": 1.092942675575614, "epoch": 3.3631912762879717, "grad_norm": 22.75, "learning_rate": 8.229629629629632e-06, "loss": 18.1498, "mean_token_accuracy": 0.7554987825453281, "num_tokens": 50241367.0, "step": 1890 }, { "entropy": 1.1048477381467818, "epoch": 3.3809947702236562, "grad_norm": 19.75, "learning_rate": 8.155555555555556e-06, "loss": 18.1831, "mean_token_accuracy": 0.7542665727436543, "num_tokens": 50511167.0, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_biology_entropy": 2.608997480392456, "eval_biology_loss": 2.929826259613037, "eval_biology_mean_token_accuracy": 0.5243550386428834, "eval_biology_num_tokens": 50511167.0, "eval_biology_runtime": 58.3475, "eval_biology_samples_per_second": 8.569, "eval_biology_steps_per_second": 2.142, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_chemistry_entropy": 1.1078306741714476, "eval_chemistry_loss": 1.1564204692840576, "eval_chemistry_mean_token_accuracy": 0.7505966582298279, "eval_chemistry_num_tokens": 50511167.0, "eval_chemistry_runtime": 72.2379, "eval_chemistry_samples_per_second": 6.922, "eval_chemistry_steps_per_second": 1.73, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_math_entropy": 1.0781858038902283, "eval_math_loss": 1.3685622215270996, "eval_math_mean_token_accuracy": 0.7281003012657166, "eval_math_num_tokens": 50511167.0, "eval_math_runtime": 75.3062, "eval_math_samples_per_second": 6.64, "eval_math_steps_per_second": 1.66, "step": 1900 }, { "epoch": 3.3809947702236562, "eval_physics_entropy": 1.193738681793213, "eval_physics_loss": 1.2733585834503174, "eval_physics_mean_token_accuracy": 0.7406534767150879, "eval_physics_num_tokens": 50511167.0, "eval_physics_runtime": 85.3447, "eval_physics_samples_per_second": 5.859, "eval_physics_steps_per_second": 1.465, "step": 1900 }, { "entropy": 1.1158889718353748, "epoch": 3.3987982641593413, "grad_norm": 22.75, "learning_rate": 8.081481481481483e-06, "loss": 18.4653, "mean_token_accuracy": 0.7502956230193376, "num_tokens": 50776842.0, "step": 1910 }, { "entropy": 1.076490705087781, "epoch": 3.4166017580950263, "grad_norm": 20.75, "learning_rate": 8.007407407407408e-06, "loss": 17.9335, "mean_token_accuracy": 0.7567251056432724, "num_tokens": 51040514.0, "step": 1920 }, { "entropy": 1.0964046400040388, "epoch": 3.434405252030711, "grad_norm": 21.625, "learning_rate": 7.933333333333334e-06, "loss": 18.146, "mean_token_accuracy": 0.7544326152652502, "num_tokens": 51300696.0, "step": 1930 }, { "entropy": 1.087927882000804, "epoch": 3.452208745966396, "grad_norm": 21.75, "learning_rate": 7.859259259259259e-06, "loss": 18.1376, "mean_token_accuracy": 0.754492249712348, "num_tokens": 51558606.0, "step": 1940 }, { "entropy": 1.0904299218207598, "epoch": 3.470012239902081, "grad_norm": 18.75, "learning_rate": 7.785185185185185e-06, "loss": 18.0444, "mean_token_accuracy": 0.7552218366414308, "num_tokens": 51827834.0, "step": 1950 }, { "entropy": 1.059173485264182, "epoch": 3.4878157338377656, "grad_norm": 20.0, "learning_rate": 7.711111111111112e-06, "loss": 17.5376, "mean_token_accuracy": 0.7617850303649902, "num_tokens": 52100179.0, "step": 1960 }, { "entropy": 1.1043213743716478, "epoch": 3.5056192277734506, "grad_norm": 20.75, "learning_rate": 7.637037037037037e-06, "loss": 18.2923, "mean_token_accuracy": 0.7549142900854349, "num_tokens": 52368055.0, "step": 1970 }, { "entropy": 1.0561675556004047, "epoch": 3.523422721709135, "grad_norm": 19.625, "learning_rate": 7.562962962962963e-06, "loss": 17.583, "mean_token_accuracy": 0.7620449144393205, "num_tokens": 52637505.0, "step": 1980 }, { "entropy": 1.0883250068873167, "epoch": 3.54122621564482, "grad_norm": 20.625, "learning_rate": 7.48888888888889e-06, "loss": 18.0451, "mean_token_accuracy": 0.7565963264554739, "num_tokens": 52906194.0, "step": 1990 }, { "entropy": 1.0408041454851626, "epoch": 3.5590297095805052, "grad_norm": 19.625, "learning_rate": 7.4148148148148155e-06, "loss": 17.3497, "mean_token_accuracy": 0.7656379960477352, "num_tokens": 53172376.0, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_biology_entropy": 2.651417718887329, "eval_biology_loss": 2.9278793334960938, "eval_biology_mean_token_accuracy": 0.5243545579910278, "eval_biology_num_tokens": 53172376.0, "eval_biology_runtime": 57.5665, "eval_biology_samples_per_second": 8.686, "eval_biology_steps_per_second": 2.171, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_chemistry_entropy": 1.111885326385498, "eval_chemistry_loss": 1.1551117897033691, "eval_chemistry_mean_token_accuracy": 0.7506753826141357, "eval_chemistry_num_tokens": 53172376.0, "eval_chemistry_runtime": 71.4351, "eval_chemistry_samples_per_second": 6.999, "eval_chemistry_steps_per_second": 1.75, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_math_entropy": 1.081713596343994, "eval_math_loss": 1.3672666549682617, "eval_math_mean_token_accuracy": 0.7283283696174622, "eval_math_num_tokens": 53172376.0, "eval_math_runtime": 74.9256, "eval_math_samples_per_second": 6.673, "eval_math_steps_per_second": 1.668, "step": 2000 }, { "epoch": 3.5590297095805052, "eval_physics_entropy": 1.2001786155700684, "eval_physics_loss": 1.2717243432998657, "eval_physics_mean_token_accuracy": 0.7409325709342957, "eval_physics_num_tokens": 53172376.0, "eval_physics_runtime": 84.6622, "eval_physics_samples_per_second": 5.906, "eval_physics_steps_per_second": 1.476, "step": 2000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.556064716540213e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }