{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.335784598439932, "epoch": 0.016, "grad_norm": 157.0, "learning_rate": 6.000000000000001e-07, "loss": 34.5026, "mean_token_accuracy": 0.6119794614613057, "num_tokens": 338352.0, "step": 10 }, { "entropy": 1.3430390540510415, "epoch": 0.032, "grad_norm": 137.0, "learning_rate": 1.2666666666666669e-06, "loss": 34.2867, "mean_token_accuracy": 0.6166948016732932, "num_tokens": 671193.0, "step": 20 }, { "entropy": 1.3716558743268252, "epoch": 0.048, "grad_norm": 121.5, "learning_rate": 1.9333333333333336e-06, "loss": 33.3472, "mean_token_accuracy": 0.6195640843361616, "num_tokens": 1001008.0, "step": 30 }, { "entropy": 1.492756663635373, "epoch": 0.064, "grad_norm": 91.5, "learning_rate": 2.6e-06, "loss": 32.9244, "mean_token_accuracy": 0.6165778396651149, "num_tokens": 1331208.0, "step": 40 }, { "entropy": 1.5643325500190257, "epoch": 0.08, "grad_norm": 64.0, "learning_rate": 3.266666666666667e-06, "loss": 30.4382, "mean_token_accuracy": 0.6357192637398839, "num_tokens": 1667532.0, "step": 50 }, { "entropy": 1.5284064427018165, "epoch": 0.096, "grad_norm": 43.5, "learning_rate": 3.9333333333333335e-06, "loss": 27.3833, "mean_token_accuracy": 0.6618047762662173, "num_tokens": 2007176.0, "step": 60 }, { "entropy": 1.5410372719168663, "epoch": 0.112, "grad_norm": 36.25, "learning_rate": 4.600000000000001e-06, "loss": 26.009, "mean_token_accuracy": 0.6689645217731595, "num_tokens": 2342259.0, "step": 70 }, { "entropy": 1.5293128810822965, "epoch": 0.128, "grad_norm": 30.5, "learning_rate": 5.2666666666666665e-06, "loss": 24.7913, "mean_token_accuracy": 0.6720394570380449, "num_tokens": 2663987.0, "step": 80 }, { "entropy": 1.4024027574807405, "epoch": 0.144, "grad_norm": 22.0, "learning_rate": 5.933333333333335e-06, "loss": 22.4089, "mean_token_accuracy": 0.6952127493917942, "num_tokens": 2997824.0, "step": 90 }, { "entropy": 1.3540743235498667, "epoch": 0.16, "grad_norm": 19.875, "learning_rate": 6.600000000000001e-06, "loss": 21.2896, "mean_token_accuracy": 0.7063459653407336, "num_tokens": 3330597.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 2.8890565853118897, "eval_biology_loss": 3.7317216396331787, "eval_biology_mean_token_accuracy": 0.4269867980480194, "eval_biology_num_tokens": 3330597.0, "eval_biology_runtime": 49.6093, "eval_biology_samples_per_second": 10.079, "eval_biology_steps_per_second": 2.52, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.682255081653595, "eval_chemistry_loss": 1.6963560581207275, "eval_chemistry_mean_token_accuracy": 0.6472484331130981, "eval_chemistry_num_tokens": 3330597.0, "eval_chemistry_runtime": 61.4961, "eval_chemistry_samples_per_second": 8.131, "eval_chemistry_steps_per_second": 2.033, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 1.142211413383484, "eval_math_loss": 1.4016218185424805, "eval_math_mean_token_accuracy": 0.7029124236106873, "eval_math_num_tokens": 3330597.0, "eval_math_runtime": 63.0568, "eval_math_samples_per_second": 7.929, "eval_math_steps_per_second": 1.982, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.3429004917144776, "eval_physics_loss": 1.3169747591018677, "eval_physics_mean_token_accuracy": 0.7064582467079162, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 71.749, "eval_physics_samples_per_second": 6.969, "eval_physics_steps_per_second": 1.742, "step": 100 }, { "entropy": 1.3304455887526274, "epoch": 0.176, "grad_norm": 22.375, "learning_rate": 7.266666666666668e-06, "loss": 20.7106, "mean_token_accuracy": 0.7095530580729246, "num_tokens": 3658264.0, "step": 110 }, { "entropy": 1.2620876263827085, "epoch": 0.192, "grad_norm": 16.375, "learning_rate": 7.933333333333334e-06, "loss": 19.8944, "mean_token_accuracy": 0.7141206510365009, "num_tokens": 3995568.0, "step": 120 }, { "entropy": 1.201955909654498, "epoch": 0.208, "grad_norm": 17.375, "learning_rate": 8.6e-06, "loss": 18.9243, "mean_token_accuracy": 0.7253232792019844, "num_tokens": 4321436.0, "step": 130 }, { "entropy": 1.178970755264163, "epoch": 0.224, "grad_norm": 15.6875, "learning_rate": 9.266666666666667e-06, "loss": 18.583, "mean_token_accuracy": 0.7276350263506174, "num_tokens": 4648491.0, "step": 140 }, { "entropy": 1.155185490846634, "epoch": 0.24, "grad_norm": 16.625, "learning_rate": 9.933333333333334e-06, "loss": 18.4686, "mean_token_accuracy": 0.7289414200931787, "num_tokens": 4986175.0, "step": 150 }, { "entropy": 1.1447543263435365, "epoch": 0.256, "grad_norm": 17.125, "learning_rate": 1.0600000000000002e-05, "loss": 18.0999, "mean_token_accuracy": 0.7330310437828302, "num_tokens": 5329320.0, "step": 160 }, { "entropy": 1.142161527276039, "epoch": 0.272, "grad_norm": 17.625, "learning_rate": 1.1266666666666668e-05, "loss": 18.0884, "mean_token_accuracy": 0.7317263871431351, "num_tokens": 5658796.0, "step": 170 }, { "entropy": 1.1180811140686273, "epoch": 0.288, "grad_norm": 15.75, "learning_rate": 1.1933333333333335e-05, "loss": 17.8484, "mean_token_accuracy": 0.7360638868063688, "num_tokens": 5980201.0, "step": 180 }, { "entropy": 1.0995653536170722, "epoch": 0.304, "grad_norm": 19.75, "learning_rate": 1.2600000000000001e-05, "loss": 17.5458, "mean_token_accuracy": 0.7386808756738901, "num_tokens": 6298858.0, "step": 190 }, { "entropy": 1.0919909674674273, "epoch": 0.32, "grad_norm": 17.75, "learning_rate": 1.3266666666666668e-05, "loss": 17.376, "mean_token_accuracy": 0.7401029217988253, "num_tokens": 6622798.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 2.781595293998718, "eval_biology_loss": 3.2204973697662354, "eval_biology_mean_token_accuracy": 0.46290496277809146, "eval_biology_num_tokens": 6622798.0, "eval_biology_runtime": 49.6202, "eval_biology_samples_per_second": 10.077, "eval_biology_steps_per_second": 2.519, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.4062482738494873, "eval_chemistry_loss": 1.4335906505584717, "eval_chemistry_mean_token_accuracy": 0.6814104890823365, "eval_chemistry_num_tokens": 6622798.0, "eval_chemistry_runtime": 61.4965, "eval_chemistry_samples_per_second": 8.131, "eval_chemistry_steps_per_second": 2.033, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 1.0162367067337037, "eval_math_loss": 1.2369706630706787, "eval_math_mean_token_accuracy": 0.721817741394043, "eval_math_num_tokens": 6622798.0, "eval_math_runtime": 63.0765, "eval_math_samples_per_second": 7.927, "eval_math_steps_per_second": 1.982, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 1.067228192806244, "eval_physics_loss": 1.0861860513687134, "eval_physics_mean_token_accuracy": 0.7410436091423035, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 71.8547, "eval_physics_samples_per_second": 6.958, "eval_physics_steps_per_second": 1.74, "step": 200 }, { "entropy": 1.0740001201629639, "epoch": 0.336, "grad_norm": 13.8125, "learning_rate": 1.3933333333333334e-05, "loss": 17.1669, "mean_token_accuracy": 0.74343366548419, "num_tokens": 6953611.0, "step": 210 }, { "entropy": 1.042375946417451, "epoch": 0.352, "grad_norm": 16.5, "learning_rate": 1.46e-05, "loss": 16.6192, "mean_token_accuracy": 0.7505168370902539, "num_tokens": 7290378.0, "step": 220 }, { "entropy": 1.0228992454707622, "epoch": 0.368, "grad_norm": 16.375, "learning_rate": 1.5266666666666667e-05, "loss": 16.3576, "mean_token_accuracy": 0.7527059823274612, "num_tokens": 7621082.0, "step": 230 }, { "entropy": 1.0688911959528924, "epoch": 0.384, "grad_norm": 17.0, "learning_rate": 1.5933333333333336e-05, "loss": 17.0574, "mean_token_accuracy": 0.7418320391327142, "num_tokens": 7955570.0, "step": 240 }, { "entropy": 1.016674182936549, "epoch": 0.4, "grad_norm": 17.375, "learning_rate": 1.66e-05, "loss": 16.2747, "mean_token_accuracy": 0.7516404036432505, "num_tokens": 8291049.0, "step": 250 }, { "entropy": 0.9974442631006241, "epoch": 0.416, "grad_norm": 16.0, "learning_rate": 1.726666666666667e-05, "loss": 15.9357, "mean_token_accuracy": 0.7562779419124126, "num_tokens": 8627310.0, "step": 260 }, { "entropy": 1.0080594643950462, "epoch": 0.432, "grad_norm": 16.625, "learning_rate": 1.7933333333333333e-05, "loss": 16.1472, "mean_token_accuracy": 0.7539955742657185, "num_tokens": 8958371.0, "step": 270 }, { "entropy": 0.9669299107044935, "epoch": 0.448, "grad_norm": 15.625, "learning_rate": 1.86e-05, "loss": 15.5349, "mean_token_accuracy": 0.7622108798474073, "num_tokens": 9294388.0, "step": 280 }, { "entropy": 0.9649454571306706, "epoch": 0.464, "grad_norm": 16.625, "learning_rate": 1.926666666666667e-05, "loss": 15.3696, "mean_token_accuracy": 0.7622643817216158, "num_tokens": 9620366.0, "step": 290 }, { "entropy": 0.9703133786097169, "epoch": 0.48, "grad_norm": 17.125, "learning_rate": 1.9933333333333334e-05, "loss": 15.5958, "mean_token_accuracy": 0.7595039043575526, "num_tokens": 9955431.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 2.6708189754486082, "eval_biology_loss": 3.0823237895965576, "eval_biology_mean_token_accuracy": 0.4763511681556702, "eval_biology_num_tokens": 9955431.0, "eval_biology_runtime": 49.4876, "eval_biology_samples_per_second": 10.104, "eval_biology_steps_per_second": 2.526, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 1.320955008983612, "eval_chemistry_loss": 1.3409372568130493, "eval_chemistry_mean_token_accuracy": 0.6957354559898377, "eval_chemistry_num_tokens": 9955431.0, "eval_chemistry_runtime": 61.3511, "eval_chemistry_samples_per_second": 8.15, "eval_chemistry_steps_per_second": 2.037, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.9890776557922363, "eval_math_loss": 1.1766003370285034, "eval_math_mean_token_accuracy": 0.732050290107727, "eval_math_num_tokens": 9955431.0, "eval_math_runtime": 62.9199, "eval_math_samples_per_second": 7.947, "eval_math_steps_per_second": 1.987, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.9889620594978332, "eval_physics_loss": 0.9930320978164673, "eval_physics_mean_token_accuracy": 0.7561589913368225, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 71.6691, "eval_physics_samples_per_second": 6.977, "eval_physics_steps_per_second": 1.744, "step": 300 }, { "entropy": 0.9819621413946151, "epoch": 0.496, "grad_norm": 14.5625, "learning_rate": 1.9933333333333334e-05, "loss": 15.7494, "mean_token_accuracy": 0.7583937518298626, "num_tokens": 10289883.0, "step": 310 }, { "entropy": 0.9672155775129795, "epoch": 0.512, "grad_norm": 18.625, "learning_rate": 1.985925925925926e-05, "loss": 15.5391, "mean_token_accuracy": 0.7612126469612122, "num_tokens": 10619468.0, "step": 320 }, { "entropy": 0.978820975869894, "epoch": 0.528, "grad_norm": 16.0, "learning_rate": 1.9785185185185187e-05, "loss": 15.6782, "mean_token_accuracy": 0.7582353662699461, "num_tokens": 10944904.0, "step": 330 }, { "entropy": 0.963908814266324, "epoch": 0.544, "grad_norm": 16.625, "learning_rate": 1.971111111111111e-05, "loss": 15.4658, "mean_token_accuracy": 0.7615648847073316, "num_tokens": 11266409.0, "step": 340 }, { "entropy": 0.9442941181361675, "epoch": 0.56, "grad_norm": 15.3125, "learning_rate": 1.963703703703704e-05, "loss": 15.0885, "mean_token_accuracy": 0.7654793575406075, "num_tokens": 11605544.0, "step": 350 }, { "entropy": 0.9328314460813999, "epoch": 0.576, "grad_norm": 16.25, "learning_rate": 1.9562962962962964e-05, "loss": 15.0173, "mean_token_accuracy": 0.768263740092516, "num_tokens": 11931715.0, "step": 360 }, { "entropy": 0.9495550885796546, "epoch": 0.592, "grad_norm": 16.5, "learning_rate": 1.948888888888889e-05, "loss": 15.1932, "mean_token_accuracy": 0.7637792613357306, "num_tokens": 12257243.0, "step": 370 }, { "entropy": 0.9208816282451153, "epoch": 0.608, "grad_norm": 14.75, "learning_rate": 1.9414814814814817e-05, "loss": 14.8406, "mean_token_accuracy": 0.7683777522295714, "num_tokens": 12583154.0, "step": 380 }, { "entropy": 0.9426417943090201, "epoch": 0.624, "grad_norm": 15.5, "learning_rate": 1.9340740740740743e-05, "loss": 15.1649, "mean_token_accuracy": 0.7648395039141178, "num_tokens": 12905392.0, "step": 390 }, { "entropy": 0.9430808376520872, "epoch": 0.64, "grad_norm": 14.6875, "learning_rate": 1.926666666666667e-05, "loss": 15.1432, "mean_token_accuracy": 0.7654214475303889, "num_tokens": 13232198.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 2.6180445404052732, "eval_biology_loss": 2.958848237991333, "eval_biology_mean_token_accuracy": 0.48831122827529905, "eval_biology_num_tokens": 13232198.0, "eval_biology_runtime": 49.452, "eval_biology_samples_per_second": 10.111, "eval_biology_steps_per_second": 2.528, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 1.2611312751770019, "eval_chemistry_loss": 1.2864106893539429, "eval_chemistry_mean_token_accuracy": 0.7043857793807984, "eval_chemistry_num_tokens": 13232198.0, "eval_chemistry_runtime": 61.4019, "eval_chemistry_samples_per_second": 8.143, "eval_chemistry_steps_per_second": 2.036, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.9471726126670837, "eval_math_loss": 1.148703694343567, "eval_math_mean_token_accuracy": 0.7371885905265808, "eval_math_num_tokens": 13232198.0, "eval_math_runtime": 62.9339, "eval_math_samples_per_second": 7.945, "eval_math_steps_per_second": 1.986, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.9362427434921264, "eval_physics_loss": 0.9423627853393555, "eval_physics_mean_token_accuracy": 0.7653819880485535, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 71.631, "eval_physics_samples_per_second": 6.98, "eval_physics_steps_per_second": 1.745, "step": 400 }, { "entropy": 0.9263238042593003, "epoch": 0.656, "grad_norm": 16.125, "learning_rate": 1.9192592592592593e-05, "loss": 14.9, "mean_token_accuracy": 0.7683228138834238, "num_tokens": 13575902.0, "step": 410 }, { "entropy": 0.9128325123339891, "epoch": 0.672, "grad_norm": 17.5, "learning_rate": 1.911851851851852e-05, "loss": 14.587, "mean_token_accuracy": 0.771899900585413, "num_tokens": 13895997.0, "step": 420 }, { "entropy": 0.9165764002129435, "epoch": 0.688, "grad_norm": 15.1875, "learning_rate": 1.9044444444444446e-05, "loss": 14.7149, "mean_token_accuracy": 0.7700499434024095, "num_tokens": 14234888.0, "step": 430 }, { "entropy": 0.9249953411519527, "epoch": 0.704, "grad_norm": 15.9375, "learning_rate": 1.8970370370370372e-05, "loss": 14.9209, "mean_token_accuracy": 0.7661528721451759, "num_tokens": 14567908.0, "step": 440 }, { "entropy": 0.9112705899402499, "epoch": 0.72, "grad_norm": 17.125, "learning_rate": 1.8896296296296295e-05, "loss": 14.5659, "mean_token_accuracy": 0.7717954892665148, "num_tokens": 14882927.0, "step": 450 }, { "entropy": 0.90460856705904, "epoch": 0.736, "grad_norm": 17.125, "learning_rate": 1.8822222222222225e-05, "loss": 14.5339, "mean_token_accuracy": 0.7712412856519222, "num_tokens": 15217342.0, "step": 460 }, { "entropy": 0.8897293049842119, "epoch": 0.752, "grad_norm": 15.5, "learning_rate": 1.874814814814815e-05, "loss": 14.2105, "mean_token_accuracy": 0.7750069301575422, "num_tokens": 15547710.0, "step": 470 }, { "entropy": 0.9218722280114889, "epoch": 0.768, "grad_norm": 16.25, "learning_rate": 1.8674074074074075e-05, "loss": 14.8733, "mean_token_accuracy": 0.7659960601478815, "num_tokens": 15877177.0, "step": 480 }, { "entropy": 0.8870411489158869, "epoch": 0.784, "grad_norm": 13.9375, "learning_rate": 1.86e-05, "loss": 14.2657, "mean_token_accuracy": 0.7761844106018543, "num_tokens": 16219640.0, "step": 490 }, { "entropy": 0.9038290875032544, "epoch": 0.8, "grad_norm": 16.0, "learning_rate": 1.8525925925925928e-05, "loss": 14.5034, "mean_token_accuracy": 0.7724477723240852, "num_tokens": 16548261.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 2.657470052719116, "eval_biology_loss": 2.942234516143799, "eval_biology_mean_token_accuracy": 0.49295870113372803, "eval_biology_num_tokens": 16548261.0, "eval_biology_runtime": 49.4094, "eval_biology_samples_per_second": 10.12, "eval_biology_steps_per_second": 2.53, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 1.2639175114631653, "eval_chemistry_loss": 1.2617712020874023, "eval_chemistry_mean_token_accuracy": 0.7095342454910278, "eval_chemistry_num_tokens": 16548261.0, "eval_chemistry_runtime": 60.9985, "eval_chemistry_samples_per_second": 8.197, "eval_chemistry_steps_per_second": 2.049, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.9355086851119995, "eval_math_loss": 1.1333692073822021, "eval_math_mean_token_accuracy": 0.7395946278572082, "eval_math_num_tokens": 16548261.0, "eval_math_runtime": 62.8632, "eval_math_samples_per_second": 7.954, "eval_math_steps_per_second": 1.988, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.9225423102378845, "eval_physics_loss": 0.9129735231399536, "eval_physics_mean_token_accuracy": 0.770587375164032, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 71.6493, "eval_physics_samples_per_second": 6.978, "eval_physics_steps_per_second": 1.745, "step": 500 }, { "entropy": 0.890050390549004, "epoch": 0.816, "grad_norm": 14.8125, "learning_rate": 1.8451851851851855e-05, "loss": 14.3791, "mean_token_accuracy": 0.7720852673053742, "num_tokens": 16870533.0, "step": 510 }, { "entropy": 0.9036840088665485, "epoch": 0.832, "grad_norm": 16.5, "learning_rate": 1.8377777777777778e-05, "loss": 14.5054, "mean_token_accuracy": 0.7727169577032328, "num_tokens": 17196371.0, "step": 520 }, { "entropy": 0.8895360480993986, "epoch": 0.848, "grad_norm": 13.9375, "learning_rate": 1.8303703703703704e-05, "loss": 14.2591, "mean_token_accuracy": 0.7748707763850688, "num_tokens": 17527066.0, "step": 530 }, { "entropy": 0.8957800149917603, "epoch": 0.864, "grad_norm": 14.5, "learning_rate": 1.822962962962963e-05, "loss": 14.3928, "mean_token_accuracy": 0.7738867543637753, "num_tokens": 17859605.0, "step": 540 }, { "entropy": 0.9031556732952595, "epoch": 0.88, "grad_norm": 16.875, "learning_rate": 1.8155555555555557e-05, "loss": 14.5649, "mean_token_accuracy": 0.7709400031715632, "num_tokens": 18187315.0, "step": 550 }, { "entropy": 0.8537686172872782, "epoch": 0.896, "grad_norm": 14.0, "learning_rate": 1.8081481481481484e-05, "loss": 13.6764, "mean_token_accuracy": 0.7827610898762941, "num_tokens": 18536790.0, "step": 560 }, { "entropy": 0.8935620501637459, "epoch": 0.912, "grad_norm": 15.4375, "learning_rate": 1.800740740740741e-05, "loss": 14.3654, "mean_token_accuracy": 0.7719177346676588, "num_tokens": 18869344.0, "step": 570 }, { "entropy": 0.8791772209107875, "epoch": 0.928, "grad_norm": 16.625, "learning_rate": 1.7933333333333333e-05, "loss": 14.1875, "mean_token_accuracy": 0.774543008953333, "num_tokens": 19199834.0, "step": 580 }, { "entropy": 0.8686152957379818, "epoch": 0.944, "grad_norm": 15.875, "learning_rate": 1.785925925925926e-05, "loss": 13.88, "mean_token_accuracy": 0.7793182540684939, "num_tokens": 19525385.0, "step": 590 }, { "entropy": 0.8704672519117593, "epoch": 0.96, "grad_norm": 15.9375, "learning_rate": 1.7785185185185186e-05, "loss": 14.0634, "mean_token_accuracy": 0.7773549720644951, "num_tokens": 19856973.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 2.5744068441390993, "eval_biology_loss": 2.907360315322876, "eval_biology_mean_token_accuracy": 0.49524139547348023, "eval_biology_num_tokens": 19856973.0, "eval_biology_runtime": 49.0409, "eval_biology_samples_per_second": 10.196, "eval_biology_steps_per_second": 2.549, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 1.2219904832839965, "eval_chemistry_loss": 1.2410826683044434, "eval_chemistry_mean_token_accuracy": 0.7126172127723693, "eval_chemistry_num_tokens": 19856973.0, "eval_chemistry_runtime": 60.7859, "eval_chemistry_samples_per_second": 8.226, "eval_chemistry_steps_per_second": 2.056, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.9290778393745422, "eval_math_loss": 1.1203620433807373, "eval_math_mean_token_accuracy": 0.7419518904685974, "eval_math_num_tokens": 19856973.0, "eval_math_runtime": 62.3226, "eval_math_samples_per_second": 8.023, "eval_math_steps_per_second": 2.006, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.8969436359405517, "eval_physics_loss": 0.8939215540885925, "eval_physics_mean_token_accuracy": 0.7737585263252258, "eval_physics_num_tokens": 19856973.0, "eval_physics_runtime": 71.3008, "eval_physics_samples_per_second": 7.013, "eval_physics_steps_per_second": 1.753, "step": 600 }, { "entropy": 0.8691291939467192, "epoch": 0.976, "grad_norm": 16.0, "learning_rate": 1.7711111111111113e-05, "loss": 13.9642, "mean_token_accuracy": 0.7793323084712028, "num_tokens": 20190600.0, "step": 610 }, { "entropy": 0.8796132244169712, "epoch": 0.992, "grad_norm": 15.3125, "learning_rate": 1.763703703703704e-05, "loss": 14.1458, "mean_token_accuracy": 0.7762652188539505, "num_tokens": 20518014.0, "step": 620 }, { "entropy": 0.8544556263834238, "epoch": 1.008, "grad_norm": 14.6875, "learning_rate": 1.7562962962962962e-05, "loss": 13.629, "mean_token_accuracy": 0.7817003551870585, "num_tokens": 20859215.0, "step": 630 }, { "entropy": 0.8545751355588436, "epoch": 1.024, "grad_norm": 15.3125, "learning_rate": 1.7488888888888892e-05, "loss": 13.7525, "mean_token_accuracy": 0.7807697676122188, "num_tokens": 21186658.0, "step": 640 }, { "entropy": 0.8538653265684844, "epoch": 1.04, "grad_norm": 15.0625, "learning_rate": 1.7414814814814815e-05, "loss": 13.7341, "mean_token_accuracy": 0.7808921810239553, "num_tokens": 21517237.0, "step": 650 }, { "entropy": 0.8380249921232462, "epoch": 1.056, "grad_norm": 14.9375, "learning_rate": 1.7340740740740742e-05, "loss": 13.4639, "mean_token_accuracy": 0.7846374321728945, "num_tokens": 21852528.0, "step": 660 }, { "entropy": 0.855887845158577, "epoch": 1.072, "grad_norm": 14.125, "learning_rate": 1.726666666666667e-05, "loss": 13.7632, "mean_token_accuracy": 0.7813815232366323, "num_tokens": 22192380.0, "step": 670 }, { "entropy": 0.8563600070774555, "epoch": 1.088, "grad_norm": 17.125, "learning_rate": 1.7192592592592595e-05, "loss": 13.7002, "mean_token_accuracy": 0.7817271586507559, "num_tokens": 22517971.0, "step": 680 }, { "entropy": 0.8393923103809356, "epoch": 1.104, "grad_norm": 14.0625, "learning_rate": 1.711851851851852e-05, "loss": 13.5135, "mean_token_accuracy": 0.7835057020187378, "num_tokens": 22848694.0, "step": 690 }, { "entropy": 0.8503093957901001, "epoch": 1.12, "grad_norm": 15.1875, "learning_rate": 1.7044444444444445e-05, "loss": 13.7396, "mean_token_accuracy": 0.7802822068333626, "num_tokens": 23183889.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 2.5127918043136597, "eval_biology_loss": 2.924085855484009, "eval_biology_mean_token_accuracy": 0.49510638427734377, "eval_biology_num_tokens": 23183889.0, "eval_biology_runtime": 49.4091, "eval_biology_samples_per_second": 10.12, "eval_biology_steps_per_second": 2.53, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 1.1826457905769348, "eval_chemistry_loss": 1.23135244846344, "eval_chemistry_mean_token_accuracy": 0.7140716204643249, "eval_chemistry_num_tokens": 23183889.0, "eval_chemistry_runtime": 61.3044, "eval_chemistry_samples_per_second": 8.156, "eval_chemistry_steps_per_second": 2.039, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.9012531743049622, "eval_math_loss": 1.1168676614761353, "eval_math_mean_token_accuracy": 0.742647081375122, "eval_math_num_tokens": 23183889.0, "eval_math_runtime": 62.8933, "eval_math_samples_per_second": 7.95, "eval_math_steps_per_second": 1.987, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.8671332702636719, "eval_physics_loss": 0.8811478614807129, "eval_physics_mean_token_accuracy": 0.7764565787315368, "eval_physics_num_tokens": 23183889.0, "eval_physics_runtime": 71.7862, "eval_physics_samples_per_second": 6.965, "eval_physics_steps_per_second": 1.741, "step": 700 }, { "entropy": 0.8146034182980657, "epoch": 1.1360000000000001, "grad_norm": 13.6875, "learning_rate": 1.697037037037037e-05, "loss": 13.0963, "mean_token_accuracy": 0.7886282972991466, "num_tokens": 23523903.0, "step": 710 }, { "entropy": 0.8425636734813452, "epoch": 1.152, "grad_norm": 16.875, "learning_rate": 1.6896296296296298e-05, "loss": 13.54, "mean_token_accuracy": 0.7844115447252988, "num_tokens": 23848848.0, "step": 720 }, { "entropy": 0.8615062309429049, "epoch": 1.168, "grad_norm": 16.25, "learning_rate": 1.6822222222222224e-05, "loss": 13.7096, "mean_token_accuracy": 0.7805542998015881, "num_tokens": 24174589.0, "step": 730 }, { "entropy": 0.8374113839119672, "epoch": 1.184, "grad_norm": 16.25, "learning_rate": 1.6748148148148147e-05, "loss": 13.5026, "mean_token_accuracy": 0.7833674903959036, "num_tokens": 24495981.0, "step": 740 }, { "entropy": 0.8216371892020107, "epoch": 1.2, "grad_norm": 15.8125, "learning_rate": 1.6674074074074077e-05, "loss": 13.302, "mean_token_accuracy": 0.786701825261116, "num_tokens": 24826245.0, "step": 750 }, { "entropy": 0.848210446164012, "epoch": 1.216, "grad_norm": 18.5, "learning_rate": 1.66e-05, "loss": 13.5667, "mean_token_accuracy": 0.7812601257115602, "num_tokens": 25147775.0, "step": 760 }, { "entropy": 0.8356876520439982, "epoch": 1.232, "grad_norm": 16.0, "learning_rate": 1.6525925925925927e-05, "loss": 13.4138, "mean_token_accuracy": 0.7863026071339846, "num_tokens": 25480033.0, "step": 770 }, { "entropy": 0.8342631004750729, "epoch": 1.248, "grad_norm": 16.5, "learning_rate": 1.6451851851851853e-05, "loss": 13.3815, "mean_token_accuracy": 0.7856004070490599, "num_tokens": 25811978.0, "step": 780 }, { "entropy": 0.8264019176363945, "epoch": 1.264, "grad_norm": 16.125, "learning_rate": 1.637777777777778e-05, "loss": 13.3075, "mean_token_accuracy": 0.7860219534486532, "num_tokens": 26146299.0, "step": 790 }, { "entropy": 0.8564631534740329, "epoch": 1.28, "grad_norm": 16.125, "learning_rate": 1.6303703703703706e-05, "loss": 13.8051, "mean_token_accuracy": 0.778905876353383, "num_tokens": 26478207.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 2.501225646018982, "eval_biology_loss": 2.926992177963257, "eval_biology_mean_token_accuracy": 0.49775544834136964, "eval_biology_num_tokens": 26478207.0, "eval_biology_runtime": 49.5165, "eval_biology_samples_per_second": 10.098, "eval_biology_steps_per_second": 2.524, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 1.1599062328338623, "eval_chemistry_loss": 1.2233794927597046, "eval_chemistry_mean_token_accuracy": 0.7164447598457336, "eval_chemistry_num_tokens": 26478207.0, "eval_chemistry_runtime": 61.2476, "eval_chemistry_samples_per_second": 8.164, "eval_chemistry_steps_per_second": 2.041, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.8935993061065673, "eval_math_loss": 1.113796591758728, "eval_math_mean_token_accuracy": 0.7438100085258484, "eval_math_num_tokens": 26478207.0, "eval_math_runtime": 63.571, "eval_math_samples_per_second": 7.865, "eval_math_steps_per_second": 1.966, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.8521192736625671, "eval_physics_loss": 0.8711134791374207, "eval_physics_mean_token_accuracy": 0.7784227828979492, "eval_physics_num_tokens": 26478207.0, "eval_physics_runtime": 71.9421, "eval_physics_samples_per_second": 6.95, "eval_physics_steps_per_second": 1.738, "step": 800 }, { "entropy": 0.8208133645355702, "epoch": 1.296, "grad_norm": 16.375, "learning_rate": 1.622962962962963e-05, "loss": 13.1842, "mean_token_accuracy": 0.7876175127923488, "num_tokens": 26806477.0, "step": 810 }, { "entropy": 0.854189190082252, "epoch": 1.312, "grad_norm": 16.875, "learning_rate": 1.6155555555555556e-05, "loss": 13.6651, "mean_token_accuracy": 0.7810488305985928, "num_tokens": 27137528.0, "step": 820 }, { "entropy": 0.8269255790859461, "epoch": 1.328, "grad_norm": 17.5, "learning_rate": 1.6081481481481482e-05, "loss": 13.368, "mean_token_accuracy": 0.7851872753351927, "num_tokens": 27465859.0, "step": 830 }, { "entropy": 0.8443555533885956, "epoch": 1.3439999999999999, "grad_norm": 16.5, "learning_rate": 1.600740740740741e-05, "loss": 13.5911, "mean_token_accuracy": 0.7820561602711678, "num_tokens": 27795568.0, "step": 840 }, { "entropy": 0.8374204233288765, "epoch": 1.3599999999999999, "grad_norm": 16.0, "learning_rate": 1.5933333333333336e-05, "loss": 13.4833, "mean_token_accuracy": 0.783146658167243, "num_tokens": 28129988.0, "step": 850 }, { "entropy": 0.8366791510954499, "epoch": 1.376, "grad_norm": 17.75, "learning_rate": 1.5859259259259262e-05, "loss": 13.3637, "mean_token_accuracy": 0.7853023037314415, "num_tokens": 28465903.0, "step": 860 }, { "entropy": 0.8118171757087111, "epoch": 1.392, "grad_norm": 17.25, "learning_rate": 1.5785185185185185e-05, "loss": 13.0363, "mean_token_accuracy": 0.7896989852190017, "num_tokens": 28801554.0, "step": 870 }, { "entropy": 0.8223038006573915, "epoch": 1.408, "grad_norm": 15.875, "learning_rate": 1.571111111111111e-05, "loss": 13.2311, "mean_token_accuracy": 0.7867654282599688, "num_tokens": 29125114.0, "step": 880 }, { "entropy": 0.8510244162753224, "epoch": 1.424, "grad_norm": 15.3125, "learning_rate": 1.5637037037037038e-05, "loss": 13.7647, "mean_token_accuracy": 0.7792895000427962, "num_tokens": 29454334.0, "step": 890 }, { "entropy": 0.805036261677742, "epoch": 1.44, "grad_norm": 15.25, "learning_rate": 1.5562962962962965e-05, "loss": 12.9122, "mean_token_accuracy": 0.7907555736601353, "num_tokens": 29789253.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 2.507858594894409, "eval_biology_loss": 2.8971524238586426, "eval_biology_mean_token_accuracy": 0.49738200902938845, "eval_biology_num_tokens": 29789253.0, "eval_biology_runtime": 49.4122, "eval_biology_samples_per_second": 10.119, "eval_biology_steps_per_second": 2.53, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 1.1563323426246643, "eval_chemistry_loss": 1.2145193815231323, "eval_chemistry_mean_token_accuracy": 0.717510892868042, "eval_chemistry_num_tokens": 29789253.0, "eval_chemistry_runtime": 61.2472, "eval_chemistry_samples_per_second": 8.164, "eval_chemistry_steps_per_second": 2.041, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.8971080303192138, "eval_math_loss": 1.1086454391479492, "eval_math_mean_token_accuracy": 0.7439061784744263, "eval_math_num_tokens": 29789253.0, "eval_math_runtime": 62.8193, "eval_math_samples_per_second": 7.959, "eval_math_steps_per_second": 1.99, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.8453807921409607, "eval_physics_loss": 0.8617435693740845, "eval_physics_mean_token_accuracy": 0.7796971354484558, "eval_physics_num_tokens": 29789253.0, "eval_physics_runtime": 71.6142, "eval_physics_samples_per_second": 6.982, "eval_physics_steps_per_second": 1.745, "step": 900 }, { "entropy": 0.8077683765441179, "epoch": 1.456, "grad_norm": 13.8125, "learning_rate": 1.548888888888889e-05, "loss": 13.0229, "mean_token_accuracy": 0.7907570086419582, "num_tokens": 30128576.0, "step": 910 }, { "entropy": 0.8584844920784235, "epoch": 1.472, "grad_norm": 17.0, "learning_rate": 1.5414814814814814e-05, "loss": 13.8325, "mean_token_accuracy": 0.7795857060700655, "num_tokens": 30452719.0, "step": 920 }, { "entropy": 0.8337999247014523, "epoch": 1.488, "grad_norm": 15.3125, "learning_rate": 1.5340740740740744e-05, "loss": 13.3892, "mean_token_accuracy": 0.7849970065057278, "num_tokens": 30782343.0, "step": 930 }, { "entropy": 0.821822557784617, "epoch": 1.504, "grad_norm": 14.1875, "learning_rate": 1.5266666666666667e-05, "loss": 13.1466, "mean_token_accuracy": 0.7879386112093926, "num_tokens": 31113238.0, "step": 940 }, { "entropy": 0.8403494212776422, "epoch": 1.52, "grad_norm": 16.375, "learning_rate": 1.5192592592592594e-05, "loss": 13.4805, "mean_token_accuracy": 0.7833794906735421, "num_tokens": 31444589.0, "step": 950 }, { "entropy": 0.8343268791213632, "epoch": 1.536, "grad_norm": 17.25, "learning_rate": 1.5118518518518519e-05, "loss": 13.539, "mean_token_accuracy": 0.7820737387984991, "num_tokens": 31774137.0, "step": 960 }, { "entropy": 0.8169895254075528, "epoch": 1.552, "grad_norm": 16.625, "learning_rate": 1.5044444444444445e-05, "loss": 12.9985, "mean_token_accuracy": 0.7904236756265164, "num_tokens": 32101776.0, "step": 970 }, { "entropy": 0.8104152591899038, "epoch": 1.568, "grad_norm": 15.625, "learning_rate": 1.497037037037037e-05, "loss": 13.079, "mean_token_accuracy": 0.7893446046859026, "num_tokens": 32428021.0, "step": 980 }, { "entropy": 0.8398273181170225, "epoch": 1.584, "grad_norm": 16.25, "learning_rate": 1.4896296296296298e-05, "loss": 13.559, "mean_token_accuracy": 0.7802358068525791, "num_tokens": 32752735.0, "step": 990 }, { "entropy": 0.8366536511108279, "epoch": 1.6, "grad_norm": 15.5, "learning_rate": 1.4822222222222225e-05, "loss": 13.3858, "mean_token_accuracy": 0.7848577659577132, "num_tokens": 33075822.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 2.4744019918441773, "eval_biology_loss": 2.8814501762390137, "eval_biology_mean_token_accuracy": 0.5001691384315491, "eval_biology_num_tokens": 33075822.0, "eval_biology_runtime": 49.6638, "eval_biology_samples_per_second": 10.068, "eval_biology_steps_per_second": 2.517, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 1.1388019576072692, "eval_chemistry_loss": 1.2091480493545532, "eval_chemistry_mean_token_accuracy": 0.7188141541481018, "eval_chemistry_num_tokens": 33075822.0, "eval_chemistry_runtime": 61.119, "eval_chemistry_samples_per_second": 8.181, "eval_chemistry_steps_per_second": 2.045, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.8872386133670807, "eval_math_loss": 1.107426404953003, "eval_math_mean_token_accuracy": 0.7446602578163147, "eval_math_num_tokens": 33075822.0, "eval_math_runtime": 62.8503, "eval_math_samples_per_second": 7.955, "eval_math_steps_per_second": 1.989, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.832112268447876, "eval_physics_loss": 0.8559062480926514, "eval_physics_mean_token_accuracy": 0.7812981810569763, "eval_physics_num_tokens": 33075822.0, "eval_physics_runtime": 71.6577, "eval_physics_samples_per_second": 6.978, "eval_physics_steps_per_second": 1.744, "step": 1000 }, { "entropy": 0.7955857511609793, "epoch": 1.616, "grad_norm": 16.125, "learning_rate": 1.474814814814815e-05, "loss": 12.8723, "mean_token_accuracy": 0.7925955194979906, "num_tokens": 33411850.0, "step": 1010 }, { "entropy": 0.8143383387476206, "epoch": 1.6320000000000001, "grad_norm": 16.375, "learning_rate": 1.4674074074074076e-05, "loss": 13.07, "mean_token_accuracy": 0.7890652883797884, "num_tokens": 33739456.0, "step": 1020 }, { "entropy": 0.7856693360954523, "epoch": 1.6480000000000001, "grad_norm": 15.875, "learning_rate": 1.46e-05, "loss": 12.6467, "mean_token_accuracy": 0.7940549373626709, "num_tokens": 34072022.0, "step": 1030 }, { "entropy": 0.8198095491155982, "epoch": 1.6640000000000001, "grad_norm": 17.375, "learning_rate": 1.4525925925925927e-05, "loss": 13.1094, "mean_token_accuracy": 0.7891215395182372, "num_tokens": 34398959.0, "step": 1040 }, { "entropy": 0.7961690971627832, "epoch": 1.6800000000000002, "grad_norm": 15.1875, "learning_rate": 1.4451851851851852e-05, "loss": 12.9049, "mean_token_accuracy": 0.790962991118431, "num_tokens": 34732284.0, "step": 1050 }, { "entropy": 0.8210279244929553, "epoch": 1.696, "grad_norm": 16.375, "learning_rate": 1.4377777777777779e-05, "loss": 13.1187, "mean_token_accuracy": 0.788687152415514, "num_tokens": 35053712.0, "step": 1060 }, { "entropy": 0.8245653146877885, "epoch": 1.712, "grad_norm": 17.125, "learning_rate": 1.4303703703703703e-05, "loss": 13.3208, "mean_token_accuracy": 0.784929183498025, "num_tokens": 35393088.0, "step": 1070 }, { "entropy": 0.8105424832552671, "epoch": 1.728, "grad_norm": 16.0, "learning_rate": 1.4229629629629632e-05, "loss": 12.9858, "mean_token_accuracy": 0.7895865086466074, "num_tokens": 35729007.0, "step": 1080 }, { "entropy": 0.8141957949846983, "epoch": 1.744, "grad_norm": 16.75, "learning_rate": 1.4155555555555556e-05, "loss": 13.0812, "mean_token_accuracy": 0.7894516389816999, "num_tokens": 36072489.0, "step": 1090 }, { "entropy": 0.8048153560608625, "epoch": 1.76, "grad_norm": 15.9375, "learning_rate": 1.4081481481481483e-05, "loss": 12.9209, "mean_token_accuracy": 0.7897520393133164, "num_tokens": 36398285.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 2.445696955680847, "eval_biology_loss": 2.8779242038726807, "eval_biology_mean_token_accuracy": 0.5016539707183838, "eval_biology_num_tokens": 36398285.0, "eval_biology_runtime": 49.6599, "eval_biology_samples_per_second": 10.068, "eval_biology_steps_per_second": 2.517, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 1.1345578532218934, "eval_chemistry_loss": 1.204408049583435, "eval_chemistry_mean_token_accuracy": 0.71992467212677, "eval_chemistry_num_tokens": 36398285.0, "eval_chemistry_runtime": 61.246, "eval_chemistry_samples_per_second": 8.164, "eval_chemistry_steps_per_second": 2.041, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.8842824428081513, "eval_math_loss": 1.1029928922653198, "eval_math_mean_token_accuracy": 0.7455306906700134, "eval_math_num_tokens": 36398285.0, "eval_math_runtime": 62.8108, "eval_math_samples_per_second": 7.96, "eval_math_steps_per_second": 1.99, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.8283794307708741, "eval_physics_loss": 0.8505285978317261, "eval_physics_mean_token_accuracy": 0.7818370275497436, "eval_physics_num_tokens": 36398285.0, "eval_physics_runtime": 71.5963, "eval_physics_samples_per_second": 6.984, "eval_physics_steps_per_second": 1.746, "step": 1100 }, { "entropy": 0.8389779690653085, "epoch": 1.776, "grad_norm": 15.6875, "learning_rate": 1.400740740740741e-05, "loss": 13.4944, "mean_token_accuracy": 0.7826780155301094, "num_tokens": 36730479.0, "step": 1110 }, { "entropy": 0.8008179372176528, "epoch": 1.792, "grad_norm": 19.125, "learning_rate": 1.3933333333333334e-05, "loss": 12.9483, "mean_token_accuracy": 0.7898946575820446, "num_tokens": 37057425.0, "step": 1120 }, { "entropy": 0.7982527418062091, "epoch": 1.808, "grad_norm": 15.625, "learning_rate": 1.385925925925926e-05, "loss": 12.7713, "mean_token_accuracy": 0.7935733944177628, "num_tokens": 37395449.0, "step": 1130 }, { "entropy": 0.8088066600263119, "epoch": 1.8239999999999998, "grad_norm": 15.9375, "learning_rate": 1.3785185185185186e-05, "loss": 12.9789, "mean_token_accuracy": 0.7902951821684837, "num_tokens": 37729535.0, "step": 1140 }, { "entropy": 0.8060135461390019, "epoch": 1.8399999999999999, "grad_norm": 16.375, "learning_rate": 1.3711111111111112e-05, "loss": 12.9445, "mean_token_accuracy": 0.7909105230122805, "num_tokens": 38057839.0, "step": 1150 }, { "entropy": 0.8027735522016883, "epoch": 1.8559999999999999, "grad_norm": 15.3125, "learning_rate": 1.3637037037037037e-05, "loss": 13.0038, "mean_token_accuracy": 0.7903423044830561, "num_tokens": 38398033.0, "step": 1160 }, { "entropy": 0.8051119217649102, "epoch": 1.8719999999999999, "grad_norm": 15.625, "learning_rate": 1.3562962962962965e-05, "loss": 12.8759, "mean_token_accuracy": 0.7893568992614746, "num_tokens": 38729203.0, "step": 1170 }, { "entropy": 0.8213391533121467, "epoch": 1.888, "grad_norm": 16.375, "learning_rate": 1.3488888888888888e-05, "loss": 13.2453, "mean_token_accuracy": 0.7860501617193222, "num_tokens": 39058791.0, "step": 1180 }, { "entropy": 0.826800760999322, "epoch": 1.904, "grad_norm": 16.625, "learning_rate": 1.3414814814814817e-05, "loss": 13.4803, "mean_token_accuracy": 0.7845181468874216, "num_tokens": 39386170.0, "step": 1190 }, { "entropy": 0.8467358741909266, "epoch": 1.92, "grad_norm": 16.375, "learning_rate": 1.3340740740740741e-05, "loss": 13.4331, "mean_token_accuracy": 0.784552700445056, "num_tokens": 39710381.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 2.472013756752014, "eval_biology_loss": 2.8643574714660645, "eval_biology_mean_token_accuracy": 0.5028325555324554, "eval_biology_num_tokens": 39710381.0, "eval_biology_runtime": 49.4032, "eval_biology_samples_per_second": 10.121, "eval_biology_steps_per_second": 2.53, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 1.1285487518310546, "eval_chemistry_loss": 1.199933648109436, "eval_chemistry_mean_token_accuracy": 0.7208248000144959, "eval_chemistry_num_tokens": 39710381.0, "eval_chemistry_runtime": 61.2533, "eval_chemistry_samples_per_second": 8.163, "eval_chemistry_steps_per_second": 2.041, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.8806797494888305, "eval_math_loss": 1.1004599332809448, "eval_math_mean_token_accuracy": 0.7461761064529419, "eval_math_num_tokens": 39710381.0, "eval_math_runtime": 62.8098, "eval_math_samples_per_second": 7.961, "eval_math_steps_per_second": 1.99, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.8215924696922302, "eval_physics_loss": 0.8454442024230957, "eval_physics_mean_token_accuracy": 0.7830140724182129, "eval_physics_num_tokens": 39710381.0, "eval_physics_runtime": 71.6941, "eval_physics_samples_per_second": 6.974, "eval_physics_steps_per_second": 1.744, "step": 1200 }, { "entropy": 0.8093992147594691, "epoch": 1.936, "grad_norm": 15.9375, "learning_rate": 1.3266666666666668e-05, "loss": 13.0771, "mean_token_accuracy": 0.7870635632425547, "num_tokens": 40043354.0, "step": 1210 }, { "entropy": 0.7759674414992332, "epoch": 1.952, "grad_norm": 15.375, "learning_rate": 1.3192592592592594e-05, "loss": 12.4136, "mean_token_accuracy": 0.7972983971238137, "num_tokens": 40375790.0, "step": 1220 }, { "entropy": 0.8013758489862084, "epoch": 1.968, "grad_norm": 15.75, "learning_rate": 1.311851851851852e-05, "loss": 12.9987, "mean_token_accuracy": 0.790325503051281, "num_tokens": 40705451.0, "step": 1230 }, { "entropy": 0.8227595569565892, "epoch": 1.984, "grad_norm": 17.5, "learning_rate": 1.3044444444444446e-05, "loss": 13.1906, "mean_token_accuracy": 0.7863460905849934, "num_tokens": 41040313.0, "step": 1240 }, { "entropy": 0.8205555606633425, "epoch": 2.0, "grad_norm": 15.625, "learning_rate": 1.297037037037037e-05, "loss": 13.1737, "mean_token_accuracy": 0.7883093636482954, "num_tokens": 41373608.0, "step": 1250 }, { "entropy": 0.7831195399165154, "epoch": 2.016, "grad_norm": 14.0, "learning_rate": 1.2896296296296299e-05, "loss": 12.4894, "mean_token_accuracy": 0.7959605794399977, "num_tokens": 41708226.0, "step": 1260 }, { "entropy": 0.7830235229805111, "epoch": 2.032, "grad_norm": 15.8125, "learning_rate": 1.2822222222222222e-05, "loss": 12.6023, "mean_token_accuracy": 0.7948959324508905, "num_tokens": 42045433.0, "step": 1270 }, { "entropy": 0.807098483107984, "epoch": 2.048, "grad_norm": 17.0, "learning_rate": 1.274814814814815e-05, "loss": 13.0344, "mean_token_accuracy": 0.7881330821663142, "num_tokens": 42372694.0, "step": 1280 }, { "entropy": 0.7877823824062944, "epoch": 2.064, "grad_norm": 15.25, "learning_rate": 1.2674074074074075e-05, "loss": 12.5998, "mean_token_accuracy": 0.7949316211044788, "num_tokens": 42710240.0, "step": 1290 }, { "entropy": 0.8019804678857326, "epoch": 2.08, "grad_norm": 15.1875, "learning_rate": 1.2600000000000001e-05, "loss": 12.9342, "mean_token_accuracy": 0.7910612858831882, "num_tokens": 43037844.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 2.445282099723816, "eval_biology_loss": 2.8822696208953857, "eval_biology_mean_token_accuracy": 0.5015360698699951, "eval_biology_num_tokens": 43037844.0, "eval_biology_runtime": 49.6427, "eval_biology_samples_per_second": 10.072, "eval_biology_steps_per_second": 2.518, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 1.1140752034187318, "eval_chemistry_loss": 1.2005956172943115, "eval_chemistry_mean_token_accuracy": 0.7206200876235962, "eval_chemistry_num_tokens": 43037844.0, "eval_chemistry_runtime": 61.3631, "eval_chemistry_samples_per_second": 8.148, "eval_chemistry_steps_per_second": 2.037, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.8749056966304779, "eval_math_loss": 1.103084683418274, "eval_math_mean_token_accuracy": 0.7457676644325256, "eval_math_num_tokens": 43037844.0, "eval_math_runtime": 62.8812, "eval_math_samples_per_second": 7.951, "eval_math_steps_per_second": 1.988, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.8129040551185608, "eval_physics_loss": 0.842690110206604, "eval_physics_mean_token_accuracy": 0.7835633087158204, "eval_physics_num_tokens": 43037844.0, "eval_physics_runtime": 72.1755, "eval_physics_samples_per_second": 6.928, "eval_physics_steps_per_second": 1.732, "step": 1300 }, { "entropy": 0.8027737921103835, "epoch": 2.096, "grad_norm": 16.625, "learning_rate": 1.2525925925925928e-05, "loss": 12.8345, "mean_token_accuracy": 0.7918097671121359, "num_tokens": 43368675.0, "step": 1310 }, { "entropy": 0.7821535093709826, "epoch": 2.112, "grad_norm": 15.4375, "learning_rate": 1.2451851851851853e-05, "loss": 12.5805, "mean_token_accuracy": 0.7947387598454952, "num_tokens": 43699544.0, "step": 1320 }, { "entropy": 0.7935595938935875, "epoch": 2.128, "grad_norm": 16.875, "learning_rate": 1.237777777777778e-05, "loss": 12.7363, "mean_token_accuracy": 0.7916791636496783, "num_tokens": 44012632.0, "step": 1330 }, { "entropy": 0.7835203887894749, "epoch": 2.144, "grad_norm": 15.1875, "learning_rate": 1.2303703703703704e-05, "loss": 12.6582, "mean_token_accuracy": 0.793102978542447, "num_tokens": 44349732.0, "step": 1340 }, { "entropy": 0.8203705210238695, "epoch": 2.16, "grad_norm": 16.0, "learning_rate": 1.222962962962963e-05, "loss": 13.2168, "mean_token_accuracy": 0.7861043442040682, "num_tokens": 44680410.0, "step": 1350 }, { "entropy": 0.7885768229141832, "epoch": 2.176, "grad_norm": 14.9375, "learning_rate": 1.2155555555555555e-05, "loss": 12.6388, "mean_token_accuracy": 0.7946638122200966, "num_tokens": 45017922.0, "step": 1360 }, { "entropy": 0.7834130380302667, "epoch": 2.192, "grad_norm": 15.75, "learning_rate": 1.2081481481481484e-05, "loss": 12.5136, "mean_token_accuracy": 0.7952086210250855, "num_tokens": 45348179.0, "step": 1370 }, { "entropy": 0.7792341850697995, "epoch": 2.208, "grad_norm": 17.625, "learning_rate": 1.2007407407407408e-05, "loss": 12.5501, "mean_token_accuracy": 0.795701441168785, "num_tokens": 45678397.0, "step": 1380 }, { "entropy": 0.773603293299675, "epoch": 2.224, "grad_norm": 14.5, "learning_rate": 1.1933333333333335e-05, "loss": 12.47, "mean_token_accuracy": 0.7955104906111956, "num_tokens": 46010052.0, "step": 1390 }, { "entropy": 0.7809545382857322, "epoch": 2.24, "grad_norm": 15.125, "learning_rate": 1.185925925925926e-05, "loss": 12.5627, "mean_token_accuracy": 0.794714093953371, "num_tokens": 46347116.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 2.4254958543777465, "eval_biology_loss": 2.8775830268859863, "eval_biology_mean_token_accuracy": 0.5026782186031341, "eval_biology_num_tokens": 46347116.0, "eval_biology_runtime": 49.9543, "eval_biology_samples_per_second": 10.009, "eval_biology_steps_per_second": 2.502, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 1.1134962687492371, "eval_chemistry_loss": 1.1991115808486938, "eval_chemistry_mean_token_accuracy": 0.7207795276641845, "eval_chemistry_num_tokens": 46347116.0, "eval_chemistry_runtime": 61.3888, "eval_chemistry_samples_per_second": 8.145, "eval_chemistry_steps_per_second": 2.036, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.8740004925727844, "eval_math_loss": 1.1019760370254517, "eval_math_mean_token_accuracy": 0.7459057631492615, "eval_math_num_tokens": 46347116.0, "eval_math_runtime": 62.8571, "eval_math_samples_per_second": 7.955, "eval_math_steps_per_second": 1.989, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.8126016085147858, "eval_physics_loss": 0.8402607440948486, "eval_physics_mean_token_accuracy": 0.784248288154602, "eval_physics_num_tokens": 46347116.0, "eval_physics_runtime": 72.184, "eval_physics_samples_per_second": 6.927, "eval_physics_steps_per_second": 1.732, "step": 1400 }, { "entropy": 0.8127070046961308, "epoch": 2.2560000000000002, "grad_norm": 16.625, "learning_rate": 1.1785185185185186e-05, "loss": 13.0614, "mean_token_accuracy": 0.7891584753990173, "num_tokens": 46673074.0, "step": 1410 }, { "entropy": 0.7707240108400584, "epoch": 2.2720000000000002, "grad_norm": 16.125, "learning_rate": 1.1711111111111113e-05, "loss": 12.3774, "mean_token_accuracy": 0.797768659889698, "num_tokens": 47009386.0, "step": 1420 }, { "entropy": 0.8034002147614956, "epoch": 2.288, "grad_norm": 16.25, "learning_rate": 1.1637037037037037e-05, "loss": 12.9193, "mean_token_accuracy": 0.7906435623764991, "num_tokens": 47344231.0, "step": 1430 }, { "entropy": 0.7937306514009833, "epoch": 2.304, "grad_norm": 17.25, "learning_rate": 1.1562962962962964e-05, "loss": 12.7869, "mean_token_accuracy": 0.7924283880740404, "num_tokens": 47658392.0, "step": 1440 }, { "entropy": 0.7891798976808786, "epoch": 2.32, "grad_norm": 14.75, "learning_rate": 1.1488888888888889e-05, "loss": 12.6378, "mean_token_accuracy": 0.7924429185688495, "num_tokens": 47995015.0, "step": 1450 }, { "entropy": 0.7635893626138568, "epoch": 2.336, "grad_norm": 15.375, "learning_rate": 1.1414814814814817e-05, "loss": 12.3402, "mean_token_accuracy": 0.7973897516727447, "num_tokens": 48336708.0, "step": 1460 }, { "entropy": 0.7687528569251298, "epoch": 2.352, "grad_norm": 15.1875, "learning_rate": 1.1340740740740742e-05, "loss": 12.2507, "mean_token_accuracy": 0.7983826816082, "num_tokens": 48665311.0, "step": 1470 }, { "entropy": 0.7886384373530746, "epoch": 2.368, "grad_norm": 15.6875, "learning_rate": 1.1266666666666668e-05, "loss": 12.712, "mean_token_accuracy": 0.7930802937597037, "num_tokens": 49005334.0, "step": 1480 }, { "entropy": 0.8016523649916053, "epoch": 2.384, "grad_norm": 17.125, "learning_rate": 1.1192592592592593e-05, "loss": 12.8426, "mean_token_accuracy": 0.7910515271127224, "num_tokens": 49331936.0, "step": 1490 }, { "entropy": 0.7944724131375551, "epoch": 2.4, "grad_norm": 16.25, "learning_rate": 1.111851851851852e-05, "loss": 12.7553, "mean_token_accuracy": 0.7921236105263233, "num_tokens": 49671048.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 2.4047589435577392, "eval_biology_loss": 2.870595693588257, "eval_biology_mean_token_accuracy": 0.5024100644588471, "eval_biology_num_tokens": 49671048.0, "eval_biology_runtime": 49.425, "eval_biology_samples_per_second": 10.116, "eval_biology_steps_per_second": 2.529, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 1.1087028155326843, "eval_chemistry_loss": 1.1979060173034668, "eval_chemistry_mean_token_accuracy": 0.7210436034202575, "eval_chemistry_num_tokens": 49671048.0, "eval_chemistry_runtime": 61.0853, "eval_chemistry_samples_per_second": 8.185, "eval_chemistry_steps_per_second": 2.046, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.8674321026802063, "eval_math_loss": 1.102066159248352, "eval_math_mean_token_accuracy": 0.7458969240188599, "eval_math_num_tokens": 49671048.0, "eval_math_runtime": 62.8397, "eval_math_samples_per_second": 7.957, "eval_math_steps_per_second": 1.989, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.8101383104324341, "eval_physics_loss": 0.8381265997886658, "eval_physics_mean_token_accuracy": 0.7847440228462219, "eval_physics_num_tokens": 49671048.0, "eval_physics_runtime": 72.2232, "eval_physics_samples_per_second": 6.923, "eval_physics_steps_per_second": 1.731, "step": 1500 }, { "entropy": 0.7810511577874422, "epoch": 2.416, "grad_norm": 15.6875, "learning_rate": 1.1044444444444444e-05, "loss": 12.5807, "mean_token_accuracy": 0.79508685618639, "num_tokens": 50018182.0, "step": 1510 }, { "entropy": 0.7916204823181033, "epoch": 2.432, "grad_norm": 16.75, "learning_rate": 1.0970370370370371e-05, "loss": 12.7028, "mean_token_accuracy": 0.7943674992769957, "num_tokens": 50341187.0, "step": 1520 }, { "entropy": 0.8073871158063411, "epoch": 2.448, "grad_norm": 15.5625, "learning_rate": 1.0896296296296298e-05, "loss": 13.0377, "mean_token_accuracy": 0.7892248127609491, "num_tokens": 50667019.0, "step": 1530 }, { "entropy": 0.8032200831919909, "epoch": 2.464, "grad_norm": 16.875, "learning_rate": 1.0822222222222222e-05, "loss": 12.8177, "mean_token_accuracy": 0.7923011161386967, "num_tokens": 50991113.0, "step": 1540 }, { "entropy": 0.7860733466222882, "epoch": 2.48, "grad_norm": 16.125, "learning_rate": 1.074814814814815e-05, "loss": 12.7361, "mean_token_accuracy": 0.7927486374974251, "num_tokens": 51319624.0, "step": 1550 }, { "entropy": 0.8012511027976871, "epoch": 2.496, "grad_norm": 15.4375, "learning_rate": 1.0674074074074074e-05, "loss": 12.8233, "mean_token_accuracy": 0.7916884988546371, "num_tokens": 51654633.0, "step": 1560 }, { "entropy": 0.7731666518375278, "epoch": 2.512, "grad_norm": 17.375, "learning_rate": 1.0600000000000002e-05, "loss": 12.4696, "mean_token_accuracy": 0.7960924245417118, "num_tokens": 51980051.0, "step": 1570 }, { "entropy": 0.7793908750638365, "epoch": 2.528, "grad_norm": 14.875, "learning_rate": 1.0525925925925927e-05, "loss": 12.5186, "mean_token_accuracy": 0.7953244607895613, "num_tokens": 52316109.0, "step": 1580 }, { "entropy": 0.7914063410833478, "epoch": 2.544, "grad_norm": 17.25, "learning_rate": 1.0451851851851853e-05, "loss": 12.8145, "mean_token_accuracy": 0.7913208685815334, "num_tokens": 52634528.0, "step": 1590 }, { "entropy": 0.7939762134104967, "epoch": 2.56, "grad_norm": 17.0, "learning_rate": 1.0377777777777778e-05, "loss": 12.6043, "mean_token_accuracy": 0.7941447257995605, "num_tokens": 52964035.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 2.3931925020217895, "eval_biology_loss": 2.875809669494629, "eval_biology_mean_token_accuracy": 0.5025460588932037, "eval_biology_num_tokens": 52964035.0, "eval_biology_runtime": 50.2947, "eval_biology_samples_per_second": 9.941, "eval_biology_steps_per_second": 2.485, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 1.0883408832550048, "eval_chemistry_loss": 1.1965774297714233, "eval_chemistry_mean_token_accuracy": 0.7214803085327148, "eval_chemistry_num_tokens": 52964035.0, "eval_chemistry_runtime": 61.7977, "eval_chemistry_samples_per_second": 8.091, "eval_chemistry_steps_per_second": 2.023, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.8583383002281189, "eval_math_loss": 1.1013567447662354, "eval_math_mean_token_accuracy": 0.7465548224449158, "eval_math_num_tokens": 52964035.0, "eval_math_runtime": 63.3224, "eval_math_samples_per_second": 7.896, "eval_math_steps_per_second": 1.974, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 0.7951563930511475, "eval_physics_loss": 0.8360637426376343, "eval_physics_mean_token_accuracy": 0.7851015777587891, "eval_physics_num_tokens": 52964035.0, "eval_physics_runtime": 71.5227, "eval_physics_samples_per_second": 6.991, "eval_physics_steps_per_second": 1.748, "step": 1600 }, { "entropy": 0.772414730861783, "epoch": 2.576, "grad_norm": 17.25, "learning_rate": 1.0303703703703705e-05, "loss": 12.4145, "mean_token_accuracy": 0.7962443709373475, "num_tokens": 53293345.0, "step": 1610 }, { "entropy": 0.7868875458836555, "epoch": 2.592, "grad_norm": 14.0625, "learning_rate": 1.0229629629629631e-05, "loss": 12.6335, "mean_token_accuracy": 0.7949258577078581, "num_tokens": 53620896.0, "step": 1620 }, { "entropy": 0.7584911126643419, "epoch": 2.608, "grad_norm": 15.4375, "learning_rate": 1.0155555555555556e-05, "loss": 12.2908, "mean_token_accuracy": 0.798742862045765, "num_tokens": 53950989.0, "step": 1630 }, { "entropy": 0.782053854689002, "epoch": 2.624, "grad_norm": 15.875, "learning_rate": 1.0081481481481484e-05, "loss": 12.5859, "mean_token_accuracy": 0.7945446480065584, "num_tokens": 54282732.0, "step": 1640 }, { "entropy": 0.781876846589148, "epoch": 2.64, "grad_norm": 16.75, "learning_rate": 1.0007407407407407e-05, "loss": 12.5738, "mean_token_accuracy": 0.7951088670641184, "num_tokens": 54609748.0, "step": 1650 }, { "entropy": 0.816822605766356, "epoch": 2.656, "grad_norm": 16.875, "learning_rate": 9.933333333333334e-06, "loss": 13.1814, "mean_token_accuracy": 0.787245037406683, "num_tokens": 54953583.0, "step": 1660 }, { "entropy": 0.7931226765736937, "epoch": 2.672, "grad_norm": 16.5, "learning_rate": 9.85925925925926e-06, "loss": 12.7541, "mean_token_accuracy": 0.7924020521342754, "num_tokens": 55281945.0, "step": 1670 }, { "entropy": 0.7660689847543836, "epoch": 2.6879999999999997, "grad_norm": 16.125, "learning_rate": 9.785185185185187e-06, "loss": 12.247, "mean_token_accuracy": 0.7995080798864365, "num_tokens": 55612391.0, "step": 1680 }, { "entropy": 0.7789125647395849, "epoch": 2.7039999999999997, "grad_norm": 15.1875, "learning_rate": 9.711111111111111e-06, "loss": 12.6456, "mean_token_accuracy": 0.7945462424308062, "num_tokens": 55949791.0, "step": 1690 }, { "entropy": 0.78773049749434, "epoch": 2.7199999999999998, "grad_norm": 18.0, "learning_rate": 9.637037037037038e-06, "loss": 12.5144, "mean_token_accuracy": 0.7949357461184263, "num_tokens": 56282204.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 2.3543988103866575, "eval_biology_loss": 2.8883936405181885, "eval_biology_mean_token_accuracy": 0.5017733733654022, "eval_biology_num_tokens": 56282204.0, "eval_biology_runtime": 49.363, "eval_biology_samples_per_second": 10.129, "eval_biology_steps_per_second": 2.532, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 1.0755338530540466, "eval_chemistry_loss": 1.1958928108215332, "eval_chemistry_mean_token_accuracy": 0.7213905358314514, "eval_chemistry_num_tokens": 56282204.0, "eval_chemistry_runtime": 61.2095, "eval_chemistry_samples_per_second": 8.169, "eval_chemistry_steps_per_second": 2.042, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.8536946845054626, "eval_math_loss": 1.1009058952331543, "eval_math_mean_token_accuracy": 0.7464403495788574, "eval_math_num_tokens": 56282204.0, "eval_math_runtime": 62.7708, "eval_math_samples_per_second": 7.965, "eval_math_steps_per_second": 1.991, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 0.7884848446846008, "eval_physics_loss": 0.8343402147293091, "eval_physics_mean_token_accuracy": 0.7853550724983215, "eval_physics_num_tokens": 56282204.0, "eval_physics_runtime": 71.5298, "eval_physics_samples_per_second": 6.99, "eval_physics_steps_per_second": 1.748, "step": 1700 }, { "entropy": 0.8021048182621598, "epoch": 2.7359999999999998, "grad_norm": 16.625, "learning_rate": 9.562962962962965e-06, "loss": 13.0577, "mean_token_accuracy": 0.788362230360508, "num_tokens": 56605478.0, "step": 1710 }, { "entropy": 0.8005984915420413, "epoch": 2.752, "grad_norm": 15.8125, "learning_rate": 9.48888888888889e-06, "loss": 12.8355, "mean_token_accuracy": 0.7925169125199318, "num_tokens": 56935874.0, "step": 1720 }, { "entropy": 0.7718434376642108, "epoch": 2.768, "grad_norm": 16.75, "learning_rate": 9.414814814814816e-06, "loss": 12.3835, "mean_token_accuracy": 0.7968426302075386, "num_tokens": 57270552.0, "step": 1730 }, { "entropy": 0.7870646607130766, "epoch": 2.784, "grad_norm": 16.875, "learning_rate": 9.34074074074074e-06, "loss": 12.6738, "mean_token_accuracy": 0.7931245800107718, "num_tokens": 57602968.0, "step": 1740 }, { "entropy": 0.8263678561896086, "epoch": 2.8, "grad_norm": 17.375, "learning_rate": 9.266666666666667e-06, "loss": 13.3692, "mean_token_accuracy": 0.7837320301681757, "num_tokens": 57925886.0, "step": 1750 }, { "entropy": 0.7726477351039648, "epoch": 2.816, "grad_norm": 15.8125, "learning_rate": 9.192592592592594e-06, "loss": 12.2578, "mean_token_accuracy": 0.7985608454793691, "num_tokens": 58262960.0, "step": 1760 }, { "entropy": 0.7880838630720973, "epoch": 2.832, "grad_norm": 16.125, "learning_rate": 9.118518518518518e-06, "loss": 12.8104, "mean_token_accuracy": 0.7908314753323793, "num_tokens": 58586324.0, "step": 1770 }, { "entropy": 0.7903873866423965, "epoch": 2.848, "grad_norm": 15.125, "learning_rate": 9.044444444444445e-06, "loss": 12.6606, "mean_token_accuracy": 0.7933553613722324, "num_tokens": 58912949.0, "step": 1780 }, { "entropy": 0.7762313047423959, "epoch": 2.864, "grad_norm": 14.875, "learning_rate": 8.970370370370372e-06, "loss": 12.5161, "mean_token_accuracy": 0.7964224554598331, "num_tokens": 59252494.0, "step": 1790 }, { "entropy": 0.7852193580940365, "epoch": 2.88, "grad_norm": 18.375, "learning_rate": 8.896296296296298e-06, "loss": 12.6227, "mean_token_accuracy": 0.7941090241074562, "num_tokens": 59574951.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 2.404080862045288, "eval_biology_loss": 2.863299608230591, "eval_biology_mean_token_accuracy": 0.5036493399143219, "eval_biology_num_tokens": 59574951.0, "eval_biology_runtime": 49.4474, "eval_biology_samples_per_second": 10.112, "eval_biology_steps_per_second": 2.528, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 1.092514799118042, "eval_chemistry_loss": 1.1931474208831787, "eval_chemistry_mean_token_accuracy": 0.7216541547775268, "eval_chemistry_num_tokens": 59574951.0, "eval_chemistry_runtime": 61.319, "eval_chemistry_samples_per_second": 8.154, "eval_chemistry_steps_per_second": 2.039, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.8589861969947815, "eval_math_loss": 1.09934401512146, "eval_math_mean_token_accuracy": 0.7468600268363953, "eval_math_num_tokens": 59574951.0, "eval_math_runtime": 62.8842, "eval_math_samples_per_second": 7.951, "eval_math_steps_per_second": 1.988, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 0.7957947330474854, "eval_physics_loss": 0.832846462726593, "eval_physics_mean_token_accuracy": 0.7856108026504517, "eval_physics_num_tokens": 59574951.0, "eval_physics_runtime": 71.6687, "eval_physics_samples_per_second": 6.977, "eval_physics_steps_per_second": 1.744, "step": 1800 }, { "entropy": 0.7800151167437435, "epoch": 2.896, "grad_norm": 16.375, "learning_rate": 8.822222222222223e-06, "loss": 12.5659, "mean_token_accuracy": 0.7957312397658824, "num_tokens": 59903274.0, "step": 1810 }, { "entropy": 0.7794828902930021, "epoch": 2.912, "grad_norm": 16.75, "learning_rate": 8.74814814814815e-06, "loss": 12.4792, "mean_token_accuracy": 0.7961793307214975, "num_tokens": 60234203.0, "step": 1820 }, { "entropy": 0.7716775756329298, "epoch": 2.928, "grad_norm": 16.5, "learning_rate": 8.674074074074074e-06, "loss": 12.4507, "mean_token_accuracy": 0.7971672754734754, "num_tokens": 60559123.0, "step": 1830 }, { "entropy": 0.7751465419307351, "epoch": 2.944, "grad_norm": 17.875, "learning_rate": 8.6e-06, "loss": 12.4246, "mean_token_accuracy": 0.7971710093319416, "num_tokens": 60893423.0, "step": 1840 }, { "entropy": 0.7817971961572766, "epoch": 2.96, "grad_norm": 15.4375, "learning_rate": 8.525925925925927e-06, "loss": 12.6213, "mean_token_accuracy": 0.7947664484381676, "num_tokens": 61228692.0, "step": 1850 }, { "entropy": 0.7864448856562376, "epoch": 2.976, "grad_norm": 15.8125, "learning_rate": 8.451851851851852e-06, "loss": 12.6128, "mean_token_accuracy": 0.7931821491569281, "num_tokens": 61560852.0, "step": 1860 }, { "entropy": 0.7615644473582506, "epoch": 2.992, "grad_norm": 16.125, "learning_rate": 8.377777777777779e-06, "loss": 12.2528, "mean_token_accuracy": 0.7986856568604708, "num_tokens": 61898951.0, "step": 1870 }, { "entropy": 0.7641174843534827, "epoch": 3.008, "grad_norm": 15.0625, "learning_rate": 8.303703703703705e-06, "loss": 12.2561, "mean_token_accuracy": 0.7995043728500605, "num_tokens": 62223875.0, "step": 1880 }, { "entropy": 0.7743637939915061, "epoch": 3.024, "grad_norm": 15.875, "learning_rate": 8.229629629629632e-06, "loss": 12.3679, "mean_token_accuracy": 0.7970022208988666, "num_tokens": 62553166.0, "step": 1890 }, { "entropy": 0.7582010868936777, "epoch": 3.04, "grad_norm": 16.125, "learning_rate": 8.155555555555556e-06, "loss": 12.2155, "mean_token_accuracy": 0.7990330256521702, "num_tokens": 62893113.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 2.399233222961426, "eval_biology_loss": 2.867366313934326, "eval_biology_mean_token_accuracy": 0.5034214525222779, "eval_biology_num_tokens": 62893113.0, "eval_biology_runtime": 49.4064, "eval_biology_samples_per_second": 10.12, "eval_biology_steps_per_second": 2.53, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 1.089496678829193, "eval_chemistry_loss": 1.1933289766311646, "eval_chemistry_mean_token_accuracy": 0.7224055013656616, "eval_chemistry_num_tokens": 62893113.0, "eval_chemistry_runtime": 61.2552, "eval_chemistry_samples_per_second": 8.163, "eval_chemistry_steps_per_second": 2.041, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.8592811126708985, "eval_math_loss": 1.1005762815475464, "eval_math_mean_token_accuracy": 0.7469019837379456, "eval_math_num_tokens": 62893113.0, "eval_math_runtime": 62.8086, "eval_math_samples_per_second": 7.961, "eval_math_steps_per_second": 1.99, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 0.7926631271839142, "eval_physics_loss": 0.8320626020431519, "eval_physics_mean_token_accuracy": 0.7857998023033143, "eval_physics_num_tokens": 62893113.0, "eval_physics_runtime": 71.555, "eval_physics_samples_per_second": 6.988, "eval_physics_steps_per_second": 1.747, "step": 1900 }, { "entropy": 0.7836578948423266, "epoch": 3.056, "grad_norm": 16.75, "learning_rate": 8.081481481481483e-06, "loss": 12.5565, "mean_token_accuracy": 0.7944340173155069, "num_tokens": 63218088.0, "step": 1910 }, { "entropy": 0.7783920112997293, "epoch": 3.072, "grad_norm": 17.5, "learning_rate": 8.007407407407408e-06, "loss": 12.4945, "mean_token_accuracy": 0.7946112662553787, "num_tokens": 63551201.0, "step": 1920 }, { "entropy": 0.7899106752127409, "epoch": 3.088, "grad_norm": 16.625, "learning_rate": 7.933333333333334e-06, "loss": 12.7648, "mean_token_accuracy": 0.7915248584002257, "num_tokens": 63876362.0, "step": 1930 }, { "entropy": 0.7593935616314411, "epoch": 3.104, "grad_norm": 16.375, "learning_rate": 7.859259259259259e-06, "loss": 12.1082, "mean_token_accuracy": 0.8004607565701007, "num_tokens": 64208171.0, "step": 1940 }, { "entropy": 0.7588531570509076, "epoch": 3.12, "grad_norm": 16.375, "learning_rate": 7.785185185185185e-06, "loss": 12.1557, "mean_token_accuracy": 0.8000247534364462, "num_tokens": 64544493.0, "step": 1950 }, { "entropy": 0.7889568522572518, "epoch": 3.136, "grad_norm": 16.375, "learning_rate": 7.711111111111112e-06, "loss": 12.774, "mean_token_accuracy": 0.7912482611835003, "num_tokens": 64872488.0, "step": 1960 }, { "entropy": 0.7564845994114876, "epoch": 3.152, "grad_norm": 15.8125, "learning_rate": 7.637037037037037e-06, "loss": 12.089, "mean_token_accuracy": 0.8013048619031906, "num_tokens": 65201124.0, "step": 1970 }, { "entropy": 0.7751831419765949, "epoch": 3.168, "grad_norm": 15.125, "learning_rate": 7.562962962962963e-06, "loss": 12.5378, "mean_token_accuracy": 0.7958132576197385, "num_tokens": 65525868.0, "step": 1980 }, { "entropy": 0.7631789870560169, "epoch": 3.184, "grad_norm": 16.75, "learning_rate": 7.48888888888889e-06, "loss": 12.2134, "mean_token_accuracy": 0.8015536881983281, "num_tokens": 65861603.0, "step": 1990 }, { "entropy": 0.7747107265517116, "epoch": 3.2, "grad_norm": 15.75, "learning_rate": 7.4148148148148155e-06, "loss": 12.494, "mean_token_accuracy": 0.7962346155196428, "num_tokens": 66193675.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 2.365761664390564, "eval_biology_loss": 2.876070022583008, "eval_biology_mean_token_accuracy": 0.5039477293491363, "eval_biology_num_tokens": 66193675.0, "eval_biology_runtime": 49.4253, "eval_biology_samples_per_second": 10.116, "eval_biology_steps_per_second": 2.529, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 1.0774572858810425, "eval_chemistry_loss": 1.1938129663467407, "eval_chemistry_mean_token_accuracy": 0.722421464920044, "eval_chemistry_num_tokens": 66193675.0, "eval_chemistry_runtime": 61.2795, "eval_chemistry_samples_per_second": 8.159, "eval_chemistry_steps_per_second": 2.04, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.8556913251876831, "eval_math_loss": 1.1010321378707886, "eval_math_mean_token_accuracy": 0.746727294921875, "eval_math_num_tokens": 66193675.0, "eval_math_runtime": 63.1, "eval_math_samples_per_second": 7.924, "eval_math_steps_per_second": 1.981, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 0.7872861375808716, "eval_physics_loss": 0.831552267074585, "eval_physics_mean_token_accuracy": 0.7860952916145325, "eval_physics_num_tokens": 66193675.0, "eval_physics_runtime": 71.959, "eval_physics_samples_per_second": 6.948, "eval_physics_steps_per_second": 1.737, "step": 2000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.614072648666411e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }