| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.5590297095805052, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.0559891855344177, |
| "epoch": 0.017803493935684877, |
| "grad_norm": 176.0, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 42.5063, |
| "mean_token_accuracy": 0.6153968568891287, |
| "num_tokens": 257189.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.0444028429687022, |
| "epoch": 0.035606987871369754, |
| "grad_norm": 160.0, |
| "learning_rate": 1.2666666666666669e-06, |
| "loss": 41.7045, |
| "mean_token_accuracy": 0.619235553778708, |
| "num_tokens": 528285.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.0654841650277376, |
| "epoch": 0.053410481807054634, |
| "grad_norm": 175.0, |
| "learning_rate": 1.9333333333333336e-06, |
| "loss": 41.559, |
| "mean_token_accuracy": 0.6179817667230963, |
| "num_tokens": 801635.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.0894893994554877, |
| "epoch": 0.07121397574273951, |
| "grad_norm": 154.0, |
| "learning_rate": 2.6e-06, |
| "loss": 41.6138, |
| "mean_token_accuracy": 0.6147882426157594, |
| "num_tokens": 1060812.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.0985747126862406, |
| "epoch": 0.08901746967842439, |
| "grad_norm": 141.0, |
| "learning_rate": 3.266666666666667e-06, |
| "loss": 39.4404, |
| "mean_token_accuracy": 0.6201398545876146, |
| "num_tokens": 1327380.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.162555592879653, |
| "epoch": 0.10682096361410927, |
| "grad_norm": 114.0, |
| "learning_rate": 3.9333333333333335e-06, |
| "loss": 38.4348, |
| "mean_token_accuracy": 0.6222015436738729, |
| "num_tokens": 1597405.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.3072715956717729, |
| "epoch": 0.12462445754979415, |
| "grad_norm": 76.5, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 37.3098, |
| "mean_token_accuracy": 0.6205647233873606, |
| "num_tokens": 1859684.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.47952934615314, |
| "epoch": 0.14242795148547902, |
| "grad_norm": 54.0, |
| "learning_rate": 5.2666666666666665e-06, |
| "loss": 35.3269, |
| "mean_token_accuracy": 0.6284260725602507, |
| "num_tokens": 2118068.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.5287989236414432, |
| "epoch": 0.1602314454211639, |
| "grad_norm": 37.25, |
| "learning_rate": 5.933333333333335e-06, |
| "loss": 31.5676, |
| "mean_token_accuracy": 0.65506336633116, |
| "num_tokens": 2388824.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.630833999812603, |
| "epoch": 0.17803493935684878, |
| "grad_norm": 28.75, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 30.3566, |
| "mean_token_accuracy": 0.65591501891613, |
| "num_tokens": 2644330.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_biology_entropy": 3.4087772026062013, |
| "eval_biology_loss": 3.8683884143829346, |
| "eval_biology_mean_token_accuracy": 0.4378368608951569, |
| "eval_biology_num_tokens": 2644330.0, |
| "eval_biology_runtime": 57.5948, |
| "eval_biology_samples_per_second": 8.681, |
| "eval_biology_steps_per_second": 2.17, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_chemistry_entropy": 1.644574547290802, |
| "eval_chemistry_loss": 1.8279423713684082, |
| "eval_chemistry_mean_token_accuracy": 0.663385133266449, |
| "eval_chemistry_num_tokens": 2644330.0, |
| "eval_chemistry_runtime": 72.0267, |
| "eval_chemistry_samples_per_second": 6.942, |
| "eval_chemistry_steps_per_second": 1.735, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_math_entropy": 1.2650005059242249, |
| "eval_math_loss": 1.795242428779602, |
| "eval_math_mean_token_accuracy": 0.6832415590286255, |
| "eval_math_num_tokens": 2644330.0, |
| "eval_math_runtime": 75.2166, |
| "eval_math_samples_per_second": 6.647, |
| "eval_math_steps_per_second": 1.662, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17803493935684878, |
| "eval_physics_entropy": 1.6315128216743469, |
| "eval_physics_loss": 1.8273048400878906, |
| "eval_physics_mean_token_accuracy": 0.6732162787914276, |
| "eval_physics_num_tokens": 2644330.0, |
| "eval_physics_runtime": 85.3066, |
| "eval_physics_samples_per_second": 5.861, |
| "eval_physics_steps_per_second": 1.465, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.558345663547516, |
| "epoch": 0.19583843329253367, |
| "grad_norm": 24.375, |
| "learning_rate": 7.266666666666668e-06, |
| "loss": 28.2409, |
| "mean_token_accuracy": 0.6733386002480983, |
| "num_tokens": 2913700.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.5497455261647701, |
| "epoch": 0.21364192722821854, |
| "grad_norm": 20.375, |
| "learning_rate": 7.933333333333334e-06, |
| "loss": 27.4174, |
| "mean_token_accuracy": 0.678030077368021, |
| "num_tokens": 3185255.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.5162339597940444, |
| "epoch": 0.2314454211639034, |
| "grad_norm": 20.5, |
| "learning_rate": 8.6e-06, |
| "loss": 26.3563, |
| "mean_token_accuracy": 0.6866102360188961, |
| "num_tokens": 3454750.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.5057988293468951, |
| "epoch": 0.2492489150995883, |
| "grad_norm": 19.375, |
| "learning_rate": 9.266666666666667e-06, |
| "loss": 26.0435, |
| "mean_token_accuracy": 0.689361485093832, |
| "num_tokens": 3719113.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.544687245413661, |
| "epoch": 0.26705240903527316, |
| "grad_norm": 18.375, |
| "learning_rate": 9.933333333333334e-06, |
| "loss": 26.1449, |
| "mean_token_accuracy": 0.6857006324455142, |
| "num_tokens": 3990505.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.4617379672825337, |
| "epoch": 0.28485590297095803, |
| "grad_norm": 17.25, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 24.9336, |
| "mean_token_accuracy": 0.6990214973688126, |
| "num_tokens": 4267403.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.45880494043231, |
| "epoch": 0.30265939690664295, |
| "grad_norm": 17.375, |
| "learning_rate": 1.1266666666666668e-05, |
| "loss": 24.5033, |
| "mean_token_accuracy": 0.7016687501221895, |
| "num_tokens": 4535458.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.435447308793664, |
| "epoch": 0.3204628908423278, |
| "grad_norm": 20.375, |
| "learning_rate": 1.1933333333333335e-05, |
| "loss": 24.3963, |
| "mean_token_accuracy": 0.7021712277084589, |
| "num_tokens": 4796815.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.4029231216758489, |
| "epoch": 0.3382663847780127, |
| "grad_norm": 16.875, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 23.6953, |
| "mean_token_accuracy": 0.7080240704119205, |
| "num_tokens": 5066948.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.4468292627483605, |
| "epoch": 0.35606987871369755, |
| "grad_norm": 17.75, |
| "learning_rate": 1.3266666666666668e-05, |
| "loss": 24.4323, |
| "mean_token_accuracy": 0.698475543037057, |
| "num_tokens": 5324751.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_biology_entropy": 2.8836843852996825, |
| "eval_biology_loss": 3.357414722442627, |
| "eval_biology_mean_token_accuracy": 0.48108551049232484, |
| "eval_biology_num_tokens": 5324751.0, |
| "eval_biology_runtime": 57.5893, |
| "eval_biology_samples_per_second": 8.682, |
| "eval_biology_steps_per_second": 2.171, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_chemistry_entropy": 1.4061724228858947, |
| "eval_chemistry_loss": 1.478424072265625, |
| "eval_chemistry_mean_token_accuracy": 0.7063784518241882, |
| "eval_chemistry_num_tokens": 5324751.0, |
| "eval_chemistry_runtime": 72.1317, |
| "eval_chemistry_samples_per_second": 6.932, |
| "eval_chemistry_steps_per_second": 1.733, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_math_entropy": 1.2177042779922485, |
| "eval_math_loss": 1.5454598665237427, |
| "eval_math_mean_token_accuracy": 0.7047145981788635, |
| "eval_math_num_tokens": 5324751.0, |
| "eval_math_runtime": 75.1207, |
| "eval_math_samples_per_second": 6.656, |
| "eval_math_steps_per_second": 1.664, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.35606987871369755, |
| "eval_physics_entropy": 1.4176100668907166, |
| "eval_physics_loss": 1.5222268104553223, |
| "eval_physics_mean_token_accuracy": 0.707690957069397, |
| "eval_physics_num_tokens": 5324751.0, |
| "eval_physics_runtime": 85.2085, |
| "eval_physics_samples_per_second": 5.868, |
| "eval_physics_steps_per_second": 1.467, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.3558454412966967, |
| "epoch": 0.3738733726493824, |
| "grad_norm": 19.0, |
| "learning_rate": 1.3933333333333334e-05, |
| "loss": 22.7949, |
| "mean_token_accuracy": 0.7136942774057389, |
| "num_tokens": 5585508.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.383369480073452, |
| "epoch": 0.39167686658506734, |
| "grad_norm": 19.375, |
| "learning_rate": 1.46e-05, |
| "loss": 23.1375, |
| "mean_token_accuracy": 0.7102958835661412, |
| "num_tokens": 5848889.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.338986362144351, |
| "epoch": 0.4094803605207522, |
| "grad_norm": 17.25, |
| "learning_rate": 1.5266666666666667e-05, |
| "loss": 22.3403, |
| "mean_token_accuracy": 0.7182383798062801, |
| "num_tokens": 6114855.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.3609099809080363, |
| "epoch": 0.4272838544564371, |
| "grad_norm": 17.75, |
| "learning_rate": 1.5933333333333336e-05, |
| "loss": 22.8066, |
| "mean_token_accuracy": 0.7140973150730133, |
| "num_tokens": 6378152.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.3484373219311236, |
| "epoch": 0.44508734839212194, |
| "grad_norm": 18.5, |
| "learning_rate": 1.66e-05, |
| "loss": 22.591, |
| "mean_token_accuracy": 0.713333373889327, |
| "num_tokens": 6637273.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.315424071252346, |
| "epoch": 0.4628908423278068, |
| "grad_norm": 19.0, |
| "learning_rate": 1.726666666666667e-05, |
| "loss": 21.9951, |
| "mean_token_accuracy": 0.7204139836132526, |
| "num_tokens": 6896684.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.32754250690341, |
| "epoch": 0.48069433626349173, |
| "grad_norm": 17.25, |
| "learning_rate": 1.7933333333333333e-05, |
| "loss": 21.9557, |
| "mean_token_accuracy": 0.7203923668712378, |
| "num_tokens": 7166608.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.2635501787066459, |
| "epoch": 0.4984978301991766, |
| "grad_norm": 17.0, |
| "learning_rate": 1.86e-05, |
| "loss": 21.113, |
| "mean_token_accuracy": 0.7295621318742633, |
| "num_tokens": 7444923.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.3078652255237102, |
| "epoch": 0.5163013241348615, |
| "grad_norm": 17.25, |
| "learning_rate": 1.926666666666667e-05, |
| "loss": 21.7315, |
| "mean_token_accuracy": 0.7203882545232773, |
| "num_tokens": 7706502.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.3434829246252775, |
| "epoch": 0.5341048180705463, |
| "grad_norm": 20.25, |
| "learning_rate": 1.9933333333333334e-05, |
| "loss": 22.236, |
| "mean_token_accuracy": 0.7177267197519541, |
| "num_tokens": 7969704.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_biology_entropy": 2.7598990201950073, |
| "eval_biology_loss": 3.1821975708007812, |
| "eval_biology_mean_token_accuracy": 0.4959620425701141, |
| "eval_biology_num_tokens": 7969704.0, |
| "eval_biology_runtime": 57.9508, |
| "eval_biology_samples_per_second": 8.628, |
| "eval_biology_steps_per_second": 2.157, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_chemistry_entropy": 1.2813276634216308, |
| "eval_chemistry_loss": 1.3571581840515137, |
| "eval_chemistry_mean_token_accuracy": 0.7210862817764282, |
| "eval_chemistry_num_tokens": 7969704.0, |
| "eval_chemistry_runtime": 72.2035, |
| "eval_chemistry_samples_per_second": 6.925, |
| "eval_chemistry_steps_per_second": 1.731, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_math_entropy": 1.165593782901764, |
| "eval_math_loss": 1.472881555557251, |
| "eval_math_mean_token_accuracy": 0.7125356969833374, |
| "eval_math_num_tokens": 7969704.0, |
| "eval_math_runtime": 75.1477, |
| "eval_math_samples_per_second": 6.654, |
| "eval_math_steps_per_second": 1.663, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5341048180705463, |
| "eval_physics_entropy": 1.3238337383270264, |
| "eval_physics_loss": 1.4213438034057617, |
| "eval_physics_mean_token_accuracy": 0.7196516485214234, |
| "eval_physics_num_tokens": 7969704.0, |
| "eval_physics_runtime": 85.3194, |
| "eval_physics_samples_per_second": 5.86, |
| "eval_physics_steps_per_second": 1.465, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.284867750480771, |
| "epoch": 0.5519083120062312, |
| "grad_norm": 21.625, |
| "learning_rate": 1.9933333333333334e-05, |
| "loss": 21.4794, |
| "mean_token_accuracy": 0.7222346868366003, |
| "num_tokens": 8242162.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.2864018987864256, |
| "epoch": 0.5697118059419161, |
| "grad_norm": 18.5, |
| "learning_rate": 1.985925925925926e-05, |
| "loss": 21.4262, |
| "mean_token_accuracy": 0.724996630847454, |
| "num_tokens": 8497852.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.2846032541245223, |
| "epoch": 0.587515299877601, |
| "grad_norm": 21.0, |
| "learning_rate": 1.9785185185185187e-05, |
| "loss": 21.5347, |
| "mean_token_accuracy": 0.7234938707202673, |
| "num_tokens": 8757753.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.2416468027979135, |
| "epoch": 0.6053187938132859, |
| "grad_norm": 15.75, |
| "learning_rate": 1.971111111111111e-05, |
| "loss": 20.5325, |
| "mean_token_accuracy": 0.733549839258194, |
| "num_tokens": 9024677.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.2626060862094164, |
| "epoch": 0.6231222877489707, |
| "grad_norm": 17.75, |
| "learning_rate": 1.963703703703704e-05, |
| "loss": 21.0676, |
| "mean_token_accuracy": 0.7265046216547489, |
| "num_tokens": 9291760.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.228025446087122, |
| "epoch": 0.6409257816846556, |
| "grad_norm": 19.875, |
| "learning_rate": 1.9562962962962964e-05, |
| "loss": 20.3954, |
| "mean_token_accuracy": 0.7328283190727234, |
| "num_tokens": 9561091.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.2296066496521234, |
| "epoch": 0.6587292756203404, |
| "grad_norm": 18.875, |
| "learning_rate": 1.948888888888889e-05, |
| "loss": 20.5407, |
| "mean_token_accuracy": 0.7325904417783022, |
| "num_tokens": 9827272.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.2079377524554729, |
| "epoch": 0.6765327695560254, |
| "grad_norm": 17.125, |
| "learning_rate": 1.9414814814814817e-05, |
| "loss": 20.0342, |
| "mean_token_accuracy": 0.7388566959649324, |
| "num_tokens": 10096065.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.2025926019996405, |
| "epoch": 0.6943362634917103, |
| "grad_norm": 18.125, |
| "learning_rate": 1.9340740740740743e-05, |
| "loss": 19.8907, |
| "mean_token_accuracy": 0.7389910601079464, |
| "num_tokens": 10364601.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.215231117978692, |
| "epoch": 0.7121397574273951, |
| "grad_norm": 18.5, |
| "learning_rate": 1.926666666666667e-05, |
| "loss": 20.2245, |
| "mean_token_accuracy": 0.7353602629154921, |
| "num_tokens": 10633325.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_biology_entropy": 2.8210434341430664, |
| "eval_biology_loss": 3.116989850997925, |
| "eval_biology_mean_token_accuracy": 0.5034433958530427, |
| "eval_biology_num_tokens": 10633325.0, |
| "eval_biology_runtime": 57.5361, |
| "eval_biology_samples_per_second": 8.69, |
| "eval_biology_steps_per_second": 2.173, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_chemistry_entropy": 1.261210060596466, |
| "eval_chemistry_loss": 1.294067144393921, |
| "eval_chemistry_mean_token_accuracy": 0.730656672000885, |
| "eval_chemistry_num_tokens": 10633325.0, |
| "eval_chemistry_runtime": 72.1604, |
| "eval_chemistry_samples_per_second": 6.929, |
| "eval_chemistry_steps_per_second": 1.732, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_math_entropy": 1.1643331799507142, |
| "eval_math_loss": 1.4408313035964966, |
| "eval_math_mean_token_accuracy": 0.7161806149482727, |
| "eval_math_num_tokens": 10633325.0, |
| "eval_math_runtime": 75.5006, |
| "eval_math_samples_per_second": 6.622, |
| "eval_math_steps_per_second": 1.656, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7121397574273951, |
| "eval_physics_entropy": 1.3186150641441345, |
| "eval_physics_loss": 1.373258113861084, |
| "eval_physics_mean_token_accuracy": 0.7264462962150574, |
| "eval_physics_num_tokens": 10633325.0, |
| "eval_physics_runtime": 85.0734, |
| "eval_physics_samples_per_second": 5.877, |
| "eval_physics_steps_per_second": 1.469, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.1908438373357058, |
| "epoch": 0.72994325136308, |
| "grad_norm": 22.875, |
| "learning_rate": 1.9192592592592593e-05, |
| "loss": 19.969, |
| "mean_token_accuracy": 0.7395484477281571, |
| "num_tokens": 10897916.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.2167691864073276, |
| "epoch": 0.7477467452987648, |
| "grad_norm": 20.0, |
| "learning_rate": 1.911851851851852e-05, |
| "loss": 20.1452, |
| "mean_token_accuracy": 0.7378258787095546, |
| "num_tokens": 11165356.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.2289611756801606, |
| "epoch": 0.7655502392344498, |
| "grad_norm": 18.75, |
| "learning_rate": 1.9044444444444446e-05, |
| "loss": 20.5277, |
| "mean_token_accuracy": 0.7328679781407118, |
| "num_tokens": 11436799.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.244011626765132, |
| "epoch": 0.7833537331701347, |
| "grad_norm": 19.875, |
| "learning_rate": 1.8970370370370372e-05, |
| "loss": 20.5562, |
| "mean_token_accuracy": 0.7318485330790281, |
| "num_tokens": 11703496.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.2247176449745893, |
| "epoch": 0.8011572271058195, |
| "grad_norm": 21.75, |
| "learning_rate": 1.8896296296296295e-05, |
| "loss": 20.421, |
| "mean_token_accuracy": 0.7341447170823813, |
| "num_tokens": 11965530.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.2138673104345798, |
| "epoch": 0.8189607210415044, |
| "grad_norm": 17.375, |
| "learning_rate": 1.8822222222222225e-05, |
| "loss": 20.2052, |
| "mean_token_accuracy": 0.7365878012031317, |
| "num_tokens": 12224427.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.1654350489377976, |
| "epoch": 0.8367642149771892, |
| "grad_norm": 17.875, |
| "learning_rate": 1.874814814814815e-05, |
| "loss": 19.4104, |
| "mean_token_accuracy": 0.7445328518748283, |
| "num_tokens": 12509124.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.198500120639801, |
| "epoch": 0.8545677089128741, |
| "grad_norm": 16.75, |
| "learning_rate": 1.8674074074074075e-05, |
| "loss": 19.8297, |
| "mean_token_accuracy": 0.7386186074465513, |
| "num_tokens": 12778408.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.170426043868065, |
| "epoch": 0.8723712028485591, |
| "grad_norm": 18.5, |
| "learning_rate": 1.86e-05, |
| "loss": 19.4215, |
| "mean_token_accuracy": 0.743653716892004, |
| "num_tokens": 13046473.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.2106714222580195, |
| "epoch": 0.8901746967842439, |
| "grad_norm": 16.875, |
| "learning_rate": 1.8525925925925928e-05, |
| "loss": 20.1785, |
| "mean_token_accuracy": 0.7360058560967445, |
| "num_tokens": 13301659.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_biology_entropy": 2.77167928981781, |
| "eval_biology_loss": 3.050346612930298, |
| "eval_biology_mean_token_accuracy": 0.5097111368179321, |
| "eval_biology_num_tokens": 13301659.0, |
| "eval_biology_runtime": 58.0048, |
| "eval_biology_samples_per_second": 8.62, |
| "eval_biology_steps_per_second": 2.155, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_chemistry_entropy": 1.2252461080551147, |
| "eval_chemistry_loss": 1.2589221000671387, |
| "eval_chemistry_mean_token_accuracy": 0.7352680149078369, |
| "eval_chemistry_num_tokens": 13301659.0, |
| "eval_chemistry_runtime": 71.4907, |
| "eval_chemistry_samples_per_second": 6.994, |
| "eval_chemistry_steps_per_second": 1.748, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_math_entropy": 1.1505462565422058, |
| "eval_math_loss": 1.420082688331604, |
| "eval_math_mean_token_accuracy": 0.7192822680473328, |
| "eval_math_num_tokens": 13301659.0, |
| "eval_math_runtime": 74.8775, |
| "eval_math_samples_per_second": 6.678, |
| "eval_math_steps_per_second": 1.669, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8901746967842439, |
| "eval_physics_entropy": 1.2864371166229247, |
| "eval_physics_loss": 1.344913125038147, |
| "eval_physics_mean_token_accuracy": 0.7303608517646789, |
| "eval_physics_num_tokens": 13301659.0, |
| "eval_physics_runtime": 84.8812, |
| "eval_physics_samples_per_second": 5.891, |
| "eval_physics_steps_per_second": 1.473, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.1539306506514548, |
| "epoch": 0.9079781907199288, |
| "grad_norm": 15.4375, |
| "learning_rate": 1.8451851851851855e-05, |
| "loss": 19.2513, |
| "mean_token_accuracy": 0.7464804232120514, |
| "num_tokens": 13570986.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.1872555747628213, |
| "epoch": 0.9257816846556136, |
| "grad_norm": 18.375, |
| "learning_rate": 1.8377777777777778e-05, |
| "loss": 19.8251, |
| "mean_token_accuracy": 0.7413290243595838, |
| "num_tokens": 13833901.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.1907209917902946, |
| "epoch": 0.9435851785912985, |
| "grad_norm": 18.0, |
| "learning_rate": 1.8303703703703704e-05, |
| "loss": 19.7051, |
| "mean_token_accuracy": 0.7420744668692351, |
| "num_tokens": 14093888.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.2106883700937032, |
| "epoch": 0.9613886725269835, |
| "grad_norm": 16.625, |
| "learning_rate": 1.822962962962963e-05, |
| "loss": 20.2094, |
| "mean_token_accuracy": 0.7350596960633993, |
| "num_tokens": 14354918.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.1697432730346917, |
| "epoch": 0.9791921664626683, |
| "grad_norm": 18.25, |
| "learning_rate": 1.8155555555555557e-05, |
| "loss": 19.3531, |
| "mean_token_accuracy": 0.7433340087532997, |
| "num_tokens": 14620244.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.1880640607327222, |
| "epoch": 0.9969956603983532, |
| "grad_norm": 17.0, |
| "learning_rate": 1.8081481481481484e-05, |
| "loss": 19.8443, |
| "mean_token_accuracy": 0.740341067314148, |
| "num_tokens": 14886593.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.1296160619105062, |
| "epoch": 1.014242795148548, |
| "grad_norm": 18.0, |
| "learning_rate": 1.800740740740741e-05, |
| "loss": 18.1103, |
| "mean_token_accuracy": 0.7519292243065373, |
| "num_tokens": 15146469.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.2114373303949832, |
| "epoch": 1.0320462890842328, |
| "grad_norm": 19.375, |
| "learning_rate": 1.7933333333333333e-05, |
| "loss": 19.9822, |
| "mean_token_accuracy": 0.7364404492080212, |
| "num_tokens": 15404569.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.1140953950583934, |
| "epoch": 1.0498497830199176, |
| "grad_norm": 17.5, |
| "learning_rate": 1.785925925925926e-05, |
| "loss": 18.6602, |
| "mean_token_accuracy": 0.7509294405579567, |
| "num_tokens": 15681709.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.178190778568387, |
| "epoch": 1.0676532769556026, |
| "grad_norm": 17.75, |
| "learning_rate": 1.7785185185185186e-05, |
| "loss": 19.4194, |
| "mean_token_accuracy": 0.7424272943288088, |
| "num_tokens": 15949421.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_biology_entropy": 2.754662238121033, |
| "eval_biology_loss": 3.033093214035034, |
| "eval_biology_mean_token_accuracy": 0.5113752641677857, |
| "eval_biology_num_tokens": 15949421.0, |
| "eval_biology_runtime": 57.7707, |
| "eval_biology_samples_per_second": 8.655, |
| "eval_biology_steps_per_second": 2.164, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_chemistry_entropy": 1.1933018751144409, |
| "eval_chemistry_loss": 1.2360327243804932, |
| "eval_chemistry_mean_token_accuracy": 0.7386878499984741, |
| "eval_chemistry_num_tokens": 15949421.0, |
| "eval_chemistry_runtime": 72.0715, |
| "eval_chemistry_samples_per_second": 6.938, |
| "eval_chemistry_steps_per_second": 1.734, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_math_entropy": 1.1338728556632995, |
| "eval_math_loss": 1.4086648225784302, |
| "eval_math_mean_token_accuracy": 0.721148521900177, |
| "eval_math_num_tokens": 15949421.0, |
| "eval_math_runtime": 75.0522, |
| "eval_math_samples_per_second": 6.662, |
| "eval_math_steps_per_second": 1.666, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0676532769556026, |
| "eval_physics_entropy": 1.2630673241615296, |
| "eval_physics_loss": 1.330203890800476, |
| "eval_physics_mean_token_accuracy": 0.7322184553146363, |
| "eval_physics_num_tokens": 15949421.0, |
| "eval_physics_runtime": 85.1496, |
| "eval_physics_samples_per_second": 5.872, |
| "eval_physics_steps_per_second": 1.468, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.1252168361097574, |
| "epoch": 1.0854567708912874, |
| "grad_norm": 18.0, |
| "learning_rate": 1.7711111111111113e-05, |
| "loss": 18.8804, |
| "mean_token_accuracy": 0.7498680882155895, |
| "num_tokens": 16221916.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.1838977057486773, |
| "epoch": 1.1032602648269723, |
| "grad_norm": 20.375, |
| "learning_rate": 1.763703703703704e-05, |
| "loss": 19.6525, |
| "mean_token_accuracy": 0.7415466036647558, |
| "num_tokens": 16488154.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.16671148724854, |
| "epoch": 1.121063758762657, |
| "grad_norm": 19.75, |
| "learning_rate": 1.7562962962962962e-05, |
| "loss": 19.3184, |
| "mean_token_accuracy": 0.7431947711855174, |
| "num_tokens": 16756828.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.1459482550621032, |
| "epoch": 1.138867252698342, |
| "grad_norm": 18.125, |
| "learning_rate": 1.7488888888888892e-05, |
| "loss": 19.0361, |
| "mean_token_accuracy": 0.7469833578914404, |
| "num_tokens": 17023681.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.160651782527566, |
| "epoch": 1.156670746634027, |
| "grad_norm": 19.5, |
| "learning_rate": 1.7414814814814815e-05, |
| "loss": 19.3198, |
| "mean_token_accuracy": 0.7445345718413592, |
| "num_tokens": 17286672.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.2018978279083967, |
| "epoch": 1.1744742405697117, |
| "grad_norm": 19.875, |
| "learning_rate": 1.7340740740740742e-05, |
| "loss": 19.8959, |
| "mean_token_accuracy": 0.7359397515654564, |
| "num_tokens": 17551581.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.1597996551543475, |
| "epoch": 1.1922777345053968, |
| "grad_norm": 18.0, |
| "learning_rate": 1.726666666666667e-05, |
| "loss": 19.2421, |
| "mean_token_accuracy": 0.7448843888938427, |
| "num_tokens": 17821205.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.1536007285118104, |
| "epoch": 1.2100812284410816, |
| "grad_norm": 20.125, |
| "learning_rate": 1.7192592592592595e-05, |
| "loss": 19.2532, |
| "mean_token_accuracy": 0.7444427307695151, |
| "num_tokens": 18087759.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.1601740807294845, |
| "epoch": 1.2278847223767664, |
| "grad_norm": 19.0, |
| "learning_rate": 1.711851851851852e-05, |
| "loss": 19.2395, |
| "mean_token_accuracy": 0.7444933351129294, |
| "num_tokens": 18355732.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.1505332425236703, |
| "epoch": 1.2456882163124514, |
| "grad_norm": 17.75, |
| "learning_rate": 1.7044444444444445e-05, |
| "loss": 19.0722, |
| "mean_token_accuracy": 0.7461056258529425, |
| "num_tokens": 18626245.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_biology_entropy": 2.6926132106781004, |
| "eval_biology_loss": 3.016256332397461, |
| "eval_biology_mean_token_accuracy": 0.5140536696910858, |
| "eval_biology_num_tokens": 18626245.0, |
| "eval_biology_runtime": 57.5147, |
| "eval_biology_samples_per_second": 8.693, |
| "eval_biology_steps_per_second": 2.173, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_chemistry_entropy": 1.1651022205352783, |
| "eval_chemistry_loss": 1.2187129259109497, |
| "eval_chemistry_mean_token_accuracy": 0.7408477578163147, |
| "eval_chemistry_num_tokens": 18626245.0, |
| "eval_chemistry_runtime": 71.9835, |
| "eval_chemistry_samples_per_second": 6.946, |
| "eval_chemistry_steps_per_second": 1.737, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_math_entropy": 1.1168806705474854, |
| "eval_math_loss": 1.4018580913543701, |
| "eval_math_mean_token_accuracy": 0.7227320442199707, |
| "eval_math_num_tokens": 18626245.0, |
| "eval_math_runtime": 75.0558, |
| "eval_math_samples_per_second": 6.662, |
| "eval_math_steps_per_second": 1.665, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.2456882163124514, |
| "eval_physics_entropy": 1.2369258465766906, |
| "eval_physics_loss": 1.316622257232666, |
| "eval_physics_mean_token_accuracy": 0.7343540601730346, |
| "eval_physics_num_tokens": 18626245.0, |
| "eval_physics_runtime": 85.1088, |
| "eval_physics_samples_per_second": 5.875, |
| "eval_physics_steps_per_second": 1.469, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.1377631668001413, |
| "epoch": 1.2634917102481362, |
| "grad_norm": 17.375, |
| "learning_rate": 1.697037037037037e-05, |
| "loss": 18.8327, |
| "mean_token_accuracy": 0.7493732802569866, |
| "num_tokens": 18899632.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.167876774072647, |
| "epoch": 1.281295204183821, |
| "grad_norm": 19.375, |
| "learning_rate": 1.6896296296296298e-05, |
| "loss": 19.4079, |
| "mean_token_accuracy": 0.7431724380701781, |
| "num_tokens": 19173635.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.0993454653769732, |
| "epoch": 1.299098698119506, |
| "grad_norm": 15.25, |
| "learning_rate": 1.6822222222222224e-05, |
| "loss": 18.3822, |
| "mean_token_accuracy": 0.7549435570836067, |
| "num_tokens": 19453495.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.1375744685530662, |
| "epoch": 1.3169021920551909, |
| "grad_norm": 19.0, |
| "learning_rate": 1.6748148148148147e-05, |
| "loss": 18.8503, |
| "mean_token_accuracy": 0.7493278611451387, |
| "num_tokens": 19719086.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.1301548499614, |
| "epoch": 1.3347056859908757, |
| "grad_norm": 21.5, |
| "learning_rate": 1.6674074074074077e-05, |
| "loss": 18.8165, |
| "mean_token_accuracy": 0.7469407975673675, |
| "num_tokens": 19981272.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.1576253682374955, |
| "epoch": 1.3525091799265607, |
| "grad_norm": 17.75, |
| "learning_rate": 1.66e-05, |
| "loss": 19.2575, |
| "mean_token_accuracy": 0.7463510025292635, |
| "num_tokens": 20243034.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.1525329016149044, |
| "epoch": 1.3703126738622455, |
| "grad_norm": 21.5, |
| "learning_rate": 1.6525925925925927e-05, |
| "loss": 19.0713, |
| "mean_token_accuracy": 0.7454641673713922, |
| "num_tokens": 20509028.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.196613146364689, |
| "epoch": 1.3881161677979303, |
| "grad_norm": 21.75, |
| "learning_rate": 1.6451851851851853e-05, |
| "loss": 19.9552, |
| "mean_token_accuracy": 0.7380195271223784, |
| "num_tokens": 20762668.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.1995733845978975, |
| "epoch": 1.4059196617336152, |
| "grad_norm": 17.375, |
| "learning_rate": 1.637777777777778e-05, |
| "loss": 19.7882, |
| "mean_token_accuracy": 0.7368107169866562, |
| "num_tokens": 21025277.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.1568929351866246, |
| "epoch": 1.4237231556693, |
| "grad_norm": 20.0, |
| "learning_rate": 1.6303703703703706e-05, |
| "loss": 19.2121, |
| "mean_token_accuracy": 0.7448243040591478, |
| "num_tokens": 21292320.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_biology_entropy": 2.6511214599609376, |
| "eval_biology_loss": 2.986257791519165, |
| "eval_biology_mean_token_accuracy": 0.5167679135799408, |
| "eval_biology_num_tokens": 21292320.0, |
| "eval_biology_runtime": 58.1602, |
| "eval_biology_samples_per_second": 8.597, |
| "eval_biology_steps_per_second": 2.149, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_chemistry_entropy": 1.1516469955444335, |
| "eval_chemistry_loss": 1.205365777015686, |
| "eval_chemistry_mean_token_accuracy": 0.742991099357605, |
| "eval_chemistry_num_tokens": 21292320.0, |
| "eval_chemistry_runtime": 72.059, |
| "eval_chemistry_samples_per_second": 6.939, |
| "eval_chemistry_steps_per_second": 1.735, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_math_entropy": 1.109243516921997, |
| "eval_math_loss": 1.3933804035186768, |
| "eval_math_mean_token_accuracy": 0.7233971772193909, |
| "eval_math_num_tokens": 21292320.0, |
| "eval_math_runtime": 75.0586, |
| "eval_math_samples_per_second": 6.661, |
| "eval_math_steps_per_second": 1.665, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.4237231556693, |
| "eval_physics_entropy": 1.2258712153434754, |
| "eval_physics_loss": 1.307606816291809, |
| "eval_physics_mean_token_accuracy": 0.735959903717041, |
| "eval_physics_num_tokens": 21292320.0, |
| "eval_physics_runtime": 85.1923, |
| "eval_physics_samples_per_second": 5.869, |
| "eval_physics_steps_per_second": 1.467, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.1539845019578934, |
| "epoch": 1.441526649604985, |
| "grad_norm": 20.625, |
| "learning_rate": 1.622962962962963e-05, |
| "loss": 19.1587, |
| "mean_token_accuracy": 0.7431901153177023, |
| "num_tokens": 21563668.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.1525183795019984, |
| "epoch": 1.4593301435406698, |
| "grad_norm": 21.125, |
| "learning_rate": 1.6155555555555556e-05, |
| "loss": 19.1174, |
| "mean_token_accuracy": 0.7469210345298052, |
| "num_tokens": 21817298.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.1620672512799501, |
| "epoch": 1.4771336374763546, |
| "grad_norm": 20.625, |
| "learning_rate": 1.6081481481481482e-05, |
| "loss": 19.3092, |
| "mean_token_accuracy": 0.7433301910758019, |
| "num_tokens": 22083010.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.1712669901549817, |
| "epoch": 1.4949371314120397, |
| "grad_norm": 21.875, |
| "learning_rate": 1.600740740740741e-05, |
| "loss": 19.4144, |
| "mean_token_accuracy": 0.7423816129565239, |
| "num_tokens": 22350046.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.1148280471563339, |
| "epoch": 1.5127406253477245, |
| "grad_norm": 18.125, |
| "learning_rate": 1.5933333333333336e-05, |
| "loss": 18.5777, |
| "mean_token_accuracy": 0.7518805626779794, |
| "num_tokens": 22615933.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.12065857835114, |
| "epoch": 1.5305441192834093, |
| "grad_norm": 20.75, |
| "learning_rate": 1.5859259259259262e-05, |
| "loss": 18.6024, |
| "mean_token_accuracy": 0.7530040096491575, |
| "num_tokens": 22886843.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.1522621292620898, |
| "epoch": 1.5483476132190943, |
| "grad_norm": 20.5, |
| "learning_rate": 1.5785185185185185e-05, |
| "loss": 19.1737, |
| "mean_token_accuracy": 0.7450519923120738, |
| "num_tokens": 23140244.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.1377869185060263, |
| "epoch": 1.5661511071547791, |
| "grad_norm": 19.5, |
| "learning_rate": 1.571111111111111e-05, |
| "loss": 18.9524, |
| "mean_token_accuracy": 0.7473382774740458, |
| "num_tokens": 23405121.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.148686793074012, |
| "epoch": 1.583954601090464, |
| "grad_norm": 20.125, |
| "learning_rate": 1.5637037037037038e-05, |
| "loss": 19.0497, |
| "mean_token_accuracy": 0.7466071300208569, |
| "num_tokens": 23671826.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.1471959844231605, |
| "epoch": 1.601758095026149, |
| "grad_norm": 20.5, |
| "learning_rate": 1.5562962962962965e-05, |
| "loss": 19.0957, |
| "mean_token_accuracy": 0.7456789817661047, |
| "num_tokens": 23933374.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_biology_entropy": 2.7071199588775636, |
| "eval_biology_loss": 2.972935199737549, |
| "eval_biology_mean_token_accuracy": 0.5189641232490539, |
| "eval_biology_num_tokens": 23933374.0, |
| "eval_biology_runtime": 57.6355, |
| "eval_biology_samples_per_second": 8.675, |
| "eval_biology_steps_per_second": 2.169, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_chemistry_entropy": 1.162458167076111, |
| "eval_chemistry_loss": 1.195241093635559, |
| "eval_chemistry_mean_token_accuracy": 0.7447079033851624, |
| "eval_chemistry_num_tokens": 23933374.0, |
| "eval_chemistry_runtime": 72.2714, |
| "eval_chemistry_samples_per_second": 6.918, |
| "eval_chemistry_steps_per_second": 1.73, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_math_entropy": 1.1186215715408325, |
| "eval_math_loss": 1.3856722116470337, |
| "eval_math_mean_token_accuracy": 0.7245329103469849, |
| "eval_math_num_tokens": 23933374.0, |
| "eval_math_runtime": 75.4805, |
| "eval_math_samples_per_second": 6.624, |
| "eval_math_steps_per_second": 1.656, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.601758095026149, |
| "eval_physics_entropy": 1.2408670482635498, |
| "eval_physics_loss": 1.2979875802993774, |
| "eval_physics_mean_token_accuracy": 0.736989695072174, |
| "eval_physics_num_tokens": 23933374.0, |
| "eval_physics_runtime": 85.587, |
| "eval_physics_samples_per_second": 5.842, |
| "eval_physics_steps_per_second": 1.461, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.0977809239178895, |
| "epoch": 1.6195615889618338, |
| "grad_norm": 19.875, |
| "learning_rate": 1.548888888888889e-05, |
| "loss": 18.2108, |
| "mean_token_accuracy": 0.7550504490733146, |
| "num_tokens": 24210156.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.14602930508554, |
| "epoch": 1.6373650828975186, |
| "grad_norm": 22.375, |
| "learning_rate": 1.5414814814814814e-05, |
| "loss": 19.1509, |
| "mean_token_accuracy": 0.7462662551552057, |
| "num_tokens": 24462726.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.0786818396300077, |
| "epoch": 1.6551685768332036, |
| "grad_norm": 19.5, |
| "learning_rate": 1.5340740740740744e-05, |
| "loss": 17.8442, |
| "mean_token_accuracy": 0.758844393491745, |
| "num_tokens": 24732815.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.1209699012339116, |
| "epoch": 1.6729720707688884, |
| "grad_norm": 21.0, |
| "learning_rate": 1.5266666666666667e-05, |
| "loss": 18.5603, |
| "mean_token_accuracy": 0.7515024449676275, |
| "num_tokens": 25002091.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.112015390396118, |
| "epoch": 1.6907755647045732, |
| "grad_norm": 19.5, |
| "learning_rate": 1.5192592592592594e-05, |
| "loss": 18.5596, |
| "mean_token_accuracy": 0.7524737507104874, |
| "num_tokens": 25267949.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.1459401201456785, |
| "epoch": 1.7085790586402583, |
| "grad_norm": 18.5, |
| "learning_rate": 1.5118518518518519e-05, |
| "loss": 18.9955, |
| "mean_token_accuracy": 0.7468857653439045, |
| "num_tokens": 25525772.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.1483839362859727, |
| "epoch": 1.7263825525759429, |
| "grad_norm": 19.25, |
| "learning_rate": 1.5044444444444445e-05, |
| "loss": 19.0262, |
| "mean_token_accuracy": 0.7459982354193926, |
| "num_tokens": 25787727.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.1058822341263295, |
| "epoch": 1.744186046511628, |
| "grad_norm": 17.375, |
| "learning_rate": 1.497037037037037e-05, |
| "loss": 18.4319, |
| "mean_token_accuracy": 0.7541479263454676, |
| "num_tokens": 26058026.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 1.0693828593939543, |
| "epoch": 1.761989540447313, |
| "grad_norm": 18.375, |
| "learning_rate": 1.4896296296296298e-05, |
| "loss": 17.7423, |
| "mean_token_accuracy": 0.7604560740292072, |
| "num_tokens": 26326631.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 1.1148248281329871, |
| "epoch": 1.7797930343829975, |
| "grad_norm": 19.125, |
| "learning_rate": 1.4822222222222225e-05, |
| "loss": 18.5599, |
| "mean_token_accuracy": 0.75152304507792, |
| "num_tokens": 26591735.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_biology_entropy": 2.7020350008010863, |
| "eval_biology_loss": 2.959472179412842, |
| "eval_biology_mean_token_accuracy": 0.5197799344062806, |
| "eval_biology_num_tokens": 26591735.0, |
| "eval_biology_runtime": 57.5048, |
| "eval_biology_samples_per_second": 8.695, |
| "eval_biology_steps_per_second": 2.174, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_chemistry_entropy": 1.14980606174469, |
| "eval_chemistry_loss": 1.186444878578186, |
| "eval_chemistry_mean_token_accuracy": 0.7462989177703857, |
| "eval_chemistry_num_tokens": 26591735.0, |
| "eval_chemistry_runtime": 71.6887, |
| "eval_chemistry_samples_per_second": 6.975, |
| "eval_chemistry_steps_per_second": 1.744, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_math_entropy": 1.105127462387085, |
| "eval_math_loss": 1.3818320035934448, |
| "eval_math_mean_token_accuracy": 0.7256039061546325, |
| "eval_math_num_tokens": 26591735.0, |
| "eval_math_runtime": 75.0009, |
| "eval_math_samples_per_second": 6.667, |
| "eval_math_steps_per_second": 1.667, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.7797930343829975, |
| "eval_physics_entropy": 1.2302804527282716, |
| "eval_physics_loss": 1.2946749925613403, |
| "eval_physics_mean_token_accuracy": 0.7378688325881958, |
| "eval_physics_num_tokens": 26591735.0, |
| "eval_physics_runtime": 85.3338, |
| "eval_physics_samples_per_second": 5.859, |
| "eval_physics_steps_per_second": 1.465, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.1227924428880214, |
| "epoch": 1.7975965283186826, |
| "grad_norm": 20.0, |
| "learning_rate": 1.474814814814815e-05, |
| "loss": 18.6939, |
| "mean_token_accuracy": 0.7498056028038264, |
| "num_tokens": 26857071.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.1648254558444022, |
| "epoch": 1.8154000222543676, |
| "grad_norm": 19.25, |
| "learning_rate": 1.4674074074074076e-05, |
| "loss": 19.3387, |
| "mean_token_accuracy": 0.7423548739403486, |
| "num_tokens": 27120617.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 1.0981685355305673, |
| "epoch": 1.8332035161900522, |
| "grad_norm": 21.25, |
| "learning_rate": 1.46e-05, |
| "loss": 18.3023, |
| "mean_token_accuracy": 0.7560988407582044, |
| "num_tokens": 27391174.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 1.139582459628582, |
| "epoch": 1.8510070101257372, |
| "grad_norm": 17.0, |
| "learning_rate": 1.4525925925925927e-05, |
| "loss": 18.8114, |
| "mean_token_accuracy": 0.7490788519382476, |
| "num_tokens": 27656941.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.0934928126633168, |
| "epoch": 1.868810504061422, |
| "grad_norm": 19.125, |
| "learning_rate": 1.4451851851851852e-05, |
| "loss": 18.1743, |
| "mean_token_accuracy": 0.7558255009353161, |
| "num_tokens": 27927118.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 1.0726651160046459, |
| "epoch": 1.8866139979971068, |
| "grad_norm": 17.25, |
| "learning_rate": 1.4377777777777779e-05, |
| "loss": 17.8591, |
| "mean_token_accuracy": 0.7595527049154043, |
| "num_tokens": 28192620.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 1.1046127401292325, |
| "epoch": 1.9044174919327919, |
| "grad_norm": 19.875, |
| "learning_rate": 1.4303703703703703e-05, |
| "loss": 18.4902, |
| "mean_token_accuracy": 0.753314646333456, |
| "num_tokens": 28458168.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.1121449010446667, |
| "epoch": 1.9222209858684767, |
| "grad_norm": 20.0, |
| "learning_rate": 1.4229629629629632e-05, |
| "loss": 18.5865, |
| "mean_token_accuracy": 0.7521534610539675, |
| "num_tokens": 28722756.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.1442356582731008, |
| "epoch": 1.9400244798041615, |
| "grad_norm": 24.5, |
| "learning_rate": 1.4155555555555556e-05, |
| "loss": 18.8683, |
| "mean_token_accuracy": 0.7469607297331095, |
| "num_tokens": 28978375.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.1289800632745028, |
| "epoch": 1.9578279737398465, |
| "grad_norm": 18.875, |
| "learning_rate": 1.4081481481481483e-05, |
| "loss": 18.8067, |
| "mean_token_accuracy": 0.7480562552809715, |
| "num_tokens": 29236865.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_biology_entropy": 2.670331030845642, |
| "eval_biology_loss": 2.9537129402160645, |
| "eval_biology_mean_token_accuracy": 0.5207649791240692, |
| "eval_biology_num_tokens": 29236865.0, |
| "eval_biology_runtime": 57.7384, |
| "eval_biology_samples_per_second": 8.66, |
| "eval_biology_steps_per_second": 2.165, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_chemistry_entropy": 1.1356218423843383, |
| "eval_chemistry_loss": 1.179543375968933, |
| "eval_chemistry_mean_token_accuracy": 0.7473730354309082, |
| "eval_chemistry_num_tokens": 29236865.0, |
| "eval_chemistry_runtime": 72.0122, |
| "eval_chemistry_samples_per_second": 6.943, |
| "eval_chemistry_steps_per_second": 1.736, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_math_entropy": 1.0909526333808899, |
| "eval_math_loss": 1.3782899379730225, |
| "eval_math_mean_token_accuracy": 0.726540036201477, |
| "eval_math_num_tokens": 29236865.0, |
| "eval_math_runtime": 74.8523, |
| "eval_math_samples_per_second": 6.68, |
| "eval_math_steps_per_second": 1.67, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.9578279737398465, |
| "eval_physics_entropy": 1.2169800367355346, |
| "eval_physics_loss": 1.2896714210510254, |
| "eval_physics_mean_token_accuracy": 0.7383955039978027, |
| "eval_physics_num_tokens": 29236865.0, |
| "eval_physics_runtime": 85.1627, |
| "eval_physics_samples_per_second": 5.871, |
| "eval_physics_steps_per_second": 1.468, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.0923401776701211, |
| "epoch": 1.9756314676755313, |
| "grad_norm": 20.25, |
| "learning_rate": 1.400740740740741e-05, |
| "loss": 18.3288, |
| "mean_token_accuracy": 0.7545268822461366, |
| "num_tokens": 29500428.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 1.1163721553981305, |
| "epoch": 1.9934349616112161, |
| "grad_norm": 21.5, |
| "learning_rate": 1.3933333333333334e-05, |
| "loss": 18.3996, |
| "mean_token_accuracy": 0.7531708285212517, |
| "num_tokens": 29766739.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 1.1096870253163, |
| "epoch": 2.010682096361411, |
| "grad_norm": 18.75, |
| "learning_rate": 1.385925925925926e-05, |
| "loss": 17.8243, |
| "mean_token_accuracy": 0.753396144605452, |
| "num_tokens": 30025427.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 1.1090912133455277, |
| "epoch": 2.028485590297096, |
| "grad_norm": 22.25, |
| "learning_rate": 1.3785185185185186e-05, |
| "loss": 18.2931, |
| "mean_token_accuracy": 0.753648667037487, |
| "num_tokens": 30286651.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 1.1002933204174041, |
| "epoch": 2.0462890842327806, |
| "grad_norm": 19.875, |
| "learning_rate": 1.3711111111111112e-05, |
| "loss": 18.3961, |
| "mean_token_accuracy": 0.7534263014793396, |
| "num_tokens": 30552624.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 1.0910452891141176, |
| "epoch": 2.0640925781684656, |
| "grad_norm": 22.75, |
| "learning_rate": 1.3637037037037037e-05, |
| "loss": 17.9244, |
| "mean_token_accuracy": 0.7581309732049704, |
| "num_tokens": 30823746.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 1.1619377303868532, |
| "epoch": 2.0818960721041506, |
| "grad_norm": 22.25, |
| "learning_rate": 1.3562962962962965e-05, |
| "loss": 19.4184, |
| "mean_token_accuracy": 0.7404937222599983, |
| "num_tokens": 31076307.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 1.144884167984128, |
| "epoch": 2.099699566039835, |
| "grad_norm": 20.875, |
| "learning_rate": 1.3488888888888888e-05, |
| "loss": 18.8592, |
| "mean_token_accuracy": 0.7461796149611473, |
| "num_tokens": 31340763.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 1.084285356104374, |
| "epoch": 2.1175030599755202, |
| "grad_norm": 19.0, |
| "learning_rate": 1.3414814814814817e-05, |
| "loss": 18.0967, |
| "mean_token_accuracy": 0.756497149169445, |
| "num_tokens": 31607200.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 1.1388497594743967, |
| "epoch": 2.1353065539112053, |
| "grad_norm": 19.75, |
| "learning_rate": 1.3340740740740741e-05, |
| "loss": 18.82, |
| "mean_token_accuracy": 0.7471547801047563, |
| "num_tokens": 31874392.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_biology_entropy": 2.6481752424240113, |
| "eval_biology_loss": 2.955437183380127, |
| "eval_biology_mean_token_accuracy": 0.5212604312896728, |
| "eval_biology_num_tokens": 31874392.0, |
| "eval_biology_runtime": 57.623, |
| "eval_biology_samples_per_second": 8.677, |
| "eval_biology_steps_per_second": 2.169, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_chemistry_entropy": 1.1267094049453736, |
| "eval_chemistry_loss": 1.174714207649231, |
| "eval_chemistry_mean_token_accuracy": 0.7476038336753845, |
| "eval_chemistry_num_tokens": 31874392.0, |
| "eval_chemistry_runtime": 72.0197, |
| "eval_chemistry_samples_per_second": 6.943, |
| "eval_chemistry_steps_per_second": 1.736, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_math_entropy": 1.091456967830658, |
| "eval_math_loss": 1.376819133758545, |
| "eval_math_mean_token_accuracy": 0.7268339867591858, |
| "eval_math_num_tokens": 31874392.0, |
| "eval_math_runtime": 75.1846, |
| "eval_math_samples_per_second": 6.65, |
| "eval_math_steps_per_second": 1.663, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.1353065539112053, |
| "eval_physics_entropy": 1.2114794454574584, |
| "eval_physics_loss": 1.285851240158081, |
| "eval_physics_mean_token_accuracy": 0.7391742224693298, |
| "eval_physics_num_tokens": 31874392.0, |
| "eval_physics_runtime": 85.115, |
| "eval_physics_samples_per_second": 5.874, |
| "eval_physics_steps_per_second": 1.469, |
| "step": 1200 |
| }, |
| { |
| "entropy": 1.1082856599241495, |
| "epoch": 2.15311004784689, |
| "grad_norm": 18.625, |
| "learning_rate": 1.3266666666666668e-05, |
| "loss": 18.4609, |
| "mean_token_accuracy": 0.7532401513308287, |
| "num_tokens": 32144016.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 1.0737901078537107, |
| "epoch": 2.170913541782575, |
| "grad_norm": 18.375, |
| "learning_rate": 1.3192592592592594e-05, |
| "loss": 17.9022, |
| "mean_token_accuracy": 0.7580810189247131, |
| "num_tokens": 32410902.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 1.0895192243158818, |
| "epoch": 2.18871703571826, |
| "grad_norm": 21.375, |
| "learning_rate": 1.311851851851852e-05, |
| "loss": 18.0794, |
| "mean_token_accuracy": 0.7569272886961699, |
| "num_tokens": 32674338.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 1.1305242408066989, |
| "epoch": 2.2065205296539445, |
| "grad_norm": 16.0, |
| "learning_rate": 1.3044444444444446e-05, |
| "loss": 18.7777, |
| "mean_token_accuracy": 0.7519055023789406, |
| "num_tokens": 32930219.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 1.1194303661584855, |
| "epoch": 2.2243240235896296, |
| "grad_norm": 18.375, |
| "learning_rate": 1.297037037037037e-05, |
| "loss": 18.3991, |
| "mean_token_accuracy": 0.7505009181797504, |
| "num_tokens": 33194318.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 1.094122551381588, |
| "epoch": 2.242127517525314, |
| "grad_norm": 21.875, |
| "learning_rate": 1.2896296296296299e-05, |
| "loss": 18.4428, |
| "mean_token_accuracy": 0.7547226294875145, |
| "num_tokens": 33453292.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 1.0551138285547494, |
| "epoch": 2.259931011460999, |
| "grad_norm": 19.5, |
| "learning_rate": 1.2822222222222222e-05, |
| "loss": 17.3115, |
| "mean_token_accuracy": 0.765355784446001, |
| "num_tokens": 33728875.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 1.0934181027114391, |
| "epoch": 2.277734505396684, |
| "grad_norm": 21.25, |
| "learning_rate": 1.274814814814815e-05, |
| "loss": 18.262, |
| "mean_token_accuracy": 0.7533521149307489, |
| "num_tokens": 33999305.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 1.1094968844205142, |
| "epoch": 2.295537999332369, |
| "grad_norm": 22.0, |
| "learning_rate": 1.2674074074074075e-05, |
| "loss": 18.2857, |
| "mean_token_accuracy": 0.7538360808044672, |
| "num_tokens": 34269727.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 1.0546644374728202, |
| "epoch": 2.313341493268054, |
| "grad_norm": 18.125, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 17.4746, |
| "mean_token_accuracy": 0.7642518579959869, |
| "num_tokens": 34537714.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_biology_entropy": 2.597072058677673, |
| "eval_biology_loss": 2.9495482444763184, |
| "eval_biology_mean_token_accuracy": 0.5221131265163421, |
| "eval_biology_num_tokens": 34537714.0, |
| "eval_biology_runtime": 57.4015, |
| "eval_biology_samples_per_second": 8.711, |
| "eval_biology_steps_per_second": 2.178, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_chemistry_entropy": 1.1102874970436096, |
| "eval_chemistry_loss": 1.1702996492385864, |
| "eval_chemistry_mean_token_accuracy": 0.7484057130813598, |
| "eval_chemistry_num_tokens": 34537714.0, |
| "eval_chemistry_runtime": 72.2853, |
| "eval_chemistry_samples_per_second": 6.917, |
| "eval_chemistry_steps_per_second": 1.729, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_math_entropy": 1.0795171055793762, |
| "eval_math_loss": 1.3760371208190918, |
| "eval_math_mean_token_accuracy": 0.7270366640090943, |
| "eval_math_num_tokens": 34537714.0, |
| "eval_math_runtime": 75.1171, |
| "eval_math_samples_per_second": 6.656, |
| "eval_math_steps_per_second": 1.664, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.313341493268054, |
| "eval_physics_entropy": 1.195563290119171, |
| "eval_physics_loss": 1.2820779085159302, |
| "eval_physics_mean_token_accuracy": 0.7392076215744019, |
| "eval_physics_num_tokens": 34537714.0, |
| "eval_physics_runtime": 85.1797, |
| "eval_physics_samples_per_second": 5.87, |
| "eval_physics_steps_per_second": 1.467, |
| "step": 1300 |
| }, |
| { |
| "entropy": 1.1043903956189751, |
| "epoch": 2.331144987203739, |
| "grad_norm": 19.125, |
| "learning_rate": 1.2525925925925928e-05, |
| "loss": 18.5308, |
| "mean_token_accuracy": 0.7503409255295992, |
| "num_tokens": 34806647.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 1.0996684737503528, |
| "epoch": 2.3489484811394234, |
| "grad_norm": 18.25, |
| "learning_rate": 1.2451851851851853e-05, |
| "loss": 18.0647, |
| "mean_token_accuracy": 0.7559586096554994, |
| "num_tokens": 35076423.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 1.1198013797402382, |
| "epoch": 2.3667519750751085, |
| "grad_norm": 20.75, |
| "learning_rate": 1.237777777777778e-05, |
| "loss": 18.6202, |
| "mean_token_accuracy": 0.7494822707027197, |
| "num_tokens": 35343494.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 1.0966038379818202, |
| "epoch": 2.3845554690107935, |
| "grad_norm": 20.25, |
| "learning_rate": 1.2303703703703704e-05, |
| "loss": 18.1883, |
| "mean_token_accuracy": 0.754872740060091, |
| "num_tokens": 35614731.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 1.1123589921742678, |
| "epoch": 2.402358962946478, |
| "grad_norm": 20.875, |
| "learning_rate": 1.222962962962963e-05, |
| "loss": 18.456, |
| "mean_token_accuracy": 0.7513397440314293, |
| "num_tokens": 35877264.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 1.061964062973857, |
| "epoch": 2.420162456882163, |
| "grad_norm": 18.375, |
| "learning_rate": 1.2155555555555555e-05, |
| "loss": 17.6878, |
| "mean_token_accuracy": 0.7598675034940243, |
| "num_tokens": 36147587.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 1.1040230866521596, |
| "epoch": 2.437965950817848, |
| "grad_norm": 21.5, |
| "learning_rate": 1.2081481481481484e-05, |
| "loss": 18.3097, |
| "mean_token_accuracy": 0.7549828208982945, |
| "num_tokens": 36416125.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 1.1043301727622747, |
| "epoch": 2.4557694447535328, |
| "grad_norm": 21.5, |
| "learning_rate": 1.2007407407407408e-05, |
| "loss": 18.3398, |
| "mean_token_accuracy": 0.7537022326141596, |
| "num_tokens": 36680383.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 1.1061188193038105, |
| "epoch": 2.473572938689218, |
| "grad_norm": 19.875, |
| "learning_rate": 1.1933333333333335e-05, |
| "loss": 18.3255, |
| "mean_token_accuracy": 0.7524043101817369, |
| "num_tokens": 36946349.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 1.0568878058344127, |
| "epoch": 2.491376432624903, |
| "grad_norm": 16.875, |
| "learning_rate": 1.185925925925926e-05, |
| "loss": 17.5249, |
| "mean_token_accuracy": 0.7631021294742822, |
| "num_tokens": 37219532.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_biology_entropy": 2.652871153831482, |
| "eval_biology_loss": 2.94612979888916, |
| "eval_biology_mean_token_accuracy": 0.5220030138492584, |
| "eval_biology_num_tokens": 37219532.0, |
| "eval_biology_runtime": 57.6473, |
| "eval_biology_samples_per_second": 8.673, |
| "eval_biology_steps_per_second": 2.168, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_chemistry_entropy": 1.1232130107879639, |
| "eval_chemistry_loss": 1.1665852069854736, |
| "eval_chemistry_mean_token_accuracy": 0.7491925139427185, |
| "eval_chemistry_num_tokens": 37219532.0, |
| "eval_chemistry_runtime": 72.2294, |
| "eval_chemistry_samples_per_second": 6.922, |
| "eval_chemistry_steps_per_second": 1.731, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_math_entropy": 1.0859251236915588, |
| "eval_math_loss": 1.372695803642273, |
| "eval_math_mean_token_accuracy": 0.7273387761116028, |
| "eval_math_num_tokens": 37219532.0, |
| "eval_math_runtime": 75.0046, |
| "eval_math_samples_per_second": 6.666, |
| "eval_math_steps_per_second": 1.667, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.491376432624903, |
| "eval_physics_entropy": 1.2078160109519958, |
| "eval_physics_loss": 1.2796850204467773, |
| "eval_physics_mean_token_accuracy": 0.7398202438354492, |
| "eval_physics_num_tokens": 37219532.0, |
| "eval_physics_runtime": 85.3077, |
| "eval_physics_samples_per_second": 5.861, |
| "eval_physics_steps_per_second": 1.465, |
| "step": 1400 |
| }, |
| { |
| "entropy": 1.091886579245329, |
| "epoch": 2.5091799265605874, |
| "grad_norm": 20.25, |
| "learning_rate": 1.1785185185185186e-05, |
| "loss": 18.0794, |
| "mean_token_accuracy": 0.7578685782849789, |
| "num_tokens": 37477914.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 1.0806248864158987, |
| "epoch": 2.5269834204962724, |
| "grad_norm": 18.875, |
| "learning_rate": 1.1711111111111113e-05, |
| "loss": 18.0524, |
| "mean_token_accuracy": 0.7555962100625038, |
| "num_tokens": 37746690.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 1.136592934280634, |
| "epoch": 2.544786914431957, |
| "grad_norm": 23.375, |
| "learning_rate": 1.1637037037037037e-05, |
| "loss": 18.8612, |
| "mean_token_accuracy": 0.7464782755821944, |
| "num_tokens": 38005792.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 1.117892136052251, |
| "epoch": 2.562590408367642, |
| "grad_norm": 19.875, |
| "learning_rate": 1.1562962962962964e-05, |
| "loss": 18.543, |
| "mean_token_accuracy": 0.751380517706275, |
| "num_tokens": 38269021.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 1.1022701445966958, |
| "epoch": 2.580393902303327, |
| "grad_norm": 18.625, |
| "learning_rate": 1.1488888888888889e-05, |
| "loss": 18.1605, |
| "mean_token_accuracy": 0.7542933959513902, |
| "num_tokens": 38545224.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 1.0846613895148038, |
| "epoch": 2.598197396239012, |
| "grad_norm": 18.375, |
| "learning_rate": 1.1414814814814817e-05, |
| "loss": 18.1621, |
| "mean_token_accuracy": 0.7546363666653633, |
| "num_tokens": 38802290.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 1.079243928194046, |
| "epoch": 2.6160008901746967, |
| "grad_norm": 20.125, |
| "learning_rate": 1.1340740740740742e-05, |
| "loss": 17.8188, |
| "mean_token_accuracy": 0.7600852824747563, |
| "num_tokens": 39065787.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 1.0927384681999683, |
| "epoch": 2.6338043841103818, |
| "grad_norm": 20.25, |
| "learning_rate": 1.1266666666666668e-05, |
| "loss": 18.1991, |
| "mean_token_accuracy": 0.7551762603223324, |
| "num_tokens": 39328517.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 1.0439645014703274, |
| "epoch": 2.6516078780460663, |
| "grad_norm": 19.25, |
| "learning_rate": 1.1192592592592593e-05, |
| "loss": 17.3405, |
| "mean_token_accuracy": 0.763591681048274, |
| "num_tokens": 39600641.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 1.1232297539710998, |
| "epoch": 2.6694113719817514, |
| "grad_norm": 23.5, |
| "learning_rate": 1.111851851851852e-05, |
| "loss": 18.7, |
| "mean_token_accuracy": 0.7493483603000641, |
| "num_tokens": 39857917.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_biology_entropy": 2.6466422786712647, |
| "eval_biology_loss": 2.9346089363098145, |
| "eval_biology_mean_token_accuracy": 0.5228305985927582, |
| "eval_biology_num_tokens": 39857917.0, |
| "eval_biology_runtime": 58.0118, |
| "eval_biology_samples_per_second": 8.619, |
| "eval_biology_steps_per_second": 2.155, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_chemistry_entropy": 1.118860065460205, |
| "eval_chemistry_loss": 1.1640205383300781, |
| "eval_chemistry_mean_token_accuracy": 0.7496954665184021, |
| "eval_chemistry_num_tokens": 39857917.0, |
| "eval_chemistry_runtime": 71.5035, |
| "eval_chemistry_samples_per_second": 6.993, |
| "eval_chemistry_steps_per_second": 1.748, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_math_entropy": 1.0849408197402954, |
| "eval_math_loss": 1.370849847793579, |
| "eval_math_mean_token_accuracy": 0.7275271005630494, |
| "eval_math_num_tokens": 39857917.0, |
| "eval_math_runtime": 74.7261, |
| "eval_math_samples_per_second": 6.691, |
| "eval_math_steps_per_second": 1.673, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.6694113719817514, |
| "eval_physics_entropy": 1.20153675699234, |
| "eval_physics_loss": 1.277133822441101, |
| "eval_physics_mean_token_accuracy": 0.7400126695632935, |
| "eval_physics_num_tokens": 39857917.0, |
| "eval_physics_runtime": 84.812, |
| "eval_physics_samples_per_second": 5.895, |
| "eval_physics_steps_per_second": 1.474, |
| "step": 1500 |
| }, |
| { |
| "entropy": 1.0860063921660186, |
| "epoch": 2.6872148659174364, |
| "grad_norm": 21.125, |
| "learning_rate": 1.1044444444444444e-05, |
| "loss": 18.0853, |
| "mean_token_accuracy": 0.7563430912792682, |
| "num_tokens": 40119174.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 1.129167440533638, |
| "epoch": 2.7050183598531214, |
| "grad_norm": 20.125, |
| "learning_rate": 1.0970370370370371e-05, |
| "loss": 18.6618, |
| "mean_token_accuracy": 0.7484573539346456, |
| "num_tokens": 40379054.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 1.07851094417274, |
| "epoch": 2.722821853788806, |
| "grad_norm": 19.0, |
| "learning_rate": 1.0896296296296298e-05, |
| "loss": 18.083, |
| "mean_token_accuracy": 0.7540540069341659, |
| "num_tokens": 40650970.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 1.1019277192652226, |
| "epoch": 2.740625347724491, |
| "grad_norm": 18.625, |
| "learning_rate": 1.0822222222222222e-05, |
| "loss": 18.2659, |
| "mean_token_accuracy": 0.7548778887838126, |
| "num_tokens": 40914339.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 1.1040710996836425, |
| "epoch": 2.7584288416601757, |
| "grad_norm": 21.0, |
| "learning_rate": 1.074814814814815e-05, |
| "loss": 18.273, |
| "mean_token_accuracy": 0.7527279917150735, |
| "num_tokens": 41181827.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 1.0668385986238718, |
| "epoch": 2.7762323355958607, |
| "grad_norm": 22.0, |
| "learning_rate": 1.0674074074074074e-05, |
| "loss": 17.8438, |
| "mean_token_accuracy": 0.7587875317782163, |
| "num_tokens": 41448961.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 1.0884086616337298, |
| "epoch": 2.7940358295315457, |
| "grad_norm": 21.75, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 18.0262, |
| "mean_token_accuracy": 0.7560815311968326, |
| "num_tokens": 41712243.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 1.1063744578510524, |
| "epoch": 2.8118393234672303, |
| "grad_norm": 18.0, |
| "learning_rate": 1.0525925925925927e-05, |
| "loss": 18.1967, |
| "mean_token_accuracy": 0.754052161052823, |
| "num_tokens": 41981755.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 1.0421814311295747, |
| "epoch": 2.8296428174029153, |
| "grad_norm": 18.625, |
| "learning_rate": 1.0451851851851853e-05, |
| "loss": 17.409, |
| "mean_token_accuracy": 0.7653664790093899, |
| "num_tokens": 42258270.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 1.0939543709158897, |
| "epoch": 2.8474463113386, |
| "grad_norm": 18.375, |
| "learning_rate": 1.0377777777777778e-05, |
| "loss": 18.0482, |
| "mean_token_accuracy": 0.7579692296683789, |
| "num_tokens": 42529321.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_biology_entropy": 2.637769458770752, |
| "eval_biology_loss": 2.940894603729248, |
| "eval_biology_mean_token_accuracy": 0.5229103591442108, |
| "eval_biology_num_tokens": 42529321.0, |
| "eval_biology_runtime": 57.8557, |
| "eval_biology_samples_per_second": 8.642, |
| "eval_biology_steps_per_second": 2.161, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_chemistry_entropy": 1.1125650115013122, |
| "eval_chemistry_loss": 1.160796046257019, |
| "eval_chemistry_mean_token_accuracy": 0.7501013536453247, |
| "eval_chemistry_num_tokens": 42529321.0, |
| "eval_chemistry_runtime": 72.2026, |
| "eval_chemistry_samples_per_second": 6.925, |
| "eval_chemistry_steps_per_second": 1.731, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_math_entropy": 1.0801175408363342, |
| "eval_math_loss": 1.3694645166397095, |
| "eval_math_mean_token_accuracy": 0.7277571864128113, |
| "eval_math_num_tokens": 42529321.0, |
| "eval_math_runtime": 75.2884, |
| "eval_math_samples_per_second": 6.641, |
| "eval_math_steps_per_second": 1.66, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.8474463113386, |
| "eval_physics_entropy": 1.198549277305603, |
| "eval_physics_loss": 1.27517569065094, |
| "eval_physics_mean_token_accuracy": 0.7403683791160584, |
| "eval_physics_num_tokens": 42529321.0, |
| "eval_physics_runtime": 85.395, |
| "eval_physics_samples_per_second": 5.855, |
| "eval_physics_steps_per_second": 1.464, |
| "step": 1600 |
| }, |
| { |
| "entropy": 1.1290572371333838, |
| "epoch": 2.865249805274285, |
| "grad_norm": 20.5, |
| "learning_rate": 1.0303703703703705e-05, |
| "loss": 18.8437, |
| "mean_token_accuracy": 0.7473030380904675, |
| "num_tokens": 42793729.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 1.0939196426421405, |
| "epoch": 2.88305329920997, |
| "grad_norm": 19.375, |
| "learning_rate": 1.0229629629629631e-05, |
| "loss": 18.0962, |
| "mean_token_accuracy": 0.7554187960922718, |
| "num_tokens": 43067006.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 1.0769088421016932, |
| "epoch": 2.900856793145655, |
| "grad_norm": 24.0, |
| "learning_rate": 1.0155555555555556e-05, |
| "loss": 17.8392, |
| "mean_token_accuracy": 0.7592465952038765, |
| "num_tokens": 43333868.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 1.0659821335226298, |
| "epoch": 2.9186602870813396, |
| "grad_norm": 20.75, |
| "learning_rate": 1.0081481481481484e-05, |
| "loss": 17.6974, |
| "mean_token_accuracy": 0.7605714075267315, |
| "num_tokens": 43605151.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 1.0646285666152835, |
| "epoch": 2.9364637810170247, |
| "grad_norm": 19.625, |
| "learning_rate": 1.0007407407407407e-05, |
| "loss": 17.7336, |
| "mean_token_accuracy": 0.7622984137386084, |
| "num_tokens": 43863034.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 1.1087765533477068, |
| "epoch": 2.9542672749527092, |
| "grad_norm": 20.25, |
| "learning_rate": 9.933333333333334e-06, |
| "loss": 18.3884, |
| "mean_token_accuracy": 0.751496770605445, |
| "num_tokens": 44126796.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 1.1063459984958173, |
| "epoch": 2.9720707688883943, |
| "grad_norm": 21.5, |
| "learning_rate": 9.85925925925926e-06, |
| "loss": 18.4278, |
| "mean_token_accuracy": 0.7521345388144255, |
| "num_tokens": 44382724.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 1.0647218599915504, |
| "epoch": 2.9898742628240793, |
| "grad_norm": 18.875, |
| "learning_rate": 9.785185185185187e-06, |
| "loss": 17.7257, |
| "mean_token_accuracy": 0.7603258349001407, |
| "num_tokens": 44651471.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 1.0653223249220078, |
| "epoch": 3.007121397574274, |
| "grad_norm": 18.5, |
| "learning_rate": 9.711111111111111e-06, |
| "loss": 16.9743, |
| "mean_token_accuracy": 0.7601908556876644, |
| "num_tokens": 44909981.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 1.1704007178544997, |
| "epoch": 3.0249248915099587, |
| "grad_norm": 22.75, |
| "learning_rate": 9.637037037037038e-06, |
| "loss": 19.3967, |
| "mean_token_accuracy": 0.7418560739606619, |
| "num_tokens": 45176547.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_biology_entropy": 2.6161900300979615, |
| "eval_biology_loss": 2.935819625854492, |
| "eval_biology_mean_token_accuracy": 0.5230246422290802, |
| "eval_biology_num_tokens": 45176547.0, |
| "eval_biology_runtime": 57.8378, |
| "eval_biology_samples_per_second": 8.645, |
| "eval_biology_steps_per_second": 2.161, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_chemistry_entropy": 1.1025579800605774, |
| "eval_chemistry_loss": 1.1589548587799072, |
| "eval_chemistry_mean_token_accuracy": 0.7501291260719299, |
| "eval_chemistry_num_tokens": 45176547.0, |
| "eval_chemistry_runtime": 71.8656, |
| "eval_chemistry_samples_per_second": 6.957, |
| "eval_chemistry_steps_per_second": 1.739, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_math_entropy": 1.0748444156646728, |
| "eval_math_loss": 1.3698667287826538, |
| "eval_math_mean_token_accuracy": 0.7280403084754944, |
| "eval_math_num_tokens": 45176547.0, |
| "eval_math_runtime": 75.1807, |
| "eval_math_samples_per_second": 6.651, |
| "eval_math_steps_per_second": 1.663, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.0249248915099587, |
| "eval_physics_entropy": 1.1882516493797302, |
| "eval_physics_loss": 1.273923397064209, |
| "eval_physics_mean_token_accuracy": 0.7405211963653564, |
| "eval_physics_num_tokens": 45176547.0, |
| "eval_physics_runtime": 85.18, |
| "eval_physics_samples_per_second": 5.87, |
| "eval_physics_steps_per_second": 1.467, |
| "step": 1700 |
| }, |
| { |
| "entropy": 1.045224749110639, |
| "epoch": 3.0427283854456437, |
| "grad_norm": 21.625, |
| "learning_rate": 9.562962962962965e-06, |
| "loss": 17.5191, |
| "mean_token_accuracy": 0.7625730182975531, |
| "num_tokens": 45437181.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 1.0588873416185378, |
| "epoch": 3.0605318793813288, |
| "grad_norm": 21.0, |
| "learning_rate": 9.48888888888889e-06, |
| "loss": 17.5556, |
| "mean_token_accuracy": 0.7609021920710802, |
| "num_tokens": 45702431.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 1.0747338887304068, |
| "epoch": 3.0783353733170133, |
| "grad_norm": 18.25, |
| "learning_rate": 9.414814814814816e-06, |
| "loss": 17.8698, |
| "mean_token_accuracy": 0.757227075099945, |
| "num_tokens": 45975788.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 1.0600129183381797, |
| "epoch": 3.0961388672526984, |
| "grad_norm": 21.0, |
| "learning_rate": 9.34074074074074e-06, |
| "loss": 17.366, |
| "mean_token_accuracy": 0.7626322463154793, |
| "num_tokens": 46251349.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 1.109250448271632, |
| "epoch": 3.1139423611883834, |
| "grad_norm": 24.75, |
| "learning_rate": 9.266666666666667e-06, |
| "loss": 18.4852, |
| "mean_token_accuracy": 0.7508084613829851, |
| "num_tokens": 46516802.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 1.056485254317522, |
| "epoch": 3.131745855124068, |
| "grad_norm": 20.125, |
| "learning_rate": 9.192592592592594e-06, |
| "loss": 17.5278, |
| "mean_token_accuracy": 0.7627970885485411, |
| "num_tokens": 46790537.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 1.0896330252289772, |
| "epoch": 3.149549349059753, |
| "grad_norm": 21.5, |
| "learning_rate": 9.118518518518518e-06, |
| "loss": 18.1303, |
| "mean_token_accuracy": 0.7566281389445066, |
| "num_tokens": 47052877.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 1.0868952518329025, |
| "epoch": 3.167352842995438, |
| "grad_norm": 20.25, |
| "learning_rate": 9.044444444444445e-06, |
| "loss": 17.9403, |
| "mean_token_accuracy": 0.7568327851593495, |
| "num_tokens": 47315587.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 1.0584388840943575, |
| "epoch": 3.1851563369311227, |
| "grad_norm": 21.875, |
| "learning_rate": 8.970370370370372e-06, |
| "loss": 17.6068, |
| "mean_token_accuracy": 0.7634176205843687, |
| "num_tokens": 47578352.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 1.0782266601920127, |
| "epoch": 3.2029598308668077, |
| "grad_norm": 21.125, |
| "learning_rate": 8.896296296296298e-06, |
| "loss": 17.9251, |
| "mean_token_accuracy": 0.7589756917208433, |
| "num_tokens": 47844623.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_biology_entropy": 2.6365171356201174, |
| "eval_biology_loss": 2.932905673980713, |
| "eval_biology_mean_token_accuracy": 0.5235939712524414, |
| "eval_biology_num_tokens": 47844623.0, |
| "eval_biology_runtime": 57.6814, |
| "eval_biology_samples_per_second": 8.668, |
| "eval_biology_steps_per_second": 2.167, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_chemistry_entropy": 1.1065808815956115, |
| "eval_chemistry_loss": 1.157857060432434, |
| "eval_chemistry_mean_token_accuracy": 0.7504735474586487, |
| "eval_chemistry_num_tokens": 47844623.0, |
| "eval_chemistry_runtime": 72.0749, |
| "eval_chemistry_samples_per_second": 6.937, |
| "eval_chemistry_steps_per_second": 1.734, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_math_entropy": 1.078178656578064, |
| "eval_math_loss": 1.3683264255523682, |
| "eval_math_mean_token_accuracy": 0.7283754692077636, |
| "eval_math_num_tokens": 47844623.0, |
| "eval_math_runtime": 75.1466, |
| "eval_math_samples_per_second": 6.654, |
| "eval_math_steps_per_second": 1.663, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.2029598308668077, |
| "eval_physics_entropy": 1.193580945968628, |
| "eval_physics_loss": 1.2739607095718384, |
| "eval_physics_mean_token_accuracy": 0.7406387696266175, |
| "eval_physics_num_tokens": 47844623.0, |
| "eval_physics_runtime": 85.1683, |
| "eval_physics_samples_per_second": 5.871, |
| "eval_physics_steps_per_second": 1.468, |
| "step": 1800 |
| }, |
| { |
| "entropy": 1.0502044271677733, |
| "epoch": 3.2207633248024923, |
| "grad_norm": 19.25, |
| "learning_rate": 8.822222222222223e-06, |
| "loss": 17.532, |
| "mean_token_accuracy": 0.762039003893733, |
| "num_tokens": 48112262.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 1.1423117272555827, |
| "epoch": 3.2385668187381773, |
| "grad_norm": 21.125, |
| "learning_rate": 8.74814814814815e-06, |
| "loss": 19.0592, |
| "mean_token_accuracy": 0.7459153685718775, |
| "num_tokens": 48369097.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 1.0870308240875601, |
| "epoch": 3.2563703126738623, |
| "grad_norm": 20.375, |
| "learning_rate": 8.674074074074074e-06, |
| "loss": 17.9442, |
| "mean_token_accuracy": 0.7594770763069392, |
| "num_tokens": 48628165.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 1.0671832324936985, |
| "epoch": 3.274173806609547, |
| "grad_norm": 19.5, |
| "learning_rate": 8.6e-06, |
| "loss": 17.6901, |
| "mean_token_accuracy": 0.759853546321392, |
| "num_tokens": 48898634.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 1.0650055054575205, |
| "epoch": 3.291977300545232, |
| "grad_norm": 18.5, |
| "learning_rate": 8.525925925925927e-06, |
| "loss": 17.6381, |
| "mean_token_accuracy": 0.7601484149694443, |
| "num_tokens": 49167839.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 1.083081658370793, |
| "epoch": 3.309780794480917, |
| "grad_norm": 20.5, |
| "learning_rate": 8.451851851851852e-06, |
| "loss": 17.9985, |
| "mean_token_accuracy": 0.7583192996680737, |
| "num_tokens": 49431378.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 1.0654306124895812, |
| "epoch": 3.3275842884166016, |
| "grad_norm": 17.375, |
| "learning_rate": 8.377777777777779e-06, |
| "loss": 17.7162, |
| "mean_token_accuracy": 0.7611102845519782, |
| "num_tokens": 49699309.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 1.086764731630683, |
| "epoch": 3.3453877823522866, |
| "grad_norm": 18.625, |
| "learning_rate": 8.303703703703705e-06, |
| "loss": 18.0623, |
| "mean_token_accuracy": 0.7570337392389774, |
| "num_tokens": 49972032.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 1.092942675575614, |
| "epoch": 3.3631912762879717, |
| "grad_norm": 22.75, |
| "learning_rate": 8.229629629629632e-06, |
| "loss": 18.1498, |
| "mean_token_accuracy": 0.7554987825453281, |
| "num_tokens": 50241367.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 1.1048477381467818, |
| "epoch": 3.3809947702236562, |
| "grad_norm": 19.75, |
| "learning_rate": 8.155555555555556e-06, |
| "loss": 18.1831, |
| "mean_token_accuracy": 0.7542665727436543, |
| "num_tokens": 50511167.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_biology_entropy": 2.608997480392456, |
| "eval_biology_loss": 2.929826259613037, |
| "eval_biology_mean_token_accuracy": 0.5243550386428834, |
| "eval_biology_num_tokens": 50511167.0, |
| "eval_biology_runtime": 58.3475, |
| "eval_biology_samples_per_second": 8.569, |
| "eval_biology_steps_per_second": 2.142, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_chemistry_entropy": 1.1078306741714476, |
| "eval_chemistry_loss": 1.1564204692840576, |
| "eval_chemistry_mean_token_accuracy": 0.7505966582298279, |
| "eval_chemistry_num_tokens": 50511167.0, |
| "eval_chemistry_runtime": 72.2379, |
| "eval_chemistry_samples_per_second": 6.922, |
| "eval_chemistry_steps_per_second": 1.73, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_math_entropy": 1.0781858038902283, |
| "eval_math_loss": 1.3685622215270996, |
| "eval_math_mean_token_accuracy": 0.7281003012657166, |
| "eval_math_num_tokens": 50511167.0, |
| "eval_math_runtime": 75.3062, |
| "eval_math_samples_per_second": 6.64, |
| "eval_math_steps_per_second": 1.66, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.3809947702236562, |
| "eval_physics_entropy": 1.193738681793213, |
| "eval_physics_loss": 1.2733585834503174, |
| "eval_physics_mean_token_accuracy": 0.7406534767150879, |
| "eval_physics_num_tokens": 50511167.0, |
| "eval_physics_runtime": 85.3447, |
| "eval_physics_samples_per_second": 5.859, |
| "eval_physics_steps_per_second": 1.465, |
| "step": 1900 |
| }, |
| { |
| "entropy": 1.1158889718353748, |
| "epoch": 3.3987982641593413, |
| "grad_norm": 22.75, |
| "learning_rate": 8.081481481481483e-06, |
| "loss": 18.4653, |
| "mean_token_accuracy": 0.7502956230193376, |
| "num_tokens": 50776842.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 1.076490705087781, |
| "epoch": 3.4166017580950263, |
| "grad_norm": 20.75, |
| "learning_rate": 8.007407407407408e-06, |
| "loss": 17.9335, |
| "mean_token_accuracy": 0.7567251056432724, |
| "num_tokens": 51040514.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 1.0964046400040388, |
| "epoch": 3.434405252030711, |
| "grad_norm": 21.625, |
| "learning_rate": 7.933333333333334e-06, |
| "loss": 18.146, |
| "mean_token_accuracy": 0.7544326152652502, |
| "num_tokens": 51300696.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 1.087927882000804, |
| "epoch": 3.452208745966396, |
| "grad_norm": 21.75, |
| "learning_rate": 7.859259259259259e-06, |
| "loss": 18.1376, |
| "mean_token_accuracy": 0.754492249712348, |
| "num_tokens": 51558606.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 1.0904299218207598, |
| "epoch": 3.470012239902081, |
| "grad_norm": 18.75, |
| "learning_rate": 7.785185185185185e-06, |
| "loss": 18.0444, |
| "mean_token_accuracy": 0.7552218366414308, |
| "num_tokens": 51827834.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 1.059173485264182, |
| "epoch": 3.4878157338377656, |
| "grad_norm": 20.0, |
| "learning_rate": 7.711111111111112e-06, |
| "loss": 17.5376, |
| "mean_token_accuracy": 0.7617850303649902, |
| "num_tokens": 52100179.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 1.1043213743716478, |
| "epoch": 3.5056192277734506, |
| "grad_norm": 20.75, |
| "learning_rate": 7.637037037037037e-06, |
| "loss": 18.2923, |
| "mean_token_accuracy": 0.7549142900854349, |
| "num_tokens": 52368055.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 1.0561675556004047, |
| "epoch": 3.523422721709135, |
| "grad_norm": 19.625, |
| "learning_rate": 7.562962962962963e-06, |
| "loss": 17.583, |
| "mean_token_accuracy": 0.7620449144393205, |
| "num_tokens": 52637505.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 1.0883250068873167, |
| "epoch": 3.54122621564482, |
| "grad_norm": 20.625, |
| "learning_rate": 7.48888888888889e-06, |
| "loss": 18.0451, |
| "mean_token_accuracy": 0.7565963264554739, |
| "num_tokens": 52906194.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 1.0408041454851626, |
| "epoch": 3.5590297095805052, |
| "grad_norm": 19.625, |
| "learning_rate": 7.4148148148148155e-06, |
| "loss": 17.3497, |
| "mean_token_accuracy": 0.7656379960477352, |
| "num_tokens": 53172376.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_biology_entropy": 2.651417718887329, |
| "eval_biology_loss": 2.9278793334960938, |
| "eval_biology_mean_token_accuracy": 0.5243545579910278, |
| "eval_biology_num_tokens": 53172376.0, |
| "eval_biology_runtime": 57.5665, |
| "eval_biology_samples_per_second": 8.686, |
| "eval_biology_steps_per_second": 2.171, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_chemistry_entropy": 1.111885326385498, |
| "eval_chemistry_loss": 1.1551117897033691, |
| "eval_chemistry_mean_token_accuracy": 0.7506753826141357, |
| "eval_chemistry_num_tokens": 53172376.0, |
| "eval_chemistry_runtime": 71.4351, |
| "eval_chemistry_samples_per_second": 6.999, |
| "eval_chemistry_steps_per_second": 1.75, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_math_entropy": 1.081713596343994, |
| "eval_math_loss": 1.3672666549682617, |
| "eval_math_mean_token_accuracy": 0.7283283696174622, |
| "eval_math_num_tokens": 53172376.0, |
| "eval_math_runtime": 74.9256, |
| "eval_math_samples_per_second": 6.673, |
| "eval_math_steps_per_second": 1.668, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.5590297095805052, |
| "eval_physics_entropy": 1.2001786155700684, |
| "eval_physics_loss": 1.2717243432998657, |
| "eval_physics_mean_token_accuracy": 0.7409325709342957, |
| "eval_physics_num_tokens": 53172376.0, |
| "eval_physics_runtime": 84.6622, |
| "eval_physics_samples_per_second": 5.906, |
| "eval_physics_steps_per_second": 1.476, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.556064716540213e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|