{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4832931827753781, "epoch": 0.016, "grad_norm": 145.0, "learning_rate": 6.000000000000001e-07, "loss": 35.8107, "mean_token_accuracy": 0.5946215284988284, "num_tokens": 338352.0, "step": 10 }, { "entropy": 1.4921929031610488, "epoch": 0.032, "grad_norm": 139.0, "learning_rate": 1.2666666666666669e-06, "loss": 35.6429, "mean_token_accuracy": 0.5987227037549019, "num_tokens": 671193.0, "step": 20 }, { "entropy": 1.5176027774810792, "epoch": 0.048, "grad_norm": 120.0, "learning_rate": 1.9333333333333336e-06, "loss": 34.7928, "mean_token_accuracy": 0.6018522759899497, "num_tokens": 1001008.0, "step": 30 }, { "entropy": 1.6286215644329787, "epoch": 0.064, "grad_norm": 94.0, "learning_rate": 2.6e-06, "loss": 34.3313, "mean_token_accuracy": 0.5992800608277321, "num_tokens": 1331208.0, "step": 40 }, { "entropy": 1.6960565708577633, "epoch": 0.08, "grad_norm": 70.0, "learning_rate": 3.266666666666667e-06, "loss": 31.8029, "mean_token_accuracy": 0.6178880449384451, "num_tokens": 1667532.0, "step": 50 }, { "entropy": 1.6291456781327724, "epoch": 0.096, "grad_norm": 46.25, "learning_rate": 3.9333333333333335e-06, "loss": 28.5587, "mean_token_accuracy": 0.6474138081073761, "num_tokens": 2007176.0, "step": 60 }, { "entropy": 1.6484372481703757, "epoch": 0.112, "grad_norm": 38.75, "learning_rate": 4.600000000000001e-06, "loss": 27.1023, "mean_token_accuracy": 0.6544294429942965, "num_tokens": 2342259.0, "step": 70 }, { "entropy": 1.6191057547926904, "epoch": 0.128, "grad_norm": 31.625, "learning_rate": 5.2666666666666665e-06, "loss": 25.9215, "mean_token_accuracy": 0.6600364238023758, "num_tokens": 2663987.0, "step": 80 }, { "entropy": 1.4830621395260095, "epoch": 0.144, "grad_norm": 23.25, "learning_rate": 5.933333333333335e-06, "loss": 23.6014, "mean_token_accuracy": 0.6834562920033932, "num_tokens": 2997824.0, "step": 90 }, { "entropy": 1.426737917587161, "epoch": 0.16, "grad_norm": 21.25, "learning_rate": 6.600000000000001e-06, "loss": 22.4685, "mean_token_accuracy": 0.6952182710170746, "num_tokens": 3330597.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 3.0376483097076417, "eval_biology_loss": 3.875791549682617, "eval_biology_mean_token_accuracy": 0.41260506939888003, "eval_biology_num_tokens": 3330597.0, "eval_biology_runtime": 49.8433, "eval_biology_samples_per_second": 10.031, "eval_biology_steps_per_second": 2.508, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.7663879008293153, "eval_chemistry_loss": 1.816375732421875, "eval_chemistry_mean_token_accuracy": 0.6312954430580139, "eval_chemistry_num_tokens": 3330597.0, "eval_chemistry_runtime": 61.8081, "eval_chemistry_samples_per_second": 8.09, "eval_chemistry_steps_per_second": 2.022, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 1.1983131990432738, "eval_math_loss": 1.485450029373169, "eval_math_mean_token_accuracy": 0.6917262263298035, "eval_math_num_tokens": 3330597.0, "eval_math_runtime": 63.351, "eval_math_samples_per_second": 7.893, "eval_math_steps_per_second": 1.973, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.4052113437652587, "eval_physics_loss": 1.3870269060134888, "eval_physics_mean_token_accuracy": 0.69519624376297, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 72.2152, "eval_physics_samples_per_second": 6.924, "eval_physics_steps_per_second": 1.731, "step": 100 }, { "entropy": 1.4028880976140499, "epoch": 0.176, "grad_norm": 23.125, "learning_rate": 7.266666666666668e-06, "loss": 21.8268, "mean_token_accuracy": 0.6981963120400906, "num_tokens": 3658264.0, "step": 110 }, { "entropy": 1.3301444135606288, "epoch": 0.192, "grad_norm": 17.375, "learning_rate": 7.933333333333334e-06, "loss": 20.9681, "mean_token_accuracy": 0.7033475987613201, "num_tokens": 3995568.0, "step": 120 }, { "entropy": 1.2701803509145975, "epoch": 0.208, "grad_norm": 18.25, "learning_rate": 8.6e-06, "loss": 19.9173, "mean_token_accuracy": 0.7150526810437441, "num_tokens": 4321436.0, "step": 130 }, { "entropy": 1.2350537855178119, "epoch": 0.224, "grad_norm": 16.625, "learning_rate": 9.266666666666667e-06, "loss": 19.5032, "mean_token_accuracy": 0.717908713966608, "num_tokens": 4648491.0, "step": 140 }, { "entropy": 1.2139144621789455, "epoch": 0.24, "grad_norm": 20.0, "learning_rate": 9.933333333333334e-06, "loss": 19.3976, "mean_token_accuracy": 0.7190249029546976, "num_tokens": 4986175.0, "step": 150 }, { "entropy": 1.2034844245761633, "epoch": 0.256, "grad_norm": 18.125, "learning_rate": 1.0600000000000002e-05, "loss": 19.0035, "mean_token_accuracy": 0.7240409322082997, "num_tokens": 5329320.0, "step": 160 }, { "entropy": 1.19539747312665, "epoch": 0.272, "grad_norm": 19.125, "learning_rate": 1.1266666666666668e-05, "loss": 18.9941, "mean_token_accuracy": 0.7230293404310941, "num_tokens": 5658796.0, "step": 170 }, { "entropy": 1.1726421393454074, "epoch": 0.288, "grad_norm": 16.75, "learning_rate": 1.1933333333333335e-05, "loss": 18.6802, "mean_token_accuracy": 0.7282810874283314, "num_tokens": 5980201.0, "step": 180 }, { "entropy": 1.16094581335783, "epoch": 0.304, "grad_norm": 20.5, "learning_rate": 1.2600000000000001e-05, "loss": 18.4063, "mean_token_accuracy": 0.7304705370217561, "num_tokens": 6298858.0, "step": 190 }, { "entropy": 1.1420196149498225, "epoch": 0.32, "grad_norm": 18.375, "learning_rate": 1.3266666666666668e-05, "loss": 18.2034, "mean_token_accuracy": 0.7318542957305908, "num_tokens": 6622798.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 2.920998822212219, "eval_biology_loss": 3.363135814666748, "eval_biology_mean_token_accuracy": 0.4486829369068146, "eval_biology_num_tokens": 6622798.0, "eval_biology_runtime": 49.8146, "eval_biology_samples_per_second": 10.037, "eval_biology_steps_per_second": 2.509, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.4927990851402282, "eval_chemistry_loss": 1.5301882028579712, "eval_chemistry_mean_token_accuracy": 0.6676614291667938, "eval_chemistry_num_tokens": 6622798.0, "eval_chemistry_runtime": 61.7412, "eval_chemistry_samples_per_second": 8.098, "eval_chemistry_steps_per_second": 2.025, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 1.087874176979065, "eval_math_loss": 1.2984504699707031, "eval_math_mean_token_accuracy": 0.7132572102546691, "eval_math_num_tokens": 6622798.0, "eval_math_runtime": 63.3429, "eval_math_samples_per_second": 7.894, "eval_math_steps_per_second": 1.973, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 1.1195718150138856, "eval_physics_loss": 1.1357909440994263, "eval_physics_mean_token_accuracy": 0.7332857141494751, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 72.1176, "eval_physics_samples_per_second": 6.933, "eval_physics_steps_per_second": 1.733, "step": 200 }, { "entropy": 1.1236778676509858, "epoch": 0.336, "grad_norm": 14.25, "learning_rate": 1.3933333333333334e-05, "loss": 17.9874, "mean_token_accuracy": 0.7355770241469145, "num_tokens": 6953611.0, "step": 210 }, { "entropy": 1.0880652230232954, "epoch": 0.352, "grad_norm": 18.0, "learning_rate": 1.46e-05, "loss": 17.3415, "mean_token_accuracy": 0.7432329464703799, "num_tokens": 7290378.0, "step": 220 }, { "entropy": 1.0688236601650716, "epoch": 0.368, "grad_norm": 17.875, "learning_rate": 1.5266666666666667e-05, "loss": 17.0985, "mean_token_accuracy": 0.7455301389098168, "num_tokens": 7621082.0, "step": 230 }, { "entropy": 1.1190654184669255, "epoch": 0.384, "grad_norm": 17.375, "learning_rate": 1.5933333333333336e-05, "loss": 17.8397, "mean_token_accuracy": 0.7339698273688555, "num_tokens": 7955570.0, "step": 240 }, { "entropy": 1.0598117262125015, "epoch": 0.4, "grad_norm": 18.0, "learning_rate": 1.66e-05, "loss": 16.9636, "mean_token_accuracy": 0.7449962131679058, "num_tokens": 8291049.0, "step": 250 }, { "entropy": 1.0420986939221621, "epoch": 0.416, "grad_norm": 16.625, "learning_rate": 1.726666666666667e-05, "loss": 16.6593, "mean_token_accuracy": 0.7490336105227471, "num_tokens": 8627310.0, "step": 260 }, { "entropy": 1.0523861989378929, "epoch": 0.432, "grad_norm": 19.0, "learning_rate": 1.7933333333333333e-05, "loss": 16.8565, "mean_token_accuracy": 0.7475517597049475, "num_tokens": 8958371.0, "step": 270 }, { "entropy": 1.0093163922429085, "epoch": 0.448, "grad_norm": 16.75, "learning_rate": 1.86e-05, "loss": 16.2024, "mean_token_accuracy": 0.7556655522435903, "num_tokens": 9294388.0, "step": 280 }, { "entropy": 1.006963584944606, "epoch": 0.464, "grad_norm": 17.875, "learning_rate": 1.926666666666667e-05, "loss": 16.0211, "mean_token_accuracy": 0.7555320005863905, "num_tokens": 9620366.0, "step": 290 }, { "entropy": 1.0118312349542975, "epoch": 0.48, "grad_norm": 18.25, "learning_rate": 1.9933333333333334e-05, "loss": 16.2777, "mean_token_accuracy": 0.7526854898780584, "num_tokens": 9955431.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 2.8107905416488648, "eval_biology_loss": 3.2179975509643555, "eval_biology_mean_token_accuracy": 0.46178966522216797, "eval_biology_num_tokens": 9955431.0, "eval_biology_runtime": 49.8514, "eval_biology_samples_per_second": 10.03, "eval_biology_steps_per_second": 2.507, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 1.406659827709198, "eval_chemistry_loss": 1.4287575483322144, "eval_chemistry_mean_token_accuracy": 0.6827028579711915, "eval_chemistry_num_tokens": 9955431.0, "eval_chemistry_runtime": 61.8676, "eval_chemistry_samples_per_second": 8.082, "eval_chemistry_steps_per_second": 2.02, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 1.0421963691711427, "eval_math_loss": 1.2314904928207397, "eval_math_mean_token_accuracy": 0.7236453113555908, "eval_math_num_tokens": 9955431.0, "eval_math_runtime": 63.3661, "eval_math_samples_per_second": 7.891, "eval_math_steps_per_second": 1.973, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 1.0351716361045837, "eval_physics_loss": 1.0356038808822632, "eval_physics_mean_token_accuracy": 0.7498980407714844, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 72.1536, "eval_physics_samples_per_second": 6.93, "eval_physics_steps_per_second": 1.732, "step": 300 }, { "entropy": 1.0259150266647339, "epoch": 0.496, "grad_norm": 15.8125, "learning_rate": 1.9933333333333334e-05, "loss": 16.4439, "mean_token_accuracy": 0.7514661472290755, "num_tokens": 10289883.0, "step": 310 }, { "entropy": 1.0062489442527294, "epoch": 0.512, "grad_norm": 19.5, "learning_rate": 1.985925925925926e-05, "loss": 16.2084, "mean_token_accuracy": 0.7547577202320099, "num_tokens": 10619468.0, "step": 320 }, { "entropy": 1.0211657840758561, "epoch": 0.528, "grad_norm": 17.0, "learning_rate": 1.9785185185185187e-05, "loss": 16.3421, "mean_token_accuracy": 0.75145508274436, "num_tokens": 10944904.0, "step": 330 }, { "entropy": 1.0024307239800692, "epoch": 0.544, "grad_norm": 17.375, "learning_rate": 1.971111111111111e-05, "loss": 16.1168, "mean_token_accuracy": 0.7545490644872188, "num_tokens": 11266409.0, "step": 340 }, { "entropy": 0.9814366064965725, "epoch": 0.56, "grad_norm": 15.6875, "learning_rate": 1.963703703703704e-05, "loss": 15.6974, "mean_token_accuracy": 0.759765500202775, "num_tokens": 11605544.0, "step": 350 }, { "entropy": 0.974255072697997, "epoch": 0.576, "grad_norm": 17.625, "learning_rate": 1.9562962962962964e-05, "loss": 15.6108, "mean_token_accuracy": 0.762177936732769, "num_tokens": 11931715.0, "step": 360 }, { "entropy": 0.9886789865791797, "epoch": 0.592, "grad_norm": 17.75, "learning_rate": 1.948888888888889e-05, "loss": 15.8091, "mean_token_accuracy": 0.7574512097984553, "num_tokens": 12257243.0, "step": 370 }, { "entropy": 0.9611615493893624, "epoch": 0.608, "grad_norm": 15.5625, "learning_rate": 1.9414814814814817e-05, "loss": 15.4695, "mean_token_accuracy": 0.7621630631387234, "num_tokens": 12583154.0, "step": 380 }, { "entropy": 0.9780517250299454, "epoch": 0.624, "grad_norm": 16.25, "learning_rate": 1.9340740740740743e-05, "loss": 15.7482, "mean_token_accuracy": 0.7585811104625464, "num_tokens": 12905392.0, "step": 390 }, { "entropy": 0.9820186741650104, "epoch": 0.64, "grad_norm": 15.6875, "learning_rate": 1.926666666666667e-05, "loss": 15.7619, "mean_token_accuracy": 0.7589227013289929, "num_tokens": 13232198.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 2.7366416778564453, "eval_biology_loss": 3.0968964099884033, "eval_biology_mean_token_accuracy": 0.47410999417304994, "eval_biology_num_tokens": 13232198.0, "eval_biology_runtime": 49.8326, "eval_biology_samples_per_second": 10.034, "eval_biology_steps_per_second": 2.508, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 1.3320539832115172, "eval_chemistry_loss": 1.3684444427490234, "eval_chemistry_mean_token_accuracy": 0.693352394104004, "eval_chemistry_num_tokens": 13232198.0, "eval_chemistry_runtime": 61.7471, "eval_chemistry_samples_per_second": 8.098, "eval_chemistry_steps_per_second": 2.024, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.9903823466300964, "eval_math_loss": 1.1989392042160034, "eval_math_mean_token_accuracy": 0.7292539710998536, "eval_math_num_tokens": 13232198.0, "eval_math_runtime": 63.2924, "eval_math_samples_per_second": 7.9, "eval_math_steps_per_second": 1.975, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.9698497958183289, "eval_physics_loss": 0.9801211357116699, "eval_physics_mean_token_accuracy": 0.7597138810157776, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 72.1491, "eval_physics_samples_per_second": 6.93, "eval_physics_steps_per_second": 1.733, "step": 400 }, { "entropy": 0.9678926695138216, "epoch": 0.656, "grad_norm": 17.625, "learning_rate": 1.9192592592592593e-05, "loss": 15.5332, "mean_token_accuracy": 0.7619860425591469, "num_tokens": 13575902.0, "step": 410 }, { "entropy": 0.9481775723397732, "epoch": 0.672, "grad_norm": 18.875, "learning_rate": 1.911851851851852e-05, "loss": 15.1784, "mean_token_accuracy": 0.7657071366906166, "num_tokens": 13895997.0, "step": 420 }, { "entropy": 0.9527117524296045, "epoch": 0.688, "grad_norm": 16.125, "learning_rate": 1.9044444444444446e-05, "loss": 15.2982, "mean_token_accuracy": 0.7649280011653901, "num_tokens": 14234888.0, "step": 430 }, { "entropy": 0.9615373708307743, "epoch": 0.704, "grad_norm": 17.25, "learning_rate": 1.8970370370370372e-05, "loss": 15.5115, "mean_token_accuracy": 0.7601466745138168, "num_tokens": 14567908.0, "step": 440 }, { "entropy": 0.9485624451190233, "epoch": 0.72, "grad_norm": 18.5, "learning_rate": 1.8896296296296295e-05, "loss": 15.1719, "mean_token_accuracy": 0.7653266172856092, "num_tokens": 14882927.0, "step": 450 }, { "entropy": 0.9433321110904217, "epoch": 0.736, "grad_norm": 18.375, "learning_rate": 1.8822222222222225e-05, "loss": 15.1447, "mean_token_accuracy": 0.765366930142045, "num_tokens": 15217342.0, "step": 460 }, { "entropy": 0.9253723874688149, "epoch": 0.752, "grad_norm": 16.75, "learning_rate": 1.874814814814815e-05, "loss": 14.7655, "mean_token_accuracy": 0.7691089898347855, "num_tokens": 15547710.0, "step": 470 }, { "entropy": 0.9615132443606853, "epoch": 0.768, "grad_norm": 17.375, "learning_rate": 1.8674074074074075e-05, "loss": 15.4996, "mean_token_accuracy": 0.7604613497853279, "num_tokens": 15877177.0, "step": 480 }, { "entropy": 0.9211561907082796, "epoch": 0.784, "grad_norm": 14.875, "learning_rate": 1.86e-05, "loss": 14.8121, "mean_token_accuracy": 0.7703294314444065, "num_tokens": 16219640.0, "step": 490 }, { "entropy": 0.939649223536253, "epoch": 0.8, "grad_norm": 16.75, "learning_rate": 1.8525925925925928e-05, "loss": 15.0708, "mean_token_accuracy": 0.7668382901698351, "num_tokens": 16548261.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 2.762967661857605, "eval_biology_loss": 3.090928792953491, "eval_biology_mean_token_accuracy": 0.47848794198036193, "eval_biology_num_tokens": 16548261.0, "eval_biology_runtime": 50.0005, "eval_biology_samples_per_second": 10.0, "eval_biology_steps_per_second": 2.5, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 1.333887234210968, "eval_chemistry_loss": 1.340030550956726, "eval_chemistry_mean_token_accuracy": 0.6984462032318115, "eval_chemistry_num_tokens": 16548261.0, "eval_chemistry_runtime": 61.3452, "eval_chemistry_samples_per_second": 8.151, "eval_chemistry_steps_per_second": 2.038, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.9834395766258239, "eval_math_loss": 1.1809327602386475, "eval_math_mean_token_accuracy": 0.7322900333404541, "eval_math_num_tokens": 16548261.0, "eval_math_runtime": 63.2493, "eval_math_samples_per_second": 7.905, "eval_math_steps_per_second": 1.976, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.9553510432243347, "eval_physics_loss": 0.948321521282196, "eval_physics_mean_token_accuracy": 0.7648878183364868, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 72.2375, "eval_physics_samples_per_second": 6.922, "eval_physics_steps_per_second": 1.73, "step": 500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.652179086470234e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }