{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7975468060001731, "epoch": 0.016, "grad_norm": 48.0, "learning_rate": 1.8e-07, "loss": 13.872, "mean_token_accuracy": 0.7770325228571892, "num_tokens": 338352.0, "step": 10 }, { "entropy": 0.7971246171742677, "epoch": 0.032, "grad_norm": 38.75, "learning_rate": 3.8e-07, "loss": 13.8259, "mean_token_accuracy": 0.7767761040478944, "num_tokens": 671193.0, "step": 20 }, { "entropy": 0.7927551301196217, "epoch": 0.048, "grad_norm": 36.25, "learning_rate": 5.800000000000001e-07, "loss": 13.6952, "mean_token_accuracy": 0.7782795619219541, "num_tokens": 1001008.0, "step": 30 }, { "entropy": 0.8188681453466415, "epoch": 0.064, "grad_norm": 33.5, "learning_rate": 7.8e-07, "loss": 13.9837, "mean_token_accuracy": 0.7739029213786125, "num_tokens": 1331208.0, "step": 40 }, { "entropy": 0.8227416690438986, "epoch": 0.08, "grad_norm": 32.75, "learning_rate": 9.800000000000001e-07, "loss": 13.7971, "mean_token_accuracy": 0.7759626224637032, "num_tokens": 1667532.0, "step": 50 }, { "entropy": 0.809746851399541, "epoch": 0.096, "grad_norm": 34.0, "learning_rate": 1.1800000000000001e-06, "loss": 13.4702, "mean_token_accuracy": 0.7815078292042017, "num_tokens": 2007176.0, "step": 60 }, { "entropy": 0.8254514675587415, "epoch": 0.112, "grad_norm": 36.75, "learning_rate": 1.3800000000000001e-06, "loss": 13.7471, "mean_token_accuracy": 0.7760862343013286, "num_tokens": 2342259.0, "step": 70 }, { "entropy": 0.8527037408202887, "epoch": 0.128, "grad_norm": 27.125, "learning_rate": 1.5800000000000001e-06, "loss": 13.9233, "mean_token_accuracy": 0.7734280046075582, "num_tokens": 2663987.0, "step": 80 }, { "entropy": 0.8274104345589877, "epoch": 0.144, "grad_norm": 25.75, "learning_rate": 1.7800000000000001e-06, "loss": 13.653, "mean_token_accuracy": 0.7781244270503521, "num_tokens": 2997824.0, "step": 90 }, { "entropy": 0.8279923181980848, "epoch": 0.16, "grad_norm": 26.5, "learning_rate": 1.98e-06, "loss": 13.4841, "mean_token_accuracy": 0.7816651199012995, "num_tokens": 3330597.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.125685109615326, "eval_biology_loss": 1.1623297929763794, "eval_biology_mean_token_accuracy": 0.7038317298889161, "eval_biology_num_tokens": 3330597.0, "eval_biology_runtime": 45.8822, "eval_biology_samples_per_second": 10.897, "eval_biology_steps_per_second": 2.724, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 0.8375687637329101, "eval_physics_loss": 0.8513404130935669, "eval_physics_mean_token_accuracy": 0.7783295741081238, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 65.9246, "eval_physics_samples_per_second": 7.584, "eval_physics_steps_per_second": 1.896, "step": 100 }, { "entropy": 0.8265087101608515, "epoch": 0.176, "grad_norm": 33.0, "learning_rate": 2.1800000000000003e-06, "loss": 13.4457, "mean_token_accuracy": 0.7798256956040859, "num_tokens": 3658264.0, "step": 110 }, { "entropy": 0.8278129205107689, "epoch": 0.192, "grad_norm": 25.75, "learning_rate": 2.38e-06, "loss": 13.4175, "mean_token_accuracy": 0.7793817535042763, "num_tokens": 3995568.0, "step": 120 }, { "entropy": 0.818607559427619, "epoch": 0.208, "grad_norm": 26.125, "learning_rate": 2.5800000000000003e-06, "loss": 13.2096, "mean_token_accuracy": 0.7836348541080952, "num_tokens": 4321436.0, "step": 130 }, { "entropy": 0.8120364205911755, "epoch": 0.224, "grad_norm": 23.625, "learning_rate": 2.7800000000000005e-06, "loss": 13.0693, "mean_token_accuracy": 0.7830110590904951, "num_tokens": 4648491.0, "step": 140 }, { "entropy": 0.8133310537785292, "epoch": 0.24, "grad_norm": 23.75, "learning_rate": 2.9800000000000003e-06, "loss": 13.1184, "mean_token_accuracy": 0.7830653071403504, "num_tokens": 4986175.0, "step": 150 }, { "entropy": 0.8032114181667567, "epoch": 0.256, "grad_norm": 27.375, "learning_rate": 3.1800000000000005e-06, "loss": 13.0503, "mean_token_accuracy": 0.7851274147629738, "num_tokens": 5329320.0, "step": 160 }, { "entropy": 0.8205402866005898, "epoch": 0.272, "grad_norm": 26.125, "learning_rate": 3.3800000000000007e-06, "loss": 13.1719, "mean_token_accuracy": 0.7810409177094698, "num_tokens": 5658796.0, "step": 170 }, { "entropy": 0.8173610385507345, "epoch": 0.288, "grad_norm": 22.25, "learning_rate": 3.58e-06, "loss": 13.1428, "mean_token_accuracy": 0.7840665753930807, "num_tokens": 5980201.0, "step": 180 }, { "entropy": 0.7997741607949138, "epoch": 0.304, "grad_norm": 24.875, "learning_rate": 3.7800000000000002e-06, "loss": 12.8822, "mean_token_accuracy": 0.7862737070769071, "num_tokens": 6298858.0, "step": 190 }, { "entropy": 0.8092747095972299, "epoch": 0.32, "grad_norm": 22.875, "learning_rate": 3.980000000000001e-06, "loss": 13.0185, "mean_token_accuracy": 0.784797790274024, "num_tokens": 6622798.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.1380381927490235, "eval_biology_loss": 1.1702971458435059, "eval_biology_mean_token_accuracy": 0.7033402571678161, "eval_biology_num_tokens": 6622798.0, "eval_biology_runtime": 175.631, "eval_biology_samples_per_second": 2.847, "eval_biology_steps_per_second": 0.712, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.812020037651062, "eval_physics_loss": 0.8106491565704346, "eval_physics_mean_token_accuracy": 0.7852507462501526, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 200.6075, "eval_physics_samples_per_second": 2.492, "eval_physics_steps_per_second": 0.623, "step": 200 }, { "entropy": 0.797424552589655, "epoch": 0.336, "grad_norm": 21.5, "learning_rate": 4.18e-06, "loss": 12.8378, "mean_token_accuracy": 0.7871691755950451, "num_tokens": 6953611.0, "step": 210 }, { "entropy": 0.7838323166593909, "epoch": 0.352, "grad_norm": 24.0, "learning_rate": 4.38e-06, "loss": 12.6327, "mean_token_accuracy": 0.7907786477357149, "num_tokens": 7290378.0, "step": 220 }, { "entropy": 0.7731631100177765, "epoch": 0.368, "grad_norm": 23.25, "learning_rate": 4.58e-06, "loss": 12.3982, "mean_token_accuracy": 0.7930114820599556, "num_tokens": 7621082.0, "step": 230 }, { "entropy": 0.801326191239059, "epoch": 0.384, "grad_norm": 22.75, "learning_rate": 4.78e-06, "loss": 12.8917, "mean_token_accuracy": 0.7840922765433789, "num_tokens": 7955570.0, "step": 240 }, { "entropy": 0.7741426464170218, "epoch": 0.4, "grad_norm": 23.875, "learning_rate": 4.980000000000001e-06, "loss": 12.4671, "mean_token_accuracy": 0.791225866228342, "num_tokens": 8291049.0, "step": 250 }, { "entropy": 0.7628269851207733, "epoch": 0.416, "grad_norm": 22.125, "learning_rate": 5.18e-06, "loss": 12.2906, "mean_token_accuracy": 0.7947692718356848, "num_tokens": 8627310.0, "step": 260 }, { "entropy": 0.7752459084615111, "epoch": 0.432, "grad_norm": 23.125, "learning_rate": 5.380000000000001e-06, "loss": 12.4973, "mean_token_accuracy": 0.7905333787202835, "num_tokens": 8958371.0, "step": 270 }, { "entropy": 0.7545073958113789, "epoch": 0.448, "grad_norm": 22.625, "learning_rate": 5.580000000000001e-06, "loss": 12.1817, "mean_token_accuracy": 0.7961019795387984, "num_tokens": 9294388.0, "step": 280 }, { "entropy": 0.7482077127322555, "epoch": 0.464, "grad_norm": 23.375, "learning_rate": 5.78e-06, "loss": 11.9906, "mean_token_accuracy": 0.7969086967408657, "num_tokens": 9620366.0, "step": 290 }, { "entropy": 0.7531273065134882, "epoch": 0.48, "grad_norm": 21.0, "learning_rate": 5.98e-06, "loss": 12.0906, "mean_token_accuracy": 0.7956845626235008, "num_tokens": 9955431.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.1282176275253295, "eval_biology_loss": 1.176755428314209, "eval_biology_mean_token_accuracy": 0.7018080081939697, "eval_biology_num_tokens": 9955431.0, "eval_biology_runtime": 45.9288, "eval_biology_samples_per_second": 10.886, "eval_biology_steps_per_second": 2.722, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.7810471363067627, "eval_physics_loss": 0.7771082520484924, "eval_physics_mean_token_accuracy": 0.7920530920028687, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 66.0018, "eval_physics_samples_per_second": 7.576, "eval_physics_steps_per_second": 1.894, "step": 300 }, { "entropy": 0.7689633307978511, "epoch": 0.496, "grad_norm": 20.875, "learning_rate": 6.18e-06, "loss": 12.3289, "mean_token_accuracy": 0.7935276433825493, "num_tokens": 10289883.0, "step": 310 }, { "entropy": 0.7578680694103241, "epoch": 0.512, "grad_norm": 25.0, "learning_rate": 6.380000000000001e-06, "loss": 12.2227, "mean_token_accuracy": 0.7949301645159721, "num_tokens": 10619468.0, "step": 320 }, { "entropy": 0.7704175990074873, "epoch": 0.528, "grad_norm": 20.75, "learning_rate": 6.5800000000000005e-06, "loss": 12.3943, "mean_token_accuracy": 0.7915358774363994, "num_tokens": 10944904.0, "step": 330 }, { "entropy": 0.7533261435106396, "epoch": 0.544, "grad_norm": 23.125, "learning_rate": 6.780000000000001e-06, "loss": 12.1385, "mean_token_accuracy": 0.7953089620918036, "num_tokens": 11266409.0, "step": 340 }, { "entropy": 0.7507271701470017, "epoch": 0.56, "grad_norm": 18.875, "learning_rate": 6.98e-06, "loss": 12.0437, "mean_token_accuracy": 0.79745435975492, "num_tokens": 11605544.0, "step": 350 }, { "entropy": 0.7390375791117549, "epoch": 0.576, "grad_norm": 21.625, "learning_rate": 7.180000000000001e-06, "loss": 11.8508, "mean_token_accuracy": 0.8004897948354482, "num_tokens": 11931715.0, "step": 360 }, { "entropy": 0.7514585722237825, "epoch": 0.592, "grad_norm": 21.5, "learning_rate": 7.3800000000000005e-06, "loss": 12.0218, "mean_token_accuracy": 0.7965340100228786, "num_tokens": 12257243.0, "step": 370 }, { "entropy": 0.7325067885220051, "epoch": 0.608, "grad_norm": 20.75, "learning_rate": 7.58e-06, "loss": 11.7926, "mean_token_accuracy": 0.7993212066590786, "num_tokens": 12583154.0, "step": 380 }, { "entropy": 0.7487934850156307, "epoch": 0.624, "grad_norm": 21.0, "learning_rate": 7.78e-06, "loss": 12.053, "mean_token_accuracy": 0.7958210565149784, "num_tokens": 12905392.0, "step": 390 }, { "entropy": 0.7440792236477136, "epoch": 0.64, "grad_norm": 18.75, "learning_rate": 7.980000000000002e-06, "loss": 11.9643, "mean_token_accuracy": 0.7981198724359274, "num_tokens": 13232198.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.0903304896354675, "eval_biology_loss": 1.182153344154358, "eval_biology_mean_token_accuracy": 0.7013925290107728, "eval_biology_num_tokens": 13232198.0, "eval_biology_runtime": 45.8492, "eval_biology_samples_per_second": 10.905, "eval_biology_steps_per_second": 2.726, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.7371547894477845, "eval_physics_loss": 0.7496699094772339, "eval_physics_mean_token_accuracy": 0.7974319715499878, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 65.8732, "eval_physics_samples_per_second": 7.59, "eval_physics_steps_per_second": 1.898, "step": 400 }, { "entropy": 0.7319347187876701, "epoch": 0.656, "grad_norm": 18.375, "learning_rate": 8.18e-06, "loss": 11.7795, "mean_token_accuracy": 0.8000467628240585, "num_tokens": 13575902.0, "step": 410 }, { "entropy": 0.71892513576895, "epoch": 0.672, "grad_norm": 21.25, "learning_rate": 8.380000000000001e-06, "loss": 11.525, "mean_token_accuracy": 0.8044146560132504, "num_tokens": 13895997.0, "step": 420 }, { "entropy": 0.7239193903282285, "epoch": 0.688, "grad_norm": 21.375, "learning_rate": 8.580000000000001e-06, "loss": 11.6384, "mean_token_accuracy": 0.8023904841393232, "num_tokens": 14234888.0, "step": 430 }, { "entropy": 0.7392614649608731, "epoch": 0.704, "grad_norm": 21.875, "learning_rate": 8.78e-06, "loss": 11.9131, "mean_token_accuracy": 0.7982321321964264, "num_tokens": 14567908.0, "step": 440 }, { "entropy": 0.7193062495440244, "epoch": 0.72, "grad_norm": 23.5, "learning_rate": 8.98e-06, "loss": 11.5749, "mean_token_accuracy": 0.8031982038170099, "num_tokens": 14882927.0, "step": 450 }, { "entropy": 0.7137783830985427, "epoch": 0.736, "grad_norm": 22.375, "learning_rate": 9.180000000000002e-06, "loss": 11.4717, "mean_token_accuracy": 0.8033147465437651, "num_tokens": 15217342.0, "step": 460 }, { "entropy": 0.7088428331539035, "epoch": 0.752, "grad_norm": 18.75, "learning_rate": 9.38e-06, "loss": 11.392, "mean_token_accuracy": 0.8049089256674051, "num_tokens": 15547710.0, "step": 470 }, { "entropy": 0.7304569650441408, "epoch": 0.768, "grad_norm": 20.625, "learning_rate": 9.58e-06, "loss": 11.7448, "mean_token_accuracy": 0.7994368057698011, "num_tokens": 15877177.0, "step": 480 }, { "entropy": 0.7059216756373644, "epoch": 0.784, "grad_norm": 18.75, "learning_rate": 9.780000000000001e-06, "loss": 11.3295, "mean_token_accuracy": 0.806659733131528, "num_tokens": 16219640.0, "step": 490 }, { "entropy": 0.712453056499362, "epoch": 0.8, "grad_norm": 19.125, "learning_rate": 9.980000000000001e-06, "loss": 11.4991, "mean_token_accuracy": 0.8039865717291832, "num_tokens": 16548261.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.1249133114814758, "eval_biology_loss": 1.1854252815246582, "eval_biology_mean_token_accuracy": 0.7005919570922852, "eval_biology_num_tokens": 16548261.0, "eval_biology_runtime": 45.6964, "eval_biology_samples_per_second": 10.942, "eval_biology_steps_per_second": 2.735, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.730017361164093, "eval_physics_loss": 0.724478542804718, "eval_physics_mean_token_accuracy": 0.8027530283927917, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 65.8504, "eval_physics_samples_per_second": 7.593, "eval_physics_steps_per_second": 1.898, "step": 500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.652179086470234e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }