| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7975468060001731, |
| "epoch": 0.016, |
| "grad_norm": 48.0, |
| "learning_rate": 1.8e-07, |
| "loss": 13.872, |
| "mean_token_accuracy": 0.7770325228571892, |
| "num_tokens": 338352.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.7971246171742677, |
| "epoch": 0.032, |
| "grad_norm": 38.75, |
| "learning_rate": 3.8e-07, |
| "loss": 13.8259, |
| "mean_token_accuracy": 0.7767761040478944, |
| "num_tokens": 671193.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.7927551301196217, |
| "epoch": 0.048, |
| "grad_norm": 36.25, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 13.6952, |
| "mean_token_accuracy": 0.7782795619219541, |
| "num_tokens": 1001008.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.8188681453466415, |
| "epoch": 0.064, |
| "grad_norm": 33.5, |
| "learning_rate": 7.8e-07, |
| "loss": 13.9837, |
| "mean_token_accuracy": 0.7739029213786125, |
| "num_tokens": 1331208.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8227416690438986, |
| "epoch": 0.08, |
| "grad_norm": 32.75, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 13.7971, |
| "mean_token_accuracy": 0.7759626224637032, |
| "num_tokens": 1667532.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.809746851399541, |
| "epoch": 0.096, |
| "grad_norm": 34.0, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 13.4702, |
| "mean_token_accuracy": 0.7815078292042017, |
| "num_tokens": 2007176.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8254514675587415, |
| "epoch": 0.112, |
| "grad_norm": 36.75, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 13.7471, |
| "mean_token_accuracy": 0.7760862343013286, |
| "num_tokens": 2342259.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8527037408202887, |
| "epoch": 0.128, |
| "grad_norm": 27.125, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 13.9233, |
| "mean_token_accuracy": 0.7734280046075582, |
| "num_tokens": 2663987.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8274104345589877, |
| "epoch": 0.144, |
| "grad_norm": 25.75, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 13.653, |
| "mean_token_accuracy": 0.7781244270503521, |
| "num_tokens": 2997824.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8279923181980848, |
| "epoch": 0.16, |
| "grad_norm": 26.5, |
| "learning_rate": 1.98e-06, |
| "loss": 13.4841, |
| "mean_token_accuracy": 0.7816651199012995, |
| "num_tokens": 3330597.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_biology_entropy": 1.125685109615326, |
| "eval_biology_loss": 1.1623297929763794, |
| "eval_biology_mean_token_accuracy": 0.7038317298889161, |
| "eval_biology_num_tokens": 3330597.0, |
| "eval_biology_runtime": 45.8822, |
| "eval_biology_samples_per_second": 10.897, |
| "eval_biology_steps_per_second": 2.724, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_physics_entropy": 0.8375687637329101, |
| "eval_physics_loss": 0.8513404130935669, |
| "eval_physics_mean_token_accuracy": 0.7783295741081238, |
| "eval_physics_num_tokens": 3330597.0, |
| "eval_physics_runtime": 65.9246, |
| "eval_physics_samples_per_second": 7.584, |
| "eval_physics_steps_per_second": 1.896, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.8265087101608515, |
| "epoch": 0.176, |
| "grad_norm": 33.0, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 13.4457, |
| "mean_token_accuracy": 0.7798256956040859, |
| "num_tokens": 3658264.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8278129205107689, |
| "epoch": 0.192, |
| "grad_norm": 25.75, |
| "learning_rate": 2.38e-06, |
| "loss": 13.4175, |
| "mean_token_accuracy": 0.7793817535042763, |
| "num_tokens": 3995568.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.818607559427619, |
| "epoch": 0.208, |
| "grad_norm": 26.125, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 13.2096, |
| "mean_token_accuracy": 0.7836348541080952, |
| "num_tokens": 4321436.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8120364205911755, |
| "epoch": 0.224, |
| "grad_norm": 23.625, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 13.0693, |
| "mean_token_accuracy": 0.7830110590904951, |
| "num_tokens": 4648491.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.8133310537785292, |
| "epoch": 0.24, |
| "grad_norm": 23.75, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 13.1184, |
| "mean_token_accuracy": 0.7830653071403504, |
| "num_tokens": 4986175.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.8032114181667567, |
| "epoch": 0.256, |
| "grad_norm": 27.375, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 13.0503, |
| "mean_token_accuracy": 0.7851274147629738, |
| "num_tokens": 5329320.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.8205402866005898, |
| "epoch": 0.272, |
| "grad_norm": 26.125, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 13.1719, |
| "mean_token_accuracy": 0.7810409177094698, |
| "num_tokens": 5658796.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.8173610385507345, |
| "epoch": 0.288, |
| "grad_norm": 22.25, |
| "learning_rate": 3.58e-06, |
| "loss": 13.1428, |
| "mean_token_accuracy": 0.7840665753930807, |
| "num_tokens": 5980201.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.7997741607949138, |
| "epoch": 0.304, |
| "grad_norm": 24.875, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 12.8822, |
| "mean_token_accuracy": 0.7862737070769071, |
| "num_tokens": 6298858.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.8092747095972299, |
| "epoch": 0.32, |
| "grad_norm": 22.875, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 13.0185, |
| "mean_token_accuracy": 0.784797790274024, |
| "num_tokens": 6622798.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_biology_entropy": 1.1380381927490235, |
| "eval_biology_loss": 1.1702971458435059, |
| "eval_biology_mean_token_accuracy": 0.7033402571678161, |
| "eval_biology_num_tokens": 6622798.0, |
| "eval_biology_runtime": 175.631, |
| "eval_biology_samples_per_second": 2.847, |
| "eval_biology_steps_per_second": 0.712, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_physics_entropy": 0.812020037651062, |
| "eval_physics_loss": 0.8106491565704346, |
| "eval_physics_mean_token_accuracy": 0.7852507462501526, |
| "eval_physics_num_tokens": 6622798.0, |
| "eval_physics_runtime": 200.6075, |
| "eval_physics_samples_per_second": 2.492, |
| "eval_physics_steps_per_second": 0.623, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.797424552589655, |
| "epoch": 0.336, |
| "grad_norm": 21.5, |
| "learning_rate": 4.18e-06, |
| "loss": 12.8378, |
| "mean_token_accuracy": 0.7871691755950451, |
| "num_tokens": 6953611.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.7838323166593909, |
| "epoch": 0.352, |
| "grad_norm": 24.0, |
| "learning_rate": 4.38e-06, |
| "loss": 12.6327, |
| "mean_token_accuracy": 0.7907786477357149, |
| "num_tokens": 7290378.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.7731631100177765, |
| "epoch": 0.368, |
| "grad_norm": 23.25, |
| "learning_rate": 4.58e-06, |
| "loss": 12.3982, |
| "mean_token_accuracy": 0.7930114820599556, |
| "num_tokens": 7621082.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.801326191239059, |
| "epoch": 0.384, |
| "grad_norm": 22.75, |
| "learning_rate": 4.78e-06, |
| "loss": 12.8917, |
| "mean_token_accuracy": 0.7840922765433789, |
| "num_tokens": 7955570.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.7741426464170218, |
| "epoch": 0.4, |
| "grad_norm": 23.875, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 12.4671, |
| "mean_token_accuracy": 0.791225866228342, |
| "num_tokens": 8291049.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.7628269851207733, |
| "epoch": 0.416, |
| "grad_norm": 22.125, |
| "learning_rate": 5.18e-06, |
| "loss": 12.2906, |
| "mean_token_accuracy": 0.7947692718356848, |
| "num_tokens": 8627310.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.7752459084615111, |
| "epoch": 0.432, |
| "grad_norm": 23.125, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 12.4973, |
| "mean_token_accuracy": 0.7905333787202835, |
| "num_tokens": 8958371.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.7545073958113789, |
| "epoch": 0.448, |
| "grad_norm": 22.625, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 12.1817, |
| "mean_token_accuracy": 0.7961019795387984, |
| "num_tokens": 9294388.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.7482077127322555, |
| "epoch": 0.464, |
| "grad_norm": 23.375, |
| "learning_rate": 5.78e-06, |
| "loss": 11.9906, |
| "mean_token_accuracy": 0.7969086967408657, |
| "num_tokens": 9620366.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.7531273065134882, |
| "epoch": 0.48, |
| "grad_norm": 21.0, |
| "learning_rate": 5.98e-06, |
| "loss": 12.0906, |
| "mean_token_accuracy": 0.7956845626235008, |
| "num_tokens": 9955431.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_biology_entropy": 1.1282176275253295, |
| "eval_biology_loss": 1.176755428314209, |
| "eval_biology_mean_token_accuracy": 0.7018080081939697, |
| "eval_biology_num_tokens": 9955431.0, |
| "eval_biology_runtime": 45.9288, |
| "eval_biology_samples_per_second": 10.886, |
| "eval_biology_steps_per_second": 2.722, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_physics_entropy": 0.7810471363067627, |
| "eval_physics_loss": 0.7771082520484924, |
| "eval_physics_mean_token_accuracy": 0.7920530920028687, |
| "eval_physics_num_tokens": 9955431.0, |
| "eval_physics_runtime": 66.0018, |
| "eval_physics_samples_per_second": 7.576, |
| "eval_physics_steps_per_second": 1.894, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7689633307978511, |
| "epoch": 0.496, |
| "grad_norm": 20.875, |
| "learning_rate": 6.18e-06, |
| "loss": 12.3289, |
| "mean_token_accuracy": 0.7935276433825493, |
| "num_tokens": 10289883.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.7578680694103241, |
| "epoch": 0.512, |
| "grad_norm": 25.0, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 12.2227, |
| "mean_token_accuracy": 0.7949301645159721, |
| "num_tokens": 10619468.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.7704175990074873, |
| "epoch": 0.528, |
| "grad_norm": 20.75, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 12.3943, |
| "mean_token_accuracy": 0.7915358774363994, |
| "num_tokens": 10944904.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.7533261435106396, |
| "epoch": 0.544, |
| "grad_norm": 23.125, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 12.1385, |
| "mean_token_accuracy": 0.7953089620918036, |
| "num_tokens": 11266409.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.7507271701470017, |
| "epoch": 0.56, |
| "grad_norm": 18.875, |
| "learning_rate": 6.98e-06, |
| "loss": 12.0437, |
| "mean_token_accuracy": 0.79745435975492, |
| "num_tokens": 11605544.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.7390375791117549, |
| "epoch": 0.576, |
| "grad_norm": 21.625, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 11.8508, |
| "mean_token_accuracy": 0.8004897948354482, |
| "num_tokens": 11931715.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.7514585722237825, |
| "epoch": 0.592, |
| "grad_norm": 21.5, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 12.0218, |
| "mean_token_accuracy": 0.7965340100228786, |
| "num_tokens": 12257243.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.7325067885220051, |
| "epoch": 0.608, |
| "grad_norm": 20.75, |
| "learning_rate": 7.58e-06, |
| "loss": 11.7926, |
| "mean_token_accuracy": 0.7993212066590786, |
| "num_tokens": 12583154.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.7487934850156307, |
| "epoch": 0.624, |
| "grad_norm": 21.0, |
| "learning_rate": 7.78e-06, |
| "loss": 12.053, |
| "mean_token_accuracy": 0.7958210565149784, |
| "num_tokens": 12905392.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.7440792236477136, |
| "epoch": 0.64, |
| "grad_norm": 18.75, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 11.9643, |
| "mean_token_accuracy": 0.7981198724359274, |
| "num_tokens": 13232198.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_biology_entropy": 1.0903304896354675, |
| "eval_biology_loss": 1.182153344154358, |
| "eval_biology_mean_token_accuracy": 0.7013925290107728, |
| "eval_biology_num_tokens": 13232198.0, |
| "eval_biology_runtime": 45.8492, |
| "eval_biology_samples_per_second": 10.905, |
| "eval_biology_steps_per_second": 2.726, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_physics_entropy": 0.7371547894477845, |
| "eval_physics_loss": 0.7496699094772339, |
| "eval_physics_mean_token_accuracy": 0.7974319715499878, |
| "eval_physics_num_tokens": 13232198.0, |
| "eval_physics_runtime": 65.8732, |
| "eval_physics_samples_per_second": 7.59, |
| "eval_physics_steps_per_second": 1.898, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.7319347187876701, |
| "epoch": 0.656, |
| "grad_norm": 18.375, |
| "learning_rate": 8.18e-06, |
| "loss": 11.7795, |
| "mean_token_accuracy": 0.8000467628240585, |
| "num_tokens": 13575902.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.71892513576895, |
| "epoch": 0.672, |
| "grad_norm": 21.25, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 11.525, |
| "mean_token_accuracy": 0.8044146560132504, |
| "num_tokens": 13895997.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.7239193903282285, |
| "epoch": 0.688, |
| "grad_norm": 21.375, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 11.6384, |
| "mean_token_accuracy": 0.8023904841393232, |
| "num_tokens": 14234888.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.7392614649608731, |
| "epoch": 0.704, |
| "grad_norm": 21.875, |
| "learning_rate": 8.78e-06, |
| "loss": 11.9131, |
| "mean_token_accuracy": 0.7982321321964264, |
| "num_tokens": 14567908.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.7193062495440244, |
| "epoch": 0.72, |
| "grad_norm": 23.5, |
| "learning_rate": 8.98e-06, |
| "loss": 11.5749, |
| "mean_token_accuracy": 0.8031982038170099, |
| "num_tokens": 14882927.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7137783830985427, |
| "epoch": 0.736, |
| "grad_norm": 22.375, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 11.4717, |
| "mean_token_accuracy": 0.8033147465437651, |
| "num_tokens": 15217342.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.7088428331539035, |
| "epoch": 0.752, |
| "grad_norm": 18.75, |
| "learning_rate": 9.38e-06, |
| "loss": 11.392, |
| "mean_token_accuracy": 0.8049089256674051, |
| "num_tokens": 15547710.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.7304569650441408, |
| "epoch": 0.768, |
| "grad_norm": 20.625, |
| "learning_rate": 9.58e-06, |
| "loss": 11.7448, |
| "mean_token_accuracy": 0.7994368057698011, |
| "num_tokens": 15877177.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.7059216756373644, |
| "epoch": 0.784, |
| "grad_norm": 18.75, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 11.3295, |
| "mean_token_accuracy": 0.806659733131528, |
| "num_tokens": 16219640.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.712453056499362, |
| "epoch": 0.8, |
| "grad_norm": 19.125, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 11.4991, |
| "mean_token_accuracy": 0.8039865717291832, |
| "num_tokens": 16548261.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_biology_entropy": 1.1249133114814758, |
| "eval_biology_loss": 1.1854252815246582, |
| "eval_biology_mean_token_accuracy": 0.7005919570922852, |
| "eval_biology_num_tokens": 16548261.0, |
| "eval_biology_runtime": 45.6964, |
| "eval_biology_samples_per_second": 10.942, |
| "eval_biology_steps_per_second": 2.735, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_physics_entropy": 0.730017361164093, |
| "eval_physics_loss": 0.724478542804718, |
| "eval_physics_mean_token_accuracy": 0.8027530283927917, |
| "eval_physics_num_tokens": 16548261.0, |
| "eval_physics_runtime": 65.8504, |
| "eval_physics_samples_per_second": 7.593, |
| "eval_physics_steps_per_second": 1.898, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 16, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.652179086470234e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|