| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.4, |
| "eval_steps": 100, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.9169743306934834, |
| "epoch": 0.008, |
| "grad_norm": 0.10595703125, |
| "learning_rate": 1.8e-07, |
| "loss": 11.4093, |
| "mean_token_accuracy": 0.16113518364727497, |
| "num_tokens": 103172.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.943360574543476, |
| "epoch": 0.016, |
| "grad_norm": 0.11328125, |
| "learning_rate": 3.8e-07, |
| "loss": 11.3724, |
| "mean_token_accuracy": 0.15745336734689772, |
| "num_tokens": 195524.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.8853021122515201, |
| "epoch": 0.024, |
| "grad_norm": 0.11376953125, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 11.4536, |
| "mean_token_accuracy": 0.1584921860601753, |
| "num_tokens": 296999.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.9313011743128299, |
| "epoch": 0.032, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 7.8e-07, |
| "loss": 11.4176, |
| "mean_token_accuracy": 0.15730738821439444, |
| "num_tokens": 390903.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.886887714266777, |
| "epoch": 0.04, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 11.362, |
| "mean_token_accuracy": 0.15966884773224593, |
| "num_tokens": 488481.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.8928033113479614, |
| "epoch": 0.048, |
| "grad_norm": 0.10791015625, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 11.4662, |
| "mean_token_accuracy": 0.1601271564140916, |
| "num_tokens": 589868.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.9126614801585675, |
| "epoch": 0.056, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 11.4393, |
| "mean_token_accuracy": 0.15678977984935044, |
| "num_tokens": 690754.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.8982849732041358, |
| "epoch": 0.064, |
| "grad_norm": 0.11279296875, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 11.428, |
| "mean_token_accuracy": 0.15833127587102352, |
| "num_tokens": 791190.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.9196412533521652, |
| "epoch": 0.072, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 11.3783, |
| "mean_token_accuracy": 0.1575094529427588, |
| "num_tokens": 890075.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.8977599568665027, |
| "epoch": 0.08, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 1.98e-06, |
| "loss": 11.3874, |
| "mean_token_accuracy": 0.16100947242230176, |
| "num_tokens": 989860.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_coding_entropy": 1.000721924304962, |
| "eval_coding_loss": 8.267828941345215, |
| "eval_coding_mean_token_accuracy": 0.2908177390098572, |
| "eval_coding_num_tokens": 989860.0, |
| "eval_coding_runtime": 94.2996, |
| "eval_coding_samples_per_second": 5.302, |
| "eval_coding_steps_per_second": 2.651, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_biology_entropy": 1.8729813146591185, |
| "eval_biology_loss": 11.396076202392578, |
| "eval_biology_mean_token_accuracy": 0.1593981314599514, |
| "eval_biology_num_tokens": 989860.0, |
| "eval_biology_runtime": 41.3672, |
| "eval_biology_samples_per_second": 12.087, |
| "eval_biology_steps_per_second": 6.043, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.9323325648903846, |
| "epoch": 0.088, |
| "grad_norm": 0.109375, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 11.43, |
| "mean_token_accuracy": 0.15889165913686157, |
| "num_tokens": 1087099.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.9131557896733284, |
| "epoch": 0.096, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 2.38e-06, |
| "loss": 11.2832, |
| "mean_token_accuracy": 0.15686040292494, |
| "num_tokens": 1181777.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.8847123883664607, |
| "epoch": 0.104, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 11.5591, |
| "mean_token_accuracy": 0.15738505702465772, |
| "num_tokens": 1283881.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.871863854676485, |
| "epoch": 0.112, |
| "grad_norm": 0.10888671875, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 11.3822, |
| "mean_token_accuracy": 0.15778318694792687, |
| "num_tokens": 1385513.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.9166233658790588, |
| "epoch": 0.12, |
| "grad_norm": 0.115234375, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 11.3895, |
| "mean_token_accuracy": 0.15918795759789645, |
| "num_tokens": 1483667.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.902534269541502, |
| "epoch": 0.128, |
| "grad_norm": 0.10888671875, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 11.4745, |
| "mean_token_accuracy": 0.15784057211130859, |
| "num_tokens": 1582368.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.9250910609960556, |
| "epoch": 0.136, |
| "grad_norm": 0.1015625, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 11.3645, |
| "mean_token_accuracy": 0.15877335243858398, |
| "num_tokens": 1677060.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.888841236382723, |
| "epoch": 0.144, |
| "grad_norm": 0.1201171875, |
| "learning_rate": 3.58e-06, |
| "loss": 11.3131, |
| "mean_token_accuracy": 0.15988174122758209, |
| "num_tokens": 1773764.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.923802337050438, |
| "epoch": 0.152, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 11.4321, |
| "mean_token_accuracy": 0.15674821990542115, |
| "num_tokens": 1873167.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.913605347275734, |
| "epoch": 0.16, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 11.5801, |
| "mean_token_accuracy": 0.1564559088088572, |
| "num_tokens": 1970077.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_coding_entropy": 1.000721924304962, |
| "eval_coding_loss": 8.267828941345215, |
| "eval_coding_mean_token_accuracy": 0.2908177390098572, |
| "eval_coding_num_tokens": 1970077.0, |
| "eval_coding_runtime": 93.1599, |
| "eval_coding_samples_per_second": 5.367, |
| "eval_coding_steps_per_second": 2.684, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_biology_entropy": 1.8729813146591185, |
| "eval_biology_loss": 11.396076202392578, |
| "eval_biology_mean_token_accuracy": 0.1593981314599514, |
| "eval_biology_num_tokens": 1970077.0, |
| "eval_biology_runtime": 41.0689, |
| "eval_biology_samples_per_second": 12.175, |
| "eval_biology_steps_per_second": 6.087, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.9281546145677566, |
| "epoch": 0.168, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 4.18e-06, |
| "loss": 11.4462, |
| "mean_token_accuracy": 0.15564582953229547, |
| "num_tokens": 2066815.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.883149204403162, |
| "epoch": 0.176, |
| "grad_norm": 0.103515625, |
| "learning_rate": 4.38e-06, |
| "loss": 11.4588, |
| "mean_token_accuracy": 0.1579661637544632, |
| "num_tokens": 2168354.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.914576094597578, |
| "epoch": 0.184, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 4.58e-06, |
| "loss": 11.3803, |
| "mean_token_accuracy": 0.15757576958276331, |
| "num_tokens": 2270617.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.9133810736238956, |
| "epoch": 0.192, |
| "grad_norm": 0.11083984375, |
| "learning_rate": 4.78e-06, |
| "loss": 11.5562, |
| "mean_token_accuracy": 0.15540659767575563, |
| "num_tokens": 2365822.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.9234349891543387, |
| "epoch": 0.2, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 11.5222, |
| "mean_token_accuracy": 0.15323229427449406, |
| "num_tokens": 2459876.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.9399859428405761, |
| "epoch": 0.208, |
| "grad_norm": 0.109375, |
| "learning_rate": 5.18e-06, |
| "loss": 11.428, |
| "mean_token_accuracy": 0.1552271238528192, |
| "num_tokens": 2558762.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.9029304042458535, |
| "epoch": 0.216, |
| "grad_norm": 0.10205078125, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 11.2925, |
| "mean_token_accuracy": 0.16210160069167615, |
| "num_tokens": 2654953.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.9189352929592132, |
| "epoch": 0.224, |
| "grad_norm": 0.09912109375, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 11.3849, |
| "mean_token_accuracy": 0.15769596104510128, |
| "num_tokens": 2755347.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.8914753369987012, |
| "epoch": 0.232, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 5.78e-06, |
| "loss": 11.4031, |
| "mean_token_accuracy": 0.16200053002685308, |
| "num_tokens": 2852298.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.9092405900359153, |
| "epoch": 0.24, |
| "grad_norm": 0.10302734375, |
| "learning_rate": 5.98e-06, |
| "loss": 11.2327, |
| "mean_token_accuracy": 0.16136883310973643, |
| "num_tokens": 2947346.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_coding_entropy": 1.000721924304962, |
| "eval_coding_loss": 8.267828941345215, |
| "eval_coding_mean_token_accuracy": 0.2908177390098572, |
| "eval_coding_num_tokens": 2947346.0, |
| "eval_coding_runtime": 93.3688, |
| "eval_coding_samples_per_second": 5.355, |
| "eval_coding_steps_per_second": 2.678, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_biology_entropy": 1.8729813146591185, |
| "eval_biology_loss": 11.396076202392578, |
| "eval_biology_mean_token_accuracy": 0.1593981314599514, |
| "eval_biology_num_tokens": 2947346.0, |
| "eval_biology_runtime": 41.7366, |
| "eval_biology_samples_per_second": 11.98, |
| "eval_biology_steps_per_second": 5.99, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.9313343465328217, |
| "epoch": 0.248, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 6.18e-06, |
| "loss": 11.5425, |
| "mean_token_accuracy": 0.15317737096920608, |
| "num_tokens": 3043128.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.9211706645786761, |
| "epoch": 0.256, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 11.4729, |
| "mean_token_accuracy": 0.1547949264757335, |
| "num_tokens": 3139957.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.923046988248825, |
| "epoch": 0.264, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 11.4159, |
| "mean_token_accuracy": 0.15573964624200015, |
| "num_tokens": 3232159.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.8901807874441148, |
| "epoch": 0.272, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 11.3543, |
| "mean_token_accuracy": 0.1617777702398598, |
| "num_tokens": 3335951.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.8801799044013023, |
| "epoch": 0.28, |
| "grad_norm": 0.10498046875, |
| "learning_rate": 6.98e-06, |
| "loss": 11.4879, |
| "mean_token_accuracy": 0.15775063186883925, |
| "num_tokens": 3437213.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.9043901436030866, |
| "epoch": 0.288, |
| "grad_norm": 0.11474609375, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 11.5366, |
| "mean_token_accuracy": 0.15751364254392683, |
| "num_tokens": 3539731.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.9588028222322464, |
| "epoch": 0.296, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 11.3894, |
| "mean_token_accuracy": 0.153670167690143, |
| "num_tokens": 3633448.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.881439681351185, |
| "epoch": 0.304, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 7.58e-06, |
| "loss": 11.3016, |
| "mean_token_accuracy": 0.16050010859034955, |
| "num_tokens": 3733488.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.9542286798357964, |
| "epoch": 0.312, |
| "grad_norm": 0.103515625, |
| "learning_rate": 7.78e-06, |
| "loss": 11.273, |
| "mean_token_accuracy": 0.15929239662364125, |
| "num_tokens": 3825212.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.9287514887750148, |
| "epoch": 0.32, |
| "grad_norm": 0.10107421875, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 11.3054, |
| "mean_token_accuracy": 0.1604186396114528, |
| "num_tokens": 3920545.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_coding_entropy": 1.000721924304962, |
| "eval_coding_loss": 8.267828941345215, |
| "eval_coding_mean_token_accuracy": 0.2908177390098572, |
| "eval_coding_num_tokens": 3920545.0, |
| "eval_coding_runtime": 92.8995, |
| "eval_coding_samples_per_second": 5.382, |
| "eval_coding_steps_per_second": 2.691, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_biology_entropy": 1.8729813146591185, |
| "eval_biology_loss": 11.396076202392578, |
| "eval_biology_mean_token_accuracy": 0.1593981314599514, |
| "eval_biology_num_tokens": 3920545.0, |
| "eval_biology_runtime": 41.088, |
| "eval_biology_samples_per_second": 12.169, |
| "eval_biology_steps_per_second": 6.085, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.9058279685676098, |
| "epoch": 0.328, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 8.18e-06, |
| "loss": 11.3842, |
| "mean_token_accuracy": 0.15871414174325765, |
| "num_tokens": 4016673.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.930207794904709, |
| "epoch": 0.336, |
| "grad_norm": 0.10888671875, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 11.3961, |
| "mean_token_accuracy": 0.1579543233383447, |
| "num_tokens": 4114077.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.9419952422380446, |
| "epoch": 0.344, |
| "grad_norm": 0.111328125, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 11.362, |
| "mean_token_accuracy": 0.15992560391314328, |
| "num_tokens": 4210403.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.913541816174984, |
| "epoch": 0.352, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 8.78e-06, |
| "loss": 11.3323, |
| "mean_token_accuracy": 0.1633904448710382, |
| "num_tokens": 4306949.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.913567177206278, |
| "epoch": 0.36, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 8.98e-06, |
| "loss": 11.458, |
| "mean_token_accuracy": 0.15837323614396154, |
| "num_tokens": 4406104.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.931741751730442, |
| "epoch": 0.368, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 11.2873, |
| "mean_token_accuracy": 0.15801281332969666, |
| "num_tokens": 4504001.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.904297123849392, |
| "epoch": 0.376, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 9.38e-06, |
| "loss": 11.3813, |
| "mean_token_accuracy": 0.155983364302665, |
| "num_tokens": 4597559.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.9029462151229382, |
| "epoch": 0.384, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 9.58e-06, |
| "loss": 11.3823, |
| "mean_token_accuracy": 0.15874490085989237, |
| "num_tokens": 4693812.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.9272562205791473, |
| "epoch": 0.392, |
| "grad_norm": 0.11474609375, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 11.2937, |
| "mean_token_accuracy": 0.1593953504692763, |
| "num_tokens": 4790965.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.9103233270347117, |
| "epoch": 0.4, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 11.3472, |
| "mean_token_accuracy": 0.15859601702541112, |
| "num_tokens": 4887094.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_coding_entropy": 1.000721924304962, |
| "eval_coding_loss": 8.267828941345215, |
| "eval_coding_mean_token_accuracy": 0.2908177390098572, |
| "eval_coding_num_tokens": 4887094.0, |
| "eval_coding_runtime": 93.1337, |
| "eval_coding_samples_per_second": 5.369, |
| "eval_coding_steps_per_second": 2.684, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_biology_entropy": 1.8729813146591185, |
| "eval_biology_loss": 11.396076202392578, |
| "eval_biology_mean_token_accuracy": 0.1593981314599514, |
| "eval_biology_num_tokens": 4887094.0, |
| "eval_biology_runtime": 41.2135, |
| "eval_biology_samples_per_second": 12.132, |
| "eval_biology_steps_per_second": 6.066, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.18186854263808e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|