{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.9169743306934834, "epoch": 0.008, "grad_norm": 0.10595703125, "learning_rate": 1.8e-07, "loss": 11.4093, "mean_token_accuracy": 0.16113518364727497, "num_tokens": 103172.0, "step": 10 }, { "entropy": 1.943360574543476, "epoch": 0.016, "grad_norm": 0.11328125, "learning_rate": 3.8e-07, "loss": 11.3724, "mean_token_accuracy": 0.15745336734689772, "num_tokens": 195524.0, "step": 20 }, { "entropy": 1.8853021122515201, "epoch": 0.024, "grad_norm": 0.11376953125, "learning_rate": 5.800000000000001e-07, "loss": 11.4536, "mean_token_accuracy": 0.1584921860601753, "num_tokens": 296999.0, "step": 30 }, { "entropy": 1.9313011743128299, "epoch": 0.032, "grad_norm": 0.11962890625, "learning_rate": 7.8e-07, "loss": 11.4176, "mean_token_accuracy": 0.15730738821439444, "num_tokens": 390903.0, "step": 40 }, { "entropy": 1.886887714266777, "epoch": 0.04, "grad_norm": 0.11083984375, "learning_rate": 9.800000000000001e-07, "loss": 11.362, "mean_token_accuracy": 0.15966884773224593, "num_tokens": 488481.0, "step": 50 }, { "entropy": 1.8928033113479614, "epoch": 0.048, "grad_norm": 0.10791015625, "learning_rate": 1.1800000000000001e-06, "loss": 11.4662, "mean_token_accuracy": 0.1601271564140916, "num_tokens": 589868.0, "step": 60 }, { "entropy": 1.9126614801585675, "epoch": 0.056, "grad_norm": 0.11083984375, "learning_rate": 1.3800000000000001e-06, "loss": 11.4393, "mean_token_accuracy": 0.15678977984935044, "num_tokens": 690754.0, "step": 70 }, { "entropy": 1.8982849732041358, "epoch": 0.064, "grad_norm": 0.11279296875, "learning_rate": 1.5800000000000001e-06, "loss": 11.428, "mean_token_accuracy": 0.15833127587102352, "num_tokens": 791190.0, "step": 80 }, { "entropy": 1.9196412533521652, "epoch": 0.072, "grad_norm": 0.11181640625, "learning_rate": 1.7800000000000001e-06, "loss": 11.3783, "mean_token_accuracy": 0.1575094529427588, "num_tokens": 890075.0, "step": 90 }, { "entropy": 1.8977599568665027, "epoch": 0.08, "grad_norm": 0.1142578125, "learning_rate": 1.98e-06, "loss": 11.3874, "mean_token_accuracy": 0.16100947242230176, "num_tokens": 989860.0, "step": 100 }, { "epoch": 0.08, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 989860.0, "eval_coding_runtime": 94.2996, "eval_coding_samples_per_second": 5.302, "eval_coding_steps_per_second": 2.651, "step": 100 }, { "epoch": 0.08, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 989860.0, "eval_biology_runtime": 41.3672, "eval_biology_samples_per_second": 12.087, "eval_biology_steps_per_second": 6.043, "step": 100 }, { "entropy": 1.9323325648903846, "epoch": 0.088, "grad_norm": 0.109375, "learning_rate": 2.1800000000000003e-06, "loss": 11.43, "mean_token_accuracy": 0.15889165913686157, "num_tokens": 1087099.0, "step": 110 }, { "entropy": 1.9131557896733284, "epoch": 0.096, "grad_norm": 0.1123046875, "learning_rate": 2.38e-06, "loss": 11.2832, "mean_token_accuracy": 0.15686040292494, "num_tokens": 1181777.0, "step": 120 }, { "entropy": 1.8847123883664607, "epoch": 0.104, "grad_norm": 0.1181640625, "learning_rate": 2.5800000000000003e-06, "loss": 11.5591, "mean_token_accuracy": 0.15738505702465772, "num_tokens": 1283881.0, "step": 130 }, { "entropy": 1.871863854676485, "epoch": 0.112, "grad_norm": 0.10888671875, "learning_rate": 2.7800000000000005e-06, "loss": 11.3822, "mean_token_accuracy": 0.15778318694792687, "num_tokens": 1385513.0, "step": 140 }, { "entropy": 1.9166233658790588, "epoch": 0.12, "grad_norm": 0.115234375, "learning_rate": 2.9800000000000003e-06, "loss": 11.3895, "mean_token_accuracy": 0.15918795759789645, "num_tokens": 1483667.0, "step": 150 }, { "entropy": 1.902534269541502, "epoch": 0.128, "grad_norm": 0.10888671875, "learning_rate": 3.1800000000000005e-06, "loss": 11.4745, "mean_token_accuracy": 0.15784057211130859, "num_tokens": 1582368.0, "step": 160 }, { "entropy": 1.9250910609960556, "epoch": 0.136, "grad_norm": 0.1015625, "learning_rate": 3.3800000000000007e-06, "loss": 11.3645, "mean_token_accuracy": 0.15877335243858398, "num_tokens": 1677060.0, "step": 170 }, { "entropy": 1.888841236382723, "epoch": 0.144, "grad_norm": 0.1201171875, "learning_rate": 3.58e-06, "loss": 11.3131, "mean_token_accuracy": 0.15988174122758209, "num_tokens": 1773764.0, "step": 180 }, { "entropy": 1.923802337050438, "epoch": 0.152, "grad_norm": 0.10986328125, "learning_rate": 3.7800000000000002e-06, "loss": 11.4321, "mean_token_accuracy": 0.15674821990542115, "num_tokens": 1873167.0, "step": 190 }, { "entropy": 1.913605347275734, "epoch": 0.16, "grad_norm": 0.10693359375, "learning_rate": 3.980000000000001e-06, "loss": 11.5801, "mean_token_accuracy": 0.1564559088088572, "num_tokens": 1970077.0, "step": 200 }, { "epoch": 0.16, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 1970077.0, "eval_coding_runtime": 93.1599, "eval_coding_samples_per_second": 5.367, "eval_coding_steps_per_second": 2.684, "step": 200 }, { "epoch": 0.16, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 1970077.0, "eval_biology_runtime": 41.0689, "eval_biology_samples_per_second": 12.175, "eval_biology_steps_per_second": 6.087, "step": 200 }, { "entropy": 1.9281546145677566, "epoch": 0.168, "grad_norm": 0.11083984375, "learning_rate": 4.18e-06, "loss": 11.4462, "mean_token_accuracy": 0.15564582953229547, "num_tokens": 2066815.0, "step": 210 }, { "entropy": 1.883149204403162, "epoch": 0.176, "grad_norm": 0.103515625, "learning_rate": 4.38e-06, "loss": 11.4588, "mean_token_accuracy": 0.1579661637544632, "num_tokens": 2168354.0, "step": 220 }, { "entropy": 1.914576094597578, "epoch": 0.184, "grad_norm": 0.10400390625, "learning_rate": 4.58e-06, "loss": 11.3803, "mean_token_accuracy": 0.15757576958276331, "num_tokens": 2270617.0, "step": 230 }, { "entropy": 1.9133810736238956, "epoch": 0.192, "grad_norm": 0.11083984375, "learning_rate": 4.78e-06, "loss": 11.5562, "mean_token_accuracy": 0.15540659767575563, "num_tokens": 2365822.0, "step": 240 }, { "entropy": 1.9234349891543387, "epoch": 0.2, "grad_norm": 0.1142578125, "learning_rate": 4.980000000000001e-06, "loss": 11.5222, "mean_token_accuracy": 0.15323229427449406, "num_tokens": 2459876.0, "step": 250 }, { "entropy": 1.9399859428405761, "epoch": 0.208, "grad_norm": 0.109375, "learning_rate": 5.18e-06, "loss": 11.428, "mean_token_accuracy": 0.1552271238528192, "num_tokens": 2558762.0, "step": 260 }, { "entropy": 1.9029304042458535, "epoch": 0.216, "grad_norm": 0.10205078125, "learning_rate": 5.380000000000001e-06, "loss": 11.2925, "mean_token_accuracy": 0.16210160069167615, "num_tokens": 2654953.0, "step": 270 }, { "entropy": 1.9189352929592132, "epoch": 0.224, "grad_norm": 0.09912109375, "learning_rate": 5.580000000000001e-06, "loss": 11.3849, "mean_token_accuracy": 0.15769596104510128, "num_tokens": 2755347.0, "step": 280 }, { "entropy": 1.8914753369987012, "epoch": 0.232, "grad_norm": 0.1181640625, "learning_rate": 5.78e-06, "loss": 11.4031, "mean_token_accuracy": 0.16200053002685308, "num_tokens": 2852298.0, "step": 290 }, { "entropy": 1.9092405900359153, "epoch": 0.24, "grad_norm": 0.10302734375, "learning_rate": 5.98e-06, "loss": 11.2327, "mean_token_accuracy": 0.16136883310973643, "num_tokens": 2947346.0, "step": 300 }, { "epoch": 0.24, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 2947346.0, "eval_coding_runtime": 93.3688, "eval_coding_samples_per_second": 5.355, "eval_coding_steps_per_second": 2.678, "step": 300 }, { "epoch": 0.24, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 2947346.0, "eval_biology_runtime": 41.7366, "eval_biology_samples_per_second": 11.98, "eval_biology_steps_per_second": 5.99, "step": 300 }, { "entropy": 1.9313343465328217, "epoch": 0.248, "grad_norm": 0.1123046875, "learning_rate": 6.18e-06, "loss": 11.5425, "mean_token_accuracy": 0.15317737096920608, "num_tokens": 3043128.0, "step": 310 }, { "entropy": 1.9211706645786761, "epoch": 0.256, "grad_norm": 0.1083984375, "learning_rate": 6.380000000000001e-06, "loss": 11.4729, "mean_token_accuracy": 0.1547949264757335, "num_tokens": 3139957.0, "step": 320 }, { "entropy": 1.923046988248825, "epoch": 0.264, "grad_norm": 0.10693359375, "learning_rate": 6.5800000000000005e-06, "loss": 11.4159, "mean_token_accuracy": 0.15573964624200015, "num_tokens": 3232159.0, "step": 330 }, { "entropy": 1.8901807874441148, "epoch": 0.272, "grad_norm": 0.1083984375, "learning_rate": 6.780000000000001e-06, "loss": 11.3543, "mean_token_accuracy": 0.1617777702398598, "num_tokens": 3335951.0, "step": 340 }, { "entropy": 1.8801799044013023, "epoch": 0.28, "grad_norm": 0.10498046875, "learning_rate": 6.98e-06, "loss": 11.4879, "mean_token_accuracy": 0.15775063186883925, "num_tokens": 3437213.0, "step": 350 }, { "entropy": 1.9043901436030866, "epoch": 0.288, "grad_norm": 0.11474609375, "learning_rate": 7.180000000000001e-06, "loss": 11.5366, "mean_token_accuracy": 0.15751364254392683, "num_tokens": 3539731.0, "step": 360 }, { "entropy": 1.9588028222322464, "epoch": 0.296, "grad_norm": 0.1064453125, "learning_rate": 7.3800000000000005e-06, "loss": 11.3894, "mean_token_accuracy": 0.153670167690143, "num_tokens": 3633448.0, "step": 370 }, { "entropy": 1.881439681351185, "epoch": 0.304, "grad_norm": 0.1064453125, "learning_rate": 7.58e-06, "loss": 11.3016, "mean_token_accuracy": 0.16050010859034955, "num_tokens": 3733488.0, "step": 380 }, { "entropy": 1.9542286798357964, "epoch": 0.312, "grad_norm": 0.103515625, "learning_rate": 7.78e-06, "loss": 11.273, "mean_token_accuracy": 0.15929239662364125, "num_tokens": 3825212.0, "step": 390 }, { "entropy": 1.9287514887750148, "epoch": 0.32, "grad_norm": 0.10107421875, "learning_rate": 7.980000000000002e-06, "loss": 11.3054, "mean_token_accuracy": 0.1604186396114528, "num_tokens": 3920545.0, "step": 400 }, { "epoch": 0.32, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 3920545.0, "eval_coding_runtime": 92.8995, "eval_coding_samples_per_second": 5.382, "eval_coding_steps_per_second": 2.691, "step": 400 }, { "epoch": 0.32, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 3920545.0, "eval_biology_runtime": 41.088, "eval_biology_samples_per_second": 12.169, "eval_biology_steps_per_second": 6.085, "step": 400 }, { "entropy": 1.9058279685676098, "epoch": 0.328, "grad_norm": 0.11181640625, "learning_rate": 8.18e-06, "loss": 11.3842, "mean_token_accuracy": 0.15871414174325765, "num_tokens": 4016673.0, "step": 410 }, { "entropy": 1.930207794904709, "epoch": 0.336, "grad_norm": 0.10888671875, "learning_rate": 8.380000000000001e-06, "loss": 11.3961, "mean_token_accuracy": 0.1579543233383447, "num_tokens": 4114077.0, "step": 420 }, { "entropy": 1.9419952422380446, "epoch": 0.344, "grad_norm": 0.111328125, "learning_rate": 8.580000000000001e-06, "loss": 11.362, "mean_token_accuracy": 0.15992560391314328, "num_tokens": 4210403.0, "step": 430 }, { "entropy": 1.913541816174984, "epoch": 0.352, "grad_norm": 0.1142578125, "learning_rate": 8.78e-06, "loss": 11.3323, "mean_token_accuracy": 0.1633904448710382, "num_tokens": 4306949.0, "step": 440 }, { "entropy": 1.913567177206278, "epoch": 0.36, "grad_norm": 0.1240234375, "learning_rate": 8.98e-06, "loss": 11.458, "mean_token_accuracy": 0.15837323614396154, "num_tokens": 4406104.0, "step": 450 }, { "entropy": 1.931741751730442, "epoch": 0.368, "grad_norm": 0.1025390625, "learning_rate": 9.180000000000002e-06, "loss": 11.2873, "mean_token_accuracy": 0.15801281332969666, "num_tokens": 4504001.0, "step": 460 }, { "entropy": 1.904297123849392, "epoch": 0.376, "grad_norm": 0.1025390625, "learning_rate": 9.38e-06, "loss": 11.3813, "mean_token_accuracy": 0.155983364302665, "num_tokens": 4597559.0, "step": 470 }, { "entropy": 1.9029462151229382, "epoch": 0.384, "grad_norm": 0.09716796875, "learning_rate": 9.58e-06, "loss": 11.3823, "mean_token_accuracy": 0.15874490085989237, "num_tokens": 4693812.0, "step": 480 }, { "entropy": 1.9272562205791473, "epoch": 0.392, "grad_norm": 0.11474609375, "learning_rate": 9.780000000000001e-06, "loss": 11.2937, "mean_token_accuracy": 0.1593953504692763, "num_tokens": 4790965.0, "step": 490 }, { "entropy": 1.9103233270347117, "epoch": 0.4, "grad_norm": 0.1083984375, "learning_rate": 9.980000000000001e-06, "loss": 11.3472, "mean_token_accuracy": 0.15859601702541112, "num_tokens": 4887094.0, "step": 500 }, { "epoch": 0.4, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 4887094.0, "eval_coding_runtime": 93.1337, "eval_coding_samples_per_second": 5.369, "eval_coding_steps_per_second": 2.684, "step": 500 }, { "epoch": 0.4, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 4887094.0, "eval_biology_runtime": 41.2135, "eval_biology_samples_per_second": 12.132, "eval_biology_steps_per_second": 6.066, "step": 500 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.18186854263808e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }