{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4, "eval_steps": 100, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.9169743306934834, "epoch": 0.008, "grad_norm": 0.10595703125, "learning_rate": 1.8e-07, "loss": 11.4093, "mean_token_accuracy": 0.16113518364727497, "num_tokens": 103172.0, "step": 10 }, { "entropy": 1.943360574543476, "epoch": 0.016, "grad_norm": 0.11328125, "learning_rate": 3.8e-07, "loss": 11.3724, "mean_token_accuracy": 0.15745336734689772, "num_tokens": 195524.0, "step": 20 }, { "entropy": 1.8853021122515201, "epoch": 0.024, "grad_norm": 0.11376953125, "learning_rate": 5.800000000000001e-07, "loss": 11.4536, "mean_token_accuracy": 0.1584921860601753, "num_tokens": 296999.0, "step": 30 }, { "entropy": 1.9313011743128299, "epoch": 0.032, "grad_norm": 0.11962890625, "learning_rate": 7.8e-07, "loss": 11.4176, "mean_token_accuracy": 0.15730738821439444, "num_tokens": 390903.0, "step": 40 }, { "entropy": 1.886887714266777, "epoch": 0.04, "grad_norm": 0.11083984375, "learning_rate": 9.800000000000001e-07, "loss": 11.362, "mean_token_accuracy": 0.15966884773224593, "num_tokens": 488481.0, "step": 50 }, { "entropy": 1.8928033113479614, "epoch": 0.048, "grad_norm": 0.10791015625, "learning_rate": 1.1800000000000001e-06, "loss": 11.4662, "mean_token_accuracy": 0.1601271564140916, "num_tokens": 589868.0, "step": 60 }, { "entropy": 1.9126614801585675, "epoch": 0.056, "grad_norm": 0.11083984375, "learning_rate": 1.3800000000000001e-06, "loss": 11.4393, "mean_token_accuracy": 0.15678977984935044, "num_tokens": 690754.0, "step": 70 }, { "entropy": 1.8982849732041358, "epoch": 0.064, "grad_norm": 0.11279296875, "learning_rate": 1.5800000000000001e-06, "loss": 11.428, "mean_token_accuracy": 0.15833127587102352, "num_tokens": 791190.0, "step": 80 }, { "entropy": 1.9196412533521652, "epoch": 0.072, "grad_norm": 0.11181640625, "learning_rate": 1.7800000000000001e-06, "loss": 11.3783, "mean_token_accuracy": 0.1575094529427588, "num_tokens": 890075.0, "step": 90 }, { "entropy": 1.8977599568665027, "epoch": 0.08, "grad_norm": 0.1142578125, "learning_rate": 1.98e-06, "loss": 11.3874, "mean_token_accuracy": 0.16100947242230176, "num_tokens": 989860.0, "step": 100 }, { "epoch": 0.08, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 989860.0, "eval_coding_runtime": 94.2996, "eval_coding_samples_per_second": 5.302, "eval_coding_steps_per_second": 2.651, "step": 100 }, { "epoch": 0.08, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 989860.0, "eval_biology_runtime": 41.3672, "eval_biology_samples_per_second": 12.087, "eval_biology_steps_per_second": 6.043, "step": 100 }, { "entropy": 1.9323325648903846, "epoch": 0.088, "grad_norm": 0.109375, "learning_rate": 2.1800000000000003e-06, "loss": 11.43, "mean_token_accuracy": 0.15889165913686157, "num_tokens": 1087099.0, "step": 110 }, { "entropy": 1.9131557896733284, "epoch": 0.096, "grad_norm": 0.1123046875, "learning_rate": 2.38e-06, "loss": 11.2832, "mean_token_accuracy": 0.15686040292494, "num_tokens": 1181777.0, "step": 120 }, { "entropy": 1.8847123883664607, "epoch": 0.104, "grad_norm": 0.1181640625, "learning_rate": 2.5800000000000003e-06, "loss": 11.5591, "mean_token_accuracy": 0.15738505702465772, "num_tokens": 1283881.0, "step": 130 }, { "entropy": 1.871863854676485, "epoch": 0.112, "grad_norm": 0.10888671875, "learning_rate": 2.7800000000000005e-06, "loss": 11.3822, "mean_token_accuracy": 0.15778318694792687, "num_tokens": 1385513.0, "step": 140 }, { "entropy": 1.9166233658790588, "epoch": 0.12, "grad_norm": 0.115234375, "learning_rate": 2.9800000000000003e-06, "loss": 11.3895, "mean_token_accuracy": 0.15918795759789645, "num_tokens": 1483667.0, "step": 150 }, { "entropy": 1.902534269541502, "epoch": 0.128, "grad_norm": 0.10888671875, "learning_rate": 3.1800000000000005e-06, "loss": 11.4745, "mean_token_accuracy": 0.15784057211130859, "num_tokens": 1582368.0, "step": 160 }, { "entropy": 1.9250910609960556, "epoch": 0.136, "grad_norm": 0.1015625, "learning_rate": 3.3800000000000007e-06, "loss": 11.3645, "mean_token_accuracy": 0.15877335243858398, "num_tokens": 1677060.0, "step": 170 }, { "entropy": 1.888841236382723, "epoch": 0.144, "grad_norm": 0.1201171875, "learning_rate": 3.58e-06, "loss": 11.3131, "mean_token_accuracy": 0.15988174122758209, "num_tokens": 1773764.0, "step": 180 }, { "entropy": 1.923802337050438, "epoch": 0.152, "grad_norm": 0.10986328125, "learning_rate": 3.7800000000000002e-06, "loss": 11.4321, "mean_token_accuracy": 0.15674821990542115, "num_tokens": 1873167.0, "step": 190 }, { "entropy": 1.913605347275734, "epoch": 0.16, "grad_norm": 0.10693359375, "learning_rate": 3.980000000000001e-06, "loss": 11.5801, "mean_token_accuracy": 0.1564559088088572, "num_tokens": 1970077.0, "step": 200 }, { "epoch": 0.16, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 1970077.0, "eval_coding_runtime": 93.1599, "eval_coding_samples_per_second": 5.367, "eval_coding_steps_per_second": 2.684, "step": 200 }, { "epoch": 0.16, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 1970077.0, "eval_biology_runtime": 41.0689, "eval_biology_samples_per_second": 12.175, "eval_biology_steps_per_second": 6.087, "step": 200 }, { "entropy": 1.9281546145677566, "epoch": 0.168, "grad_norm": 0.11083984375, "learning_rate": 4.18e-06, "loss": 11.4462, "mean_token_accuracy": 0.15564582953229547, "num_tokens": 2066815.0, "step": 210 }, { "entropy": 1.883149204403162, "epoch": 0.176, "grad_norm": 0.103515625, "learning_rate": 4.38e-06, "loss": 11.4588, "mean_token_accuracy": 0.1579661637544632, "num_tokens": 2168354.0, "step": 220 }, { "entropy": 1.914576094597578, "epoch": 0.184, "grad_norm": 0.10400390625, "learning_rate": 4.58e-06, "loss": 11.3803, "mean_token_accuracy": 0.15757576958276331, "num_tokens": 2270617.0, "step": 230 }, { "entropy": 1.9133810736238956, "epoch": 0.192, "grad_norm": 0.11083984375, "learning_rate": 4.78e-06, "loss": 11.5562, "mean_token_accuracy": 0.15540659767575563, "num_tokens": 2365822.0, "step": 240 }, { "entropy": 1.9234349891543387, "epoch": 0.2, "grad_norm": 0.1142578125, "learning_rate": 4.980000000000001e-06, "loss": 11.5222, "mean_token_accuracy": 0.15323229427449406, "num_tokens": 2459876.0, "step": 250 }, { "entropy": 1.9399859428405761, "epoch": 0.208, "grad_norm": 0.109375, "learning_rate": 5.18e-06, "loss": 11.428, "mean_token_accuracy": 0.1552271238528192, "num_tokens": 2558762.0, "step": 260 }, { "entropy": 1.9029304042458535, "epoch": 0.216, "grad_norm": 0.10205078125, "learning_rate": 5.380000000000001e-06, "loss": 11.2925, "mean_token_accuracy": 0.16210160069167615, "num_tokens": 2654953.0, "step": 270 }, { "entropy": 1.9189352929592132, "epoch": 0.224, "grad_norm": 0.09912109375, "learning_rate": 5.580000000000001e-06, "loss": 11.3849, "mean_token_accuracy": 0.15769596104510128, "num_tokens": 2755347.0, "step": 280 }, { "entropy": 1.8914753369987012, "epoch": 0.232, "grad_norm": 0.1181640625, "learning_rate": 5.78e-06, "loss": 11.4031, "mean_token_accuracy": 0.16200053002685308, "num_tokens": 2852298.0, "step": 290 }, { "entropy": 1.9092405900359153, "epoch": 0.24, "grad_norm": 0.10302734375, "learning_rate": 5.98e-06, "loss": 11.2327, "mean_token_accuracy": 0.16136883310973643, "num_tokens": 2947346.0, "step": 300 }, { "epoch": 0.24, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 2947346.0, "eval_coding_runtime": 93.3688, "eval_coding_samples_per_second": 5.355, "eval_coding_steps_per_second": 2.678, "step": 300 }, { "epoch": 0.24, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 2947346.0, "eval_biology_runtime": 41.7366, "eval_biology_samples_per_second": 11.98, "eval_biology_steps_per_second": 5.99, "step": 300 }, { "entropy": 1.9313343465328217, "epoch": 0.248, "grad_norm": 0.1123046875, "learning_rate": 6.18e-06, "loss": 11.5425, "mean_token_accuracy": 0.15317737096920608, "num_tokens": 3043128.0, "step": 310 }, { "entropy": 1.9211706645786761, "epoch": 0.256, "grad_norm": 0.1083984375, "learning_rate": 6.380000000000001e-06, "loss": 11.4729, "mean_token_accuracy": 0.1547949264757335, "num_tokens": 3139957.0, "step": 320 }, { "entropy": 1.923046988248825, "epoch": 0.264, "grad_norm": 0.10693359375, "learning_rate": 6.5800000000000005e-06, "loss": 11.4159, "mean_token_accuracy": 0.15573964624200015, "num_tokens": 3232159.0, "step": 330 }, { "entropy": 1.8901807874441148, "epoch": 0.272, "grad_norm": 0.1083984375, "learning_rate": 6.780000000000001e-06, "loss": 11.3543, "mean_token_accuracy": 0.1617777702398598, "num_tokens": 3335951.0, "step": 340 }, { "entropy": 1.8801799044013023, "epoch": 0.28, "grad_norm": 0.10498046875, "learning_rate": 6.98e-06, "loss": 11.4879, "mean_token_accuracy": 0.15775063186883925, "num_tokens": 3437213.0, "step": 350 }, { "entropy": 1.9043901436030866, "epoch": 0.288, "grad_norm": 0.11474609375, "learning_rate": 7.180000000000001e-06, "loss": 11.5366, "mean_token_accuracy": 0.15751364254392683, "num_tokens": 3539731.0, "step": 360 }, { "entropy": 1.9588028222322464, "epoch": 0.296, "grad_norm": 0.1064453125, "learning_rate": 7.3800000000000005e-06, "loss": 11.3894, "mean_token_accuracy": 0.153670167690143, "num_tokens": 3633448.0, "step": 370 }, { "entropy": 1.881439681351185, "epoch": 0.304, "grad_norm": 0.1064453125, "learning_rate": 7.58e-06, "loss": 11.3016, "mean_token_accuracy": 0.16050010859034955, "num_tokens": 3733488.0, "step": 380 }, { "entropy": 1.9542286798357964, "epoch": 0.312, "grad_norm": 0.103515625, "learning_rate": 7.78e-06, "loss": 11.273, "mean_token_accuracy": 0.15929239662364125, "num_tokens": 3825212.0, "step": 390 }, { "entropy": 1.9287514887750148, "epoch": 0.32, "grad_norm": 0.10107421875, "learning_rate": 7.980000000000002e-06, "loss": 11.3054, "mean_token_accuracy": 0.1604186396114528, "num_tokens": 3920545.0, "step": 400 }, { "epoch": 0.32, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 3920545.0, "eval_coding_runtime": 92.8995, "eval_coding_samples_per_second": 5.382, "eval_coding_steps_per_second": 2.691, "step": 400 }, { "epoch": 0.32, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 3920545.0, "eval_biology_runtime": 41.088, "eval_biology_samples_per_second": 12.169, "eval_biology_steps_per_second": 6.085, "step": 400 }, { "entropy": 1.9058279685676098, "epoch": 0.328, "grad_norm": 0.11181640625, "learning_rate": 8.18e-06, "loss": 11.3842, "mean_token_accuracy": 0.15871414174325765, "num_tokens": 4016673.0, "step": 410 }, { "entropy": 1.930207794904709, "epoch": 0.336, "grad_norm": 0.10888671875, "learning_rate": 8.380000000000001e-06, "loss": 11.3961, "mean_token_accuracy": 0.1579543233383447, "num_tokens": 4114077.0, "step": 420 }, { "entropy": 1.9419952422380446, "epoch": 0.344, "grad_norm": 0.111328125, "learning_rate": 8.580000000000001e-06, "loss": 11.362, "mean_token_accuracy": 0.15992560391314328, "num_tokens": 4210403.0, "step": 430 }, { "entropy": 1.913541816174984, "epoch": 0.352, "grad_norm": 0.1142578125, "learning_rate": 8.78e-06, "loss": 11.3323, "mean_token_accuracy": 0.1633904448710382, "num_tokens": 4306949.0, "step": 440 }, { "entropy": 1.913567177206278, "epoch": 0.36, "grad_norm": 0.1240234375, "learning_rate": 8.98e-06, "loss": 11.458, "mean_token_accuracy": 0.15837323614396154, "num_tokens": 4406104.0, "step": 450 }, { "entropy": 1.931741751730442, "epoch": 0.368, "grad_norm": 0.1025390625, "learning_rate": 9.180000000000002e-06, "loss": 11.2873, "mean_token_accuracy": 0.15801281332969666, "num_tokens": 4504001.0, "step": 460 }, { "entropy": 1.904297123849392, "epoch": 0.376, "grad_norm": 0.1025390625, "learning_rate": 9.38e-06, "loss": 11.3813, "mean_token_accuracy": 0.155983364302665, "num_tokens": 4597559.0, "step": 470 }, { "entropy": 1.9029462151229382, "epoch": 0.384, "grad_norm": 0.09716796875, "learning_rate": 9.58e-06, "loss": 11.3823, "mean_token_accuracy": 0.15874490085989237, "num_tokens": 4693812.0, "step": 480 }, { "entropy": 1.9272562205791473, "epoch": 0.392, "grad_norm": 0.11474609375, "learning_rate": 9.780000000000001e-06, "loss": 11.2937, "mean_token_accuracy": 0.1593953504692763, "num_tokens": 4790965.0, "step": 490 }, { "entropy": 1.9103233270347117, "epoch": 0.4, "grad_norm": 0.1083984375, "learning_rate": 9.980000000000001e-06, "loss": 11.3472, "mean_token_accuracy": 0.15859601702541112, "num_tokens": 4887094.0, "step": 500 }, { "epoch": 0.4, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 4887094.0, "eval_coding_runtime": 93.1337, "eval_coding_samples_per_second": 5.369, "eval_coding_steps_per_second": 2.684, "step": 500 }, { "epoch": 0.4, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 4887094.0, "eval_biology_runtime": 41.2135, "eval_biology_samples_per_second": 12.132, "eval_biology_steps_per_second": 6.066, "step": 500 }, { "entropy": 1.9290625013411045, "epoch": 0.408, "grad_norm": 0.1142578125, "learning_rate": 1.018e-05, "loss": 11.375, "mean_token_accuracy": 0.16000252263620496, "num_tokens": 4985394.0, "step": 510 }, { "entropy": 1.9234229408204555, "epoch": 0.416, "grad_norm": 0.10595703125, "learning_rate": 1.038e-05, "loss": 11.4479, "mean_token_accuracy": 0.1574494892731309, "num_tokens": 5085369.0, "step": 520 }, { "entropy": 1.9453741960227489, "epoch": 0.424, "grad_norm": 0.1162109375, "learning_rate": 1.0580000000000002e-05, "loss": 11.3764, "mean_token_accuracy": 0.15493024131283165, "num_tokens": 5176083.0, "step": 530 }, { "entropy": 1.9170791164040566, "epoch": 0.432, "grad_norm": 0.10498046875, "learning_rate": 1.0780000000000002e-05, "loss": 11.4612, "mean_token_accuracy": 0.15664362437091767, "num_tokens": 5271275.0, "step": 540 }, { "entropy": 1.9397698909044265, "epoch": 0.44, "grad_norm": 0.1044921875, "learning_rate": 1.0980000000000002e-05, "loss": 11.2801, "mean_token_accuracy": 0.15725393719039857, "num_tokens": 5362624.0, "step": 550 }, { "entropy": 1.9190262392163278, "epoch": 0.448, "grad_norm": 0.10205078125, "learning_rate": 1.1180000000000001e-05, "loss": 11.4585, "mean_token_accuracy": 0.157243634108454, "num_tokens": 5460559.0, "step": 560 }, { "entropy": 1.9073573514819144, "epoch": 0.456, "grad_norm": 0.10791015625, "learning_rate": 1.138e-05, "loss": 11.4459, "mean_token_accuracy": 0.15579957109875978, "num_tokens": 5558471.0, "step": 570 }, { "entropy": 1.92078455388546, "epoch": 0.464, "grad_norm": 0.10693359375, "learning_rate": 1.1580000000000001e-05, "loss": 11.3866, "mean_token_accuracy": 0.15736165479756892, "num_tokens": 5653809.0, "step": 580 }, { "entropy": 1.9094278007745742, "epoch": 0.472, "grad_norm": 0.10595703125, "learning_rate": 1.178e-05, "loss": 11.4047, "mean_token_accuracy": 0.15574069027788937, "num_tokens": 5751529.0, "step": 590 }, { "entropy": 1.9597051613032819, "epoch": 0.48, "grad_norm": 0.1181640625, "learning_rate": 1.198e-05, "loss": 11.4724, "mean_token_accuracy": 0.1529582162387669, "num_tokens": 5850176.0, "step": 600 }, { "epoch": 0.48, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 5850176.0, "eval_coding_runtime": 93.1046, "eval_coding_samples_per_second": 5.37, "eval_coding_steps_per_second": 2.685, "step": 600 }, { "epoch": 0.48, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 5850176.0, "eval_biology_runtime": 41.2971, "eval_biology_samples_per_second": 12.107, "eval_biology_steps_per_second": 6.054, "step": 600 }, { "entropy": 1.8962870091199875, "epoch": 0.488, "grad_norm": 0.10302734375, "learning_rate": 1.218e-05, "loss": 11.3305, "mean_token_accuracy": 0.16129443161189555, "num_tokens": 5948898.0, "step": 610 }, { "entropy": 1.9090802267193794, "epoch": 0.496, "grad_norm": 0.10009765625, "learning_rate": 1.2380000000000002e-05, "loss": 11.2714, "mean_token_accuracy": 0.16270390092395245, "num_tokens": 6046503.0, "step": 620 }, { "entropy": 1.9413189493119716, "epoch": 0.504, "grad_norm": 0.1064453125, "learning_rate": 1.2580000000000002e-05, "loss": 11.348, "mean_token_accuracy": 0.15744470418430864, "num_tokens": 6138649.0, "step": 630 }, { "entropy": 1.9004696995019912, "epoch": 0.512, "grad_norm": 0.1142578125, "learning_rate": 1.2780000000000001e-05, "loss": 11.5757, "mean_token_accuracy": 0.15675477078184485, "num_tokens": 6240456.0, "step": 640 }, { "entropy": 1.9349555149674416, "epoch": 0.52, "grad_norm": 0.109375, "learning_rate": 1.2980000000000001e-05, "loss": 11.3673, "mean_token_accuracy": 0.15686554382555187, "num_tokens": 6333958.0, "step": 650 }, { "entropy": 1.9110501252114773, "epoch": 0.528, "grad_norm": 0.111328125, "learning_rate": 1.3180000000000001e-05, "loss": 11.3261, "mean_token_accuracy": 0.15933070508763195, "num_tokens": 6430555.0, "step": 660 }, { "entropy": 1.89276999309659, "epoch": 0.536, "grad_norm": 0.1103515625, "learning_rate": 1.3380000000000002e-05, "loss": 11.3362, "mean_token_accuracy": 0.16168827014043927, "num_tokens": 6533033.0, "step": 670 }, { "entropy": 1.9395132802426815, "epoch": 0.544, "grad_norm": 0.11083984375, "learning_rate": 1.3580000000000002e-05, "loss": 11.4779, "mean_token_accuracy": 0.15581823773682119, "num_tokens": 6626006.0, "step": 680 }, { "entropy": 1.941649568080902, "epoch": 0.552, "grad_norm": 0.10986328125, "learning_rate": 1.378e-05, "loss": 11.4785, "mean_token_accuracy": 0.15422701863572003, "num_tokens": 6719864.0, "step": 690 }, { "entropy": 1.8964682638645172, "epoch": 0.56, "grad_norm": 0.109375, "learning_rate": 1.398e-05, "loss": 11.3283, "mean_token_accuracy": 0.16160819460637868, "num_tokens": 6820754.0, "step": 700 }, { "epoch": 0.56, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 6820754.0, "eval_coding_runtime": 93.0261, "eval_coding_samples_per_second": 5.375, "eval_coding_steps_per_second": 2.687, "step": 700 }, { "epoch": 0.56, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 6820754.0, "eval_biology_runtime": 40.9428, "eval_biology_samples_per_second": 12.212, "eval_biology_steps_per_second": 6.106, "step": 700 }, { "entropy": 1.8812049485743045, "epoch": 0.568, "grad_norm": 0.10986328125, "learning_rate": 1.418e-05, "loss": 11.4292, "mean_token_accuracy": 0.15740878535434605, "num_tokens": 6924942.0, "step": 710 }, { "entropy": 1.9126729860901832, "epoch": 0.576, "grad_norm": 0.10205078125, "learning_rate": 1.4380000000000001e-05, "loss": 11.4497, "mean_token_accuracy": 0.15626165526919067, "num_tokens": 7021844.0, "step": 720 }, { "entropy": 1.9178711868822576, "epoch": 0.584, "grad_norm": 0.1083984375, "learning_rate": 1.4580000000000001e-05, "loss": 11.4406, "mean_token_accuracy": 0.1565072405152023, "num_tokens": 7118443.0, "step": 730 }, { "entropy": 1.899195620045066, "epoch": 0.592, "grad_norm": 0.10498046875, "learning_rate": 1.478e-05, "loss": 11.3891, "mean_token_accuracy": 0.15970171936787664, "num_tokens": 7213951.0, "step": 740 }, { "entropy": 1.8711984746158123, "epoch": 0.6, "grad_norm": 0.107421875, "learning_rate": 1.498e-05, "loss": 11.6155, "mean_token_accuracy": 0.1571272831875831, "num_tokens": 7317984.0, "step": 750 }, { "entropy": 1.8990586549043655, "epoch": 0.608, "grad_norm": 0.10888671875, "learning_rate": 1.5180000000000002e-05, "loss": 11.5369, "mean_token_accuracy": 0.15493298289366067, "num_tokens": 7416773.0, "step": 760 }, { "entropy": 1.8844982825219632, "epoch": 0.616, "grad_norm": 0.11376953125, "learning_rate": 1.5380000000000002e-05, "loss": 11.4661, "mean_token_accuracy": 0.15754226935096086, "num_tokens": 7514031.0, "step": 770 }, { "entropy": 1.9250130340456963, "epoch": 0.624, "grad_norm": 0.1142578125, "learning_rate": 1.5580000000000003e-05, "loss": 11.5539, "mean_token_accuracy": 0.15620456612668931, "num_tokens": 7612843.0, "step": 780 }, { "entropy": 1.9027365200221538, "epoch": 0.632, "grad_norm": 0.11572265625, "learning_rate": 1.578e-05, "loss": 11.397, "mean_token_accuracy": 0.15837502111680807, "num_tokens": 7710332.0, "step": 790 }, { "entropy": 1.9388741664588451, "epoch": 0.64, "grad_norm": 0.1015625, "learning_rate": 1.5980000000000003e-05, "loss": 11.2848, "mean_token_accuracy": 0.15811930843628944, "num_tokens": 7801633.0, "step": 800 }, { "epoch": 0.64, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 7801633.0, "eval_coding_runtime": 92.9264, "eval_coding_samples_per_second": 5.381, "eval_coding_steps_per_second": 2.69, "step": 800 }, { "epoch": 0.64, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 7801633.0, "eval_biology_runtime": 40.9144, "eval_biology_samples_per_second": 12.221, "eval_biology_steps_per_second": 6.11, "step": 800 }, { "entropy": 1.8989893589168787, "epoch": 0.648, "grad_norm": 0.1123046875, "learning_rate": 1.618e-05, "loss": 11.4501, "mean_token_accuracy": 0.1592996684834361, "num_tokens": 7899038.0, "step": 810 }, { "entropy": 1.9489813253283501, "epoch": 0.656, "grad_norm": 0.1103515625, "learning_rate": 1.638e-05, "loss": 11.6214, "mean_token_accuracy": 0.15152352401055397, "num_tokens": 7995843.0, "step": 820 }, { "entropy": 1.9073718406260014, "epoch": 0.664, "grad_norm": 0.109375, "learning_rate": 1.658e-05, "loss": 11.4378, "mean_token_accuracy": 0.1551885347813368, "num_tokens": 8092148.0, "step": 830 }, { "entropy": 1.9346605040133, "epoch": 0.672, "grad_norm": 0.1142578125, "learning_rate": 1.6780000000000002e-05, "loss": 11.4991, "mean_token_accuracy": 0.15528175202198327, "num_tokens": 8183103.0, "step": 840 }, { "entropy": 1.8954717583954335, "epoch": 0.68, "grad_norm": 0.11328125, "learning_rate": 1.698e-05, "loss": 11.5802, "mean_token_accuracy": 0.15582030937075614, "num_tokens": 8283708.0, "step": 850 }, { "entropy": 1.8708562321960926, "epoch": 0.688, "grad_norm": 0.10986328125, "learning_rate": 1.718e-05, "loss": 11.3021, "mean_token_accuracy": 0.16033690557815133, "num_tokens": 8385976.0, "step": 860 }, { "entropy": 1.940383419394493, "epoch": 0.696, "grad_norm": 0.11376953125, "learning_rate": 1.7380000000000003e-05, "loss": 11.5672, "mean_token_accuracy": 0.15030423626303674, "num_tokens": 8480263.0, "step": 870 }, { "entropy": 1.8808147184550763, "epoch": 0.704, "grad_norm": 0.10986328125, "learning_rate": 1.758e-05, "loss": 11.3381, "mean_token_accuracy": 0.16111928690224886, "num_tokens": 8578431.0, "step": 880 }, { "entropy": 1.8877504222095012, "epoch": 0.712, "grad_norm": 0.10791015625, "learning_rate": 1.7780000000000003e-05, "loss": 11.3877, "mean_token_accuracy": 0.1602990841027349, "num_tokens": 8679830.0, "step": 890 }, { "entropy": 1.8855501756072044, "epoch": 0.72, "grad_norm": 0.11279296875, "learning_rate": 1.798e-05, "loss": 11.5192, "mean_token_accuracy": 0.1597036702558398, "num_tokens": 8781342.0, "step": 900 }, { "epoch": 0.72, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 8781342.0, "eval_coding_runtime": 92.7951, "eval_coding_samples_per_second": 5.388, "eval_coding_steps_per_second": 2.694, "step": 900 }, { "epoch": 0.72, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 8781342.0, "eval_biology_runtime": 40.8386, "eval_biology_samples_per_second": 12.243, "eval_biology_steps_per_second": 6.122, "step": 900 }, { "entropy": 1.8865461751818657, "epoch": 0.728, "grad_norm": 0.1083984375, "learning_rate": 1.8180000000000002e-05, "loss": 11.4372, "mean_token_accuracy": 0.15840516053140163, "num_tokens": 8876240.0, "step": 910 }, { "entropy": 1.8957439251244068, "epoch": 0.736, "grad_norm": 0.1201171875, "learning_rate": 1.8380000000000004e-05, "loss": 11.4042, "mean_token_accuracy": 0.16179822930134832, "num_tokens": 8977918.0, "step": 920 }, { "entropy": 1.9178364366292953, "epoch": 0.744, "grad_norm": 0.11279296875, "learning_rate": 1.858e-05, "loss": 11.3294, "mean_token_accuracy": 0.15907879420556129, "num_tokens": 9073936.0, "step": 930 }, { "entropy": 1.8959111988544464, "epoch": 0.752, "grad_norm": 0.11328125, "learning_rate": 1.878e-05, "loss": 11.5083, "mean_token_accuracy": 0.1573775229975581, "num_tokens": 9169322.0, "step": 940 }, { "entropy": 1.9043195970356463, "epoch": 0.76, "grad_norm": 0.10595703125, "learning_rate": 1.898e-05, "loss": 11.4335, "mean_token_accuracy": 0.15969336559064687, "num_tokens": 9267416.0, "step": 950 }, { "entropy": 1.8812848784029483, "epoch": 0.768, "grad_norm": 0.1044921875, "learning_rate": 1.918e-05, "loss": 11.4766, "mean_token_accuracy": 0.16011672262102367, "num_tokens": 9368141.0, "step": 960 }, { "entropy": 1.9347192905843258, "epoch": 0.776, "grad_norm": 0.1171875, "learning_rate": 1.938e-05, "loss": 11.4781, "mean_token_accuracy": 0.15778247057460248, "num_tokens": 9463691.0, "step": 970 }, { "entropy": 1.8791780702769756, "epoch": 0.784, "grad_norm": 0.11376953125, "learning_rate": 1.9580000000000002e-05, "loss": 11.3971, "mean_token_accuracy": 0.16152906292118133, "num_tokens": 9565236.0, "step": 980 }, { "entropy": 1.9060623526573182, "epoch": 0.792, "grad_norm": 0.11474609375, "learning_rate": 1.978e-05, "loss": 11.3989, "mean_token_accuracy": 0.15776223079301416, "num_tokens": 9664805.0, "step": 990 }, { "entropy": 1.9234035909175873, "epoch": 0.8, "grad_norm": 0.109375, "learning_rate": 1.9980000000000002e-05, "loss": 11.363, "mean_token_accuracy": 0.15743429348804056, "num_tokens": 9761227.0, "step": 1000 }, { "epoch": 0.8, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 9761227.0, "eval_coding_runtime": 92.8952, "eval_coding_samples_per_second": 5.382, "eval_coding_steps_per_second": 2.691, "step": 1000 }, { "epoch": 0.8, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 9761227.0, "eval_biology_runtime": 40.8879, "eval_biology_samples_per_second": 12.229, "eval_biology_steps_per_second": 6.114, "step": 1000 }, { "entropy": 1.872146301716566, "epoch": 0.808, "grad_norm": 0.11279296875, "learning_rate": 1.9980000000000002e-05, "loss": 11.3733, "mean_token_accuracy": 0.1633847408927977, "num_tokens": 9863410.0, "step": 1010 }, { "entropy": 1.9263332709670067, "epoch": 0.816, "grad_norm": 0.11669921875, "learning_rate": 1.995777777777778e-05, "loss": 11.4047, "mean_token_accuracy": 0.15647466196678578, "num_tokens": 9958727.0, "step": 1020 }, { "entropy": 1.9192464537918568, "epoch": 0.824, "grad_norm": 0.1162109375, "learning_rate": 1.9935555555555557e-05, "loss": 11.4102, "mean_token_accuracy": 0.15872471225447954, "num_tokens": 10059078.0, "step": 1030 }, { "entropy": 1.9027912005782128, "epoch": 0.832, "grad_norm": 0.1015625, "learning_rate": 1.9913333333333335e-05, "loss": 11.2532, "mean_token_accuracy": 0.16170275984331967, "num_tokens": 10155771.0, "step": 1040 }, { "entropy": 1.8838146448135376, "epoch": 0.84, "grad_norm": 0.10400390625, "learning_rate": 1.9891111111111112e-05, "loss": 11.3128, "mean_token_accuracy": 0.15962061467580496, "num_tokens": 10255927.0, "step": 1050 }, { "entropy": 1.8773360393941403, "epoch": 0.848, "grad_norm": 0.109375, "learning_rate": 1.986888888888889e-05, "loss": 11.4155, "mean_token_accuracy": 0.16165038282051683, "num_tokens": 10357721.0, "step": 1060 }, { "entropy": 1.9122937515377998, "epoch": 0.856, "grad_norm": 0.1162109375, "learning_rate": 1.9846666666666668e-05, "loss": 11.3491, "mean_token_accuracy": 0.15969276065006852, "num_tokens": 10454335.0, "step": 1070 }, { "entropy": 1.9112751238048076, "epoch": 0.864, "grad_norm": 0.10546875, "learning_rate": 1.9824444444444445e-05, "loss": 11.3171, "mean_token_accuracy": 0.1582150506321341, "num_tokens": 10552495.0, "step": 1080 }, { "entropy": 1.8869936399161815, "epoch": 0.872, "grad_norm": 0.11767578125, "learning_rate": 1.9802222222222226e-05, "loss": 11.3723, "mean_token_accuracy": 0.159729657554999, "num_tokens": 10651501.0, "step": 1090 }, { "entropy": 1.9142157301306724, "epoch": 0.88, "grad_norm": 0.11083984375, "learning_rate": 1.978e-05, "loss": 11.4287, "mean_token_accuracy": 0.15900661037303507, "num_tokens": 10748749.0, "step": 1100 }, { "epoch": 0.88, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 10748749.0, "eval_coding_runtime": 92.8483, "eval_coding_samples_per_second": 5.385, "eval_coding_steps_per_second": 2.693, "step": 1100 }, { "epoch": 0.88, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 10748749.0, "eval_biology_runtime": 40.7088, "eval_biology_samples_per_second": 12.282, "eval_biology_steps_per_second": 6.141, "step": 1100 }, { "entropy": 1.8793569900095464, "epoch": 0.888, "grad_norm": 0.1123046875, "learning_rate": 1.975777777777778e-05, "loss": 11.5877, "mean_token_accuracy": 0.157168534072116, "num_tokens": 10852932.0, "step": 1110 }, { "entropy": 1.9415382850915193, "epoch": 0.896, "grad_norm": 0.10546875, "learning_rate": 1.9735555555555556e-05, "loss": 11.3746, "mean_token_accuracy": 0.15685343989171088, "num_tokens": 10943319.0, "step": 1120 }, { "entropy": 1.928680495172739, "epoch": 0.904, "grad_norm": 0.1083984375, "learning_rate": 1.9713333333333337e-05, "loss": 11.3322, "mean_token_accuracy": 0.15655601797625424, "num_tokens": 11037833.0, "step": 1130 }, { "entropy": 1.8811852902173996, "epoch": 0.912, "grad_norm": 0.11865234375, "learning_rate": 1.969111111111111e-05, "loss": 11.3788, "mean_token_accuracy": 0.15932427351363004, "num_tokens": 11136935.0, "step": 1140 }, { "entropy": 1.9035414353013038, "epoch": 0.92, "grad_norm": 0.10107421875, "learning_rate": 1.9668888888888892e-05, "loss": 11.4397, "mean_token_accuracy": 0.15855442080646753, "num_tokens": 11238002.0, "step": 1150 }, { "entropy": 1.9215666435658931, "epoch": 0.928, "grad_norm": 0.1103515625, "learning_rate": 1.9646666666666666e-05, "loss": 11.3836, "mean_token_accuracy": 0.15947869322262703, "num_tokens": 11331098.0, "step": 1160 }, { "entropy": 1.8665415190160275, "epoch": 0.936, "grad_norm": 0.11181640625, "learning_rate": 1.9624444444444447e-05, "loss": 11.3554, "mean_token_accuracy": 0.163870263681747, "num_tokens": 11435812.0, "step": 1170 }, { "entropy": 1.9265005595982074, "epoch": 0.944, "grad_norm": 0.11181640625, "learning_rate": 1.9602222222222225e-05, "loss": 11.3103, "mean_token_accuracy": 0.15953028812073172, "num_tokens": 11530550.0, "step": 1180 }, { "entropy": 1.9132369719445705, "epoch": 0.952, "grad_norm": 0.10986328125, "learning_rate": 1.9580000000000002e-05, "loss": 11.2875, "mean_token_accuracy": 0.16013024668209255, "num_tokens": 11629080.0, "step": 1190 }, { "entropy": 1.9029541954398155, "epoch": 0.96, "grad_norm": 0.10595703125, "learning_rate": 1.955777777777778e-05, "loss": 11.5258, "mean_token_accuracy": 0.15739277489483355, "num_tokens": 11729645.0, "step": 1200 }, { "epoch": 0.96, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 11729645.0, "eval_coding_runtime": 92.7369, "eval_coding_samples_per_second": 5.392, "eval_coding_steps_per_second": 2.696, "step": 1200 }, { "epoch": 0.96, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 11729645.0, "eval_biology_runtime": 40.7517, "eval_biology_samples_per_second": 12.269, "eval_biology_steps_per_second": 6.135, "step": 1200 }, { "entropy": 1.924806034564972, "epoch": 0.968, "grad_norm": 0.111328125, "learning_rate": 1.9535555555555557e-05, "loss": 11.3332, "mean_token_accuracy": 0.1609709003008902, "num_tokens": 11823767.0, "step": 1210 }, { "entropy": 1.9001309793442487, "epoch": 0.976, "grad_norm": 0.1083984375, "learning_rate": 1.9513333333333335e-05, "loss": 11.3968, "mean_token_accuracy": 0.16173409800976515, "num_tokens": 11924644.0, "step": 1220 }, { "entropy": 1.926213839650154, "epoch": 0.984, "grad_norm": 0.11181640625, "learning_rate": 1.9491111111111113e-05, "loss": 11.2886, "mean_token_accuracy": 0.15953713809140027, "num_tokens": 12022090.0, "step": 1230 }, { "entropy": 1.896909598261118, "epoch": 0.992, "grad_norm": 0.11083984375, "learning_rate": 1.946888888888889e-05, "loss": 11.44, "mean_token_accuracy": 0.15784551627002658, "num_tokens": 12123059.0, "step": 1240 }, { "entropy": 1.9211212292313575, "epoch": 1.0, "grad_norm": 0.1142578125, "learning_rate": 1.9446666666666668e-05, "loss": 11.3871, "mean_token_accuracy": 0.15915677635930478, "num_tokens": 12221291.0, "step": 1250 }, { "entropy": 1.9072089441120625, "epoch": 1.008, "grad_norm": 0.1171875, "learning_rate": 1.9424444444444446e-05, "loss": 11.3831, "mean_token_accuracy": 0.1602979816030711, "num_tokens": 12319366.0, "step": 1260 }, { "entropy": 1.8810669101774693, "epoch": 1.016, "grad_norm": 0.11328125, "learning_rate": 1.9402222222222223e-05, "loss": 11.3728, "mean_token_accuracy": 0.1634572588838637, "num_tokens": 12422323.0, "step": 1270 }, { "entropy": 1.8920352309942245, "epoch": 1.024, "grad_norm": 0.10986328125, "learning_rate": 1.938e-05, "loss": 11.4291, "mean_token_accuracy": 0.1560253918170929, "num_tokens": 12524183.0, "step": 1280 }, { "entropy": 1.9242343619465827, "epoch": 1.032, "grad_norm": 0.11083984375, "learning_rate": 1.935777777777778e-05, "loss": 11.4591, "mean_token_accuracy": 0.15780404200777412, "num_tokens": 12621565.0, "step": 1290 }, { "entropy": 1.9213342539966107, "epoch": 1.04, "grad_norm": 0.1103515625, "learning_rate": 1.9335555555555556e-05, "loss": 11.4174, "mean_token_accuracy": 0.15637533548288046, "num_tokens": 12718593.0, "step": 1300 }, { "epoch": 1.04, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 12718593.0, "eval_coding_runtime": 92.7616, "eval_coding_samples_per_second": 5.39, "eval_coding_steps_per_second": 2.695, "step": 1300 }, { "epoch": 1.04, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 12718593.0, "eval_biology_runtime": 40.7637, "eval_biology_samples_per_second": 12.266, "eval_biology_steps_per_second": 6.133, "step": 1300 }, { "entropy": 1.8875514529645443, "epoch": 1.048, "grad_norm": 0.11376953125, "learning_rate": 1.9313333333333334e-05, "loss": 11.3486, "mean_token_accuracy": 0.1649646230507642, "num_tokens": 12817611.0, "step": 1310 }, { "entropy": 1.884049878269434, "epoch": 1.056, "grad_norm": 0.1171875, "learning_rate": 1.9291111111111115e-05, "loss": 11.3444, "mean_token_accuracy": 0.16324606756679713, "num_tokens": 12917803.0, "step": 1320 }, { "entropy": 1.897967816144228, "epoch": 1.064, "grad_norm": 0.11767578125, "learning_rate": 1.926888888888889e-05, "loss": 11.4133, "mean_token_accuracy": 0.1546560618560761, "num_tokens": 13013009.0, "step": 1330 }, { "entropy": 1.9011694863438606, "epoch": 1.072, "grad_norm": 0.103515625, "learning_rate": 1.924666666666667e-05, "loss": 11.4129, "mean_token_accuracy": 0.1572916334029287, "num_tokens": 13105826.0, "step": 1340 }, { "entropy": 1.9191783390939237, "epoch": 1.08, "grad_norm": 0.1103515625, "learning_rate": 1.9224444444444444e-05, "loss": 11.5042, "mean_token_accuracy": 0.15683019561693073, "num_tokens": 13201601.0, "step": 1350 }, { "entropy": 1.9041197896003723, "epoch": 1.088, "grad_norm": 0.10693359375, "learning_rate": 1.9202222222222225e-05, "loss": 11.3016, "mean_token_accuracy": 0.16032634493894876, "num_tokens": 13298619.0, "step": 1360 }, { "entropy": 1.9777653701603413, "epoch": 1.096, "grad_norm": 0.10546875, "learning_rate": 1.918e-05, "loss": 11.4078, "mean_token_accuracy": 0.1558103010058403, "num_tokens": 13390496.0, "step": 1370 }, { "entropy": 1.8881109297275542, "epoch": 1.104, "grad_norm": 0.1083984375, "learning_rate": 1.915777777777778e-05, "loss": 11.4555, "mean_token_accuracy": 0.1594717210624367, "num_tokens": 13491486.0, "step": 1380 }, { "entropy": 1.9563044786453248, "epoch": 1.112, "grad_norm": 0.11865234375, "learning_rate": 1.9135555555555555e-05, "loss": 11.3942, "mean_token_accuracy": 0.15433563021942973, "num_tokens": 13580102.0, "step": 1390 }, { "entropy": 1.9426785811781884, "epoch": 1.12, "grad_norm": 0.10302734375, "learning_rate": 1.9113333333333336e-05, "loss": 11.3608, "mean_token_accuracy": 0.15766524169594048, "num_tokens": 13674663.0, "step": 1400 }, { "epoch": 1.12, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 13674663.0, "eval_coding_runtime": 92.7614, "eval_coding_samples_per_second": 5.39, "eval_coding_steps_per_second": 2.695, "step": 1400 }, { "epoch": 1.12, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 13674663.0, "eval_biology_runtime": 40.7448, "eval_biology_samples_per_second": 12.271, "eval_biology_steps_per_second": 6.136, "step": 1400 }, { "entropy": 1.8810144498944283, "epoch": 1.1280000000000001, "grad_norm": 0.10693359375, "learning_rate": 1.9091111111111113e-05, "loss": 11.4009, "mean_token_accuracy": 0.15898247696459294, "num_tokens": 13774234.0, "step": 1410 }, { "entropy": 1.9516346745193005, "epoch": 1.1360000000000001, "grad_norm": 0.10888671875, "learning_rate": 1.906888888888889e-05, "loss": 11.3674, "mean_token_accuracy": 0.1572843327652663, "num_tokens": 13869134.0, "step": 1420 }, { "entropy": 1.9005931705236434, "epoch": 1.144, "grad_norm": 0.123046875, "learning_rate": 1.904666666666667e-05, "loss": 11.4935, "mean_token_accuracy": 0.15715239457786084, "num_tokens": 13973498.0, "step": 1430 }, { "entropy": 1.863999791443348, "epoch": 1.152, "grad_norm": 0.11083984375, "learning_rate": 1.9024444444444446e-05, "loss": 11.3597, "mean_token_accuracy": 0.16133098863065243, "num_tokens": 14078365.0, "step": 1440 }, { "entropy": 1.9424147799611091, "epoch": 1.16, "grad_norm": 0.09765625, "learning_rate": 1.9002222222222224e-05, "loss": 11.3612, "mean_token_accuracy": 0.15900293425656856, "num_tokens": 14171486.0, "step": 1450 }, { "entropy": 1.9269852563738823, "epoch": 1.168, "grad_norm": 0.1123046875, "learning_rate": 1.898e-05, "loss": 11.4704, "mean_token_accuracy": 0.15463799880817533, "num_tokens": 14266831.0, "step": 1460 }, { "entropy": 1.9132212005555629, "epoch": 1.176, "grad_norm": 0.11669921875, "learning_rate": 1.895777777777778e-05, "loss": 11.3853, "mean_token_accuracy": 0.16369018028490245, "num_tokens": 14370559.0, "step": 1470 }, { "entropy": 1.9134477652609347, "epoch": 1.184, "grad_norm": 0.11279296875, "learning_rate": 1.8935555555555556e-05, "loss": 11.4252, "mean_token_accuracy": 0.1567579856608063, "num_tokens": 14465660.0, "step": 1480 }, { "entropy": 1.9147060811519623, "epoch": 1.192, "grad_norm": 0.10546875, "learning_rate": 1.8913333333333334e-05, "loss": 11.3604, "mean_token_accuracy": 0.15956381279975176, "num_tokens": 14559282.0, "step": 1490 }, { "entropy": 1.9207824341952802, "epoch": 1.2, "grad_norm": 0.11083984375, "learning_rate": 1.8891111111111115e-05, "loss": 11.2929, "mean_token_accuracy": 0.15796949863433837, "num_tokens": 14653228.0, "step": 1500 }, { "epoch": 1.2, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 14653228.0, "eval_coding_runtime": 93.1044, "eval_coding_samples_per_second": 5.37, "eval_coding_steps_per_second": 2.685, "step": 1500 }, { "epoch": 1.2, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 14653228.0, "eval_biology_runtime": 40.7171, "eval_biology_samples_per_second": 12.28, "eval_biology_steps_per_second": 6.14, "step": 1500 }, { "entropy": 1.8704286485910415, "epoch": 1.208, "grad_norm": 0.111328125, "learning_rate": 1.886888888888889e-05, "loss": 11.4975, "mean_token_accuracy": 0.16043112352490424, "num_tokens": 14756972.0, "step": 1510 }, { "entropy": 1.9115773357450963, "epoch": 1.216, "grad_norm": 0.10888671875, "learning_rate": 1.884666666666667e-05, "loss": 11.4561, "mean_token_accuracy": 0.15903587290085852, "num_tokens": 14857782.0, "step": 1520 }, { "entropy": 1.9327999822795392, "epoch": 1.224, "grad_norm": 0.10595703125, "learning_rate": 1.8824444444444445e-05, "loss": 11.3866, "mean_token_accuracy": 0.15627011358737947, "num_tokens": 14950496.0, "step": 1530 }, { "entropy": 1.8955214098095894, "epoch": 1.232, "grad_norm": 0.1220703125, "learning_rate": 1.8802222222222226e-05, "loss": 11.3568, "mean_token_accuracy": 0.15990630425512792, "num_tokens": 15047356.0, "step": 1540 }, { "entropy": 1.9376576729118824, "epoch": 1.24, "grad_norm": 0.109375, "learning_rate": 1.878e-05, "loss": 11.3893, "mean_token_accuracy": 0.1599413577467203, "num_tokens": 15142592.0, "step": 1550 }, { "entropy": 1.9254475340247155, "epoch": 1.248, "grad_norm": 0.1162109375, "learning_rate": 1.875777777777778e-05, "loss": 11.4126, "mean_token_accuracy": 0.1606173425912857, "num_tokens": 15241098.0, "step": 1560 }, { "entropy": 1.9526181869208812, "epoch": 1.256, "grad_norm": 0.10546875, "learning_rate": 1.873555555555556e-05, "loss": 11.2484, "mean_token_accuracy": 0.15649163788184522, "num_tokens": 15335851.0, "step": 1570 }, { "entropy": 1.8829004153609277, "epoch": 1.264, "grad_norm": 0.10546875, "learning_rate": 1.8713333333333336e-05, "loss": 11.4768, "mean_token_accuracy": 0.15737718129530548, "num_tokens": 15437657.0, "step": 1580 }, { "entropy": 1.9331139042973517, "epoch": 1.272, "grad_norm": 0.11328125, "learning_rate": 1.8691111111111114e-05, "loss": 11.2952, "mean_token_accuracy": 0.15551234567537903, "num_tokens": 15530753.0, "step": 1590 }, { "entropy": 1.9040869876742363, "epoch": 1.28, "grad_norm": 0.115234375, "learning_rate": 1.866888888888889e-05, "loss": 11.3958, "mean_token_accuracy": 0.16075886362232267, "num_tokens": 15630795.0, "step": 1600 }, { "epoch": 1.28, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 15630795.0, "eval_coding_runtime": 92.7724, "eval_coding_samples_per_second": 5.39, "eval_coding_steps_per_second": 2.695, "step": 1600 }, { "epoch": 1.28, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 15630795.0, "eval_biology_runtime": 41.1103, "eval_biology_samples_per_second": 12.162, "eval_biology_steps_per_second": 6.081, "step": 1600 }, { "entropy": 1.949748583137989, "epoch": 1.288, "grad_norm": 0.1015625, "learning_rate": 1.864666666666667e-05, "loss": 11.3105, "mean_token_accuracy": 0.15701372702606023, "num_tokens": 15725428.0, "step": 1610 }, { "entropy": 1.9060676544904709, "epoch": 1.296, "grad_norm": 0.111328125, "learning_rate": 1.8624444444444446e-05, "loss": 11.4172, "mean_token_accuracy": 0.15982052269391717, "num_tokens": 15827105.0, "step": 1620 }, { "entropy": 1.9177203148603439, "epoch": 1.304, "grad_norm": 0.123046875, "learning_rate": 1.8602222222222224e-05, "loss": 11.5038, "mean_token_accuracy": 0.15568662905134262, "num_tokens": 15923902.0, "step": 1630 }, { "entropy": 1.9236695282161236, "epoch": 1.312, "grad_norm": 0.10791015625, "learning_rate": 1.858e-05, "loss": 11.3814, "mean_token_accuracy": 0.15766759933903812, "num_tokens": 16019645.0, "step": 1640 }, { "entropy": 1.9366594329476357, "epoch": 1.32, "grad_norm": 0.111328125, "learning_rate": 1.855777777777778e-05, "loss": 11.4536, "mean_token_accuracy": 0.15385150196962058, "num_tokens": 16115487.0, "step": 1650 }, { "entropy": 1.883201529085636, "epoch": 1.328, "grad_norm": 0.1123046875, "learning_rate": 1.8535555555555557e-05, "loss": 11.4721, "mean_token_accuracy": 0.15817170958034693, "num_tokens": 16221726.0, "step": 1660 }, { "entropy": 1.864897334575653, "epoch": 1.336, "grad_norm": 0.11572265625, "learning_rate": 1.8513333333333335e-05, "loss": 11.3423, "mean_token_accuracy": 0.1640611401759088, "num_tokens": 16322791.0, "step": 1670 }, { "entropy": 1.9057570204138756, "epoch": 1.3439999999999999, "grad_norm": 0.1064453125, "learning_rate": 1.8491111111111112e-05, "loss": 11.6343, "mean_token_accuracy": 0.15843453146517278, "num_tokens": 16427594.0, "step": 1680 }, { "entropy": 1.8997819542884826, "epoch": 1.3519999999999999, "grad_norm": 0.10986328125, "learning_rate": 1.846888888888889e-05, "loss": 11.4734, "mean_token_accuracy": 0.15844294875860215, "num_tokens": 16527485.0, "step": 1690 }, { "entropy": 1.9108400709927083, "epoch": 1.3599999999999999, "grad_norm": 0.1083984375, "learning_rate": 1.8446666666666667e-05, "loss": 11.3154, "mean_token_accuracy": 0.15963923330418767, "num_tokens": 16621605.0, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 16621605.0, "eval_coding_runtime": 92.75, "eval_coding_samples_per_second": 5.391, "eval_coding_steps_per_second": 2.695, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 16621605.0, "eval_biology_runtime": 40.7292, "eval_biology_samples_per_second": 12.276, "eval_biology_steps_per_second": 6.138, "step": 1700 }, { "entropy": 1.9072483718395232, "epoch": 1.3679999999999999, "grad_norm": 0.11083984375, "learning_rate": 1.842444444444445e-05, "loss": 11.3111, "mean_token_accuracy": 0.1618227817583829, "num_tokens": 16719005.0, "step": 1710 }, { "entropy": 1.9344917692244052, "epoch": 1.376, "grad_norm": 0.1005859375, "learning_rate": 1.8402222222222223e-05, "loss": 11.2864, "mean_token_accuracy": 0.15792911853641273, "num_tokens": 16813444.0, "step": 1720 }, { "entropy": 1.9180502645671367, "epoch": 1.384, "grad_norm": 0.111328125, "learning_rate": 1.8380000000000004e-05, "loss": 11.4819, "mean_token_accuracy": 0.15618215668946506, "num_tokens": 16908557.0, "step": 1730 }, { "entropy": 1.908074852079153, "epoch": 1.392, "grad_norm": 0.1181640625, "learning_rate": 1.8357777777777778e-05, "loss": 11.412, "mean_token_accuracy": 0.1577038482762873, "num_tokens": 17006509.0, "step": 1740 }, { "entropy": 1.9261491410434246, "epoch": 1.4, "grad_norm": 0.10205078125, "learning_rate": 1.833555555555556e-05, "loss": 11.407, "mean_token_accuracy": 0.15843819421716035, "num_tokens": 17097916.0, "step": 1750 }, { "entropy": 1.901807364076376, "epoch": 1.408, "grad_norm": 0.10888671875, "learning_rate": 1.8313333333333333e-05, "loss": 11.4551, "mean_token_accuracy": 0.1580773525405675, "num_tokens": 17197870.0, "step": 1760 }, { "entropy": 1.9068179935216905, "epoch": 1.416, "grad_norm": 0.1005859375, "learning_rate": 1.8291111111111114e-05, "loss": 11.4565, "mean_token_accuracy": 0.1564626230392605, "num_tokens": 17296950.0, "step": 1770 }, { "entropy": 1.9137524850666523, "epoch": 1.424, "grad_norm": 0.10546875, "learning_rate": 1.8268888888888888e-05, "loss": 11.319, "mean_token_accuracy": 0.1580805520527065, "num_tokens": 17394390.0, "step": 1780 }, { "entropy": 1.86898348107934, "epoch": 1.432, "grad_norm": 0.1142578125, "learning_rate": 1.824666666666667e-05, "loss": 11.3352, "mean_token_accuracy": 0.16185315563343466, "num_tokens": 17498287.0, "step": 1790 }, { "entropy": 1.9457152262330055, "epoch": 1.44, "grad_norm": 0.0986328125, "learning_rate": 1.8224444444444447e-05, "loss": 11.5085, "mean_token_accuracy": 0.1535401228815317, "num_tokens": 17587777.0, "step": 1800 }, { "epoch": 1.44, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 17587777.0, "eval_coding_runtime": 92.8619, "eval_coding_samples_per_second": 5.384, "eval_coding_steps_per_second": 2.692, "step": 1800 }, { "epoch": 1.44, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 17587777.0, "eval_biology_runtime": 40.7335, "eval_biology_samples_per_second": 12.275, "eval_biology_steps_per_second": 6.137, "step": 1800 }, { "entropy": 1.902113003283739, "epoch": 1.448, "grad_norm": 0.103515625, "learning_rate": 1.8202222222222225e-05, "loss": 11.4753, "mean_token_accuracy": 0.15963359172455965, "num_tokens": 17688697.0, "step": 1810 }, { "entropy": 1.9020785458385945, "epoch": 1.456, "grad_norm": 0.1103515625, "learning_rate": 1.8180000000000002e-05, "loss": 11.361, "mean_token_accuracy": 0.15817707288078964, "num_tokens": 17788456.0, "step": 1820 }, { "entropy": 1.9290625780820847, "epoch": 1.464, "grad_norm": 0.10546875, "learning_rate": 1.815777777777778e-05, "loss": 11.3487, "mean_token_accuracy": 0.1603408372029662, "num_tokens": 17882731.0, "step": 1830 }, { "entropy": 1.9105426162481307, "epoch": 1.472, "grad_norm": 0.111328125, "learning_rate": 1.8135555555555557e-05, "loss": 11.511, "mean_token_accuracy": 0.1596439859829843, "num_tokens": 17984063.0, "step": 1840 }, { "entropy": 1.9299705654382706, "epoch": 1.48, "grad_norm": 0.11669921875, "learning_rate": 1.8113333333333335e-05, "loss": 11.3816, "mean_token_accuracy": 0.15798658230341972, "num_tokens": 18080829.0, "step": 1850 }, { "entropy": 1.9229087643325329, "epoch": 1.488, "grad_norm": 0.11181640625, "learning_rate": 1.8091111111111113e-05, "loss": 11.5753, "mean_token_accuracy": 0.15435658004134895, "num_tokens": 18175640.0, "step": 1860 }, { "entropy": 1.937994058430195, "epoch": 1.496, "grad_norm": 0.11181640625, "learning_rate": 1.806888888888889e-05, "loss": 11.4177, "mean_token_accuracy": 0.15660293828696012, "num_tokens": 18268965.0, "step": 1870 }, { "entropy": 1.9269187480211258, "epoch": 1.504, "grad_norm": 0.10595703125, "learning_rate": 1.8046666666666668e-05, "loss": 11.4106, "mean_token_accuracy": 0.15928622940555215, "num_tokens": 18367857.0, "step": 1880 }, { "entropy": 1.92791308760643, "epoch": 1.512, "grad_norm": 0.1064453125, "learning_rate": 1.8024444444444445e-05, "loss": 11.4263, "mean_token_accuracy": 0.15667357984930277, "num_tokens": 18468841.0, "step": 1890 }, { "entropy": 1.9114290274679662, "epoch": 1.52, "grad_norm": 0.11376953125, "learning_rate": 1.8002222222222223e-05, "loss": 11.5163, "mean_token_accuracy": 0.15518842511810363, "num_tokens": 18569146.0, "step": 1900 }, { "epoch": 1.52, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 18569146.0, "eval_coding_runtime": 92.7725, "eval_coding_samples_per_second": 5.39, "eval_coding_steps_per_second": 2.695, "step": 1900 }, { "epoch": 1.52, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 18569146.0, "eval_biology_runtime": 40.763, "eval_biology_samples_per_second": 12.266, "eval_biology_steps_per_second": 6.133, "step": 1900 }, { "entropy": 1.9271638520061969, "epoch": 1.528, "grad_norm": 0.10546875, "learning_rate": 1.798e-05, "loss": 11.3061, "mean_token_accuracy": 0.15814642114564775, "num_tokens": 18661379.0, "step": 1910 }, { "entropy": 1.9256254382431508, "epoch": 1.536, "grad_norm": 0.11572265625, "learning_rate": 1.7957777777777778e-05, "loss": 11.3743, "mean_token_accuracy": 0.1560321016702801, "num_tokens": 18755079.0, "step": 1920 }, { "entropy": 1.8881871685385705, "epoch": 1.544, "grad_norm": 0.1103515625, "learning_rate": 1.7935555555555556e-05, "loss": 11.367, "mean_token_accuracy": 0.16265541026368738, "num_tokens": 18855275.0, "step": 1930 }, { "entropy": 1.9023422569036483, "epoch": 1.552, "grad_norm": 0.1142578125, "learning_rate": 1.7913333333333337e-05, "loss": 11.4127, "mean_token_accuracy": 0.15928544756025076, "num_tokens": 18956248.0, "step": 1940 }, { "entropy": 1.8970495253801345, "epoch": 1.56, "grad_norm": 0.107421875, "learning_rate": 1.789111111111111e-05, "loss": 11.3361, "mean_token_accuracy": 0.1625245711300522, "num_tokens": 19051957.0, "step": 1950 }, { "entropy": 1.9093870118260383, "epoch": 1.568, "grad_norm": 0.111328125, "learning_rate": 1.7868888888888892e-05, "loss": 11.4793, "mean_token_accuracy": 0.1557660404127091, "num_tokens": 19150315.0, "step": 1960 }, { "entropy": 1.8911843553185463, "epoch": 1.576, "grad_norm": 0.1044921875, "learning_rate": 1.7846666666666666e-05, "loss": 11.3024, "mean_token_accuracy": 0.1613072390668094, "num_tokens": 19248041.0, "step": 1970 }, { "entropy": 1.8826511427760124, "epoch": 1.584, "grad_norm": 0.10986328125, "learning_rate": 1.7824444444444447e-05, "loss": 11.2043, "mean_token_accuracy": 0.16279208194464445, "num_tokens": 19344260.0, "step": 1980 }, { "entropy": 1.9475473545491695, "epoch": 1.592, "grad_norm": 0.1044921875, "learning_rate": 1.780222222222222e-05, "loss": 11.4873, "mean_token_accuracy": 0.15342508563771845, "num_tokens": 19436040.0, "step": 1990 }, { "entropy": 1.9466259226202964, "epoch": 1.6, "grad_norm": 0.10986328125, "learning_rate": 1.7780000000000003e-05, "loss": 11.5525, "mean_token_accuracy": 0.15283841700293124, "num_tokens": 19532552.0, "step": 2000 }, { "epoch": 1.6, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 19532552.0, "eval_coding_runtime": 92.7885, "eval_coding_samples_per_second": 5.389, "eval_coding_steps_per_second": 2.694, "step": 2000 }, { "epoch": 1.6, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 19532552.0, "eval_biology_runtime": 41.0205, "eval_biology_samples_per_second": 12.189, "eval_biology_steps_per_second": 6.095, "step": 2000 }, { "entropy": 1.9087241023778916, "epoch": 1.608, "grad_norm": 0.1123046875, "learning_rate": 1.7757777777777777e-05, "loss": 11.358, "mean_token_accuracy": 0.15919128167442978, "num_tokens": 19631506.0, "step": 2010 }, { "entropy": 1.8858548194169997, "epoch": 1.616, "grad_norm": 0.1162109375, "learning_rate": 1.7735555555555558e-05, "loss": 11.4741, "mean_token_accuracy": 0.16057525095529854, "num_tokens": 19732719.0, "step": 2020 }, { "entropy": 1.8991656713187695, "epoch": 1.624, "grad_norm": 0.111328125, "learning_rate": 1.7713333333333335e-05, "loss": 11.452, "mean_token_accuracy": 0.15742587419226767, "num_tokens": 19834441.0, "step": 2030 }, { "entropy": 1.9366624042391778, "epoch": 1.6320000000000001, "grad_norm": 0.11376953125, "learning_rate": 1.7691111111111113e-05, "loss": 11.3964, "mean_token_accuracy": 0.15863695833832026, "num_tokens": 19926830.0, "step": 2040 }, { "entropy": 1.9385347999632359, "epoch": 1.6400000000000001, "grad_norm": 0.1044921875, "learning_rate": 1.766888888888889e-05, "loss": 11.3855, "mean_token_accuracy": 0.15577454604208468, "num_tokens": 20017708.0, "step": 2050 }, { "entropy": 1.8880144242197274, "epoch": 1.6480000000000001, "grad_norm": 0.115234375, "learning_rate": 1.7646666666666668e-05, "loss": 11.427, "mean_token_accuracy": 0.15919531048275531, "num_tokens": 20118800.0, "step": 2060 }, { "entropy": 1.915343378484249, "epoch": 1.6560000000000001, "grad_norm": 0.11376953125, "learning_rate": 1.7624444444444446e-05, "loss": 11.4617, "mean_token_accuracy": 0.1596326429862529, "num_tokens": 20219215.0, "step": 2070 }, { "entropy": 1.8944584995508194, "epoch": 1.6640000000000001, "grad_norm": 0.11328125, "learning_rate": 1.7602222222222223e-05, "loss": 11.3653, "mean_token_accuracy": 0.1606519581284374, "num_tokens": 20320511.0, "step": 2080 }, { "entropy": 1.9216352105140686, "epoch": 1.6720000000000002, "grad_norm": 0.107421875, "learning_rate": 1.758e-05, "loss": 11.3123, "mean_token_accuracy": 0.1593463249038905, "num_tokens": 20414832.0, "step": 2090 }, { "entropy": 1.9011931098997592, "epoch": 1.6800000000000002, "grad_norm": 0.10498046875, "learning_rate": 1.755777777777778e-05, "loss": 11.4019, "mean_token_accuracy": 0.1574652241077274, "num_tokens": 20513393.0, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 20513393.0, "eval_coding_runtime": 94.0839, "eval_coding_samples_per_second": 5.314, "eval_coding_steps_per_second": 2.657, "step": 2100 }, { "epoch": 1.6800000000000002, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 20513393.0, "eval_biology_runtime": 40.6997, "eval_biology_samples_per_second": 12.285, "eval_biology_steps_per_second": 6.143, "step": 2100 }, { "entropy": 1.9457669362425805, "epoch": 1.688, "grad_norm": 0.10595703125, "learning_rate": 1.7535555555555556e-05, "loss": 11.4225, "mean_token_accuracy": 0.15737789403647184, "num_tokens": 20604938.0, "step": 2110 }, { "entropy": 1.9023445054888726, "epoch": 1.696, "grad_norm": 0.10107421875, "learning_rate": 1.7513333333333334e-05, "loss": 11.3753, "mean_token_accuracy": 0.16211591372266412, "num_tokens": 20707237.0, "step": 2120 }, { "entropy": 1.8882552206516265, "epoch": 1.704, "grad_norm": 0.103515625, "learning_rate": 1.749111111111111e-05, "loss": 11.4613, "mean_token_accuracy": 0.15875514289364218, "num_tokens": 20812088.0, "step": 2130 }, { "entropy": 1.8843944430351258, "epoch": 1.712, "grad_norm": 0.11083984375, "learning_rate": 1.746888888888889e-05, "loss": 11.4171, "mean_token_accuracy": 0.15933335497975348, "num_tokens": 20910419.0, "step": 2140 }, { "entropy": 1.9196000188589095, "epoch": 1.72, "grad_norm": 0.11572265625, "learning_rate": 1.7446666666666667e-05, "loss": 11.4545, "mean_token_accuracy": 0.1565834353212267, "num_tokens": 21009172.0, "step": 2150 }, { "entropy": 1.8814144264906645, "epoch": 1.728, "grad_norm": 0.1103515625, "learning_rate": 1.7424444444444444e-05, "loss": 11.4253, "mean_token_accuracy": 0.1586998355574906, "num_tokens": 21107498.0, "step": 2160 }, { "entropy": 1.9101394057273864, "epoch": 1.736, "grad_norm": 0.1044921875, "learning_rate": 1.7402222222222222e-05, "loss": 11.515, "mean_token_accuracy": 0.1576042068656534, "num_tokens": 21204959.0, "step": 2170 }, { "entropy": 1.8962420254945755, "epoch": 1.744, "grad_norm": 0.1083984375, "learning_rate": 1.7380000000000003e-05, "loss": 11.2589, "mean_token_accuracy": 0.16131992358714342, "num_tokens": 21303955.0, "step": 2180 }, { "entropy": 1.8812646329402924, "epoch": 1.752, "grad_norm": 0.1123046875, "learning_rate": 1.735777777777778e-05, "loss": 11.422, "mean_token_accuracy": 0.1588835945352912, "num_tokens": 21408903.0, "step": 2190 }, { "entropy": 1.9250748824328183, "epoch": 1.76, "grad_norm": 0.09716796875, "learning_rate": 1.7335555555555558e-05, "loss": 11.5098, "mean_token_accuracy": 0.1589009021408856, "num_tokens": 21499572.0, "step": 2200 }, { "epoch": 1.76, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 21499572.0, "eval_coding_runtime": 92.8416, "eval_coding_samples_per_second": 5.386, "eval_coding_steps_per_second": 2.693, "step": 2200 }, { "epoch": 1.76, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 21499572.0, "eval_biology_runtime": 40.9581, "eval_biology_samples_per_second": 12.208, "eval_biology_steps_per_second": 6.104, "step": 2200 }, { "entropy": 1.9176459647715092, "epoch": 1.768, "grad_norm": 0.107421875, "learning_rate": 1.7313333333333336e-05, "loss": 11.3785, "mean_token_accuracy": 0.1561640478670597, "num_tokens": 21596260.0, "step": 2210 }, { "entropy": 1.9001624204218388, "epoch": 1.776, "grad_norm": 0.11083984375, "learning_rate": 1.7291111111111113e-05, "loss": 11.3406, "mean_token_accuracy": 0.16171830627135933, "num_tokens": 21692804.0, "step": 2220 }, { "entropy": 1.890039400756359, "epoch": 1.784, "grad_norm": 0.11279296875, "learning_rate": 1.726888888888889e-05, "loss": 11.3238, "mean_token_accuracy": 0.16518823937512933, "num_tokens": 21796414.0, "step": 2230 }, { "entropy": 1.9122498795390128, "epoch": 1.792, "grad_norm": 0.1103515625, "learning_rate": 1.724666666666667e-05, "loss": 11.4882, "mean_token_accuracy": 0.15549946581013502, "num_tokens": 21894218.0, "step": 2240 }, { "entropy": 1.9429917126893996, "epoch": 1.8, "grad_norm": 0.1162109375, "learning_rate": 1.7224444444444446e-05, "loss": 11.3674, "mean_token_accuracy": 0.15660224789753557, "num_tokens": 21985664.0, "step": 2250 }, { "entropy": 1.8927513837814331, "epoch": 1.808, "grad_norm": 0.09814453125, "learning_rate": 1.7202222222222224e-05, "loss": 11.4481, "mean_token_accuracy": 0.15663987225852907, "num_tokens": 22082522.0, "step": 2260 }, { "entropy": 1.9026566654443742, "epoch": 1.8159999999999998, "grad_norm": 0.11181640625, "learning_rate": 1.718e-05, "loss": 11.368, "mean_token_accuracy": 0.15799217401072382, "num_tokens": 22180616.0, "step": 2270 }, { "entropy": 1.9114446617662906, "epoch": 1.8239999999999998, "grad_norm": 0.09912109375, "learning_rate": 1.715777777777778e-05, "loss": 11.3445, "mean_token_accuracy": 0.16169237615540624, "num_tokens": 22278933.0, "step": 2280 }, { "entropy": 1.9253177329897881, "epoch": 1.8319999999999999, "grad_norm": 0.11279296875, "learning_rate": 1.7135555555555557e-05, "loss": 11.5365, "mean_token_accuracy": 0.15368229574523867, "num_tokens": 22377874.0, "step": 2290 }, { "entropy": 1.9121784418821335, "epoch": 1.8399999999999999, "grad_norm": 0.1044921875, "learning_rate": 1.7113333333333334e-05, "loss": 11.3564, "mean_token_accuracy": 0.15802422454580664, "num_tokens": 22473801.0, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 22473801.0, "eval_coding_runtime": 92.9508, "eval_coding_samples_per_second": 5.379, "eval_coding_steps_per_second": 2.69, "step": 2300 }, { "epoch": 1.8399999999999999, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 22473801.0, "eval_biology_runtime": 40.8871, "eval_biology_samples_per_second": 12.229, "eval_biology_steps_per_second": 6.114, "step": 2300 }, { "entropy": 1.8603787779808045, "epoch": 1.8479999999999999, "grad_norm": 0.11669921875, "learning_rate": 1.7091111111111112e-05, "loss": 11.5278, "mean_token_accuracy": 0.15900381957180798, "num_tokens": 22582148.0, "step": 2310 }, { "entropy": 1.8850210525095463, "epoch": 1.8559999999999999, "grad_norm": 0.1103515625, "learning_rate": 1.706888888888889e-05, "loss": 11.569, "mean_token_accuracy": 0.15426975889131428, "num_tokens": 22677853.0, "step": 2320 }, { "entropy": 1.8693079382181168, "epoch": 1.8639999999999999, "grad_norm": 0.10302734375, "learning_rate": 1.704666666666667e-05, "loss": 11.4324, "mean_token_accuracy": 0.16082727410830558, "num_tokens": 22775707.0, "step": 2330 }, { "entropy": 1.9007405743002892, "epoch": 1.8719999999999999, "grad_norm": 0.10400390625, "learning_rate": 1.7024444444444445e-05, "loss": 11.3858, "mean_token_accuracy": 0.16051669786684214, "num_tokens": 22874965.0, "step": 2340 }, { "entropy": 1.9155317477881908, "epoch": 1.88, "grad_norm": 0.1220703125, "learning_rate": 1.7002222222222226e-05, "loss": 11.5316, "mean_token_accuracy": 0.15344377453438937, "num_tokens": 22972554.0, "step": 2350 }, { "entropy": 1.8995164781808853, "epoch": 1.888, "grad_norm": 0.1083984375, "learning_rate": 1.698e-05, "loss": 11.3258, "mean_token_accuracy": 0.1600531129632145, "num_tokens": 23068892.0, "step": 2360 }, { "entropy": 1.9323486879467964, "epoch": 1.896, "grad_norm": 0.10986328125, "learning_rate": 1.695777777777778e-05, "loss": 11.4639, "mean_token_accuracy": 0.1554114448837936, "num_tokens": 23168907.0, "step": 2370 }, { "entropy": 1.9529826886951924, "epoch": 1.904, "grad_norm": 0.10546875, "learning_rate": 1.6935555555555555e-05, "loss": 11.3384, "mean_token_accuracy": 0.15591429970227183, "num_tokens": 23263827.0, "step": 2380 }, { "entropy": 1.8884943507611751, "epoch": 1.912, "grad_norm": 0.10888671875, "learning_rate": 1.6913333333333336e-05, "loss": 11.3368, "mean_token_accuracy": 0.15996967633254827, "num_tokens": 23361737.0, "step": 2390 }, { "entropy": 1.9062687747180462, "epoch": 1.92, "grad_norm": 0.10791015625, "learning_rate": 1.689111111111111e-05, "loss": 11.3189, "mean_token_accuracy": 0.15876830075867474, "num_tokens": 23463627.0, "step": 2400 }, { "epoch": 1.92, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 23463627.0, "eval_coding_runtime": 93.9322, "eval_coding_samples_per_second": 5.323, "eval_coding_steps_per_second": 2.661, "step": 2400 }, { "epoch": 1.92, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 23463627.0, "eval_biology_runtime": 40.818, "eval_biology_samples_per_second": 12.25, "eval_biology_steps_per_second": 6.125, "step": 2400 }, { "entropy": 1.8954006470739841, "epoch": 1.928, "grad_norm": 0.111328125, "learning_rate": 1.686888888888889e-05, "loss": 11.3369, "mean_token_accuracy": 0.1590992364101112, "num_tokens": 23558006.0, "step": 2410 }, { "entropy": 1.8887307055294513, "epoch": 1.936, "grad_norm": 0.109375, "learning_rate": 1.684666666666667e-05, "loss": 11.4566, "mean_token_accuracy": 0.16132439165376128, "num_tokens": 23660418.0, "step": 2420 }, { "entropy": 1.845245386660099, "epoch": 1.944, "grad_norm": 0.10888671875, "learning_rate": 1.6824444444444447e-05, "loss": 11.4973, "mean_token_accuracy": 0.16263166088610886, "num_tokens": 23764848.0, "step": 2430 }, { "entropy": 1.9215773575007915, "epoch": 1.952, "grad_norm": 0.1123046875, "learning_rate": 1.6802222222222224e-05, "loss": 11.3361, "mean_token_accuracy": 0.15789743876084686, "num_tokens": 23858145.0, "step": 2440 }, { "entropy": 1.914723663777113, "epoch": 1.96, "grad_norm": 0.111328125, "learning_rate": 1.6780000000000002e-05, "loss": 11.3811, "mean_token_accuracy": 0.1588454409968108, "num_tokens": 23957580.0, "step": 2450 }, { "entropy": 1.9352627888321876, "epoch": 1.968, "grad_norm": 0.12060546875, "learning_rate": 1.675777777777778e-05, "loss": 11.6394, "mean_token_accuracy": 0.15175382704474033, "num_tokens": 24053364.0, "step": 2460 }, { "entropy": 1.9044601261615752, "epoch": 1.976, "grad_norm": 0.1171875, "learning_rate": 1.6735555555555557e-05, "loss": 11.4335, "mean_token_accuracy": 0.15687427022494377, "num_tokens": 24151503.0, "step": 2470 }, { "entropy": 1.8967606857419015, "epoch": 1.984, "grad_norm": 0.111328125, "learning_rate": 1.6713333333333335e-05, "loss": 11.3002, "mean_token_accuracy": 0.1592816713731736, "num_tokens": 24249465.0, "step": 2480 }, { "entropy": 1.8964715145528317, "epoch": 1.992, "grad_norm": 0.10400390625, "learning_rate": 1.6691111111111112e-05, "loss": 11.4641, "mean_token_accuracy": 0.15451558271888644, "num_tokens": 24346850.0, "step": 2490 }, { "entropy": 1.9223815500736237, "epoch": 2.0, "grad_norm": 0.11328125, "learning_rate": 1.666888888888889e-05, "loss": 11.4094, "mean_token_accuracy": 0.1565131512004882, "num_tokens": 24442582.0, "step": 2500 }, { "epoch": 2.0, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 24442582.0, "eval_coding_runtime": 92.8543, "eval_coding_samples_per_second": 5.385, "eval_coding_steps_per_second": 2.692, "step": 2500 }, { "epoch": 2.0, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 24442582.0, "eval_biology_runtime": 41.0571, "eval_biology_samples_per_second": 12.178, "eval_biology_steps_per_second": 6.089, "step": 2500 }, { "entropy": 1.9437901653349399, "epoch": 2.008, "grad_norm": 0.11083984375, "learning_rate": 1.6646666666666668e-05, "loss": 11.4952, "mean_token_accuracy": 0.1546648852992803, "num_tokens": 24536380.0, "step": 2510 }, { "entropy": 1.900653602182865, "epoch": 2.016, "grad_norm": 0.11083984375, "learning_rate": 1.6624444444444445e-05, "loss": 11.325, "mean_token_accuracy": 0.16003339942544698, "num_tokens": 24632353.0, "step": 2520 }, { "entropy": 1.9109860993921757, "epoch": 2.024, "grad_norm": 0.109375, "learning_rate": 1.6602222222222223e-05, "loss": 11.3807, "mean_token_accuracy": 0.15942266723141074, "num_tokens": 24728292.0, "step": 2530 }, { "entropy": 1.9284051954746246, "epoch": 2.032, "grad_norm": 0.10791015625, "learning_rate": 1.658e-05, "loss": 11.527, "mean_token_accuracy": 0.1541938280686736, "num_tokens": 24822715.0, "step": 2540 }, { "entropy": 1.9049392193555832, "epoch": 2.04, "grad_norm": 0.11279296875, "learning_rate": 1.6557777777777778e-05, "loss": 11.3616, "mean_token_accuracy": 0.16025418490171434, "num_tokens": 24922642.0, "step": 2550 }, { "entropy": 1.9019584387540818, "epoch": 2.048, "grad_norm": 0.11376953125, "learning_rate": 1.6535555555555556e-05, "loss": 11.3519, "mean_token_accuracy": 0.15873378361575305, "num_tokens": 25023118.0, "step": 2560 }, { "entropy": 1.9273467533290387, "epoch": 2.056, "grad_norm": 0.1123046875, "learning_rate": 1.6513333333333333e-05, "loss": 11.426, "mean_token_accuracy": 0.15817135088145734, "num_tokens": 25120290.0, "step": 2570 }, { "entropy": 1.9145058818161487, "epoch": 2.064, "grad_norm": 0.11328125, "learning_rate": 1.6491111111111114e-05, "loss": 11.5601, "mean_token_accuracy": 0.1541168469004333, "num_tokens": 25217414.0, "step": 2580 }, { "entropy": 1.9274385668337346, "epoch": 2.072, "grad_norm": 0.1103515625, "learning_rate": 1.646888888888889e-05, "loss": 11.3818, "mean_token_accuracy": 0.16090190247632563, "num_tokens": 25311278.0, "step": 2590 }, { "entropy": 1.9054167561233044, "epoch": 2.08, "grad_norm": 0.1025390625, "learning_rate": 1.644666666666667e-05, "loss": 11.4338, "mean_token_accuracy": 0.15598403732292354, "num_tokens": 25408961.0, "step": 2600 }, { "epoch": 2.08, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 25408961.0, "eval_coding_runtime": 93.8384, "eval_coding_samples_per_second": 5.328, "eval_coding_steps_per_second": 2.664, "step": 2600 }, { "epoch": 2.08, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 25408961.0, "eval_biology_runtime": 41.3742, "eval_biology_samples_per_second": 12.085, "eval_biology_steps_per_second": 6.042, "step": 2600 }, { "entropy": 1.8700239777565002, "epoch": 2.088, "grad_norm": 0.11181640625, "learning_rate": 1.6424444444444444e-05, "loss": 11.4265, "mean_token_accuracy": 0.1610154163558036, "num_tokens": 25508270.0, "step": 2610 }, { "entropy": 1.9474636733531951, "epoch": 2.096, "grad_norm": 0.1162109375, "learning_rate": 1.6402222222222225e-05, "loss": 11.4146, "mean_token_accuracy": 0.15509584778919816, "num_tokens": 25600511.0, "step": 2620 }, { "entropy": 1.9224482253193855, "epoch": 2.104, "grad_norm": 0.10888671875, "learning_rate": 1.638e-05, "loss": 11.4784, "mean_token_accuracy": 0.15859652827493848, "num_tokens": 25700023.0, "step": 2630 }, { "entropy": 1.934036909788847, "epoch": 2.112, "grad_norm": 0.1123046875, "learning_rate": 1.635777777777778e-05, "loss": 11.378, "mean_token_accuracy": 0.1580273206345737, "num_tokens": 25796565.0, "step": 2640 }, { "entropy": 1.9118811137974263, "epoch": 2.12, "grad_norm": 0.107421875, "learning_rate": 1.6335555555555558e-05, "loss": 11.5702, "mean_token_accuracy": 0.1561312806326896, "num_tokens": 25892925.0, "step": 2650 }, { "entropy": 1.8830873943865298, "epoch": 2.128, "grad_norm": 0.11962890625, "learning_rate": 1.6313333333333335e-05, "loss": 11.3559, "mean_token_accuracy": 0.16317159850150348, "num_tokens": 25991156.0, "step": 2660 }, { "entropy": 1.8808968737721443, "epoch": 2.136, "grad_norm": 0.103515625, "learning_rate": 1.6291111111111113e-05, "loss": 11.3396, "mean_token_accuracy": 0.1611542631406337, "num_tokens": 26095179.0, "step": 2670 }, { "entropy": 1.9176495954394341, "epoch": 2.144, "grad_norm": 0.1142578125, "learning_rate": 1.626888888888889e-05, "loss": 11.4749, "mean_token_accuracy": 0.1571306322235614, "num_tokens": 26193237.0, "step": 2680 }, { "entropy": 1.908892147988081, "epoch": 2.152, "grad_norm": 0.12255859375, "learning_rate": 1.6246666666666668e-05, "loss": 11.5167, "mean_token_accuracy": 0.15779453404247762, "num_tokens": 26291850.0, "step": 2690 }, { "entropy": 1.9191628627479076, "epoch": 2.16, "grad_norm": 0.10791015625, "learning_rate": 1.6224444444444446e-05, "loss": 11.3646, "mean_token_accuracy": 0.15869081704877316, "num_tokens": 26387993.0, "step": 2700 }, { "epoch": 2.16, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 26387993.0, "eval_coding_runtime": 93.1425, "eval_coding_samples_per_second": 5.368, "eval_coding_steps_per_second": 2.684, "step": 2700 }, { "epoch": 2.16, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 26387993.0, "eval_biology_runtime": 41.1691, "eval_biology_samples_per_second": 12.145, "eval_biology_steps_per_second": 6.073, "step": 2700 }, { "entropy": 1.9261901803314685, "epoch": 2.168, "grad_norm": 0.1015625, "learning_rate": 1.6202222222222223e-05, "loss": 11.4341, "mean_token_accuracy": 0.15784642342478036, "num_tokens": 26485327.0, "step": 2710 }, { "entropy": 1.9036023139953613, "epoch": 2.176, "grad_norm": 0.10888671875, "learning_rate": 1.618e-05, "loss": 11.5446, "mean_token_accuracy": 0.15460943533107638, "num_tokens": 26585269.0, "step": 2720 }, { "entropy": 1.899263332784176, "epoch": 2.184, "grad_norm": 0.1083984375, "learning_rate": 1.615777777777778e-05, "loss": 11.3673, "mean_token_accuracy": 0.16169481696560978, "num_tokens": 26683997.0, "step": 2730 }, { "entropy": 1.9311272650957108, "epoch": 2.192, "grad_norm": 0.115234375, "learning_rate": 1.6135555555555556e-05, "loss": 11.4495, "mean_token_accuracy": 0.15821120562031865, "num_tokens": 26776318.0, "step": 2740 }, { "entropy": 1.8850563056766987, "epoch": 2.2, "grad_norm": 0.1044921875, "learning_rate": 1.6113333333333334e-05, "loss": 11.3452, "mean_token_accuracy": 0.16115122279152275, "num_tokens": 26876705.0, "step": 2750 }, { "entropy": 1.9144244894385338, "epoch": 2.208, "grad_norm": 0.1123046875, "learning_rate": 1.609111111111111e-05, "loss": 11.5625, "mean_token_accuracy": 0.1553129477892071, "num_tokens": 26974420.0, "step": 2760 }, { "entropy": 1.9013450536876917, "epoch": 2.216, "grad_norm": 0.1103515625, "learning_rate": 1.606888888888889e-05, "loss": 11.4429, "mean_token_accuracy": 0.15538255805149676, "num_tokens": 27072242.0, "step": 2770 }, { "entropy": 1.92282452955842, "epoch": 2.224, "grad_norm": 0.1103515625, "learning_rate": 1.6046666666666667e-05, "loss": 11.5022, "mean_token_accuracy": 0.15521100857295095, "num_tokens": 27167577.0, "step": 2780 }, { "entropy": 1.8980347372591495, "epoch": 2.232, "grad_norm": 0.0966796875, "learning_rate": 1.6024444444444444e-05, "loss": 11.4875, "mean_token_accuracy": 0.1572256637737155, "num_tokens": 27267457.0, "step": 2790 }, { "entropy": 1.9151273936033248, "epoch": 2.24, "grad_norm": 0.1201171875, "learning_rate": 1.6002222222222222e-05, "loss": 11.322, "mean_token_accuracy": 0.16055175149813294, "num_tokens": 27364189.0, "step": 2800 }, { "epoch": 2.24, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 27364189.0, "eval_coding_runtime": 93.3157, "eval_coding_samples_per_second": 5.358, "eval_coding_steps_per_second": 2.679, "step": 2800 }, { "epoch": 2.24, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 27364189.0, "eval_biology_runtime": 41.9503, "eval_biology_samples_per_second": 11.919, "eval_biology_steps_per_second": 5.959, "step": 2800 }, { "entropy": 1.9232811972498893, "epoch": 2.248, "grad_norm": 0.0986328125, "learning_rate": 1.5980000000000003e-05, "loss": 11.2752, "mean_token_accuracy": 0.15789530873298646, "num_tokens": 27456862.0, "step": 2810 }, { "entropy": 1.8877685502171517, "epoch": 2.2560000000000002, "grad_norm": 0.11474609375, "learning_rate": 1.5957777777777777e-05, "loss": 11.3612, "mean_token_accuracy": 0.1628992194775492, "num_tokens": 27557387.0, "step": 2820 }, { "entropy": 1.9086260199546814, "epoch": 2.2640000000000002, "grad_norm": 0.11279296875, "learning_rate": 1.5935555555555558e-05, "loss": 11.4745, "mean_token_accuracy": 0.15700925663113594, "num_tokens": 27655988.0, "step": 2830 }, { "entropy": 1.905223260819912, "epoch": 2.2720000000000002, "grad_norm": 0.095703125, "learning_rate": 1.5913333333333332e-05, "loss": 11.4469, "mean_token_accuracy": 0.15800920827314258, "num_tokens": 27755725.0, "step": 2840 }, { "entropy": 1.9008326523005963, "epoch": 2.2800000000000002, "grad_norm": 0.1162109375, "learning_rate": 1.5891111111111113e-05, "loss": 11.525, "mean_token_accuracy": 0.15418536285869777, "num_tokens": 27854353.0, "step": 2850 }, { "entropy": 1.9335841588675975, "epoch": 2.288, "grad_norm": 0.11279296875, "learning_rate": 1.5868888888888888e-05, "loss": 11.4848, "mean_token_accuracy": 0.15718822227790952, "num_tokens": 27950694.0, "step": 2860 }, { "entropy": 1.9186846047639847, "epoch": 2.296, "grad_norm": 0.103515625, "learning_rate": 1.584666666666667e-05, "loss": 11.3358, "mean_token_accuracy": 0.16270824219100177, "num_tokens": 28048799.0, "step": 2870 }, { "entropy": 1.866267541795969, "epoch": 2.304, "grad_norm": 0.11181640625, "learning_rate": 1.5824444444444446e-05, "loss": 11.421, "mean_token_accuracy": 0.16045537383761257, "num_tokens": 28150719.0, "step": 2880 }, { "entropy": 1.9044499434530735, "epoch": 2.312, "grad_norm": 0.11328125, "learning_rate": 1.5802222222222224e-05, "loss": 11.3927, "mean_token_accuracy": 0.16035234807059168, "num_tokens": 28248071.0, "step": 2890 }, { "entropy": 1.8817835420370101, "epoch": 2.32, "grad_norm": 0.11572265625, "learning_rate": 1.578e-05, "loss": 11.4988, "mean_token_accuracy": 0.16117906668223442, "num_tokens": 28348652.0, "step": 2900 }, { "epoch": 2.32, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 28348652.0, "eval_coding_runtime": 93.0783, "eval_coding_samples_per_second": 5.372, "eval_coding_steps_per_second": 2.686, "step": 2900 }, { "epoch": 2.32, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 28348652.0, "eval_biology_runtime": 41.374, "eval_biology_samples_per_second": 12.085, "eval_biology_steps_per_second": 6.042, "step": 2900 }, { "entropy": 1.9071245066821576, "epoch": 2.328, "grad_norm": 0.10693359375, "learning_rate": 1.575777777777778e-05, "loss": 11.5137, "mean_token_accuracy": 0.15565888402052225, "num_tokens": 28446430.0, "step": 2910 }, { "entropy": 1.929621510952711, "epoch": 2.336, "grad_norm": 0.10498046875, "learning_rate": 1.5735555555555557e-05, "loss": 11.2705, "mean_token_accuracy": 0.160611807461828, "num_tokens": 28542945.0, "step": 2920 }, { "entropy": 1.9237425334751606, "epoch": 2.344, "grad_norm": 0.11328125, "learning_rate": 1.5713333333333334e-05, "loss": 11.3778, "mean_token_accuracy": 0.15708167278207838, "num_tokens": 28636050.0, "step": 2930 }, { "entropy": 1.8845984295010567, "epoch": 2.352, "grad_norm": 0.115234375, "learning_rate": 1.5691111111111112e-05, "loss": 11.3105, "mean_token_accuracy": 0.16018757806159556, "num_tokens": 28731927.0, "step": 2940 }, { "entropy": 1.9016848914325237, "epoch": 2.36, "grad_norm": 0.10888671875, "learning_rate": 1.5668888888888893e-05, "loss": 11.4814, "mean_token_accuracy": 0.15679828128777445, "num_tokens": 28829775.0, "step": 2950 }, { "entropy": 1.8923688180744649, "epoch": 2.368, "grad_norm": 0.111328125, "learning_rate": 1.5646666666666667e-05, "loss": 11.2761, "mean_token_accuracy": 0.15961556532420218, "num_tokens": 28929298.0, "step": 2960 }, { "entropy": 1.9211289800703526, "epoch": 2.376, "grad_norm": 0.1064453125, "learning_rate": 1.5624444444444448e-05, "loss": 11.3615, "mean_token_accuracy": 0.15924623790197073, "num_tokens": 29027827.0, "step": 2970 }, { "entropy": 1.9433083713054657, "epoch": 2.384, "grad_norm": 0.09912109375, "learning_rate": 1.5602222222222222e-05, "loss": 11.4113, "mean_token_accuracy": 0.15409083757549524, "num_tokens": 29121142.0, "step": 2980 }, { "entropy": 1.9047025486826896, "epoch": 2.392, "grad_norm": 0.11279296875, "learning_rate": 1.5580000000000003e-05, "loss": 11.4248, "mean_token_accuracy": 0.15844591483473777, "num_tokens": 29218220.0, "step": 2990 }, { "entropy": 1.8793051794171334, "epoch": 2.4, "grad_norm": 0.109375, "learning_rate": 1.5557777777777778e-05, "loss": 11.3359, "mean_token_accuracy": 0.16226587211713195, "num_tokens": 29317919.0, "step": 3000 }, { "epoch": 2.4, "eval_coding_entropy": 1.000721924304962, "eval_coding_loss": 8.267828941345215, "eval_coding_mean_token_accuracy": 0.2908177390098572, "eval_coding_num_tokens": 29317919.0, "eval_coding_runtime": 93.1485, "eval_coding_samples_per_second": 5.368, "eval_coding_steps_per_second": 2.684, "step": 3000 }, { "epoch": 2.4, "eval_biology_entropy": 1.8729813146591185, "eval_biology_loss": 11.396076202392578, "eval_biology_mean_token_accuracy": 0.1593981314599514, "eval_biology_num_tokens": 29317919.0, "eval_biology_runtime": 40.738, "eval_biology_samples_per_second": 12.274, "eval_biology_steps_per_second": 6.137, "step": 3000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5227694585717586e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }