{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6734127338975668, "epoch": 0.004, "grad_norm": 1000.0, "learning_rate": 1.8e-07, "loss": 28.5463, "mean_token_accuracy": 0.7051723279058933, "num_tokens": 89176.0, "step": 10 }, { "entropy": 0.7554282372817397, "epoch": 0.008, "grad_norm": 848.0, "learning_rate": 3.8e-07, "loss": 30.5507, "mean_token_accuracy": 0.6821797413751483, "num_tokens": 173886.0, "step": 20 }, { "entropy": 0.7452052749693394, "epoch": 0.012, "grad_norm": 700.0, "learning_rate": 5.800000000000001e-07, "loss": 28.0843, "mean_token_accuracy": 0.6968917248770594, "num_tokens": 258970.0, "step": 30 }, { "entropy": 0.8406938400119544, "epoch": 0.016, "grad_norm": 588.0, "learning_rate": 7.8e-07, "loss": 28.3624, "mean_token_accuracy": 0.6884651094675064, "num_tokens": 338352.0, "step": 40 }, { "entropy": 0.8782084910199046, "epoch": 0.02, "grad_norm": 556.0, "learning_rate": 9.800000000000001e-07, "loss": 25.7081, "mean_token_accuracy": 0.7061796801164746, "num_tokens": 424243.0, "step": 50 }, { "entropy": 0.9611807337030769, "epoch": 0.024, "grad_norm": 456.0, "learning_rate": 1.1800000000000001e-06, "loss": 25.387, "mean_token_accuracy": 0.6989632658660412, "num_tokens": 505613.0, "step": 60 }, { "entropy": 0.9614389628171921, "epoch": 0.028, "grad_norm": 290.0, "learning_rate": 1.3800000000000001e-06, "loss": 22.9903, "mean_token_accuracy": 0.7096886133775115, "num_tokens": 584157.0, "step": 70 }, { "entropy": 0.9275373946875334, "epoch": 0.032, "grad_norm": 161.0, "learning_rate": 1.5800000000000001e-06, "loss": 20.0946, "mean_token_accuracy": 0.7300519198179245, "num_tokens": 671193.0, "step": 80 }, { "entropy": 1.0108212113380433, "epoch": 0.036, "grad_norm": 217.0, "learning_rate": 1.7800000000000001e-06, "loss": 20.6725, "mean_token_accuracy": 0.7165856244042516, "num_tokens": 753947.0, "step": 90 }, { "entropy": 0.9366745728999376, "epoch": 0.04, "grad_norm": 162.0, "learning_rate": 1.98e-06, "loss": 18.0538, "mean_token_accuracy": 0.7409927822649479, "num_tokens": 835868.0, "step": 100 }, { "epoch": 0.04, "eval_biology_entropy": 1.5465301005840302, "eval_biology_loss": 1.9649326801300049, "eval_biology_mean_token_accuracy": 0.5973824737071991, "eval_biology_num_tokens": 835868.0, "eval_biology_runtime": 35.5773, "eval_biology_samples_per_second": 14.054, "eval_biology_steps_per_second": 14.054, "step": 100 }, { "epoch": 0.04, "eval_chemistry_entropy": 1.0573365586400032, "eval_chemistry_loss": 1.2894866466522217, "eval_chemistry_mean_token_accuracy": 0.7121288641691208, "eval_chemistry_num_tokens": 835868.0, "eval_chemistry_runtime": 39.6571, "eval_chemistry_samples_per_second": 12.608, "eval_chemistry_steps_per_second": 12.608, "step": 100 }, { "epoch": 0.04, "eval_math_entropy": 0.8270023443698883, "eval_math_loss": 1.1661900281906128, "eval_math_mean_token_accuracy": 0.737815534889698, "eval_math_num_tokens": 835868.0, "eval_math_runtime": 40.5371, "eval_math_samples_per_second": 12.334, "eval_math_steps_per_second": 12.334, "step": 100 }, { "epoch": 0.04, "eval_physics_entropy": 0.9262812133431435, "eval_physics_loss": 1.1149247884750366, "eval_physics_mean_token_accuracy": 0.7413808185458183, "eval_physics_num_tokens": 835868.0, "eval_physics_runtime": 45.1205, "eval_physics_samples_per_second": 11.081, "eval_physics_steps_per_second": 11.081, "step": 100 }, { "entropy": 0.8625543506816029, "epoch": 0.044, "grad_norm": 141.0, "learning_rate": 2.1800000000000003e-06, "loss": 16.1132, "mean_token_accuracy": 0.7616209197789431, "num_tokens": 917534.0, "step": 110 }, { "entropy": 0.8609532468020916, "epoch": 0.048, "grad_norm": 142.0, "learning_rate": 2.38e-06, "loss": 15.8038, "mean_token_accuracy": 0.7642258133739233, "num_tokens": 1001008.0, "step": 120 }, { "entropy": 0.8615515833720565, "epoch": 0.052, "grad_norm": 110.0, "learning_rate": 2.5800000000000003e-06, "loss": 15.6256, "mean_token_accuracy": 0.7626194018870592, "num_tokens": 1085098.0, "step": 130 }, { "entropy": 0.9037158312276006, "epoch": 0.056, "grad_norm": 122.5, "learning_rate": 2.7800000000000005e-06, "loss": 16.4938, "mean_token_accuracy": 0.7565715182572603, "num_tokens": 1166615.0, "step": 140 }, { "entropy": 0.8950596721842885, "epoch": 0.06, "grad_norm": 134.0, "learning_rate": 2.9800000000000003e-06, "loss": 14.5001, "mean_token_accuracy": 0.7665418200194836, "num_tokens": 1248387.0, "step": 150 }, { "entropy": 0.961725745536387, "epoch": 0.064, "grad_norm": 100.0, "learning_rate": 3.1800000000000005e-06, "loss": 15.127, "mean_token_accuracy": 0.7604502853006124, "num_tokens": 1331208.0, "step": 160 }, { "entropy": 0.881209403835237, "epoch": 0.068, "grad_norm": 98.5, "learning_rate": 3.3800000000000007e-06, "loss": 13.694, "mean_token_accuracy": 0.7777196705341339, "num_tokens": 1416841.0, "step": 170 }, { "entropy": 0.8587444640696049, "epoch": 0.072, "grad_norm": 109.0, "learning_rate": 3.58e-06, "loss": 13.3488, "mean_token_accuracy": 0.7776382105425, "num_tokens": 1499449.0, "step": 180 }, { "entropy": 0.8419531056657433, "epoch": 0.076, "grad_norm": 89.5, "learning_rate": 3.7800000000000002e-06, "loss": 13.184, "mean_token_accuracy": 0.7732796184718609, "num_tokens": 1585982.0, "step": 190 }, { "entropy": 0.8316561704501509, "epoch": 0.08, "grad_norm": 83.0, "learning_rate": 3.980000000000001e-06, "loss": 13.2192, "mean_token_accuracy": 0.7721541322767734, "num_tokens": 1667532.0, "step": 200 }, { "epoch": 0.08, "eval_biology_entropy": 1.3391264139413834, "eval_biology_loss": 1.495794653892517, "eval_biology_mean_token_accuracy": 0.6435496712327003, "eval_biology_num_tokens": 1667532.0, "eval_biology_runtime": 35.2419, "eval_biology_samples_per_second": 14.188, "eval_biology_steps_per_second": 14.188, "step": 200 }, { "epoch": 0.08, "eval_chemistry_entropy": 0.9186333262324333, "eval_chemistry_loss": 0.9441949725151062, "eval_chemistry_mean_token_accuracy": 0.749591316819191, "eval_chemistry_num_tokens": 1667532.0, "eval_chemistry_runtime": 39.611, "eval_chemistry_samples_per_second": 12.623, "eval_chemistry_steps_per_second": 12.623, "step": 200 }, { "epoch": 0.08, "eval_math_entropy": 0.7541183361709118, "eval_math_loss": 0.8956651091575623, "eval_math_mean_token_accuracy": 0.7658155815601349, "eval_math_num_tokens": 1667532.0, "eval_math_runtime": 40.4594, "eval_math_samples_per_second": 12.358, "eval_math_steps_per_second": 12.358, "step": 200 }, { "epoch": 0.08, "eval_physics_entropy": 0.8000644188523293, "eval_physics_loss": 0.8156371116638184, "eval_physics_mean_token_accuracy": 0.7777305005192756, "eval_physics_num_tokens": 1667532.0, "eval_physics_runtime": 45.1972, "eval_physics_samples_per_second": 11.063, "eval_physics_steps_per_second": 11.063, "step": 200 }, { "entropy": 0.7765617506578565, "epoch": 0.084, "grad_norm": 79.0, "learning_rate": 4.18e-06, "loss": 12.5271, "mean_token_accuracy": 0.7859760526567697, "num_tokens": 1750813.0, "step": 210 }, { "entropy": 0.8027046866714954, "epoch": 0.088, "grad_norm": 82.0, "learning_rate": 4.38e-06, "loss": 12.8324, "mean_token_accuracy": 0.7811258573085069, "num_tokens": 1835489.0, "step": 220 }, { "entropy": 0.8210387529805303, "epoch": 0.092, "grad_norm": 93.0, "learning_rate": 4.58e-06, "loss": 13.2166, "mean_token_accuracy": 0.7749050311744213, "num_tokens": 1915115.0, "step": 230 }, { "entropy": 0.7716038260608912, "epoch": 0.096, "grad_norm": 106.5, "learning_rate": 4.78e-06, "loss": 12.3448, "mean_token_accuracy": 0.7872315399348736, "num_tokens": 2007176.0, "step": 240 }, { "entropy": 0.7926006538793444, "epoch": 0.1, "grad_norm": 73.0, "learning_rate": 4.980000000000001e-06, "loss": 12.647, "mean_token_accuracy": 0.7829932551831007, "num_tokens": 2088696.0, "step": 250 }, { "entropy": 0.8025551496073604, "epoch": 0.104, "grad_norm": 76.5, "learning_rate": 5.18e-06, "loss": 12.8826, "mean_token_accuracy": 0.7798444323241711, "num_tokens": 2171335.0, "step": 260 }, { "entropy": 0.7691786218434572, "epoch": 0.108, "grad_norm": 90.5, "learning_rate": 5.380000000000001e-06, "loss": 12.3875, "mean_token_accuracy": 0.7863950747996569, "num_tokens": 2254168.0, "step": 270 }, { "entropy": 0.7568587090820074, "epoch": 0.112, "grad_norm": 102.5, "learning_rate": 5.580000000000001e-06, "loss": 12.0152, "mean_token_accuracy": 0.7904921755194664, "num_tokens": 2342259.0, "step": 280 }, { "entropy": 0.7691523030400276, "epoch": 0.116, "grad_norm": 81.0, "learning_rate": 5.78e-06, "loss": 12.3692, "mean_token_accuracy": 0.7872932553291321, "num_tokens": 2426289.0, "step": 290 }, { "entropy": 0.8102743584662676, "epoch": 0.12, "grad_norm": 72.5, "learning_rate": 5.98e-06, "loss": 12.869, "mean_token_accuracy": 0.7774799339473247, "num_tokens": 2504279.0, "step": 300 }, { "epoch": 0.12, "eval_biology_entropy": 1.2733513461351396, "eval_biology_loss": 1.4364651441574097, "eval_biology_mean_token_accuracy": 0.6550095283985138, "eval_biology_num_tokens": 2504279.0, "eval_biology_runtime": 35.2012, "eval_biology_samples_per_second": 14.204, "eval_biology_steps_per_second": 14.204, "step": 300 }, { "epoch": 0.12, "eval_chemistry_entropy": 0.8685060097575188, "eval_chemistry_loss": 0.891410768032074, "eval_chemistry_mean_token_accuracy": 0.7606868423819542, "eval_chemistry_num_tokens": 2504279.0, "eval_chemistry_runtime": 39.6393, "eval_chemistry_samples_per_second": 12.614, "eval_chemistry_steps_per_second": 12.614, "step": 300 }, { "epoch": 0.12, "eval_math_entropy": 0.7212547466754914, "eval_math_loss": 0.8686274886131287, "eval_math_mean_token_accuracy": 0.7728212839365005, "eval_math_num_tokens": 2504279.0, "eval_math_runtime": 40.496, "eval_math_samples_per_second": 12.347, "eval_math_steps_per_second": 12.347, "step": 300 }, { "epoch": 0.12, "eval_physics_entropy": 0.7448382738828659, "eval_physics_loss": 0.75321364402771, "eval_physics_mean_token_accuracy": 0.7901370082497596, "eval_physics_num_tokens": 2504279.0, "eval_physics_runtime": 45.1204, "eval_physics_samples_per_second": 11.081, "eval_physics_steps_per_second": 11.081, "step": 300 }, { "entropy": 0.785609095916152, "epoch": 0.124, "grad_norm": 72.5, "learning_rate": 6.18e-06, "loss": 12.587, "mean_token_accuracy": 0.7802519340068101, "num_tokens": 2583599.0, "step": 310 }, { "entropy": 0.7176813660189509, "epoch": 0.128, "grad_norm": 68.0, "learning_rate": 6.380000000000001e-06, "loss": 11.598, "mean_token_accuracy": 0.7966837007552385, "num_tokens": 2663987.0, "step": 320 }, { "entropy": 0.7790673697367311, "epoch": 0.132, "grad_norm": 80.0, "learning_rate": 6.5800000000000005e-06, "loss": 12.5265, "mean_token_accuracy": 0.7850641790777445, "num_tokens": 2743933.0, "step": 330 }, { "entropy": 0.7665272958576679, "epoch": 0.136, "grad_norm": 58.0, "learning_rate": 6.780000000000001e-06, "loss": 12.1612, "mean_token_accuracy": 0.792558753862977, "num_tokens": 2829720.0, "step": 340 }, { "entropy": 0.7513320775702595, "epoch": 0.14, "grad_norm": 63.5, "learning_rate": 6.98e-06, "loss": 12.1271, "mean_token_accuracy": 0.7889207687228919, "num_tokens": 2915814.0, "step": 350 }, { "entropy": 0.7429860392585397, "epoch": 0.144, "grad_norm": 93.0, "learning_rate": 7.180000000000001e-06, "loss": 11.8571, "mean_token_accuracy": 0.7946608085185289, "num_tokens": 2997824.0, "step": 360 }, { "entropy": 0.7470190897583961, "epoch": 0.148, "grad_norm": 78.0, "learning_rate": 7.3800000000000005e-06, "loss": 12.0544, "mean_token_accuracy": 0.7908409368246794, "num_tokens": 3076422.0, "step": 370 }, { "entropy": 0.705881585367024, "epoch": 0.152, "grad_norm": 67.0, "learning_rate": 7.58e-06, "loss": 11.3247, "mean_token_accuracy": 0.799928180873394, "num_tokens": 3159946.0, "step": 380 }, { "entropy": 0.7021555813029409, "epoch": 0.156, "grad_norm": 71.0, "learning_rate": 7.78e-06, "loss": 11.1229, "mean_token_accuracy": 0.8013802103698253, "num_tokens": 3247218.0, "step": 390 }, { "entropy": 0.7140809871256352, "epoch": 0.16, "grad_norm": 70.5, "learning_rate": 7.980000000000002e-06, "loss": 11.5837, "mean_token_accuracy": 0.7982432089745999, "num_tokens": 3330597.0, "step": 400 }, { "epoch": 0.16, "eval_biology_entropy": 1.2522204325199127, "eval_biology_loss": 1.4104059934616089, "eval_biology_mean_token_accuracy": 0.6601863117814064, "eval_biology_num_tokens": 3330597.0, "eval_biology_runtime": 35.3004, "eval_biology_samples_per_second": 14.164, "eval_biology_steps_per_second": 14.164, "step": 400 }, { "epoch": 0.16, "eval_chemistry_entropy": 0.8451869760751725, "eval_chemistry_loss": 0.8661983609199524, "eval_chemistry_mean_token_accuracy": 0.7661204301714897, "eval_chemistry_num_tokens": 3330597.0, "eval_chemistry_runtime": 39.7099, "eval_chemistry_samples_per_second": 12.591, "eval_chemistry_steps_per_second": 12.591, "step": 400 }, { "epoch": 0.16, "eval_math_entropy": 0.7037465732693672, "eval_math_loss": 0.8592318296432495, "eval_math_mean_token_accuracy": 0.7752037541866302, "eval_math_num_tokens": 3330597.0, "eval_math_runtime": 40.5913, "eval_math_samples_per_second": 12.318, "eval_math_steps_per_second": 12.318, "step": 400 }, { "epoch": 0.16, "eval_physics_entropy": 0.7166930376887322, "eval_physics_loss": 0.7220315933227539, "eval_physics_mean_token_accuracy": 0.797553953409195, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 45.1326, "eval_physics_samples_per_second": 11.078, "eval_physics_steps_per_second": 11.078, "step": 400 }, { "entropy": 0.7665172951295972, "epoch": 0.164, "grad_norm": 76.0, "learning_rate": 8.18e-06, "loss": 12.0954, "mean_token_accuracy": 0.7892688918858767, "num_tokens": 3414445.0, "step": 410 }, { "entropy": 0.7116625974886119, "epoch": 0.168, "grad_norm": 82.0, "learning_rate": 8.380000000000001e-06, "loss": 11.522, "mean_token_accuracy": 0.7954209111630917, "num_tokens": 3496104.0, "step": 420 }, { "entropy": 0.7325116093270481, "epoch": 0.172, "grad_norm": 63.75, "learning_rate": 8.580000000000001e-06, "loss": 11.8769, "mean_token_accuracy": 0.7929634388536215, "num_tokens": 3576979.0, "step": 430 }, { "entropy": 0.713999939430505, "epoch": 0.176, "grad_norm": 87.5, "learning_rate": 8.78e-06, "loss": 11.4054, "mean_token_accuracy": 0.7975817665457725, "num_tokens": 3658264.0, "step": 440 }, { "entropy": 0.7327057829126715, "epoch": 0.18, "grad_norm": 80.5, "learning_rate": 8.98e-06, "loss": 11.7006, "mean_token_accuracy": 0.7944537442177534, "num_tokens": 3739142.0, "step": 450 }, { "entropy": 0.7196770435199141, "epoch": 0.184, "grad_norm": 72.5, "learning_rate": 9.180000000000002e-06, "loss": 11.5975, "mean_token_accuracy": 0.7943054404109716, "num_tokens": 3821352.0, "step": 460 }, { "entropy": 0.6925434624776244, "epoch": 0.188, "grad_norm": 65.0, "learning_rate": 9.38e-06, "loss": 11.1089, "mean_token_accuracy": 0.8022922430187464, "num_tokens": 3905291.0, "step": 470 }, { "entropy": 0.6977481247857213, "epoch": 0.192, "grad_norm": 59.5, "learning_rate": 9.58e-06, "loss": 11.1687, "mean_token_accuracy": 0.803613081201911, "num_tokens": 3995568.0, "step": 480 }, { "entropy": 0.7005923493765295, "epoch": 0.196, "grad_norm": 73.5, "learning_rate": 9.780000000000001e-06, "loss": 11.3477, "mean_token_accuracy": 0.8010011687874794, "num_tokens": 4075670.0, "step": 490 }, { "entropy": 0.6876647426746786, "epoch": 0.2, "grad_norm": 70.0, "learning_rate": 9.980000000000001e-06, "loss": 10.9642, "mean_token_accuracy": 0.8049134809523821, "num_tokens": 4160918.0, "step": 500 }, { "epoch": 0.2, "eval_biology_entropy": 1.2019921021461486, "eval_biology_loss": 1.389614462852478, "eval_biology_mean_token_accuracy": 0.6654457822442055, "eval_biology_num_tokens": 4160918.0, "eval_biology_runtime": 35.0884, "eval_biology_samples_per_second": 14.25, "eval_biology_steps_per_second": 14.25, "step": 500 }, { "epoch": 0.2, "eval_chemistry_entropy": 0.8132011856138707, "eval_chemistry_loss": 0.8497000932693481, "eval_chemistry_mean_token_accuracy": 0.7708191834092141, "eval_chemistry_num_tokens": 4160918.0, "eval_chemistry_runtime": 39.8091, "eval_chemistry_samples_per_second": 12.56, "eval_chemistry_steps_per_second": 12.56, "step": 500 }, { "epoch": 0.2, "eval_math_entropy": 0.6891986537277699, "eval_math_loss": 0.8553754687309265, "eval_math_mean_token_accuracy": 0.7759535777568817, "eval_math_num_tokens": 4160918.0, "eval_math_runtime": 40.7076, "eval_math_samples_per_second": 12.283, "eval_math_steps_per_second": 12.283, "step": 500 }, { "epoch": 0.2, "eval_physics_entropy": 0.6950600004792213, "eval_physics_loss": 0.7005596160888672, "eval_physics_mean_token_accuracy": 0.8021351820230485, "eval_physics_num_tokens": 4160918.0, "eval_physics_runtime": 45.2305, "eval_physics_samples_per_second": 11.054, "eval_physics_steps_per_second": 11.054, "step": 500 }, { "entropy": 0.6887986609712243, "epoch": 0.204, "grad_norm": 61.75, "learning_rate": 1.018e-05, "loss": 11.1871, "mean_token_accuracy": 0.8051418982446193, "num_tokens": 4238810.0, "step": 510 }, { "entropy": 0.6818487482145429, "epoch": 0.208, "grad_norm": 64.0, "learning_rate": 1.038e-05, "loss": 10.9951, "mean_token_accuracy": 0.8058457836508751, "num_tokens": 4321436.0, "step": 520 }, { "entropy": 0.7008362987078727, "epoch": 0.212, "grad_norm": 67.5, "learning_rate": 1.0580000000000002e-05, "loss": 11.0955, "mean_token_accuracy": 0.8010448336601257, "num_tokens": 4401710.0, "step": 530 }, { "entropy": 0.7055313820019364, "epoch": 0.216, "grad_norm": 62.25, "learning_rate": 1.0780000000000002e-05, "loss": 11.4932, "mean_token_accuracy": 0.79723966345191, "num_tokens": 4482225.0, "step": 540 }, { "entropy": 0.6803635071963072, "epoch": 0.22, "grad_norm": 68.5, "learning_rate": 1.0980000000000002e-05, "loss": 11.0034, "mean_token_accuracy": 0.804273284226656, "num_tokens": 4564662.0, "step": 550 }, { "entropy": 0.667565680295229, "epoch": 0.224, "grad_norm": 52.0, "learning_rate": 1.1180000000000001e-05, "loss": 10.6657, "mean_token_accuracy": 0.8107417456805706, "num_tokens": 4648491.0, "step": 560 }, { "entropy": 0.6490520000457763, "epoch": 0.228, "grad_norm": 56.5, "learning_rate": 1.138e-05, "loss": 10.5546, "mean_token_accuracy": 0.8112937267869711, "num_tokens": 4731596.0, "step": 570 }, { "entropy": 0.7194359874352813, "epoch": 0.232, "grad_norm": 58.25, "learning_rate": 1.1580000000000001e-05, "loss": 11.4255, "mean_token_accuracy": 0.7990602564066649, "num_tokens": 4813449.0, "step": 580 }, { "entropy": 0.6862584982067347, "epoch": 0.236, "grad_norm": 50.25, "learning_rate": 1.178e-05, "loss": 11.0007, "mean_token_accuracy": 0.8042842660099268, "num_tokens": 4899144.0, "step": 590 }, { "entropy": 0.6895454197190702, "epoch": 0.24, "grad_norm": 58.5, "learning_rate": 1.198e-05, "loss": 11.1171, "mean_token_accuracy": 0.8028610210865736, "num_tokens": 4986175.0, "step": 600 }, { "epoch": 0.24, "eval_biology_entropy": 1.2410894528031349, "eval_biology_loss": 1.3766074180603027, "eval_biology_mean_token_accuracy": 0.6672532483935356, "eval_biology_num_tokens": 4986175.0, "eval_biology_runtime": 35.1026, "eval_biology_samples_per_second": 14.244, "eval_biology_steps_per_second": 14.244, "step": 600 }, { "epoch": 0.24, "eval_chemistry_entropy": 0.8104133826196194, "eval_chemistry_loss": 0.8397422432899475, "eval_chemistry_mean_token_accuracy": 0.7722517918944359, "eval_chemistry_num_tokens": 4986175.0, "eval_chemistry_runtime": 39.6397, "eval_chemistry_samples_per_second": 12.614, "eval_chemistry_steps_per_second": 12.614, "step": 600 }, { "epoch": 0.24, "eval_math_entropy": 0.6969810444712639, "eval_math_loss": 0.8505039215087891, "eval_math_mean_token_accuracy": 0.7772268445491791, "eval_math_num_tokens": 4986175.0, "eval_math_runtime": 40.4861, "eval_math_samples_per_second": 12.35, "eval_math_steps_per_second": 12.35, "step": 600 }, { "epoch": 0.24, "eval_physics_entropy": 0.6853992799520493, "eval_physics_loss": 0.6912311911582947, "eval_physics_mean_token_accuracy": 0.8048652278184891, "eval_physics_num_tokens": 4986175.0, "eval_physics_runtime": 45.4233, "eval_physics_samples_per_second": 11.008, "eval_physics_steps_per_second": 11.008, "step": 600 }, { "entropy": 0.6969290849752724, "epoch": 0.244, "grad_norm": 65.0, "learning_rate": 1.218e-05, "loss": 11.151, "mean_token_accuracy": 0.8016246553510428, "num_tokens": 5071994.0, "step": 610 }, { "entropy": 0.6583109201863409, "epoch": 0.248, "grad_norm": 57.5, "learning_rate": 1.2380000000000002e-05, "loss": 10.5979, "mean_token_accuracy": 0.8095750134438277, "num_tokens": 5164741.0, "step": 620 }, { "entropy": 0.7111584974452854, "epoch": 0.252, "grad_norm": 81.5, "learning_rate": 1.2580000000000002e-05, "loss": 11.3705, "mean_token_accuracy": 0.7996403712779283, "num_tokens": 5245878.0, "step": 630 }, { "entropy": 0.6798671390861273, "epoch": 0.256, "grad_norm": 65.5, "learning_rate": 1.2780000000000001e-05, "loss": 11.0375, "mean_token_accuracy": 0.802841005846858, "num_tokens": 5329320.0, "step": 640 }, { "entropy": 0.6868512833490967, "epoch": 0.26, "grad_norm": 55.0, "learning_rate": 1.2980000000000001e-05, "loss": 10.9356, "mean_token_accuracy": 0.8048643987625838, "num_tokens": 5412557.0, "step": 650 }, { "entropy": 0.660436293296516, "epoch": 0.264, "grad_norm": 65.5, "learning_rate": 1.3180000000000001e-05, "loss": 10.8415, "mean_token_accuracy": 0.8079600531607867, "num_tokens": 5497259.0, "step": 660 }, { "entropy": 0.7050647180527448, "epoch": 0.268, "grad_norm": 60.5, "learning_rate": 1.3380000000000002e-05, "loss": 11.106, "mean_token_accuracy": 0.8046324852854013, "num_tokens": 5576721.0, "step": 670 }, { "entropy": 0.7028336159884929, "epoch": 0.272, "grad_norm": 61.25, "learning_rate": 1.3580000000000002e-05, "loss": 11.3741, "mean_token_accuracy": 0.7989816222339868, "num_tokens": 5658796.0, "step": 680 }, { "entropy": 0.713030344620347, "epoch": 0.276, "grad_norm": 62.0, "learning_rate": 1.378e-05, "loss": 11.4372, "mean_token_accuracy": 0.797571299597621, "num_tokens": 5737499.0, "step": 690 }, { "entropy": 0.6879976620897651, "epoch": 0.28, "grad_norm": 78.5, "learning_rate": 1.398e-05, "loss": 11.1153, "mean_token_accuracy": 0.8056350216269493, "num_tokens": 5819027.0, "step": 700 }, { "epoch": 0.28, "eval_biology_entropy": 1.2951917058229447, "eval_biology_loss": 1.374377727508545, "eval_biology_mean_token_accuracy": 0.6666482761502266, "eval_biology_num_tokens": 5819027.0, "eval_biology_runtime": 35.3803, "eval_biology_samples_per_second": 14.132, "eval_biology_steps_per_second": 14.132, "step": 700 }, { "epoch": 0.28, "eval_chemistry_entropy": 0.8508687050342559, "eval_chemistry_loss": 0.8420527577400208, "eval_chemistry_mean_token_accuracy": 0.7695601610541344, "eval_chemistry_num_tokens": 5819027.0, "eval_chemistry_runtime": 39.6948, "eval_chemistry_samples_per_second": 12.596, "eval_chemistry_steps_per_second": 12.596, "step": 700 }, { "epoch": 0.28, "eval_math_entropy": 0.7296031129956245, "eval_math_loss": 0.8581300377845764, "eval_math_mean_token_accuracy": 0.7747315044403076, "eval_math_num_tokens": 5819027.0, "eval_math_runtime": 40.6002, "eval_math_samples_per_second": 12.315, "eval_math_steps_per_second": 12.315, "step": 700 }, { "epoch": 0.28, "eval_physics_entropy": 0.7137228677272797, "eval_physics_loss": 0.6899552941322327, "eval_physics_mean_token_accuracy": 0.804570896744728, "eval_physics_num_tokens": 5819027.0, "eval_physics_runtime": 45.1753, "eval_physics_samples_per_second": 11.068, "eval_physics_steps_per_second": 11.068, "step": 700 }, { "entropy": 0.7131443534046411, "epoch": 0.284, "grad_norm": 76.5, "learning_rate": 1.418e-05, "loss": 11.5042, "mean_token_accuracy": 0.7973460745066404, "num_tokens": 5898693.0, "step": 710 }, { "entropy": 0.6842026851139963, "epoch": 0.288, "grad_norm": 58.5, "learning_rate": 1.4380000000000001e-05, "loss": 10.9569, "mean_token_accuracy": 0.8062832679599523, "num_tokens": 5980201.0, "step": 720 }, { "entropy": 0.6580634312704206, "epoch": 0.292, "grad_norm": 58.0, "learning_rate": 1.4580000000000001e-05, "loss": 10.6812, "mean_token_accuracy": 0.8109602797776461, "num_tokens": 6059507.0, "step": 730 }, { "entropy": 0.6885316792875529, "epoch": 0.296, "grad_norm": 59.0, "learning_rate": 1.478e-05, "loss": 11.1324, "mean_token_accuracy": 0.8033004485070705, "num_tokens": 6139830.0, "step": 740 }, { "entropy": 0.6904184087179601, "epoch": 0.3, "grad_norm": 54.0, "learning_rate": 1.498e-05, "loss": 10.9676, "mean_token_accuracy": 0.8058249101042747, "num_tokens": 6218134.0, "step": 750 }, { "entropy": 0.693754238076508, "epoch": 0.304, "grad_norm": 59.5, "learning_rate": 1.5180000000000002e-05, "loss": 11.1453, "mean_token_accuracy": 0.804497017711401, "num_tokens": 6298858.0, "step": 760 }, { "entropy": 0.7230889040045441, "epoch": 0.308, "grad_norm": 66.5, "learning_rate": 1.5380000000000002e-05, "loss": 11.775, "mean_token_accuracy": 0.7913905128836631, "num_tokens": 6375761.0, "step": 770 }, { "entropy": 0.6630848026834428, "epoch": 0.312, "grad_norm": 50.0, "learning_rate": 1.5580000000000003e-05, "loss": 10.6675, "mean_token_accuracy": 0.8106919646263122, "num_tokens": 6456502.0, "step": 780 }, { "entropy": 0.6884183536283672, "epoch": 0.316, "grad_norm": 65.5, "learning_rate": 1.578e-05, "loss": 11.2036, "mean_token_accuracy": 0.8040056221187115, "num_tokens": 6537521.0, "step": 790 }, { "entropy": 0.7094144577160477, "epoch": 0.32, "grad_norm": 51.75, "learning_rate": 1.5980000000000003e-05, "loss": 11.3893, "mean_token_accuracy": 0.7988939873874188, "num_tokens": 6622798.0, "step": 800 }, { "epoch": 0.32, "eval_biology_entropy": 1.272662544965744, "eval_biology_loss": 1.3795855045318604, "eval_biology_mean_token_accuracy": 0.6654499787688255, "eval_biology_num_tokens": 6622798.0, "eval_biology_runtime": 35.1835, "eval_biology_samples_per_second": 14.211, "eval_biology_steps_per_second": 14.211, "step": 800 }, { "epoch": 0.32, "eval_chemistry_entropy": 0.8212787560224533, "eval_chemistry_loss": 0.8387787938117981, "eval_chemistry_mean_token_accuracy": 0.7721875888109208, "eval_chemistry_num_tokens": 6622798.0, "eval_chemistry_runtime": 39.5827, "eval_chemistry_samples_per_second": 12.632, "eval_chemistry_steps_per_second": 12.632, "step": 800 }, { "epoch": 0.32, "eval_math_entropy": 0.6975619449019432, "eval_math_loss": 0.8566721081733704, "eval_math_mean_token_accuracy": 0.7762556113004685, "eval_math_num_tokens": 6622798.0, "eval_math_runtime": 40.4554, "eval_math_samples_per_second": 12.359, "eval_math_steps_per_second": 12.359, "step": 800 }, { "epoch": 0.32, "eval_physics_entropy": 0.6843295809030533, "eval_physics_loss": 0.6862691640853882, "eval_physics_mean_token_accuracy": 0.8047665995955467, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 45.1382, "eval_physics_samples_per_second": 11.077, "eval_physics_steps_per_second": 11.077, "step": 800 }, { "entropy": 0.6872260123491287, "epoch": 0.324, "grad_norm": 61.75, "learning_rate": 1.618e-05, "loss": 11.1453, "mean_token_accuracy": 0.8037573281675577, "num_tokens": 6699373.0, "step": 810 }, { "entropy": 0.7004622181877493, "epoch": 0.328, "grad_norm": 53.0, "learning_rate": 1.638e-05, "loss": 11.2569, "mean_token_accuracy": 0.8005565378814936, "num_tokens": 6782985.0, "step": 820 }, { "entropy": 0.683347325026989, "epoch": 0.332, "grad_norm": 67.0, "learning_rate": 1.658e-05, "loss": 10.9209, "mean_token_accuracy": 0.8052662838250398, "num_tokens": 6868578.0, "step": 830 }, { "entropy": 0.6550024008378387, "epoch": 0.336, "grad_norm": 49.0, "learning_rate": 1.6780000000000002e-05, "loss": 10.4463, "mean_token_accuracy": 0.8156119327992201, "num_tokens": 6953611.0, "step": 840 }, { "entropy": 0.6940796528942883, "epoch": 0.34, "grad_norm": 60.75, "learning_rate": 1.698e-05, "loss": 11.4328, "mean_token_accuracy": 0.7994465328752994, "num_tokens": 7037831.0, "step": 850 }, { "entropy": 0.6509441762231291, "epoch": 0.344, "grad_norm": 68.0, "learning_rate": 1.718e-05, "loss": 10.5903, "mean_token_accuracy": 0.8100398641079665, "num_tokens": 7121949.0, "step": 860 }, { "entropy": 0.68049411540851, "epoch": 0.348, "grad_norm": 50.75, "learning_rate": 1.7380000000000003e-05, "loss": 10.7995, "mean_token_accuracy": 0.8092381667345763, "num_tokens": 7206056.0, "step": 870 }, { "entropy": 0.6681747000664473, "epoch": 0.352, "grad_norm": 59.0, "learning_rate": 1.758e-05, "loss": 10.7902, "mean_token_accuracy": 0.8085832469165325, "num_tokens": 7290378.0, "step": 880 }, { "entropy": 0.6910556216724217, "epoch": 0.356, "grad_norm": 46.5, "learning_rate": 1.7780000000000003e-05, "loss": 11.072, "mean_token_accuracy": 0.8025941159576178, "num_tokens": 7367952.0, "step": 890 }, { "entropy": 0.6846336699090898, "epoch": 0.36, "grad_norm": 69.0, "learning_rate": 1.798e-05, "loss": 10.9941, "mean_token_accuracy": 0.8069840248674154, "num_tokens": 7452920.0, "step": 900 }, { "epoch": 0.36, "eval_biology_entropy": 1.3079615205526351, "eval_biology_loss": 1.3845115900039673, "eval_biology_mean_token_accuracy": 0.6659645406007767, "eval_biology_num_tokens": 7452920.0, "eval_biology_runtime": 35.2058, "eval_biology_samples_per_second": 14.202, "eval_biology_steps_per_second": 14.202, "step": 900 }, { "epoch": 0.36, "eval_chemistry_entropy": 0.855868073105812, "eval_chemistry_loss": 0.8524179458618164, "eval_chemistry_mean_token_accuracy": 0.7686776904463768, "eval_chemistry_num_tokens": 7452920.0, "eval_chemistry_runtime": 39.6971, "eval_chemistry_samples_per_second": 12.595, "eval_chemistry_steps_per_second": 12.595, "step": 900 }, { "epoch": 0.36, "eval_math_entropy": 0.7238874210417271, "eval_math_loss": 0.8637869954109192, "eval_math_mean_token_accuracy": 0.774171804189682, "eval_math_num_tokens": 7452920.0, "eval_math_runtime": 40.502, "eval_math_samples_per_second": 12.345, "eval_math_steps_per_second": 12.345, "step": 900 }, { "epoch": 0.36, "eval_physics_entropy": 0.703471056163311, "eval_physics_loss": 0.6876726150512695, "eval_physics_mean_token_accuracy": 0.8049070924520493, "eval_physics_num_tokens": 7452920.0, "eval_physics_runtime": 45.1347, "eval_physics_samples_per_second": 11.078, "eval_physics_steps_per_second": 11.078, "step": 900 }, { "entropy": 0.6624960891902447, "epoch": 0.364, "grad_norm": 50.25, "learning_rate": 1.8180000000000002e-05, "loss": 10.6947, "mean_token_accuracy": 0.8089043792337179, "num_tokens": 7535961.0, "step": 910 }, { "entropy": 0.6663541577756404, "epoch": 0.368, "grad_norm": 51.0, "learning_rate": 1.8380000000000004e-05, "loss": 10.5684, "mean_token_accuracy": 0.810727647319436, "num_tokens": 7621082.0, "step": 920 }, { "entropy": 0.6552547904662788, "epoch": 0.372, "grad_norm": 51.5, "learning_rate": 1.858e-05, "loss": 10.6444, "mean_token_accuracy": 0.8090339493006468, "num_tokens": 7705651.0, "step": 930 }, { "entropy": 0.7489686900749802, "epoch": 0.376, "grad_norm": 51.5, "learning_rate": 1.878e-05, "loss": 12.0319, "mean_token_accuracy": 0.7858695086091757, "num_tokens": 7787036.0, "step": 940 }, { "entropy": 0.6922270046547055, "epoch": 0.38, "grad_norm": 63.0, "learning_rate": 1.898e-05, "loss": 11.1499, "mean_token_accuracy": 0.7991227209568024, "num_tokens": 7869437.0, "step": 950 }, { "entropy": 0.6693172385916114, "epoch": 0.384, "grad_norm": 45.75, "learning_rate": 1.918e-05, "loss": 10.7571, "mean_token_accuracy": 0.8078656267374754, "num_tokens": 7955570.0, "step": 960 }, { "entropy": 0.6446146180853247, "epoch": 0.388, "grad_norm": 43.5, "learning_rate": 1.938e-05, "loss": 10.4611, "mean_token_accuracy": 0.8115016505122185, "num_tokens": 8038818.0, "step": 970 }, { "entropy": 0.6791700626723468, "epoch": 0.392, "grad_norm": 45.0, "learning_rate": 1.9580000000000002e-05, "loss": 10.8502, "mean_token_accuracy": 0.8048314619809389, "num_tokens": 8123527.0, "step": 980 }, { "entropy": 0.684117188770324, "epoch": 0.396, "grad_norm": 48.75, "learning_rate": 1.978e-05, "loss": 10.9286, "mean_token_accuracy": 0.8045736275613308, "num_tokens": 8206501.0, "step": 990 }, { "entropy": 0.6638183539733291, "epoch": 0.4, "grad_norm": 45.25, "learning_rate": 1.9980000000000002e-05, "loss": 10.7878, "mean_token_accuracy": 0.8090838421136141, "num_tokens": 8291049.0, "step": 1000 }, { "epoch": 0.4, "eval_biology_entropy": 1.3014859555363656, "eval_biology_loss": 1.4013315439224243, "eval_biology_mean_token_accuracy": 0.6621385813355446, "eval_biology_num_tokens": 8291049.0, "eval_biology_runtime": 35.3423, "eval_biology_samples_per_second": 14.147, "eval_biology_steps_per_second": 14.147, "step": 1000 }, { "epoch": 0.4, "eval_chemistry_entropy": 0.8515431408882141, "eval_chemistry_loss": 0.8530916571617126, "eval_chemistry_mean_token_accuracy": 0.7688064026236534, "eval_chemistry_num_tokens": 8291049.0, "eval_chemistry_runtime": 39.7131, "eval_chemistry_samples_per_second": 12.59, "eval_chemistry_steps_per_second": 12.59, "step": 1000 }, { "epoch": 0.4, "eval_math_entropy": 0.7349339348077774, "eval_math_loss": 0.8596640825271606, "eval_math_mean_token_accuracy": 0.7750666145086288, "eval_math_num_tokens": 8291049.0, "eval_math_runtime": 40.5711, "eval_math_samples_per_second": 12.324, "eval_math_steps_per_second": 12.324, "step": 1000 }, { "epoch": 0.4, "eval_physics_entropy": 0.698657899916172, "eval_physics_loss": 0.690846860408783, "eval_physics_mean_token_accuracy": 0.8048570597171784, "eval_physics_num_tokens": 8291049.0, "eval_physics_runtime": 45.18, "eval_physics_samples_per_second": 11.067, "eval_physics_steps_per_second": 11.067, "step": 1000 }, { "entropy": 0.6724585631862283, "epoch": 0.404, "grad_norm": 47.0, "learning_rate": 1.9980000000000002e-05, "loss": 10.7513, "mean_token_accuracy": 0.8090190943330526, "num_tokens": 8373687.0, "step": 1010 }, { "entropy": 0.6880609506741167, "epoch": 0.408, "grad_norm": 49.0, "learning_rate": 1.995777777777778e-05, "loss": 11.0507, "mean_token_accuracy": 0.8037724006921053, "num_tokens": 8459795.0, "step": 1020 }, { "entropy": 0.6609848150983453, "epoch": 0.412, "grad_norm": 50.75, "learning_rate": 1.9935555555555557e-05, "loss": 10.803, "mean_token_accuracy": 0.8078120674937963, "num_tokens": 8544948.0, "step": 1030 }, { "entropy": 0.6650119775906205, "epoch": 0.416, "grad_norm": 49.25, "learning_rate": 1.9913333333333335e-05, "loss": 10.794, "mean_token_accuracy": 0.8062687937170268, "num_tokens": 8627310.0, "step": 1040 }, { "entropy": 0.7647646913304925, "epoch": 0.42, "grad_norm": 50.25, "learning_rate": 1.9891111111111112e-05, "loss": 12.2348, "mean_token_accuracy": 0.7883469216525555, "num_tokens": 8707798.0, "step": 1050 }, { "entropy": 0.6614232028834521, "epoch": 0.424, "grad_norm": 46.0, "learning_rate": 1.986888888888889e-05, "loss": 10.6404, "mean_token_accuracy": 0.8107621461153031, "num_tokens": 8792574.0, "step": 1060 }, { "entropy": 0.6635355862788856, "epoch": 0.428, "grad_norm": 54.25, "learning_rate": 1.9846666666666668e-05, "loss": 10.7408, "mean_token_accuracy": 0.8067193511873484, "num_tokens": 8878637.0, "step": 1070 }, { "entropy": 0.6831195020116866, "epoch": 0.432, "grad_norm": 56.75, "learning_rate": 1.9824444444444445e-05, "loss": 10.9817, "mean_token_accuracy": 0.8036571219563484, "num_tokens": 8958371.0, "step": 1080 }, { "entropy": 0.6557174487970769, "epoch": 0.436, "grad_norm": 56.25, "learning_rate": 1.9802222222222226e-05, "loss": 10.5087, "mean_token_accuracy": 0.8101301193237305, "num_tokens": 9040649.0, "step": 1090 }, { "entropy": 0.6296142281033099, "epoch": 0.44, "grad_norm": 46.25, "learning_rate": 1.978e-05, "loss": 10.1617, "mean_token_accuracy": 0.8164261173456907, "num_tokens": 9125564.0, "step": 1100 }, { "epoch": 0.44, "eval_biology_entropy": 1.2857338542938233, "eval_biology_loss": 1.4026042222976685, "eval_biology_mean_token_accuracy": 0.6611523522734642, "eval_biology_num_tokens": 9125564.0, "eval_biology_runtime": 35.3074, "eval_biology_samples_per_second": 14.161, "eval_biology_steps_per_second": 14.161, "step": 1100 }, { "epoch": 0.44, "eval_chemistry_entropy": 0.8301562021374702, "eval_chemistry_loss": 0.8592454195022583, "eval_chemistry_mean_token_accuracy": 0.7674569791555405, "eval_chemistry_num_tokens": 9125564.0, "eval_chemistry_runtime": 39.7025, "eval_chemistry_samples_per_second": 12.594, "eval_chemistry_steps_per_second": 12.594, "step": 1100 }, { "epoch": 0.44, "eval_math_entropy": 0.704224185705185, "eval_math_loss": 0.8685066103935242, "eval_math_mean_token_accuracy": 0.7726707045435905, "eval_math_num_tokens": 9125564.0, "eval_math_runtime": 40.5031, "eval_math_samples_per_second": 12.345, "eval_math_steps_per_second": 12.345, "step": 1100 }, { "epoch": 0.44, "eval_physics_entropy": 0.676539529800415, "eval_physics_loss": 0.6832760572433472, "eval_physics_mean_token_accuracy": 0.8052293303608894, "eval_physics_num_tokens": 9125564.0, "eval_physics_runtime": 45.1101, "eval_physics_samples_per_second": 11.084, "eval_physics_steps_per_second": 11.084, "step": 1100 }, { "entropy": 0.7054770017042756, "epoch": 0.444, "grad_norm": 47.75, "learning_rate": 1.975777777777778e-05, "loss": 11.3801, "mean_token_accuracy": 0.8028026439249516, "num_tokens": 9210443.0, "step": 1110 }, { "entropy": 0.6552120961248875, "epoch": 0.448, "grad_norm": 42.0, "learning_rate": 1.9735555555555556e-05, "loss": 10.7148, "mean_token_accuracy": 0.8113164242357016, "num_tokens": 9294388.0, "step": 1120 }, { "entropy": 0.6474918988533318, "epoch": 0.452, "grad_norm": 49.0, "learning_rate": 1.9713333333333337e-05, "loss": 10.1143, "mean_token_accuracy": 0.8196653757244349, "num_tokens": 9371203.0, "step": 1130 }, { "entropy": 0.6594600110314787, "epoch": 0.456, "grad_norm": 43.5, "learning_rate": 1.969111111111111e-05, "loss": 10.7859, "mean_token_accuracy": 0.806324940547347, "num_tokens": 9455428.0, "step": 1140 }, { "entropy": 0.6459505766630173, "epoch": 0.46, "grad_norm": 41.25, "learning_rate": 1.9668888888888892e-05, "loss": 10.2758, "mean_token_accuracy": 0.8145925845950842, "num_tokens": 9539605.0, "step": 1150 }, { "entropy": 0.6633252296596766, "epoch": 0.464, "grad_norm": 46.75, "learning_rate": 1.9646666666666666e-05, "loss": 10.746, "mean_token_accuracy": 0.8067185960710048, "num_tokens": 9620366.0, "step": 1160 }, { "entropy": 0.6536956983618438, "epoch": 0.468, "grad_norm": 54.75, "learning_rate": 1.9624444444444447e-05, "loss": 10.5255, "mean_token_accuracy": 0.8116643182933331, "num_tokens": 9706621.0, "step": 1170 }, { "entropy": 0.6528704442083836, "epoch": 0.472, "grad_norm": 42.5, "learning_rate": 1.9602222222222225e-05, "loss": 10.4167, "mean_token_accuracy": 0.8110667478293181, "num_tokens": 9788925.0, "step": 1180 }, { "entropy": 0.6303915046155453, "epoch": 0.476, "grad_norm": 40.75, "learning_rate": 1.9580000000000002e-05, "loss": 10.2211, "mean_token_accuracy": 0.8153577871620655, "num_tokens": 9869908.0, "step": 1190 }, { "entropy": 0.7140287417918444, "epoch": 0.48, "grad_norm": 50.5, "learning_rate": 1.955777777777778e-05, "loss": 11.4393, "mean_token_accuracy": 0.7977926902472973, "num_tokens": 9955431.0, "step": 1200 }, { "epoch": 0.48, "eval_biology_entropy": 1.2901287813782691, "eval_biology_loss": 1.4074944257736206, "eval_biology_mean_token_accuracy": 0.6630856924653054, "eval_biology_num_tokens": 9955431.0, "eval_biology_runtime": 35.1485, "eval_biology_samples_per_second": 14.225, "eval_biology_steps_per_second": 14.225, "step": 1200 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.8402542336285115, "eval_chemistry_loss": 0.8475250005722046, "eval_chemistry_mean_token_accuracy": 0.7706258528828621, "eval_chemistry_num_tokens": 9955431.0, "eval_chemistry_runtime": 39.5963, "eval_chemistry_samples_per_second": 12.627, "eval_chemistry_steps_per_second": 12.627, "step": 1200 }, { "epoch": 0.48, "eval_math_entropy": 0.7095566186904907, "eval_math_loss": 0.8719534873962402, "eval_math_mean_token_accuracy": 0.7730017136335373, "eval_math_num_tokens": 9955431.0, "eval_math_runtime": 40.4499, "eval_math_samples_per_second": 12.361, "eval_math_steps_per_second": 12.361, "step": 1200 }, { "epoch": 0.48, "eval_physics_entropy": 0.6804338894784451, "eval_physics_loss": 0.6772474646568298, "eval_physics_mean_token_accuracy": 0.8075065380334854, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 45.0788, "eval_physics_samples_per_second": 11.092, "eval_physics_steps_per_second": 11.092, "step": 1200 }, { "entropy": 0.6472635091282427, "epoch": 0.484, "grad_norm": 50.75, "learning_rate": 1.9535555555555557e-05, "loss": 10.4858, "mean_token_accuracy": 0.8127345737069845, "num_tokens": 10038293.0, "step": 1210 }, { "entropy": 0.6718290301039815, "epoch": 0.488, "grad_norm": 43.25, "learning_rate": 1.9513333333333335e-05, "loss": 10.7595, "mean_token_accuracy": 0.8104257199913263, "num_tokens": 10119599.0, "step": 1220 }, { "entropy": 0.7013039434328675, "epoch": 0.492, "grad_norm": 49.25, "learning_rate": 1.9491111111111113e-05, "loss": 11.2505, "mean_token_accuracy": 0.8000239260494709, "num_tokens": 10204514.0, "step": 1230 }, { "entropy": 0.6948929887264967, "epoch": 0.496, "grad_norm": 42.25, "learning_rate": 1.946888888888889e-05, "loss": 11.0562, "mean_token_accuracy": 0.8022828463464975, "num_tokens": 10289883.0, "step": 1240 }, { "entropy": 0.6491774883121252, "epoch": 0.5, "grad_norm": 48.25, "learning_rate": 1.9446666666666668e-05, "loss": 10.5893, "mean_token_accuracy": 0.8121375739574432, "num_tokens": 10368762.0, "step": 1250 }, { "entropy": 0.6618938606232405, "epoch": 0.504, "grad_norm": 43.5, "learning_rate": 1.9424444444444446e-05, "loss": 10.5939, "mean_token_accuracy": 0.8118096552789211, "num_tokens": 10454683.0, "step": 1260 }, { "entropy": 0.6799033954739571, "epoch": 0.508, "grad_norm": 47.0, "learning_rate": 1.9402222222222223e-05, "loss": 11.031, "mean_token_accuracy": 0.804159976914525, "num_tokens": 10539515.0, "step": 1270 }, { "entropy": 0.7304276229813695, "epoch": 0.512, "grad_norm": 51.5, "learning_rate": 1.938e-05, "loss": 11.6009, "mean_token_accuracy": 0.7962873011827469, "num_tokens": 10619468.0, "step": 1280 }, { "entropy": 0.6799104714766144, "epoch": 0.516, "grad_norm": 45.75, "learning_rate": 1.935777777777778e-05, "loss": 11.0138, "mean_token_accuracy": 0.805281289294362, "num_tokens": 10698439.0, "step": 1290 }, { "entropy": 0.6859343230724335, "epoch": 0.52, "grad_norm": 42.25, "learning_rate": 1.9335555555555556e-05, "loss": 11.027, "mean_token_accuracy": 0.8035755015909671, "num_tokens": 10781285.0, "step": 1300 }, { "epoch": 0.52, "eval_biology_entropy": 1.2939175048470497, "eval_biology_loss": 1.4010000228881836, "eval_biology_mean_token_accuracy": 0.6629679874181748, "eval_biology_num_tokens": 10781285.0, "eval_biology_runtime": 35.2822, "eval_biology_samples_per_second": 14.171, "eval_biology_steps_per_second": 14.171, "step": 1300 }, { "epoch": 0.52, "eval_chemistry_entropy": 0.8264291672706604, "eval_chemistry_loss": 0.847792387008667, "eval_chemistry_mean_token_accuracy": 0.7705023067593575, "eval_chemistry_num_tokens": 10781285.0, "eval_chemistry_runtime": 39.6216, "eval_chemistry_samples_per_second": 12.619, "eval_chemistry_steps_per_second": 12.619, "step": 1300 }, { "epoch": 0.52, "eval_math_entropy": 0.7105307767689228, "eval_math_loss": 0.8610548973083496, "eval_math_mean_token_accuracy": 0.7741031144857407, "eval_math_num_tokens": 10781285.0, "eval_math_runtime": 40.4357, "eval_math_samples_per_second": 12.365, "eval_math_steps_per_second": 12.365, "step": 1300 }, { "epoch": 0.52, "eval_physics_entropy": 0.6724484626352787, "eval_physics_loss": 0.6681262254714966, "eval_physics_mean_token_accuracy": 0.8100028432011604, "eval_physics_num_tokens": 10781285.0, "eval_physics_runtime": 45.0631, "eval_physics_samples_per_second": 11.096, "eval_physics_steps_per_second": 11.096, "step": 1300 }, { "entropy": 0.6645524255931378, "epoch": 0.524, "grad_norm": 47.0, "learning_rate": 1.9313333333333334e-05, "loss": 10.7825, "mean_token_accuracy": 0.8092559453099966, "num_tokens": 10864773.0, "step": 1310 }, { "entropy": 0.6470350125804544, "epoch": 0.528, "grad_norm": 41.0, "learning_rate": 1.9291111111111115e-05, "loss": 10.3184, "mean_token_accuracy": 0.815019316598773, "num_tokens": 10944904.0, "step": 1320 }, { "entropy": 0.6757032671943307, "epoch": 0.532, "grad_norm": 47.75, "learning_rate": 1.926888888888889e-05, "loss": 10.8658, "mean_token_accuracy": 0.8063097450882196, "num_tokens": 11023958.0, "step": 1330 }, { "entropy": 0.6243293879553675, "epoch": 0.536, "grad_norm": 46.25, "learning_rate": 1.924666666666667e-05, "loss": 10.1411, "mean_token_accuracy": 0.8193823467940092, "num_tokens": 11106538.0, "step": 1340 }, { "entropy": 0.6877710536122322, "epoch": 0.54, "grad_norm": 54.5, "learning_rate": 1.9224444444444444e-05, "loss": 10.9827, "mean_token_accuracy": 0.8047556940466165, "num_tokens": 11187142.0, "step": 1350 }, { "entropy": 0.7018949103541672, "epoch": 0.544, "grad_norm": 55.75, "learning_rate": 1.9202222222222225e-05, "loss": 11.3787, "mean_token_accuracy": 0.7996319092810154, "num_tokens": 11266409.0, "step": 1360 }, { "entropy": 0.6630384393967688, "epoch": 0.548, "grad_norm": 50.75, "learning_rate": 1.918e-05, "loss": 10.6384, "mean_token_accuracy": 0.8089293787255883, "num_tokens": 11350094.0, "step": 1370 }, { "entropy": 0.6659614780917764, "epoch": 0.552, "grad_norm": 45.75, "learning_rate": 1.915777777777778e-05, "loss": 10.7485, "mean_token_accuracy": 0.8104357924312353, "num_tokens": 11438391.0, "step": 1380 }, { "entropy": 0.6783489585854113, "epoch": 0.556, "grad_norm": 54.0, "learning_rate": 1.9135555555555555e-05, "loss": 10.738, "mean_token_accuracy": 0.8090345006436109, "num_tokens": 11521583.0, "step": 1390 }, { "entropy": 0.6332698877900839, "epoch": 0.56, "grad_norm": 42.0, "learning_rate": 1.9113333333333336e-05, "loss": 10.1535, "mean_token_accuracy": 0.817741920426488, "num_tokens": 11605544.0, "step": 1400 }, { "epoch": 0.56, "eval_biology_entropy": 1.2966757586598396, "eval_biology_loss": 1.4016809463500977, "eval_biology_mean_token_accuracy": 0.6630619642138481, "eval_biology_num_tokens": 11605544.0, "eval_biology_runtime": 35.3173, "eval_biology_samples_per_second": 14.157, "eval_biology_steps_per_second": 14.157, "step": 1400 }, { "epoch": 0.56, "eval_chemistry_entropy": 0.8152842739224434, "eval_chemistry_loss": 0.8491419553756714, "eval_chemistry_mean_token_accuracy": 0.7716290903687477, "eval_chemistry_num_tokens": 11605544.0, "eval_chemistry_runtime": 39.6109, "eval_chemistry_samples_per_second": 12.623, "eval_chemistry_steps_per_second": 12.623, "step": 1400 }, { "epoch": 0.56, "eval_math_entropy": 0.7092237346470356, "eval_math_loss": 0.8697728514671326, "eval_math_mean_token_accuracy": 0.7739188714027405, "eval_math_num_tokens": 11605544.0, "eval_math_runtime": 40.4481, "eval_math_samples_per_second": 12.362, "eval_math_steps_per_second": 12.362, "step": 1400 }, { "epoch": 0.56, "eval_physics_entropy": 0.6652475365400314, "eval_physics_loss": 0.6614710688591003, "eval_physics_mean_token_accuracy": 0.8117705038189889, "eval_physics_num_tokens": 11605544.0, "eval_physics_runtime": 45.1827, "eval_physics_samples_per_second": 11.066, "eval_physics_steps_per_second": 11.066, "step": 1400 }, { "entropy": 0.6269684578292072, "epoch": 0.564, "grad_norm": 42.5, "learning_rate": 1.9091111111111113e-05, "loss": 10.2481, "mean_token_accuracy": 0.8174567829817534, "num_tokens": 11686173.0, "step": 1410 }, { "entropy": 0.6821280302479863, "epoch": 0.568, "grad_norm": 38.25, "learning_rate": 1.906888888888889e-05, "loss": 10.6848, "mean_token_accuracy": 0.8115028716623783, "num_tokens": 11765430.0, "step": 1420 }, { "entropy": 0.6392442825250327, "epoch": 0.572, "grad_norm": 44.0, "learning_rate": 1.904666666666667e-05, "loss": 10.5018, "mean_token_accuracy": 0.8134302724152803, "num_tokens": 11850973.0, "step": 1430 }, { "entropy": 0.6551524167880416, "epoch": 0.576, "grad_norm": 56.25, "learning_rate": 1.9024444444444446e-05, "loss": 10.4141, "mean_token_accuracy": 0.8127408139407635, "num_tokens": 11931715.0, "step": 1440 }, { "entropy": 0.6323688972741366, "epoch": 0.58, "grad_norm": 47.75, "learning_rate": 1.9002222222222224e-05, "loss": 9.9927, "mean_token_accuracy": 0.8190583731979132, "num_tokens": 12012155.0, "step": 1450 }, { "entropy": 0.6468079431913794, "epoch": 0.584, "grad_norm": 44.25, "learning_rate": 1.898e-05, "loss": 10.5496, "mean_token_accuracy": 0.8117645151913166, "num_tokens": 12093686.0, "step": 1460 }, { "entropy": 0.6574975445866584, "epoch": 0.588, "grad_norm": 46.5, "learning_rate": 1.895777777777778e-05, "loss": 10.4738, "mean_token_accuracy": 0.81148545704782, "num_tokens": 12177581.0, "step": 1470 }, { "entropy": 0.6759683048352599, "epoch": 0.592, "grad_norm": 46.25, "learning_rate": 1.8935555555555556e-05, "loss": 10.844, "mean_token_accuracy": 0.8061960492283106, "num_tokens": 12257243.0, "step": 1480 }, { "entropy": 0.6535733859054744, "epoch": 0.596, "grad_norm": 58.75, "learning_rate": 1.8913333333333334e-05, "loss": 10.5604, "mean_token_accuracy": 0.8106869570910931, "num_tokens": 12338544.0, "step": 1490 }, { "entropy": 0.635299691837281, "epoch": 0.6, "grad_norm": 45.25, "learning_rate": 1.8891111111111115e-05, "loss": 10.1003, "mean_token_accuracy": 0.8157124288380146, "num_tokens": 12422008.0, "step": 1500 }, { "epoch": 0.6, "eval_biology_entropy": 1.2644084550142287, "eval_biology_loss": 1.403598666191101, "eval_biology_mean_token_accuracy": 0.662936574101448, "eval_biology_num_tokens": 12422008.0, "eval_biology_runtime": 35.0607, "eval_biology_samples_per_second": 14.261, "eval_biology_steps_per_second": 14.261, "step": 1500 }, { "epoch": 0.6, "eval_chemistry_entropy": 0.7927452449202538, "eval_chemistry_loss": 0.8444129824638367, "eval_chemistry_mean_token_accuracy": 0.7716013901829719, "eval_chemistry_num_tokens": 12422008.0, "eval_chemistry_runtime": 39.6183, "eval_chemistry_samples_per_second": 12.62, "eval_chemistry_steps_per_second": 12.62, "step": 1500 }, { "epoch": 0.6, "eval_math_entropy": 0.6911805932819843, "eval_math_loss": 0.8791993260383606, "eval_math_mean_token_accuracy": 0.7729427699446678, "eval_math_num_tokens": 12422008.0, "eval_math_runtime": 40.5514, "eval_math_samples_per_second": 12.33, "eval_math_steps_per_second": 12.33, "step": 1500 }, { "epoch": 0.6, "eval_physics_entropy": 0.635497625797987, "eval_physics_loss": 0.6566235423088074, "eval_physics_mean_token_accuracy": 0.8124136482477188, "eval_physics_num_tokens": 12422008.0, "eval_physics_runtime": 45.1105, "eval_physics_samples_per_second": 11.084, "eval_physics_steps_per_second": 11.084, "step": 1500 }, { "entropy": 0.6438517822884023, "epoch": 0.604, "grad_norm": 46.25, "learning_rate": 1.886888888888889e-05, "loss": 10.4253, "mean_token_accuracy": 0.8152689874172211, "num_tokens": 12505134.0, "step": 1510 }, { "entropy": 0.6165852569974959, "epoch": 0.608, "grad_norm": 50.25, "learning_rate": 1.884666666666667e-05, "loss": 9.8857, "mean_token_accuracy": 0.8216613974422217, "num_tokens": 12583154.0, "step": 1520 }, { "entropy": 0.6570021582767367, "epoch": 0.612, "grad_norm": 42.75, "learning_rate": 1.8824444444444445e-05, "loss": 10.4566, "mean_token_accuracy": 0.8129478100687265, "num_tokens": 12666601.0, "step": 1530 }, { "entropy": 0.6499744065105915, "epoch": 0.616, "grad_norm": 42.5, "learning_rate": 1.8802222222222226e-05, "loss": 10.3697, "mean_token_accuracy": 0.8139124307781458, "num_tokens": 12746458.0, "step": 1540 }, { "entropy": 0.6602890061214566, "epoch": 0.62, "grad_norm": 39.0, "learning_rate": 1.878e-05, "loss": 10.7086, "mean_token_accuracy": 0.8104711420834064, "num_tokens": 12831572.0, "step": 1550 }, { "entropy": 0.6643684912472964, "epoch": 0.624, "grad_norm": 47.75, "learning_rate": 1.875777777777778e-05, "loss": 10.5879, "mean_token_accuracy": 0.8116141181439162, "num_tokens": 12905392.0, "step": 1560 }, { "entropy": 0.6584846514277161, "epoch": 0.628, "grad_norm": 43.75, "learning_rate": 1.873555555555556e-05, "loss": 10.5432, "mean_token_accuracy": 0.8107300482690334, "num_tokens": 12986513.0, "step": 1570 }, { "entropy": 0.638776770234108, "epoch": 0.632, "grad_norm": 39.5, "learning_rate": 1.8713333333333336e-05, "loss": 10.2261, "mean_token_accuracy": 0.814822031930089, "num_tokens": 13071058.0, "step": 1580 }, { "entropy": 0.6551781664602458, "epoch": 0.636, "grad_norm": 44.0, "learning_rate": 1.8691111111111114e-05, "loss": 10.5182, "mean_token_accuracy": 0.8119142647832632, "num_tokens": 13151842.0, "step": 1590 }, { "entropy": 0.7266039110720157, "epoch": 0.64, "grad_norm": 46.75, "learning_rate": 1.866888888888889e-05, "loss": 11.6713, "mean_token_accuracy": 0.7968564338982105, "num_tokens": 13232198.0, "step": 1600 }, { "epoch": 0.64, "eval_biology_entropy": 1.3351151769161225, "eval_biology_loss": 1.3962006568908691, "eval_biology_mean_token_accuracy": 0.6624402161240578, "eval_biology_num_tokens": 13232198.0, "eval_biology_runtime": 35.2387, "eval_biology_samples_per_second": 14.189, "eval_biology_steps_per_second": 14.189, "step": 1600 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.8378922098278999, "eval_chemistry_loss": 0.8399096727371216, "eval_chemistry_mean_token_accuracy": 0.771498102247715, "eval_chemistry_num_tokens": 13232198.0, "eval_chemistry_runtime": 39.6463, "eval_chemistry_samples_per_second": 12.612, "eval_chemistry_steps_per_second": 12.612, "step": 1600 }, { "epoch": 0.64, "eval_math_entropy": 0.7215032837092876, "eval_math_loss": 0.8692191243171692, "eval_math_mean_token_accuracy": 0.7733984060287475, "eval_math_num_tokens": 13232198.0, "eval_math_runtime": 40.5046, "eval_math_samples_per_second": 12.344, "eval_math_steps_per_second": 12.344, "step": 1600 }, { "epoch": 0.64, "eval_physics_entropy": 0.6693824680745601, "eval_physics_loss": 0.6483619213104248, "eval_physics_mean_token_accuracy": 0.8146560984253883, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 45.1205, "eval_physics_samples_per_second": 11.081, "eval_physics_steps_per_second": 11.081, "step": 1600 }, { "entropy": 0.6329503210261465, "epoch": 0.644, "grad_norm": 41.5, "learning_rate": 1.864666666666667e-05, "loss": 10.1519, "mean_token_accuracy": 0.8150339797139168, "num_tokens": 13318037.0, "step": 1610 }, { "entropy": 0.6718782410956919, "epoch": 0.648, "grad_norm": 48.25, "learning_rate": 1.8624444444444446e-05, "loss": 10.6876, "mean_token_accuracy": 0.8110412795096635, "num_tokens": 13401064.0, "step": 1620 }, { "entropy": 0.6298892620950938, "epoch": 0.652, "grad_norm": 52.0, "learning_rate": 1.8602222222222224e-05, "loss": 10.2865, "mean_token_accuracy": 0.8168573569506407, "num_tokens": 13488222.0, "step": 1630 }, { "entropy": 0.6405643708072603, "epoch": 0.656, "grad_norm": 39.75, "learning_rate": 1.858e-05, "loss": 10.0944, "mean_token_accuracy": 0.8184370543807745, "num_tokens": 13575902.0, "step": 1640 }, { "entropy": 0.6571455264464021, "epoch": 0.66, "grad_norm": 41.25, "learning_rate": 1.855777777777778e-05, "loss": 10.5153, "mean_token_accuracy": 0.8104129206389189, "num_tokens": 13655826.0, "step": 1650 }, { "entropy": 0.6566264102235436, "epoch": 0.664, "grad_norm": 55.5, "learning_rate": 1.8535555555555557e-05, "loss": 10.5224, "mean_token_accuracy": 0.8120165556669235, "num_tokens": 13733192.0, "step": 1660 }, { "entropy": 0.6292093011550606, "epoch": 0.668, "grad_norm": 37.0, "learning_rate": 1.8513333333333335e-05, "loss": 10.0179, "mean_token_accuracy": 0.8215251605957746, "num_tokens": 13816716.0, "step": 1670 }, { "entropy": 0.6717162574641407, "epoch": 0.672, "grad_norm": 42.5, "learning_rate": 1.8491111111111112e-05, "loss": 10.9238, "mean_token_accuracy": 0.8044366929680109, "num_tokens": 13895997.0, "step": 1680 }, { "entropy": 0.6257681073620915, "epoch": 0.676, "grad_norm": 41.75, "learning_rate": 1.846888888888889e-05, "loss": 9.9056, "mean_token_accuracy": 0.8205471843481064, "num_tokens": 13988168.0, "step": 1690 }, { "entropy": 0.6243995727039874, "epoch": 0.68, "grad_norm": 45.5, "learning_rate": 1.8446666666666667e-05, "loss": 10.0775, "mean_token_accuracy": 0.8218194592744112, "num_tokens": 14074579.0, "step": 1700 }, { "epoch": 0.68, "eval_biology_entropy": 1.3095218172669412, "eval_biology_loss": 1.3988749980926514, "eval_biology_mean_token_accuracy": 0.6642519612908363, "eval_biology_num_tokens": 14074579.0, "eval_biology_runtime": 35.376, "eval_biology_samples_per_second": 14.134, "eval_biology_steps_per_second": 14.134, "step": 1700 }, { "epoch": 0.68, "eval_chemistry_entropy": 0.810583723127842, "eval_chemistry_loss": 0.8362264633178711, "eval_chemistry_mean_token_accuracy": 0.7733853607773781, "eval_chemistry_num_tokens": 14074579.0, "eval_chemistry_runtime": 39.7857, "eval_chemistry_samples_per_second": 12.567, "eval_chemistry_steps_per_second": 12.567, "step": 1700 }, { "epoch": 0.68, "eval_math_entropy": 0.7038493918180466, "eval_math_loss": 0.8700717687606812, "eval_math_mean_token_accuracy": 0.7738546322584152, "eval_math_num_tokens": 14074579.0, "eval_math_runtime": 40.5553, "eval_math_samples_per_second": 12.329, "eval_math_steps_per_second": 12.329, "step": 1700 }, { "epoch": 0.68, "eval_physics_entropy": 0.6500247009396553, "eval_physics_loss": 0.6439645290374756, "eval_physics_mean_token_accuracy": 0.8147145962715149, "eval_physics_num_tokens": 14074579.0, "eval_physics_runtime": 45.1157, "eval_physics_samples_per_second": 11.083, "eval_physics_steps_per_second": 11.083, "step": 1700 }, { "entropy": 0.701755482237786, "epoch": 0.684, "grad_norm": 38.0, "learning_rate": 1.842444444444445e-05, "loss": 11.2097, "mean_token_accuracy": 0.8015010356903076, "num_tokens": 14153649.0, "step": 1710 }, { "entropy": 0.6312699088826775, "epoch": 0.688, "grad_norm": 41.5, "learning_rate": 1.8402222222222223e-05, "loss": 10.0591, "mean_token_accuracy": 0.8177381068468094, "num_tokens": 14234888.0, "step": 1720 }, { "entropy": 0.6341699136421084, "epoch": 0.692, "grad_norm": 49.75, "learning_rate": 1.8380000000000004e-05, "loss": 10.2054, "mean_token_accuracy": 0.8159505747258663, "num_tokens": 14323183.0, "step": 1730 }, { "entropy": 0.6394278825260699, "epoch": 0.696, "grad_norm": 44.0, "learning_rate": 1.8357777777777778e-05, "loss": 10.2597, "mean_token_accuracy": 0.8152242347598075, "num_tokens": 14411150.0, "step": 1740 }, { "entropy": 0.6502743780612945, "epoch": 0.7, "grad_norm": 38.0, "learning_rate": 1.833555555555556e-05, "loss": 10.4132, "mean_token_accuracy": 0.8118325512856245, "num_tokens": 14490628.0, "step": 1750 }, { "entropy": 0.6663031128235162, "epoch": 0.704, "grad_norm": 45.0, "learning_rate": 1.8313333333333333e-05, "loss": 10.7733, "mean_token_accuracy": 0.8083227630704641, "num_tokens": 14567908.0, "step": 1760 }, { "entropy": 0.6487898311577738, "epoch": 0.708, "grad_norm": 40.0, "learning_rate": 1.8291111111111114e-05, "loss": 10.2458, "mean_token_accuracy": 0.81355558373034, "num_tokens": 14647291.0, "step": 1770 }, { "entropy": 0.6143384758383036, "epoch": 0.712, "grad_norm": 42.75, "learning_rate": 1.8268888888888888e-05, "loss": 9.8792, "mean_token_accuracy": 0.8224807776510715, "num_tokens": 14728975.0, "step": 1780 }, { "entropy": 0.6461778385564685, "epoch": 0.716, "grad_norm": 42.5, "learning_rate": 1.824666666666667e-05, "loss": 10.16, "mean_token_accuracy": 0.8192640010267496, "num_tokens": 14804586.0, "step": 1790 }, { "entropy": 0.6274462300352752, "epoch": 0.72, "grad_norm": 43.5, "learning_rate": 1.8224444444444447e-05, "loss": 10.2187, "mean_token_accuracy": 0.8162837028503418, "num_tokens": 14882927.0, "step": 1800 }, { "epoch": 0.72, "eval_biology_entropy": 1.26099048101902, "eval_biology_loss": 1.4029406309127808, "eval_biology_mean_token_accuracy": 0.6639119000434875, "eval_biology_num_tokens": 14882927.0, "eval_biology_runtime": 35.2293, "eval_biology_samples_per_second": 14.193, "eval_biology_steps_per_second": 14.193, "step": 1800 }, { "epoch": 0.72, "eval_chemistry_entropy": 0.793392687290907, "eval_chemistry_loss": 0.832188606262207, "eval_chemistry_mean_token_accuracy": 0.7744238600730896, "eval_chemistry_num_tokens": 14882927.0, "eval_chemistry_runtime": 39.6195, "eval_chemistry_samples_per_second": 12.62, "eval_chemistry_steps_per_second": 12.62, "step": 1800 }, { "epoch": 0.72, "eval_math_entropy": 0.6914696083068848, "eval_math_loss": 0.8720027804374695, "eval_math_mean_token_accuracy": 0.7736118195652961, "eval_math_num_tokens": 14882927.0, "eval_math_runtime": 40.5243, "eval_math_samples_per_second": 12.338, "eval_math_steps_per_second": 12.338, "step": 1800 }, { "epoch": 0.72, "eval_physics_entropy": 0.6349251216948032, "eval_physics_loss": 0.6378234624862671, "eval_physics_mean_token_accuracy": 0.8165309005975723, "eval_physics_num_tokens": 14882927.0, "eval_physics_runtime": 45.1781, "eval_physics_samples_per_second": 11.067, "eval_physics_steps_per_second": 11.067, "step": 1800 }, { "entropy": 0.6608743364922702, "epoch": 0.724, "grad_norm": 44.5, "learning_rate": 1.8202222222222225e-05, "loss": 10.4493, "mean_token_accuracy": 0.8107382688671351, "num_tokens": 14966717.0, "step": 1810 }, { "entropy": 0.5937761082313955, "epoch": 0.728, "grad_norm": 43.25, "learning_rate": 1.8180000000000002e-05, "loss": 9.6358, "mean_token_accuracy": 0.8237139344215393, "num_tokens": 15052770.0, "step": 1820 }, { "entropy": 0.6625786048360169, "epoch": 0.732, "grad_norm": 37.0, "learning_rate": 1.815777777777778e-05, "loss": 10.5736, "mean_token_accuracy": 0.8131286226212978, "num_tokens": 15135882.0, "step": 1830 }, { "entropy": 0.6253975971601904, "epoch": 0.736, "grad_norm": 49.5, "learning_rate": 1.8135555555555557e-05, "loss": 9.9433, "mean_token_accuracy": 0.8202268972992897, "num_tokens": 15217342.0, "step": 1840 }, { "entropy": 0.6260659927502275, "epoch": 0.74, "grad_norm": 45.75, "learning_rate": 1.8113333333333335e-05, "loss": 10.0495, "mean_token_accuracy": 0.819375942274928, "num_tokens": 15303623.0, "step": 1850 }, { "entropy": 0.6029338354244829, "epoch": 0.744, "grad_norm": 46.75, "learning_rate": 1.8091111111111113e-05, "loss": 9.5649, "mean_token_accuracy": 0.8268326785415411, "num_tokens": 15385161.0, "step": 1860 }, { "entropy": 0.6492635687813163, "epoch": 0.748, "grad_norm": 47.0, "learning_rate": 1.806888888888889e-05, "loss": 10.568, "mean_token_accuracy": 0.810694944486022, "num_tokens": 15466126.0, "step": 1870 }, { "entropy": 0.5932938390411436, "epoch": 0.752, "grad_norm": 43.25, "learning_rate": 1.8046666666666668e-05, "loss": 9.4157, "mean_token_accuracy": 0.8285971570760011, "num_tokens": 15547710.0, "step": 1880 }, { "entropy": 0.6472898460924625, "epoch": 0.756, "grad_norm": 47.75, "learning_rate": 1.8024444444444445e-05, "loss": 10.2805, "mean_token_accuracy": 0.8147294465452433, "num_tokens": 15625904.0, "step": 1890 }, { "entropy": 0.6560673840343952, "epoch": 0.76, "grad_norm": 43.75, "learning_rate": 1.8002222222222223e-05, "loss": 10.5391, "mean_token_accuracy": 0.8101910080760717, "num_tokens": 15704494.0, "step": 1900 }, { "epoch": 0.76, "eval_biology_entropy": 1.3165188397169114, "eval_biology_loss": 1.4028364419937134, "eval_biology_mean_token_accuracy": 0.6636078572869301, "eval_biology_num_tokens": 15704494.0, "eval_biology_runtime": 35.6648, "eval_biology_samples_per_second": 14.019, "eval_biology_steps_per_second": 14.019, "step": 1900 }, { "epoch": 0.76, "eval_chemistry_entropy": 0.8100777663588524, "eval_chemistry_loss": 0.8284242749214172, "eval_chemistry_mean_token_accuracy": 0.7751321754455567, "eval_chemistry_num_tokens": 15704494.0, "eval_chemistry_runtime": 39.604, "eval_chemistry_samples_per_second": 12.625, "eval_chemistry_steps_per_second": 12.625, "step": 1900 }, { "epoch": 0.76, "eval_math_entropy": 0.7177843990921974, "eval_math_loss": 0.8692219853401184, "eval_math_mean_token_accuracy": 0.7739469853639602, "eval_math_num_tokens": 15704494.0, "eval_math_runtime": 40.511, "eval_math_samples_per_second": 12.342, "eval_math_steps_per_second": 12.342, "step": 1900 }, { "epoch": 0.76, "eval_physics_entropy": 0.6514912424087524, "eval_physics_loss": 0.6343158483505249, "eval_physics_mean_token_accuracy": 0.8171744772791862, "eval_physics_num_tokens": 15704494.0, "eval_physics_runtime": 45.1095, "eval_physics_samples_per_second": 11.084, "eval_physics_steps_per_second": 11.084, "step": 1900 }, { "entropy": 0.6272192076779902, "epoch": 0.764, "grad_norm": 38.25, "learning_rate": 1.798e-05, "loss": 9.9781, "mean_token_accuracy": 0.8175393354147673, "num_tokens": 15792219.0, "step": 1910 }, { "entropy": 0.619711946696043, "epoch": 0.768, "grad_norm": 39.25, "learning_rate": 1.7957777777777778e-05, "loss": 10.014, "mean_token_accuracy": 0.8173364765942097, "num_tokens": 15877177.0, "step": 1920 }, { "entropy": 0.6192873228341341, "epoch": 0.772, "grad_norm": 38.0, "learning_rate": 1.7935555555555556e-05, "loss": 9.8355, "mean_token_accuracy": 0.8235324405133724, "num_tokens": 15965749.0, "step": 1930 }, { "entropy": 0.6311636022292078, "epoch": 0.776, "grad_norm": 39.0, "learning_rate": 1.7913333333333337e-05, "loss": 10.0725, "mean_token_accuracy": 0.8187215372920036, "num_tokens": 16050530.0, "step": 1940 }, { "entropy": 0.6458089170977473, "epoch": 0.78, "grad_norm": 40.25, "learning_rate": 1.789111111111111e-05, "loss": 10.399, "mean_token_accuracy": 0.8167071305215359, "num_tokens": 16136804.0, "step": 1950 }, { "entropy": 0.6231075780466199, "epoch": 0.784, "grad_norm": 43.25, "learning_rate": 1.7868888888888892e-05, "loss": 9.9612, "mean_token_accuracy": 0.8209680158644914, "num_tokens": 16219640.0, "step": 1960 }, { "entropy": 0.6302494045346976, "epoch": 0.788, "grad_norm": 39.0, "learning_rate": 1.7846666666666666e-05, "loss": 9.9731, "mean_token_accuracy": 0.8196312204003334, "num_tokens": 16299595.0, "step": 1970 }, { "entropy": 0.6404457478784025, "epoch": 0.792, "grad_norm": 51.5, "learning_rate": 1.7824444444444447e-05, "loss": 10.3109, "mean_token_accuracy": 0.8141773957759142, "num_tokens": 16378820.0, "step": 1980 }, { "entropy": 0.644137370865792, "epoch": 0.796, "grad_norm": 44.25, "learning_rate": 1.780222222222222e-05, "loss": 10.3859, "mean_token_accuracy": 0.8138818342238665, "num_tokens": 16464611.0, "step": 1990 }, { "entropy": 0.6414632762782275, "epoch": 0.8, "grad_norm": 35.0, "learning_rate": 1.7780000000000003e-05, "loss": 10.2348, "mean_token_accuracy": 0.8169484097510576, "num_tokens": 16548261.0, "step": 2000 }, { "epoch": 0.8, "eval_biology_entropy": 1.310195390045643, "eval_biology_loss": 1.3905400037765503, "eval_biology_mean_token_accuracy": 0.6651642812490464, "eval_biology_num_tokens": 16548261.0, "eval_biology_runtime": 35.3115, "eval_biology_samples_per_second": 14.16, "eval_biology_steps_per_second": 14.16, "step": 2000 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.7988228353559971, "eval_chemistry_loss": 0.8260599374771118, "eval_chemistry_mean_token_accuracy": 0.7754636185765267, "eval_chemistry_num_tokens": 16548261.0, "eval_chemistry_runtime": 39.4167, "eval_chemistry_samples_per_second": 12.685, "eval_chemistry_steps_per_second": 12.685, "step": 2000 }, { "epoch": 0.8, "eval_math_entropy": 0.6841731403172017, "eval_math_loss": 0.87116539478302, "eval_math_mean_token_accuracy": 0.7759169373512268, "eval_math_num_tokens": 16548261.0, "eval_math_runtime": 40.3625, "eval_math_samples_per_second": 12.388, "eval_math_steps_per_second": 12.388, "step": 2000 }, { "epoch": 0.8, "eval_physics_entropy": 0.6220660482347011, "eval_physics_loss": 0.6307912468910217, "eval_physics_mean_token_accuracy": 0.818623253762722, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 45.0164, "eval_physics_samples_per_second": 11.107, "eval_physics_steps_per_second": 11.107, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1096331558472863e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }