{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.813749196380377, "epoch": 0.016, "grad_norm": 21.625, "learning_rate": 1.8e-07, "loss": 2.3527, "mean_token_accuracy": 0.5754138542339206, "num_tokens": 195510.0, "step": 10 }, { "entropy": 1.7928078502416611, "epoch": 0.032, "grad_norm": 16.75, "learning_rate": 3.8e-07, "loss": 2.3363, "mean_token_accuracy": 0.5766932301223278, "num_tokens": 391008.0, "step": 20 }, { "entropy": 1.8388511516153812, "epoch": 0.048, "grad_norm": 17.25, "learning_rate": 5.800000000000001e-07, "loss": 2.3414, "mean_token_accuracy": 0.5755490336567164, "num_tokens": 590136.0, "step": 30 }, { "entropy": 1.853544193506241, "epoch": 0.064, "grad_norm": 12.5, "learning_rate": 7.8e-07, "loss": 2.2941, "mean_token_accuracy": 0.577743387594819, "num_tokens": 791432.0, "step": 40 }, { "entropy": 1.9382900901138782, "epoch": 0.08, "grad_norm": 12.3125, "learning_rate": 9.800000000000001e-07, "loss": 2.2698, "mean_token_accuracy": 0.5786384884268045, "num_tokens": 990342.0, "step": 50 }, { "entropy": 2.012592934817076, "epoch": 0.096, "grad_norm": 7.59375, "learning_rate": 1.1800000000000001e-06, "loss": 2.2841, "mean_token_accuracy": 0.5789956836029887, "num_tokens": 1182339.0, "step": 60 }, { "entropy": 2.020173179358244, "epoch": 0.112, "grad_norm": 5.59375, "learning_rate": 1.3800000000000001e-06, "loss": 2.1881, "mean_token_accuracy": 0.5869531789794564, "num_tokens": 1386193.0, "step": 70 }, { "entropy": 2.042734081298113, "epoch": 0.128, "grad_norm": 5.75, "learning_rate": 1.5800000000000001e-06, "loss": 2.1864, "mean_token_accuracy": 0.5859880622476339, "num_tokens": 1583261.0, "step": 80 }, { "entropy": 1.9902476705610752, "epoch": 0.144, "grad_norm": 5.6875, "learning_rate": 1.7800000000000001e-06, "loss": 2.1581, "mean_token_accuracy": 0.5905994104221464, "num_tokens": 1774826.0, "step": 90 }, { "entropy": 2.038115797191858, "epoch": 0.16, "grad_norm": 5.9375, "learning_rate": 1.98e-06, "loss": 2.1863, "mean_token_accuracy": 0.5856570154428482, "num_tokens": 1971310.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.9723965148925782, "eval_biology_loss": 2.1413300037384033, "eval_biology_mean_token_accuracy": 0.6000847628116608, "eval_biology_num_tokens": 1971310.0, "eval_biology_runtime": 38.6804, "eval_biology_samples_per_second": 12.926, "eval_biology_steps_per_second": 3.232, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 0.6801536810398102, "eval_physics_loss": 0.7345870733261108, "eval_physics_mean_token_accuracy": 0.8230011963844299, "eval_physics_num_tokens": 1971310.0, "eval_physics_runtime": 56.9468, "eval_physics_samples_per_second": 8.78, "eval_physics_steps_per_second": 2.195, "step": 100 }, { "entropy": 1.9897258900105954, "epoch": 0.176, "grad_norm": 4.90625, "learning_rate": 2.1800000000000003e-06, "loss": 2.1203, "mean_token_accuracy": 0.5903370209038258, "num_tokens": 2169453.0, "step": 110 }, { "entropy": 1.9791964955627919, "epoch": 0.192, "grad_norm": 5.8125, "learning_rate": 2.38e-06, "loss": 2.1168, "mean_token_accuracy": 0.5923390585929156, "num_tokens": 2366844.0, "step": 120 }, { "entropy": 1.9716377504169942, "epoch": 0.208, "grad_norm": 5.1875, "learning_rate": 2.5800000000000003e-06, "loss": 2.1059, "mean_token_accuracy": 0.5896596999838948, "num_tokens": 2559708.0, "step": 130 }, { "entropy": 1.9294324100017548, "epoch": 0.224, "grad_norm": 5.53125, "learning_rate": 2.7800000000000005e-06, "loss": 2.0638, "mean_token_accuracy": 0.6001523662358522, "num_tokens": 2756533.0, "step": 140 }, { "entropy": 1.9123736470937729, "epoch": 0.24, "grad_norm": 5.0625, "learning_rate": 2.9800000000000003e-06, "loss": 2.0241, "mean_token_accuracy": 0.6036131344735622, "num_tokens": 2948600.0, "step": 150 }, { "entropy": 1.9347217477858067, "epoch": 0.256, "grad_norm": 5.15625, "learning_rate": 3.1800000000000005e-06, "loss": 2.0531, "mean_token_accuracy": 0.5974894173443317, "num_tokens": 3141473.0, "step": 160 }, { "entropy": 1.9242084234952928, "epoch": 0.272, "grad_norm": 5.09375, "learning_rate": 3.3800000000000007e-06, "loss": 2.0261, "mean_token_accuracy": 0.5972955338656902, "num_tokens": 3337285.0, "step": 170 }, { "entropy": 1.8868879064917565, "epoch": 0.288, "grad_norm": 5.375, "learning_rate": 3.58e-06, "loss": 1.9858, "mean_token_accuracy": 0.6028371442109346, "num_tokens": 3541221.0, "step": 180 }, { "entropy": 1.8624922472983598, "epoch": 0.304, "grad_norm": 4.71875, "learning_rate": 3.7800000000000002e-06, "loss": 1.943, "mean_token_accuracy": 0.6070948902517557, "num_tokens": 3734952.0, "step": 190 }, { "entropy": 1.881417414546013, "epoch": 0.32, "grad_norm": 5.21875, "learning_rate": 3.980000000000001e-06, "loss": 1.9609, "mean_token_accuracy": 0.603725228458643, "num_tokens": 3922209.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.8500013475418091, "eval_biology_loss": 1.9382704496383667, "eval_biology_mean_token_accuracy": 0.6156538977622986, "eval_biology_num_tokens": 3922209.0, "eval_biology_runtime": 38.6714, "eval_biology_samples_per_second": 12.929, "eval_biology_steps_per_second": 3.232, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.6814348955154419, "eval_physics_loss": 0.7077023386955261, "eval_physics_mean_token_accuracy": 0.8221852016448975, "eval_physics_num_tokens": 3922209.0, "eval_physics_runtime": 56.9406, "eval_physics_samples_per_second": 8.781, "eval_physics_steps_per_second": 2.195, "step": 200 }, { "entropy": 1.8236718311905862, "epoch": 0.336, "grad_norm": 5.125, "learning_rate": 4.18e-06, "loss": 1.9029, "mean_token_accuracy": 0.6111209694296121, "num_tokens": 4115941.0, "step": 210 }, { "entropy": 1.851329033076763, "epoch": 0.352, "grad_norm": 5.0, "learning_rate": 4.38e-06, "loss": 1.8904, "mean_token_accuracy": 0.6098318418487907, "num_tokens": 4308755.0, "step": 220 }, { "entropy": 1.8281462915241717, "epoch": 0.368, "grad_norm": 4.65625, "learning_rate": 4.58e-06, "loss": 1.8692, "mean_token_accuracy": 0.6131063066422939, "num_tokens": 4505910.0, "step": 230 }, { "entropy": 1.831401702016592, "epoch": 0.384, "grad_norm": 4.96875, "learning_rate": 4.78e-06, "loss": 1.8357, "mean_token_accuracy": 0.6127742934972048, "num_tokens": 4696101.0, "step": 240 }, { "entropy": 1.816836379468441, "epoch": 0.4, "grad_norm": 4.75, "learning_rate": 4.980000000000001e-06, "loss": 1.8158, "mean_token_accuracy": 0.614238902926445, "num_tokens": 4889445.0, "step": 250 }, { "entropy": 1.8351925887167453, "epoch": 0.416, "grad_norm": 4.5625, "learning_rate": 5.18e-06, "loss": 1.8225, "mean_token_accuracy": 0.6112797066569329, "num_tokens": 5087624.0, "step": 260 }, { "entropy": 1.8272491071373225, "epoch": 0.432, "grad_norm": 5.5, "learning_rate": 5.380000000000001e-06, "loss": 1.8098, "mean_token_accuracy": 0.6129346057772637, "num_tokens": 5273794.0, "step": 270 }, { "entropy": 1.8296721816062926, "epoch": 0.448, "grad_norm": 5.46875, "learning_rate": 5.580000000000001e-06, "loss": 1.7925, "mean_token_accuracy": 0.6138202898204327, "num_tokens": 5463288.0, "step": 280 }, { "entropy": 1.7533989433199166, "epoch": 0.464, "grad_norm": 5.09375, "learning_rate": 5.78e-06, "loss": 1.7277, "mean_token_accuracy": 0.625254824757576, "num_tokens": 5656827.0, "step": 290 }, { "entropy": 1.7856548748910428, "epoch": 0.48, "grad_norm": 4.75, "learning_rate": 5.98e-06, "loss": 1.7492, "mean_token_accuracy": 0.6199217233806849, "num_tokens": 5853311.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.766937505722046, "eval_biology_loss": 1.737308144569397, "eval_biology_mean_token_accuracy": 0.6296310710906983, "eval_biology_num_tokens": 5853311.0, "eval_biology_runtime": 38.676, "eval_biology_samples_per_second": 12.928, "eval_biology_steps_per_second": 3.232, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.699759515285492, "eval_physics_loss": 0.6600434184074402, "eval_physics_mean_token_accuracy": 0.8216852688789368, "eval_physics_num_tokens": 5853311.0, "eval_physics_runtime": 56.9184, "eval_physics_samples_per_second": 8.785, "eval_physics_steps_per_second": 2.196, "step": 300 }, { "entropy": 1.6893749851733446, "epoch": 0.496, "grad_norm": 5.125, "learning_rate": 6.18e-06, "loss": 1.6529, "mean_token_accuracy": 0.6324049789458513, "num_tokens": 6049788.0, "step": 310 }, { "entropy": 1.777088150382042, "epoch": 0.512, "grad_norm": 5.0625, "learning_rate": 6.380000000000001e-06, "loss": 1.7373, "mean_token_accuracy": 0.6215202666819095, "num_tokens": 6243825.0, "step": 320 }, { "entropy": 1.7654731079936028, "epoch": 0.528, "grad_norm": 4.75, "learning_rate": 6.5800000000000005e-06, "loss": 1.7104, "mean_token_accuracy": 0.6297121163457632, "num_tokens": 6434188.0, "step": 330 }, { "entropy": 1.7458826407790184, "epoch": 0.544, "grad_norm": 4.59375, "learning_rate": 6.780000000000001e-06, "loss": 1.7134, "mean_token_accuracy": 0.6295231115072966, "num_tokens": 6629832.0, "step": 340 }, { "entropy": 1.7250068325549364, "epoch": 0.56, "grad_norm": 4.375, "learning_rate": 6.98e-06, "loss": 1.6747, "mean_token_accuracy": 0.6324008051306009, "num_tokens": 6824668.0, "step": 350 }, { "entropy": 1.6957500472664833, "epoch": 0.576, "grad_norm": 4.8125, "learning_rate": 7.180000000000001e-06, "loss": 1.6528, "mean_token_accuracy": 0.6356299221515656, "num_tokens": 7025923.0, "step": 360 }, { "entropy": 1.7250152129679919, "epoch": 0.592, "grad_norm": 4.375, "learning_rate": 7.3800000000000005e-06, "loss": 1.6596, "mean_token_accuracy": 0.63132487796247, "num_tokens": 7218224.0, "step": 370 }, { "entropy": 1.7018816463649273, "epoch": 0.608, "grad_norm": 5.1875, "learning_rate": 7.58e-06, "loss": 1.664, "mean_token_accuracy": 0.6356819782406091, "num_tokens": 7421216.0, "step": 380 }, { "entropy": 1.7107378408312797, "epoch": 0.624, "grad_norm": 4.78125, "learning_rate": 7.78e-06, "loss": 1.6546, "mean_token_accuracy": 0.6328394934535027, "num_tokens": 7617258.0, "step": 390 }, { "entropy": 1.7047107808291913, "epoch": 0.64, "grad_norm": 4.65625, "learning_rate": 7.980000000000002e-06, "loss": 1.636, "mean_token_accuracy": 0.6354850973933935, "num_tokens": 7806173.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.6343248481750487, "eval_biology_loss": 1.6292321681976318, "eval_biology_mean_token_accuracy": 0.6456177935600281, "eval_biology_num_tokens": 7806173.0, "eval_biology_runtime": 38.6524, "eval_biology_samples_per_second": 12.936, "eval_biology_steps_per_second": 3.234, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.6857430481910706, "eval_physics_loss": 0.6523167490959167, "eval_physics_mean_token_accuracy": 0.8234750027656556, "eval_physics_num_tokens": 7806173.0, "eval_physics_runtime": 56.9086, "eval_physics_samples_per_second": 8.786, "eval_physics_steps_per_second": 2.197, "step": 400 }, { "entropy": 1.6953211184591055, "epoch": 0.656, "grad_norm": 4.8125, "learning_rate": 8.18e-06, "loss": 1.6492, "mean_token_accuracy": 0.635795733332634, "num_tokens": 8000514.0, "step": 410 }, { "entropy": 1.7117650181055069, "epoch": 0.672, "grad_norm": 4.90625, "learning_rate": 8.380000000000001e-06, "loss": 1.651, "mean_token_accuracy": 0.6328964576125145, "num_tokens": 8187884.0, "step": 420 }, { "entropy": 1.6630911942571402, "epoch": 0.688, "grad_norm": 5.0, "learning_rate": 8.580000000000001e-06, "loss": 1.6129, "mean_token_accuracy": 0.6399691045284271, "num_tokens": 8390643.0, "step": 430 }, { "entropy": 1.6603879615664483, "epoch": 0.704, "grad_norm": 4.8125, "learning_rate": 8.78e-06, "loss": 1.6006, "mean_token_accuracy": 0.6405742786824703, "num_tokens": 8583266.0, "step": 440 }, { "entropy": 1.5886821828782558, "epoch": 0.72, "grad_norm": 4.53125, "learning_rate": 8.98e-06, "loss": 1.5254, "mean_token_accuracy": 0.6530824847519398, "num_tokens": 8786233.0, "step": 450 }, { "entropy": 1.6262568928301335, "epoch": 0.736, "grad_norm": 5.0625, "learning_rate": 9.180000000000002e-06, "loss": 1.5597, "mean_token_accuracy": 0.6445462703704834, "num_tokens": 8982898.0, "step": 460 }, { "entropy": 1.6295693822205066, "epoch": 0.752, "grad_norm": 4.90625, "learning_rate": 9.38e-06, "loss": 1.5648, "mean_token_accuracy": 0.6450414888560771, "num_tokens": 9174361.0, "step": 470 }, { "entropy": 1.5759615428745746, "epoch": 0.768, "grad_norm": 4.4375, "learning_rate": 9.58e-06, "loss": 1.5144, "mean_token_accuracy": 0.653152122348547, "num_tokens": 9373430.0, "step": 480 }, { "entropy": 1.6083305418491363, "epoch": 0.784, "grad_norm": 4.40625, "learning_rate": 9.780000000000001e-06, "loss": 1.5488, "mean_token_accuracy": 0.6459381774067878, "num_tokens": 9570409.0, "step": 490 }, { "entropy": 1.6188334189355373, "epoch": 0.8, "grad_norm": 4.78125, "learning_rate": 9.980000000000001e-06, "loss": 1.5541, "mean_token_accuracy": 0.6456099949777127, "num_tokens": 9766360.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.570951623916626, "eval_biology_loss": 1.5443849563598633, "eval_biology_mean_token_accuracy": 0.6565022644996643, "eval_biology_num_tokens": 9766360.0, "eval_biology_runtime": 38.6538, "eval_biology_samples_per_second": 12.935, "eval_biology_steps_per_second": 3.234, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.6805566384792328, "eval_physics_loss": 0.6488668322563171, "eval_physics_mean_token_accuracy": 0.8232229099273681, "eval_physics_num_tokens": 9766360.0, "eval_physics_runtime": 56.9177, "eval_physics_samples_per_second": 8.785, "eval_physics_steps_per_second": 2.196, "step": 500 }, { "entropy": 1.5766926735639573, "epoch": 0.816, "grad_norm": 5.28125, "learning_rate": 1.018e-05, "loss": 1.5247, "mean_token_accuracy": 0.6521722257137299, "num_tokens": 9963834.0, "step": 510 }, { "entropy": 1.559133169800043, "epoch": 0.832, "grad_norm": 4.5, "learning_rate": 1.038e-05, "loss": 1.5075, "mean_token_accuracy": 0.6519753206521273, "num_tokens": 10160950.0, "step": 520 }, { "entropy": 1.5535101789981127, "epoch": 0.848, "grad_norm": 5.0, "learning_rate": 1.0580000000000002e-05, "loss": 1.496, "mean_token_accuracy": 0.6550566371530294, "num_tokens": 10363131.0, "step": 530 }, { "entropy": 1.573872973024845, "epoch": 0.864, "grad_norm": 4.375, "learning_rate": 1.0780000000000002e-05, "loss": 1.5151, "mean_token_accuracy": 0.6522478803992271, "num_tokens": 10558003.0, "step": 540 }, { "entropy": 1.5461064357310534, "epoch": 0.88, "grad_norm": 4.59375, "learning_rate": 1.0980000000000002e-05, "loss": 1.4907, "mean_token_accuracy": 0.6562161970883608, "num_tokens": 10754470.0, "step": 550 }, { "entropy": 1.5923856895416975, "epoch": 0.896, "grad_norm": 4.5, "learning_rate": 1.1180000000000001e-05, "loss": 1.5274, "mean_token_accuracy": 0.6506992731243372, "num_tokens": 10948943.0, "step": 560 }, { "entropy": 1.546384008973837, "epoch": 0.912, "grad_norm": 4.65625, "learning_rate": 1.138e-05, "loss": 1.4879, "mean_token_accuracy": 0.655264026299119, "num_tokens": 11142613.0, "step": 570 }, { "entropy": 1.5580323040485382, "epoch": 0.928, "grad_norm": 4.21875, "learning_rate": 1.1580000000000001e-05, "loss": 1.4914, "mean_token_accuracy": 0.6533704947680234, "num_tokens": 11336779.0, "step": 580 }, { "entropy": 1.4954375229775905, "epoch": 0.944, "grad_norm": 4.15625, "learning_rate": 1.178e-05, "loss": 1.4496, "mean_token_accuracy": 0.6618651241064072, "num_tokens": 11536381.0, "step": 590 }, { "entropy": 1.521157095208764, "epoch": 0.96, "grad_norm": 5.34375, "learning_rate": 1.198e-05, "loss": 1.4654, "mean_token_accuracy": 0.6575134672224522, "num_tokens": 11735527.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 1.4938988003730773, "eval_biology_loss": 1.4810906648635864, "eval_biology_mean_token_accuracy": 0.6647453079223633, "eval_biology_num_tokens": 11735527.0, "eval_biology_runtime": 38.6461, "eval_biology_samples_per_second": 12.938, "eval_biology_steps_per_second": 3.234, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.6846246910095215, "eval_physics_loss": 0.6483092308044434, "eval_physics_mean_token_accuracy": 0.8228992919921875, "eval_physics_num_tokens": 11735527.0, "eval_physics_runtime": 57.2247, "eval_physics_samples_per_second": 8.737, "eval_physics_steps_per_second": 2.184, "step": 600 }, { "entropy": 1.4897711608558892, "epoch": 0.976, "grad_norm": 4.34375, "learning_rate": 1.218e-05, "loss": 1.4364, "mean_token_accuracy": 0.6634432837367058, "num_tokens": 11930935.0, "step": 610 }, { "entropy": 1.5174701895564795, "epoch": 0.992, "grad_norm": 4.53125, "learning_rate": 1.2380000000000002e-05, "loss": 1.4657, "mean_token_accuracy": 0.6584253456443548, "num_tokens": 12129585.0, "step": 620 }, { "entropy": 1.4917617563158274, "epoch": 1.008, "grad_norm": 4.5, "learning_rate": 1.2580000000000002e-05, "loss": 1.4435, "mean_token_accuracy": 0.6625992625951767, "num_tokens": 12325847.0, "step": 630 }, { "entropy": 1.429138394445181, "epoch": 1.024, "grad_norm": 4.34375, "learning_rate": 1.2780000000000001e-05, "loss": 1.3693, "mean_token_accuracy": 0.6733787953853607, "num_tokens": 12530820.0, "step": 640 }, { "entropy": 1.4665595509111882, "epoch": 1.04, "grad_norm": 4.65625, "learning_rate": 1.2980000000000001e-05, "loss": 1.4151, "mean_token_accuracy": 0.6643557522445918, "num_tokens": 12725384.0, "step": 650 }, { "entropy": 1.4320495713502168, "epoch": 1.056, "grad_norm": 4.65625, "learning_rate": 1.3180000000000001e-05, "loss": 1.3639, "mean_token_accuracy": 0.6728409513831138, "num_tokens": 12924592.0, "step": 660 }, { "entropy": 1.456987388432026, "epoch": 1.072, "grad_norm": 4.65625, "learning_rate": 1.3380000000000002e-05, "loss": 1.3964, "mean_token_accuracy": 0.6672081802040338, "num_tokens": 13112848.0, "step": 670 }, { "entropy": 1.4462645407766104, "epoch": 1.088, "grad_norm": 4.21875, "learning_rate": 1.3580000000000002e-05, "loss": 1.3914, "mean_token_accuracy": 0.6701537113636732, "num_tokens": 13305725.0, "step": 680 }, { "entropy": 1.439976465329528, "epoch": 1.104, "grad_norm": 4.4375, "learning_rate": 1.378e-05, "loss": 1.3853, "mean_token_accuracy": 0.6701969560235739, "num_tokens": 13498685.0, "step": 690 }, { "entropy": 1.4542303152382374, "epoch": 1.12, "grad_norm": 4.6875, "learning_rate": 1.398e-05, "loss": 1.3835, "mean_token_accuracy": 0.6701514054089784, "num_tokens": 13682146.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 1.3601415209770202, "eval_biology_loss": 1.4304583072662354, "eval_biology_mean_token_accuracy": 0.6715204911231994, "eval_biology_num_tokens": 13682146.0, "eval_biology_runtime": 38.6654, "eval_biology_samples_per_second": 12.931, "eval_biology_steps_per_second": 3.233, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.65302716588974, "eval_physics_loss": 0.6474171876907349, "eval_physics_mean_token_accuracy": 0.8226029920578003, "eval_physics_num_tokens": 13682146.0, "eval_physics_runtime": 56.8929, "eval_physics_samples_per_second": 8.788, "eval_physics_steps_per_second": 2.197, "step": 700 }, { "entropy": 1.4230232119560242, "epoch": 1.1360000000000001, "grad_norm": 4.9375, "learning_rate": 1.418e-05, "loss": 1.3747, "mean_token_accuracy": 0.6713559620082379, "num_tokens": 13876656.0, "step": 710 }, { "entropy": 1.398179117962718, "epoch": 1.152, "grad_norm": 3.828125, "learning_rate": 1.4380000000000001e-05, "loss": 1.3438, "mean_token_accuracy": 0.675188298150897, "num_tokens": 14086104.0, "step": 720 }, { "entropy": 1.4270055878907442, "epoch": 1.168, "grad_norm": 4.375, "learning_rate": 1.4580000000000001e-05, "loss": 1.373, "mean_token_accuracy": 0.6712808165699243, "num_tokens": 14274520.0, "step": 730 }, { "entropy": 1.4195893313735723, "epoch": 1.184, "grad_norm": 4.0, "learning_rate": 1.478e-05, "loss": 1.3657, "mean_token_accuracy": 0.6732822258025408, "num_tokens": 14473623.0, "step": 740 }, { "entropy": 1.4137178607285024, "epoch": 1.2, "grad_norm": 4.4375, "learning_rate": 1.498e-05, "loss": 1.3527, "mean_token_accuracy": 0.6739591065794229, "num_tokens": 14661238.0, "step": 750 }, { "entropy": 1.3734467361122369, "epoch": 1.216, "grad_norm": 4.25, "learning_rate": 1.5180000000000002e-05, "loss": 1.3173, "mean_token_accuracy": 0.6782099220901727, "num_tokens": 14865915.0, "step": 760 }, { "entropy": 1.3874890733510257, "epoch": 1.232, "grad_norm": 3.953125, "learning_rate": 1.5380000000000002e-05, "loss": 1.3318, "mean_token_accuracy": 0.6802723795175553, "num_tokens": 15055682.0, "step": 770 }, { "entropy": 1.3982337843626738, "epoch": 1.248, "grad_norm": 4.28125, "learning_rate": 1.5580000000000003e-05, "loss": 1.3481, "mean_token_accuracy": 0.6750583201646805, "num_tokens": 15249576.0, "step": 780 }, { "entropy": 1.4128487702459096, "epoch": 1.264, "grad_norm": 3.78125, "learning_rate": 1.578e-05, "loss": 1.3488, "mean_token_accuracy": 0.6755638718605042, "num_tokens": 15446149.0, "step": 790 }, { "entropy": 1.4168333638459445, "epoch": 1.28, "grad_norm": 4.46875, "learning_rate": 1.5980000000000003e-05, "loss": 1.3691, "mean_token_accuracy": 0.6701620414853096, "num_tokens": 15639317.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 1.3710014867782592, "eval_biology_loss": 1.3889533281326294, "eval_biology_mean_token_accuracy": 0.6776719264984131, "eval_biology_num_tokens": 15639317.0, "eval_biology_runtime": 38.6779, "eval_biology_samples_per_second": 12.927, "eval_biology_steps_per_second": 3.232, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.6683794283866882, "eval_physics_loss": 0.6475260257720947, "eval_physics_mean_token_accuracy": 0.8223769798278808, "eval_physics_num_tokens": 15639317.0, "eval_physics_runtime": 56.9416, "eval_physics_samples_per_second": 8.781, "eval_physics_steps_per_second": 2.195, "step": 800 }, { "entropy": 1.415640415251255, "epoch": 1.296, "grad_norm": 3.890625, "learning_rate": 1.618e-05, "loss": 1.3559, "mean_token_accuracy": 0.6743101593106985, "num_tokens": 15835805.0, "step": 810 }, { "entropy": 1.4065573081374168, "epoch": 1.312, "grad_norm": 3.796875, "learning_rate": 1.638e-05, "loss": 1.3441, "mean_token_accuracy": 0.6742365792393684, "num_tokens": 16028375.0, "step": 820 }, { "entropy": 1.4003059070557355, "epoch": 1.328, "grad_norm": 3.9375, "learning_rate": 1.658e-05, "loss": 1.3416, "mean_token_accuracy": 0.6751531921327114, "num_tokens": 16230555.0, "step": 830 }, { "entropy": 1.3695629570633172, "epoch": 1.3439999999999999, "grad_norm": 3.46875, "learning_rate": 1.6780000000000002e-05, "loss": 1.323, "mean_token_accuracy": 0.6816294804215431, "num_tokens": 16436297.0, "step": 840 }, { "entropy": 1.3895002137869596, "epoch": 1.3599999999999999, "grad_norm": 4.15625, "learning_rate": 1.698e-05, "loss": 1.3223, "mean_token_accuracy": 0.678603322058916, "num_tokens": 16630563.0, "step": 850 }, { "entropy": 1.4141130354255438, "epoch": 1.376, "grad_norm": 4.03125, "learning_rate": 1.718e-05, "loss": 1.3514, "mean_token_accuracy": 0.6730918735265732, "num_tokens": 16822602.0, "step": 860 }, { "entropy": 1.3758564725518228, "epoch": 1.392, "grad_norm": 3.640625, "learning_rate": 1.7380000000000003e-05, "loss": 1.3266, "mean_token_accuracy": 0.6803319718688726, "num_tokens": 17015750.0, "step": 870 }, { "entropy": 1.3721271395683288, "epoch": 1.408, "grad_norm": 3.453125, "learning_rate": 1.758e-05, "loss": 1.3173, "mean_token_accuracy": 0.6792092099785805, "num_tokens": 17207248.0, "step": 880 }, { "entropy": 1.378094071522355, "epoch": 1.424, "grad_norm": 3.78125, "learning_rate": 1.7780000000000003e-05, "loss": 1.3327, "mean_token_accuracy": 0.6793139282613992, "num_tokens": 17403869.0, "step": 890 }, { "entropy": 1.3699288651347161, "epoch": 1.44, "grad_norm": 4.5, "learning_rate": 1.798e-05, "loss": 1.3147, "mean_token_accuracy": 0.6776431813836098, "num_tokens": 17597428.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 1.3595392904281616, "eval_biology_loss": 1.3485888242721558, "eval_biology_mean_token_accuracy": 0.6834584150314331, "eval_biology_num_tokens": 17597428.0, "eval_biology_runtime": 38.682, "eval_biology_samples_per_second": 12.926, "eval_biology_steps_per_second": 3.231, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.6735630776882172, "eval_physics_loss": 0.6497707366943359, "eval_physics_mean_token_accuracy": 0.8218984279632569, "eval_physics_num_tokens": 17597428.0, "eval_physics_runtime": 56.9341, "eval_physics_samples_per_second": 8.782, "eval_physics_steps_per_second": 2.196, "step": 900 }, { "entropy": 1.3743872378021478, "epoch": 1.456, "grad_norm": 3.65625, "learning_rate": 1.8180000000000002e-05, "loss": 1.3244, "mean_token_accuracy": 0.6799841322004795, "num_tokens": 17798083.0, "step": 910 }, { "entropy": 1.3584794465452432, "epoch": 1.472, "grad_norm": 3.65625, "learning_rate": 1.8380000000000004e-05, "loss": 1.2994, "mean_token_accuracy": 0.6824300821870566, "num_tokens": 17993802.0, "step": 920 }, { "entropy": 1.3503762260079384, "epoch": 1.488, "grad_norm": 3.796875, "learning_rate": 1.858e-05, "loss": 1.3014, "mean_token_accuracy": 0.6844853088259697, "num_tokens": 18185570.0, "step": 930 }, { "entropy": 1.3729203592985868, "epoch": 1.504, "grad_norm": 3.46875, "learning_rate": 1.878e-05, "loss": 1.3216, "mean_token_accuracy": 0.6780106417834759, "num_tokens": 18377903.0, "step": 940 }, { "entropy": 1.3659293711185456, "epoch": 1.52, "grad_norm": 3.46875, "learning_rate": 1.898e-05, "loss": 1.318, "mean_token_accuracy": 0.6781526006758213, "num_tokens": 18579284.0, "step": 950 }, { "entropy": 1.366529828310013, "epoch": 1.536, "grad_norm": 4.5, "learning_rate": 1.918e-05, "loss": 1.3038, "mean_token_accuracy": 0.6802028331905603, "num_tokens": 18765324.0, "step": 960 }, { "entropy": 1.3478929094970227, "epoch": 1.552, "grad_norm": 3.9375, "learning_rate": 1.938e-05, "loss": 1.2881, "mean_token_accuracy": 0.68250414095819, "num_tokens": 18966382.0, "step": 970 }, { "entropy": 1.339873855561018, "epoch": 1.568, "grad_norm": 3.65625, "learning_rate": 1.9580000000000002e-05, "loss": 1.2915, "mean_token_accuracy": 0.6844375465065241, "num_tokens": 19160509.0, "step": 980 }, { "entropy": 1.3251274175941945, "epoch": 1.584, "grad_norm": 3.34375, "learning_rate": 1.978e-05, "loss": 1.2752, "mean_token_accuracy": 0.6852322082966567, "num_tokens": 19354662.0, "step": 990 }, { "entropy": 1.384224559739232, "epoch": 1.6, "grad_norm": 3.484375, "learning_rate": 1.9980000000000002e-05, "loss": 1.3287, "mean_token_accuracy": 0.6757747534662485, "num_tokens": 19543088.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 1.337540853023529, "eval_biology_loss": 1.3165961503982544, "eval_biology_mean_token_accuracy": 0.6879782662391662, "eval_biology_num_tokens": 19543088.0, "eval_biology_runtime": 38.6581, "eval_biology_samples_per_second": 12.934, "eval_biology_steps_per_second": 3.233, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.6811050243377685, "eval_physics_loss": 0.6504300832748413, "eval_physics_mean_token_accuracy": 0.8209615325927735, "eval_physics_num_tokens": 19543088.0, "eval_physics_runtime": 56.8501, "eval_physics_samples_per_second": 8.795, "eval_physics_steps_per_second": 2.199, "step": 1000 }, { "entropy": 1.3300264578312635, "epoch": 1.616, "grad_norm": 3.3125, "learning_rate": 1.9980000000000002e-05, "loss": 1.2769, "mean_token_accuracy": 0.685854634270072, "num_tokens": 19743506.0, "step": 1010 }, { "entropy": 1.3689603984355927, "epoch": 1.6320000000000001, "grad_norm": 3.609375, "learning_rate": 1.995777777777778e-05, "loss": 1.3153, "mean_token_accuracy": 0.6796921446919442, "num_tokens": 19937567.0, "step": 1020 }, { "entropy": 1.3543198898434639, "epoch": 1.6480000000000001, "grad_norm": 3.109375, "learning_rate": 1.9935555555555557e-05, "loss": 1.2901, "mean_token_accuracy": 0.6821238681674003, "num_tokens": 20129743.0, "step": 1030 }, { "entropy": 1.3178342673927546, "epoch": 1.6640000000000001, "grad_norm": 3.640625, "learning_rate": 1.9913333333333335e-05, "loss": 1.2644, "mean_token_accuracy": 0.6889090698212386, "num_tokens": 20331548.0, "step": 1040 }, { "entropy": 1.3326357528567314, "epoch": 1.6800000000000002, "grad_norm": 3.71875, "learning_rate": 1.9891111111111112e-05, "loss": 1.2834, "mean_token_accuracy": 0.6844820819795132, "num_tokens": 20524247.0, "step": 1050 }, { "entropy": 1.327241437882185, "epoch": 1.696, "grad_norm": 3.078125, "learning_rate": 1.986888888888889e-05, "loss": 1.2708, "mean_token_accuracy": 0.6858115192502737, "num_tokens": 20718121.0, "step": 1060 }, { "entropy": 1.3415468953549863, "epoch": 1.712, "grad_norm": 3.40625, "learning_rate": 1.9846666666666668e-05, "loss": 1.2797, "mean_token_accuracy": 0.684544776007533, "num_tokens": 20921419.0, "step": 1070 }, { "entropy": 1.320882337912917, "epoch": 1.728, "grad_norm": 3.859375, "learning_rate": 1.9824444444444445e-05, "loss": 1.267, "mean_token_accuracy": 0.6839414596557617, "num_tokens": 21118581.0, "step": 1080 }, { "entropy": 1.2898854076862336, "epoch": 1.744, "grad_norm": 3.5625, "learning_rate": 1.9802222222222226e-05, "loss": 1.2375, "mean_token_accuracy": 0.6910523246973753, "num_tokens": 21315128.0, "step": 1090 }, { "entropy": 1.31239253282547, "epoch": 1.76, "grad_norm": 3.265625, "learning_rate": 1.978e-05, "loss": 1.2603, "mean_token_accuracy": 0.6899993035942316, "num_tokens": 21511066.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 1.2963400874137878, "eval_biology_loss": 1.2855476140975952, "eval_biology_mean_token_accuracy": 0.6923068904876709, "eval_biology_num_tokens": 21511066.0, "eval_biology_runtime": 38.6761, "eval_biology_samples_per_second": 12.928, "eval_biology_steps_per_second": 3.232, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.6632856893539428, "eval_physics_loss": 0.6495886445045471, "eval_physics_mean_token_accuracy": 0.8213236899375915, "eval_physics_num_tokens": 21511066.0, "eval_physics_runtime": 56.9409, "eval_physics_samples_per_second": 8.781, "eval_physics_steps_per_second": 2.195, "step": 1100 }, { "entropy": 1.3157803509384394, "epoch": 1.776, "grad_norm": 3.59375, "learning_rate": 1.975777777777778e-05, "loss": 1.2599, "mean_token_accuracy": 0.6879700906574726, "num_tokens": 21704368.0, "step": 1110 }, { "entropy": 1.2888959780335427, "epoch": 1.792, "grad_norm": 3.3125, "learning_rate": 1.9735555555555556e-05, "loss": 1.2331, "mean_token_accuracy": 0.6911204513162374, "num_tokens": 21905784.0, "step": 1120 }, { "entropy": 1.3017573833465577, "epoch": 1.808, "grad_norm": 3.34375, "learning_rate": 1.9713333333333337e-05, "loss": 1.2521, "mean_token_accuracy": 0.6918430212885142, "num_tokens": 22094159.0, "step": 1130 }, { "entropy": 1.329062307998538, "epoch": 1.8239999999999998, "grad_norm": 3.078125, "learning_rate": 1.969111111111111e-05, "loss": 1.2592, "mean_token_accuracy": 0.684392886608839, "num_tokens": 22290654.0, "step": 1140 }, { "entropy": 1.339531321078539, "epoch": 1.8399999999999999, "grad_norm": 3.21875, "learning_rate": 1.9668888888888892e-05, "loss": 1.2846, "mean_token_accuracy": 0.6835382975637913, "num_tokens": 22485675.0, "step": 1150 }, { "entropy": 1.295450346916914, "epoch": 1.8559999999999999, "grad_norm": 3.3125, "learning_rate": 1.9646666666666666e-05, "loss": 1.2508, "mean_token_accuracy": 0.6886295765638352, "num_tokens": 22689900.0, "step": 1160 }, { "entropy": 1.2797737825661897, "epoch": 1.8719999999999999, "grad_norm": 2.90625, "learning_rate": 1.9624444444444447e-05, "loss": 1.2204, "mean_token_accuracy": 0.6948459330946207, "num_tokens": 22887275.0, "step": 1170 }, { "entropy": 1.3289342291653157, "epoch": 1.888, "grad_norm": 3.390625, "learning_rate": 1.9602222222222225e-05, "loss": 1.2744, "mean_token_accuracy": 0.6839813269674778, "num_tokens": 23081241.0, "step": 1180 }, { "entropy": 1.2998227212578057, "epoch": 1.904, "grad_norm": 3.3125, "learning_rate": 1.9580000000000002e-05, "loss": 1.2522, "mean_token_accuracy": 0.6898551240563393, "num_tokens": 23276417.0, "step": 1190 }, { "entropy": 1.300378565862775, "epoch": 1.92, "grad_norm": 3.390625, "learning_rate": 1.955777777777778e-05, "loss": 1.2471, "mean_token_accuracy": 0.6882933590561151, "num_tokens": 23476340.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 1.2630186223983764, "eval_biology_loss": 1.2595529556274414, "eval_biology_mean_token_accuracy": 0.6963475484848023, "eval_biology_num_tokens": 23476340.0, "eval_biology_runtime": 38.6878, "eval_biology_samples_per_second": 12.924, "eval_biology_steps_per_second": 3.231, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.6685261228084565, "eval_physics_loss": 0.6503589749336243, "eval_physics_mean_token_accuracy": 0.8208199772834778, "eval_physics_num_tokens": 23476340.0, "eval_physics_runtime": 56.9337, "eval_physics_samples_per_second": 8.782, "eval_physics_steps_per_second": 2.196, "step": 1200 }, { "entropy": 1.2681011691689492, "epoch": 1.936, "grad_norm": 3.171875, "learning_rate": 1.9535555555555557e-05, "loss": 1.2121, "mean_token_accuracy": 0.6959899343550205, "num_tokens": 23673208.0, "step": 1210 }, { "entropy": 1.29563331566751, "epoch": 1.952, "grad_norm": 3.109375, "learning_rate": 1.9513333333333335e-05, "loss": 1.2335, "mean_token_accuracy": 0.6913977723568678, "num_tokens": 23870993.0, "step": 1220 }, { "entropy": 1.315883708000183, "epoch": 1.968, "grad_norm": 3.109375, "learning_rate": 1.9491111111111113e-05, "loss": 1.269, "mean_token_accuracy": 0.6871387537568807, "num_tokens": 24066234.0, "step": 1230 }, { "entropy": 1.2918797850608825, "epoch": 1.984, "grad_norm": 3.28125, "learning_rate": 1.946888888888889e-05, "loss": 1.2409, "mean_token_accuracy": 0.690430635213852, "num_tokens": 24262389.0, "step": 1240 }, { "entropy": 1.2895139936357736, "epoch": 2.0, "grad_norm": 3.171875, "learning_rate": 1.9446666666666668e-05, "loss": 1.2465, "mean_token_accuracy": 0.6922377996146679, "num_tokens": 24455544.0, "step": 1250 }, { "entropy": 1.1053394179791212, "epoch": 2.016, "grad_norm": 4.0625, "learning_rate": 1.9424444444444446e-05, "loss": 1.0366, "mean_token_accuracy": 0.7227501507848502, "num_tokens": 24645425.0, "step": 1260 }, { "entropy": 1.0759795285761355, "epoch": 2.032, "grad_norm": 3.65625, "learning_rate": 1.9402222222222223e-05, "loss": 1.0324, "mean_token_accuracy": 0.7241251844912767, "num_tokens": 24835956.0, "step": 1270 }, { "entropy": 1.065844392031431, "epoch": 2.048, "grad_norm": 3.59375, "learning_rate": 1.938e-05, "loss": 0.9998, "mean_token_accuracy": 0.7277645654976368, "num_tokens": 25036470.0, "step": 1280 }, { "entropy": 1.0796132825315, "epoch": 2.064, "grad_norm": 3.4375, "learning_rate": 1.935777777777778e-05, "loss": 1.0147, "mean_token_accuracy": 0.7255880139768124, "num_tokens": 25230959.0, "step": 1290 }, { "entropy": 1.0651828214526176, "epoch": 2.08, "grad_norm": 3.25, "learning_rate": 1.9335555555555556e-05, "loss": 1.0108, "mean_token_accuracy": 0.727525070682168, "num_tokens": 25422625.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 1.0677382831573485, "eval_biology_loss": 1.290307641029358, "eval_biology_mean_token_accuracy": 0.6951840238571168, "eval_biology_num_tokens": 25422625.0, "eval_biology_runtime": 38.6809, "eval_biology_samples_per_second": 12.926, "eval_biology_steps_per_second": 3.232, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.5783973336219788, "eval_physics_loss": 0.6678071022033691, "eval_physics_mean_token_accuracy": 0.8184197754859924, "eval_physics_num_tokens": 25422625.0, "eval_physics_runtime": 56.9183, "eval_physics_samples_per_second": 8.785, "eval_physics_steps_per_second": 2.196, "step": 1300 }, { "entropy": 1.0735113076865672, "epoch": 2.096, "grad_norm": 3.484375, "learning_rate": 1.9313333333333334e-05, "loss": 1.0185, "mean_token_accuracy": 0.7264322709292174, "num_tokens": 25614286.0, "step": 1310 }, { "entropy": 1.0468817871063947, "epoch": 2.112, "grad_norm": 3.015625, "learning_rate": 1.9291111111111115e-05, "loss": 0.9933, "mean_token_accuracy": 0.7301209557801485, "num_tokens": 25810384.0, "step": 1320 }, { "entropy": 1.057022250816226, "epoch": 2.128, "grad_norm": 3.5, "learning_rate": 1.926888888888889e-05, "loss": 1.0037, "mean_token_accuracy": 0.7284754924476147, "num_tokens": 26005047.0, "step": 1330 }, { "entropy": 1.0537616554647684, "epoch": 2.144, "grad_norm": 4.09375, "learning_rate": 1.924666666666667e-05, "loss": 1.0101, "mean_token_accuracy": 0.7281042989343405, "num_tokens": 26207169.0, "step": 1340 }, { "entropy": 1.0744788832962513, "epoch": 2.16, "grad_norm": 3.546875, "learning_rate": 1.9224444444444444e-05, "loss": 1.0203, "mean_token_accuracy": 0.7265679869800806, "num_tokens": 26402150.0, "step": 1350 }, { "entropy": 1.0806033357977867, "epoch": 2.176, "grad_norm": 3.546875, "learning_rate": 1.9202222222222225e-05, "loss": 1.0319, "mean_token_accuracy": 0.7219661567360163, "num_tokens": 26599464.0, "step": 1360 }, { "entropy": 1.0509729556739331, "epoch": 2.192, "grad_norm": 3.109375, "learning_rate": 1.918e-05, "loss": 0.9872, "mean_token_accuracy": 0.7306252598762513, "num_tokens": 26790601.0, "step": 1370 }, { "entropy": 1.068820282816887, "epoch": 2.208, "grad_norm": 3.390625, "learning_rate": 1.915777777777778e-05, "loss": 1.0224, "mean_token_accuracy": 0.7255501691251993, "num_tokens": 26988598.0, "step": 1380 }, { "entropy": 1.0967998839914799, "epoch": 2.224, "grad_norm": 3.34375, "learning_rate": 1.9135555555555555e-05, "loss": 1.0393, "mean_token_accuracy": 0.7217595972120762, "num_tokens": 27181933.0, "step": 1390 }, { "entropy": 1.0403927110135556, "epoch": 2.24, "grad_norm": 3.28125, "learning_rate": 1.9113333333333336e-05, "loss": 0.982, "mean_token_accuracy": 0.7317505519837141, "num_tokens": 27378686.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 1.053062720298767, "eval_biology_loss": 1.2769854068756104, "eval_biology_mean_token_accuracy": 0.6977940616607666, "eval_biology_num_tokens": 27378686.0, "eval_biology_runtime": 38.6188, "eval_biology_samples_per_second": 12.947, "eval_biology_steps_per_second": 3.237, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.5554806351661682, "eval_physics_loss": 0.6756051182746887, "eval_physics_mean_token_accuracy": 0.8174587740898133, "eval_physics_num_tokens": 27378686.0, "eval_physics_runtime": 56.8627, "eval_physics_samples_per_second": 8.793, "eval_physics_steps_per_second": 2.198, "step": 1400 }, { "entropy": 1.0398972254246472, "epoch": 2.2560000000000002, "grad_norm": 3.296875, "learning_rate": 1.9091111111111113e-05, "loss": 0.9991, "mean_token_accuracy": 0.7316640116274357, "num_tokens": 27571908.0, "step": 1410 }, { "entropy": 1.0447341743856668, "epoch": 2.2720000000000002, "grad_norm": 3.28125, "learning_rate": 1.906888888888889e-05, "loss": 0.9972, "mean_token_accuracy": 0.728259452059865, "num_tokens": 27770478.0, "step": 1420 }, { "entropy": 1.0823504395782948, "epoch": 2.288, "grad_norm": 3.453125, "learning_rate": 1.904666666666667e-05, "loss": 1.0252, "mean_token_accuracy": 0.7220408860594034, "num_tokens": 27965558.0, "step": 1430 }, { "entropy": 1.0753419052809476, "epoch": 2.304, "grad_norm": 3.171875, "learning_rate": 1.9024444444444446e-05, "loss": 1.0203, "mean_token_accuracy": 0.7257058396935463, "num_tokens": 28165624.0, "step": 1440 }, { "entropy": 1.0463432766497136, "epoch": 2.32, "grad_norm": 3.234375, "learning_rate": 1.9002222222222224e-05, "loss": 0.9989, "mean_token_accuracy": 0.731269496306777, "num_tokens": 28363594.0, "step": 1450 }, { "entropy": 1.0562681667506695, "epoch": 2.336, "grad_norm": 3.34375, "learning_rate": 1.898e-05, "loss": 1.0089, "mean_token_accuracy": 0.7288151986896991, "num_tokens": 28558104.0, "step": 1460 }, { "entropy": 1.07792302146554, "epoch": 2.352, "grad_norm": 3.296875, "learning_rate": 1.895777777777778e-05, "loss": 1.0219, "mean_token_accuracy": 0.7233119513839483, "num_tokens": 28747265.0, "step": 1470 }, { "entropy": 1.06618184261024, "epoch": 2.368, "grad_norm": 3.171875, "learning_rate": 1.8935555555555556e-05, "loss": 1.0027, "mean_token_accuracy": 0.7261252999305725, "num_tokens": 28944694.0, "step": 1480 }, { "entropy": 1.082535107806325, "epoch": 2.384, "grad_norm": 3.546875, "learning_rate": 1.8913333333333334e-05, "loss": 1.0256, "mean_token_accuracy": 0.7226470891386271, "num_tokens": 29136617.0, "step": 1490 }, { "entropy": 1.0529342222958804, "epoch": 2.4, "grad_norm": 3.28125, "learning_rate": 1.8891111111111115e-05, "loss": 0.9972, "mean_token_accuracy": 0.7272268772125244, "num_tokens": 29333526.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 1.046908447265625, "eval_biology_loss": 1.2635581493377686, "eval_biology_mean_token_accuracy": 0.7004145407676696, "eval_biology_num_tokens": 29333526.0, "eval_biology_runtime": 38.7807, "eval_biology_samples_per_second": 12.893, "eval_biology_steps_per_second": 3.223, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.5629296214580536, "eval_physics_loss": 0.6746402978897095, "eval_physics_mean_token_accuracy": 0.8170881199836731, "eval_physics_num_tokens": 29333526.0, "eval_physics_runtime": 56.9496, "eval_physics_samples_per_second": 8.78, "eval_physics_steps_per_second": 2.195, "step": 1500 }, { "entropy": 1.0541506320238114, "epoch": 2.416, "grad_norm": 3.65625, "learning_rate": 1.886888888888889e-05, "loss": 1.0065, "mean_token_accuracy": 0.7273991428315639, "num_tokens": 29534153.0, "step": 1510 }, { "entropy": 1.053679431974888, "epoch": 2.432, "grad_norm": 3.328125, "learning_rate": 1.884666666666667e-05, "loss": 0.9968, "mean_token_accuracy": 0.7284378662705422, "num_tokens": 29736571.0, "step": 1520 }, { "entropy": 1.0887313183397054, "epoch": 2.448, "grad_norm": 3.8125, "learning_rate": 1.8824444444444445e-05, "loss": 1.0429, "mean_token_accuracy": 0.7227712966501713, "num_tokens": 29932423.0, "step": 1530 }, { "entropy": 1.0347874112427236, "epoch": 2.464, "grad_norm": 3.640625, "learning_rate": 1.8802222222222226e-05, "loss": 0.9796, "mean_token_accuracy": 0.7323532313108444, "num_tokens": 30137125.0, "step": 1540 }, { "entropy": 1.048006160184741, "epoch": 2.48, "grad_norm": 3.390625, "learning_rate": 1.878e-05, "loss": 1.001, "mean_token_accuracy": 0.7286384627223015, "num_tokens": 30341656.0, "step": 1550 }, { "entropy": 1.052099209651351, "epoch": 2.496, "grad_norm": 3.140625, "learning_rate": 1.875777777777778e-05, "loss": 0.9888, "mean_token_accuracy": 0.7288977019488811, "num_tokens": 30533732.0, "step": 1560 }, { "entropy": 1.0728581339120864, "epoch": 2.512, "grad_norm": 3.96875, "learning_rate": 1.873555555555556e-05, "loss": 1.0181, "mean_token_accuracy": 0.7265560120344162, "num_tokens": 30730599.0, "step": 1570 }, { "entropy": 1.0553664829581977, "epoch": 2.528, "grad_norm": 3.640625, "learning_rate": 1.8713333333333336e-05, "loss": 0.9993, "mean_token_accuracy": 0.7286464177072048, "num_tokens": 30927950.0, "step": 1580 }, { "entropy": 1.0592417079955339, "epoch": 2.544, "grad_norm": 3.46875, "learning_rate": 1.8691111111111114e-05, "loss": 1.0135, "mean_token_accuracy": 0.7272635698318481, "num_tokens": 31125667.0, "step": 1590 }, { "entropy": 1.0688998382538557, "epoch": 2.56, "grad_norm": 3.578125, "learning_rate": 1.866888888888889e-05, "loss": 1.0172, "mean_token_accuracy": 0.7267541728913784, "num_tokens": 31315742.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 1.0306266536712647, "eval_biology_loss": 1.2540794610977173, "eval_biology_mean_token_accuracy": 0.7014414758682251, "eval_biology_num_tokens": 31315742.0, "eval_biology_runtime": 38.994, "eval_biology_samples_per_second": 12.822, "eval_biology_steps_per_second": 3.206, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 0.5480240859985351, "eval_physics_loss": 0.6785013675689697, "eval_physics_mean_token_accuracy": 0.81651726770401, "eval_physics_num_tokens": 31315742.0, "eval_physics_runtime": 56.9063, "eval_physics_samples_per_second": 8.786, "eval_physics_steps_per_second": 2.197, "step": 1600 }, { "entropy": 1.072851437330246, "epoch": 2.576, "grad_norm": 3.3125, "learning_rate": 1.864666666666667e-05, "loss": 1.0177, "mean_token_accuracy": 0.7259860239923001, "num_tokens": 31505374.0, "step": 1610 }, { "entropy": 1.0911020763218402, "epoch": 2.592, "grad_norm": 3.375, "learning_rate": 1.8624444444444446e-05, "loss": 1.0375, "mean_token_accuracy": 0.7193623151630163, "num_tokens": 31701680.0, "step": 1620 }, { "entropy": 1.0759818445891143, "epoch": 2.608, "grad_norm": 3.265625, "learning_rate": 1.8602222222222224e-05, "loss": 1.024, "mean_token_accuracy": 0.7245077144354581, "num_tokens": 31893221.0, "step": 1630 }, { "entropy": 1.0434412751346827, "epoch": 2.624, "grad_norm": 3.453125, "learning_rate": 1.858e-05, "loss": 0.9884, "mean_token_accuracy": 0.7292033292353153, "num_tokens": 32087176.0, "step": 1640 }, { "entropy": 1.077140349522233, "epoch": 2.64, "grad_norm": 3.515625, "learning_rate": 1.855777777777778e-05, "loss": 1.0218, "mean_token_accuracy": 0.7251086611300707, "num_tokens": 32280689.0, "step": 1650 }, { "entropy": 1.0449730832129718, "epoch": 2.656, "grad_norm": 3.234375, "learning_rate": 1.8535555555555557e-05, "loss": 0.9897, "mean_token_accuracy": 0.7329251658171415, "num_tokens": 32483392.0, "step": 1660 }, { "entropy": 1.0476958800107241, "epoch": 2.672, "grad_norm": 3.328125, "learning_rate": 1.8513333333333335e-05, "loss": 0.9996, "mean_token_accuracy": 0.7290864728391171, "num_tokens": 32683246.0, "step": 1670 }, { "entropy": 1.0343516409397124, "epoch": 2.6879999999999997, "grad_norm": 3.4375, "learning_rate": 1.8491111111111112e-05, "loss": 0.9794, "mean_token_accuracy": 0.7327864274382592, "num_tokens": 32884993.0, "step": 1680 }, { "entropy": 1.059150441735983, "epoch": 2.7039999999999997, "grad_norm": 3.25, "learning_rate": 1.846888888888889e-05, "loss": 1.004, "mean_token_accuracy": 0.7282994613051414, "num_tokens": 33088388.0, "step": 1690 }, { "entropy": 1.0319584112614393, "epoch": 2.7199999999999998, "grad_norm": 3.53125, "learning_rate": 1.8446666666666667e-05, "loss": 0.9858, "mean_token_accuracy": 0.7318238351494074, "num_tokens": 33287208.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 1.0519089736938476, "eval_biology_loss": 1.2423198223114014, "eval_biology_mean_token_accuracy": 0.7037161078453064, "eval_biology_num_tokens": 33287208.0, "eval_biology_runtime": 38.8171, "eval_biology_samples_per_second": 12.881, "eval_biology_steps_per_second": 3.22, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 0.5594763560295105, "eval_physics_loss": 0.677318811416626, "eval_physics_mean_token_accuracy": 0.8168268208503723, "eval_physics_num_tokens": 33287208.0, "eval_physics_runtime": 57.185, "eval_physics_samples_per_second": 8.744, "eval_physics_steps_per_second": 2.186, "step": 1700 }, { "entropy": 1.0736609652638436, "epoch": 2.7359999999999998, "grad_norm": 3.15625, "learning_rate": 1.842444444444445e-05, "loss": 1.017, "mean_token_accuracy": 0.7253531437367201, "num_tokens": 33474750.0, "step": 1710 }, { "entropy": 1.0549831412732602, "epoch": 2.752, "grad_norm": 3.625, "learning_rate": 1.8402222222222223e-05, "loss": 1.0054, "mean_token_accuracy": 0.7258293054997921, "num_tokens": 33664302.0, "step": 1720 }, { "entropy": 1.0876980781555177, "epoch": 2.768, "grad_norm": 3.65625, "learning_rate": 1.8380000000000004e-05, "loss": 1.0317, "mean_token_accuracy": 0.7200849443674088, "num_tokens": 33854081.0, "step": 1730 }, { "entropy": 1.046105058863759, "epoch": 2.784, "grad_norm": 3.09375, "learning_rate": 1.8357777777777778e-05, "loss": 0.9956, "mean_token_accuracy": 0.7304057106375694, "num_tokens": 34053149.0, "step": 1740 }, { "entropy": 1.0452948212623596, "epoch": 2.8, "grad_norm": 3.40625, "learning_rate": 1.833555555555556e-05, "loss": 0.9964, "mean_token_accuracy": 0.7318151947110891, "num_tokens": 34254105.0, "step": 1750 }, { "entropy": 1.0454461514949798, "epoch": 2.816, "grad_norm": 3.1875, "learning_rate": 1.8313333333333333e-05, "loss": 0.9864, "mean_token_accuracy": 0.7298544138669968, "num_tokens": 34453342.0, "step": 1760 }, { "entropy": 1.0891260366886855, "epoch": 2.832, "grad_norm": 3.515625, "learning_rate": 1.8291111111111114e-05, "loss": 1.0274, "mean_token_accuracy": 0.7241302687674761, "num_tokens": 34644113.0, "step": 1770 }, { "entropy": 1.0674222145229577, "epoch": 2.848, "grad_norm": 3.4375, "learning_rate": 1.8268888888888888e-05, "loss": 1.0191, "mean_token_accuracy": 0.7263091344386339, "num_tokens": 34835062.0, "step": 1780 }, { "entropy": 1.047906568646431, "epoch": 2.864, "grad_norm": 3.234375, "learning_rate": 1.824666666666667e-05, "loss": 1.0015, "mean_token_accuracy": 0.7286620266735554, "num_tokens": 35032361.0, "step": 1790 }, { "entropy": 1.0795419167727232, "epoch": 2.88, "grad_norm": 3.421875, "learning_rate": 1.8224444444444447e-05, "loss": 1.0231, "mean_token_accuracy": 0.7250057961791754, "num_tokens": 35226653.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 1.0592120084762573, "eval_biology_loss": 1.2307487726211548, "eval_biology_mean_token_accuracy": 0.7039423842430115, "eval_biology_num_tokens": 35226653.0, "eval_biology_runtime": 38.811, "eval_biology_samples_per_second": 12.883, "eval_biology_steps_per_second": 3.221, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 0.5607087211608887, "eval_physics_loss": 0.6777584552764893, "eval_physics_mean_token_accuracy": 0.816848436832428, "eval_physics_num_tokens": 35226653.0, "eval_physics_runtime": 56.9878, "eval_physics_samples_per_second": 8.774, "eval_physics_steps_per_second": 2.193, "step": 1800 }, { "entropy": 1.0618976611644029, "epoch": 2.896, "grad_norm": 3.3125, "learning_rate": 1.8202222222222225e-05, "loss": 1.0068, "mean_token_accuracy": 0.7253085561096668, "num_tokens": 35421873.0, "step": 1810 }, { "entropy": 1.0810394674539565, "epoch": 2.912, "grad_norm": 3.6875, "learning_rate": 1.8180000000000002e-05, "loss": 1.0287, "mean_token_accuracy": 0.7242106914520263, "num_tokens": 35614590.0, "step": 1820 }, { "entropy": 1.0641514841467141, "epoch": 2.928, "grad_norm": 3.578125, "learning_rate": 1.815777777777778e-05, "loss": 1.0163, "mean_token_accuracy": 0.7251328159123659, "num_tokens": 35806494.0, "step": 1830 }, { "entropy": 1.0547591600567103, "epoch": 2.944, "grad_norm": 3.1875, "learning_rate": 1.8135555555555557e-05, "loss": 1.0087, "mean_token_accuracy": 0.7291445326060056, "num_tokens": 36001867.0, "step": 1840 }, { "entropy": 1.0459509972482919, "epoch": 2.96, "grad_norm": 3.484375, "learning_rate": 1.8113333333333335e-05, "loss": 0.9846, "mean_token_accuracy": 0.7305242072790861, "num_tokens": 36198806.0, "step": 1850 }, { "entropy": 1.0438212916254996, "epoch": 2.976, "grad_norm": 3.296875, "learning_rate": 1.8091111111111113e-05, "loss": 0.9897, "mean_token_accuracy": 0.7307346038520336, "num_tokens": 36393098.0, "step": 1860 }, { "entropy": 1.0572807922959329, "epoch": 2.992, "grad_norm": 3.359375, "learning_rate": 1.806888888888889e-05, "loss": 1.005, "mean_token_accuracy": 0.728681518137455, "num_tokens": 36589074.0, "step": 1870 }, { "entropy": 0.9687618412077427, "epoch": 3.008, "grad_norm": 4.75, "learning_rate": 1.8046666666666668e-05, "loss": 0.8593, "mean_token_accuracy": 0.7585306584835052, "num_tokens": 36778710.0, "step": 1880 }, { "entropy": 0.7038324676454067, "epoch": 3.024, "grad_norm": 3.671875, "learning_rate": 1.8024444444444445e-05, "loss": 0.666, "mean_token_accuracy": 0.79880424849689, "num_tokens": 36966789.0, "step": 1890 }, { "entropy": 0.6823407102376222, "epoch": 3.04, "grad_norm": 4.1875, "learning_rate": 1.8002222222222223e-05, "loss": 0.621, "mean_token_accuracy": 0.8089036166667938, "num_tokens": 37156529.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 0.7511428904533386, "eval_biology_loss": 1.4531614780426025, "eval_biology_mean_token_accuracy": 0.6934510517120361, "eval_biology_num_tokens": 37156529.0, "eval_biology_runtime": 38.62, "eval_biology_samples_per_second": 12.947, "eval_biology_steps_per_second": 3.237, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 0.4265694625377655, "eval_physics_loss": 0.781083881855011, "eval_physics_mean_token_accuracy": 0.8090727143287658, "eval_physics_num_tokens": 37156529.0, "eval_physics_runtime": 56.8784, "eval_physics_samples_per_second": 8.791, "eval_physics_steps_per_second": 2.198, "step": 1900 }, { "entropy": 0.6869338469579815, "epoch": 3.056, "grad_norm": 4.15625, "learning_rate": 1.798e-05, "loss": 0.6247, "mean_token_accuracy": 0.8083337739109993, "num_tokens": 37351325.0, "step": 1910 }, { "entropy": 0.6717990661039949, "epoch": 3.072, "grad_norm": 4.34375, "learning_rate": 1.7957777777777778e-05, "loss": 0.6161, "mean_token_accuracy": 0.8123877976089716, "num_tokens": 37541069.0, "step": 1920 }, { "entropy": 0.6788237554952502, "epoch": 3.088, "grad_norm": 4.40625, "learning_rate": 1.7935555555555556e-05, "loss": 0.6233, "mean_token_accuracy": 0.8102868665009737, "num_tokens": 37733440.0, "step": 1930 }, { "entropy": 0.6710379313677549, "epoch": 3.104, "grad_norm": 3.8125, "learning_rate": 1.7913333333333337e-05, "loss": 0.6095, "mean_token_accuracy": 0.811876380816102, "num_tokens": 37933661.0, "step": 1940 }, { "entropy": 0.6801185900345444, "epoch": 3.12, "grad_norm": 4.09375, "learning_rate": 1.789111111111111e-05, "loss": 0.6255, "mean_token_accuracy": 0.8087910477072, "num_tokens": 38126905.0, "step": 1950 }, { "entropy": 0.6823285862803459, "epoch": 3.136, "grad_norm": 4.4375, "learning_rate": 1.7868888888888892e-05, "loss": 0.619, "mean_token_accuracy": 0.8105869103223086, "num_tokens": 38316042.0, "step": 1960 }, { "entropy": 0.6686201065778732, "epoch": 3.152, "grad_norm": 4.53125, "learning_rate": 1.7846666666666666e-05, "loss": 0.6139, "mean_token_accuracy": 0.811134846508503, "num_tokens": 38514212.0, "step": 1970 }, { "entropy": 0.696059150248766, "epoch": 3.168, "grad_norm": 4.28125, "learning_rate": 1.7824444444444447e-05, "loss": 0.6377, "mean_token_accuracy": 0.8045505676418543, "num_tokens": 38708822.0, "step": 1980 }, { "entropy": 0.6871348209679127, "epoch": 3.184, "grad_norm": 3.9375, "learning_rate": 1.780222222222222e-05, "loss": 0.632, "mean_token_accuracy": 0.8063865445554257, "num_tokens": 38900708.0, "step": 1990 }, { "entropy": 0.6824637034907937, "epoch": 3.2, "grad_norm": 4.1875, "learning_rate": 1.7780000000000003e-05, "loss": 0.626, "mean_token_accuracy": 0.8084224767982959, "num_tokens": 39102325.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 0.7477709612846375, "eval_biology_loss": 1.473409652709961, "eval_biology_mean_token_accuracy": 0.6936884145736695, "eval_biology_num_tokens": 39102325.0, "eval_biology_runtime": 38.6411, "eval_biology_samples_per_second": 12.94, "eval_biology_steps_per_second": 3.235, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 0.4129572913646698, "eval_physics_loss": 0.8035010695457458, "eval_physics_mean_token_accuracy": 0.8071833181381226, "eval_physics_num_tokens": 39102325.0, "eval_physics_runtime": 56.8955, "eval_physics_samples_per_second": 8.788, "eval_physics_steps_per_second": 2.197, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.279038413131436e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }