{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.085290367901325, "epoch": 0.008, "grad_norm": 0.0146484375, "learning_rate": 1.8e-07, "loss": 2.8737, "mean_token_accuracy": 0.48555898685008286, "num_tokens": 173886.0, "step": 10 }, { "entropy": 2.082674165070057, "epoch": 0.016, "grad_norm": 0.00750732421875, "learning_rate": 3.8e-07, "loss": 2.8766, "mean_token_accuracy": 0.48314503096044065, "num_tokens": 338352.0, "step": 20 }, { "entropy": 2.083346585184336, "epoch": 0.024, "grad_norm": 0.01116943359375, "learning_rate": 5.800000000000001e-07, "loss": 2.8206, "mean_token_accuracy": 0.4898203007876873, "num_tokens": 505613.0, "step": 30 }, { "entropy": 2.084573998302221, "epoch": 0.032, "grad_norm": 0.01361083984375, "learning_rate": 7.8e-07, "loss": 2.846, "mean_token_accuracy": 0.4879810094833374, "num_tokens": 671193.0, "step": 40 }, { "entropy": 2.112175610661507, "epoch": 0.04, "grad_norm": 0.0184326171875, "learning_rate": 9.800000000000001e-07, "loss": 2.8945, "mean_token_accuracy": 0.48235367350280284, "num_tokens": 835868.0, "step": 50 }, { "entropy": 2.004745631664991, "epoch": 0.048, "grad_norm": 0.01055908203125, "learning_rate": 1.1800000000000001e-06, "loss": 2.7943, "mean_token_accuracy": 0.4953182445839047, "num_tokens": 1001008.0, "step": 60 }, { "entropy": 2.0760968685150147, "epoch": 0.056, "grad_norm": 0.013427734375, "learning_rate": 1.3800000000000001e-06, "loss": 2.8689, "mean_token_accuracy": 0.48274643896147607, "num_tokens": 1166615.0, "step": 70 }, { "entropy": 2.1421599693596365, "epoch": 0.064, "grad_norm": 0.01226806640625, "learning_rate": 1.5800000000000001e-06, "loss": 2.927, "mean_token_accuracy": 0.47564963400363924, "num_tokens": 1331208.0, "step": 80 }, { "entropy": 2.0629068821668626, "epoch": 0.072, "grad_norm": 0.00994873046875, "learning_rate": 1.7800000000000001e-06, "loss": 2.8669, "mean_token_accuracy": 0.48655857108533385, "num_tokens": 1499449.0, "step": 90 }, { "entropy": 2.1223726868629456, "epoch": 0.08, "grad_norm": 0.0184326171875, "learning_rate": 1.98e-06, "loss": 2.8834, "mean_token_accuracy": 0.48028963077813386, "num_tokens": 1667532.0, "step": 100 }, { "epoch": 0.08, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 1667532.0, "eval_coding_runtime": 96.0471, "eval_coding_samples_per_second": 5.206, "eval_coding_steps_per_second": 2.603, "step": 100 }, { "epoch": 0.08, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 1667532.0, "eval_physics_runtime": 63.6847, "eval_physics_samples_per_second": 7.851, "eval_physics_steps_per_second": 3.926, "step": 100 }, { "entropy": 2.028460328280926, "epoch": 0.088, "grad_norm": 0.00732421875, "learning_rate": 2.1800000000000003e-06, "loss": 2.7998, "mean_token_accuracy": 0.4919602179899812, "num_tokens": 1835489.0, "step": 110 }, { "entropy": 2.0500536493957044, "epoch": 0.096, "grad_norm": 0.0166015625, "learning_rate": 2.38e-06, "loss": 2.843, "mean_token_accuracy": 0.486666096188128, "num_tokens": 2007176.0, "step": 120 }, { "entropy": 2.133950074762106, "epoch": 0.104, "grad_norm": 0.00897216796875, "learning_rate": 2.5800000000000003e-06, "loss": 2.908, "mean_token_accuracy": 0.47787463404238223, "num_tokens": 2171335.0, "step": 130 }, { "entropy": 2.0415130145847797, "epoch": 0.112, "grad_norm": 0.0125732421875, "learning_rate": 2.7800000000000005e-06, "loss": 2.8213, "mean_token_accuracy": 0.48844432439655067, "num_tokens": 2342259.0, "step": 140 }, { "entropy": 2.19760585129261, "epoch": 0.12, "grad_norm": 0.01519775390625, "learning_rate": 2.9800000000000003e-06, "loss": 2.9587, "mean_token_accuracy": 0.4719714432954788, "num_tokens": 2504279.0, "step": 150 }, { "entropy": 2.133862767368555, "epoch": 0.128, "grad_norm": 0.0101318359375, "learning_rate": 3.1800000000000005e-06, "loss": 2.895, "mean_token_accuracy": 0.48206167966127395, "num_tokens": 2663987.0, "step": 160 }, { "entropy": 2.1023842833936213, "epoch": 0.136, "grad_norm": 0.00836181640625, "learning_rate": 3.3800000000000007e-06, "loss": 2.8679, "mean_token_accuracy": 0.48314376436173917, "num_tokens": 2829720.0, "step": 170 }, { "entropy": 2.096611645817757, "epoch": 0.144, "grad_norm": 0.016845703125, "learning_rate": 3.58e-06, "loss": 2.882, "mean_token_accuracy": 0.4854999572038651, "num_tokens": 2997824.0, "step": 180 }, { "entropy": 2.104377479106188, "epoch": 0.152, "grad_norm": 0.00860595703125, "learning_rate": 3.7800000000000002e-06, "loss": 2.8908, "mean_token_accuracy": 0.48179021421819923, "num_tokens": 3159946.0, "step": 190 }, { "entropy": 2.091322401165962, "epoch": 0.16, "grad_norm": 0.00927734375, "learning_rate": 3.980000000000001e-06, "loss": 2.8428, "mean_token_accuracy": 0.4882024429738522, "num_tokens": 3330597.0, "step": 200 }, { "epoch": 0.16, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 3330597.0, "eval_coding_runtime": 95.6196, "eval_coding_samples_per_second": 5.229, "eval_coding_steps_per_second": 2.615, "step": 200 }, { "epoch": 0.16, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 63.6492, "eval_physics_samples_per_second": 7.856, "eval_physics_steps_per_second": 3.928, "step": 200 }, { "entropy": 2.142216607928276, "epoch": 0.168, "grad_norm": 0.00701904296875, "learning_rate": 4.18e-06, "loss": 2.9186, "mean_token_accuracy": 0.4779110599309206, "num_tokens": 3496104.0, "step": 210 }, { "entropy": 2.1577497258782388, "epoch": 0.176, "grad_norm": 0.010498046875, "learning_rate": 4.38e-06, "loss": 2.9356, "mean_token_accuracy": 0.4760912239551544, "num_tokens": 3658264.0, "step": 220 }, { "entropy": 2.1311790965497495, "epoch": 0.184, "grad_norm": 0.006317138671875, "learning_rate": 4.58e-06, "loss": 2.8804, "mean_token_accuracy": 0.48113634046167136, "num_tokens": 3821352.0, "step": 230 }, { "entropy": 2.110344947129488, "epoch": 0.192, "grad_norm": 0.018798828125, "learning_rate": 4.78e-06, "loss": 2.8458, "mean_token_accuracy": 0.4866057766601443, "num_tokens": 3995568.0, "step": 240 }, { "entropy": 2.1274008348584177, "epoch": 0.2, "grad_norm": 0.00970458984375, "learning_rate": 4.980000000000001e-06, "loss": 2.9325, "mean_token_accuracy": 0.47856651023030283, "num_tokens": 4160918.0, "step": 250 }, { "entropy": 2.0445496000349523, "epoch": 0.208, "grad_norm": 0.0072021484375, "learning_rate": 5.18e-06, "loss": 2.7895, "mean_token_accuracy": 0.49246302992105484, "num_tokens": 4321436.0, "step": 260 }, { "entropy": 2.118650445342064, "epoch": 0.216, "grad_norm": 0.0118408203125, "learning_rate": 5.380000000000001e-06, "loss": 2.831, "mean_token_accuracy": 0.48565028738230465, "num_tokens": 4482225.0, "step": 270 }, { "entropy": 2.0710766136646273, "epoch": 0.224, "grad_norm": 0.00823974609375, "learning_rate": 5.580000000000001e-06, "loss": 2.8524, "mean_token_accuracy": 0.4889204766601324, "num_tokens": 4648491.0, "step": 280 }, { "entropy": 2.0747790560126305, "epoch": 0.232, "grad_norm": 0.009033203125, "learning_rate": 5.78e-06, "loss": 2.8736, "mean_token_accuracy": 0.4836031161248684, "num_tokens": 4813449.0, "step": 290 }, { "entropy": 2.1116694904863835, "epoch": 0.24, "grad_norm": 0.01141357421875, "learning_rate": 5.98e-06, "loss": 2.8974, "mean_token_accuracy": 0.47815693244338037, "num_tokens": 4986175.0, "step": 300 }, { "epoch": 0.24, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 4986175.0, "eval_coding_runtime": 95.7299, "eval_coding_samples_per_second": 5.223, "eval_coding_steps_per_second": 2.612, "step": 300 }, { "epoch": 0.24, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 4986175.0, "eval_physics_runtime": 63.6784, "eval_physics_samples_per_second": 7.852, "eval_physics_steps_per_second": 3.926, "step": 300 }, { "entropy": 2.085676248371601, "epoch": 0.248, "grad_norm": 0.01513671875, "learning_rate": 6.18e-06, "loss": 2.8547, "mean_token_accuracy": 0.4871716873720288, "num_tokens": 5164741.0, "step": 310 }, { "entropy": 2.111249604821205, "epoch": 0.256, "grad_norm": 0.01324462890625, "learning_rate": 6.380000000000001e-06, "loss": 2.869, "mean_token_accuracy": 0.482149850204587, "num_tokens": 5329320.0, "step": 320 }, { "entropy": 2.0735500395298003, "epoch": 0.264, "grad_norm": 0.01300048828125, "learning_rate": 6.5800000000000005e-06, "loss": 2.815, "mean_token_accuracy": 0.4895795300602913, "num_tokens": 5497259.0, "step": 330 }, { "entropy": 2.1957527093589304, "epoch": 0.272, "grad_norm": 0.0155029296875, "learning_rate": 6.780000000000001e-06, "loss": 2.9296, "mean_token_accuracy": 0.4737250735983253, "num_tokens": 5658796.0, "step": 340 }, { "entropy": 2.0978702254593373, "epoch": 0.28, "grad_norm": 0.012939453125, "learning_rate": 6.98e-06, "loss": 2.9223, "mean_token_accuracy": 0.47742239478975534, "num_tokens": 5819027.0, "step": 350 }, { "entropy": 2.126638425886631, "epoch": 0.288, "grad_norm": 0.01104736328125, "learning_rate": 7.180000000000001e-06, "loss": 2.8822, "mean_token_accuracy": 0.48392471522092817, "num_tokens": 5980201.0, "step": 360 }, { "entropy": 2.1021372735500337, "epoch": 0.296, "grad_norm": 0.0208740234375, "learning_rate": 7.3800000000000005e-06, "loss": 2.8638, "mean_token_accuracy": 0.4845348121598363, "num_tokens": 6139830.0, "step": 370 }, { "entropy": 2.1106897443532944, "epoch": 0.304, "grad_norm": 0.00787353515625, "learning_rate": 7.58e-06, "loss": 2.8357, "mean_token_accuracy": 0.4834788400679827, "num_tokens": 6298858.0, "step": 380 }, { "entropy": 2.143523923307657, "epoch": 0.312, "grad_norm": 0.00958251953125, "learning_rate": 7.78e-06, "loss": 2.9236, "mean_token_accuracy": 0.47671638112515213, "num_tokens": 6456502.0, "step": 390 }, { "entropy": 2.1132052429020405, "epoch": 0.32, "grad_norm": 0.01336669921875, "learning_rate": 7.980000000000002e-06, "loss": 2.8795, "mean_token_accuracy": 0.48184755612164737, "num_tokens": 6622798.0, "step": 400 }, { "epoch": 0.32, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 6622798.0, "eval_coding_runtime": 95.588, "eval_coding_samples_per_second": 5.231, "eval_coding_steps_per_second": 2.615, "step": 400 }, { "epoch": 0.32, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 63.5621, "eval_physics_samples_per_second": 7.866, "eval_physics_steps_per_second": 3.933, "step": 400 }, { "entropy": 2.1359100699424745, "epoch": 0.328, "grad_norm": 0.01220703125, "learning_rate": 8.18e-06, "loss": 2.892, "mean_token_accuracy": 0.4762568075209856, "num_tokens": 6782985.0, "step": 410 }, { "entropy": 2.080799935385585, "epoch": 0.336, "grad_norm": 0.021484375, "learning_rate": 8.380000000000001e-06, "loss": 2.8705, "mean_token_accuracy": 0.4826640056446195, "num_tokens": 6953611.0, "step": 420 }, { "entropy": 2.0621030911803246, "epoch": 0.344, "grad_norm": 0.013427734375, "learning_rate": 8.580000000000001e-06, "loss": 2.777, "mean_token_accuracy": 0.48948597013950346, "num_tokens": 7121949.0, "step": 430 }, { "entropy": 2.0954283088445664, "epoch": 0.352, "grad_norm": 0.01080322265625, "learning_rate": 8.78e-06, "loss": 2.8529, "mean_token_accuracy": 0.48598715253174307, "num_tokens": 7290378.0, "step": 440 }, { "entropy": 2.1145325861871243, "epoch": 0.36, "grad_norm": 0.01611328125, "learning_rate": 8.98e-06, "loss": 2.864, "mean_token_accuracy": 0.48475142791867254, "num_tokens": 7452920.0, "step": 450 }, { "entropy": 2.0749227844178675, "epoch": 0.368, "grad_norm": 0.0255126953125, "learning_rate": 9.180000000000002e-06, "loss": 2.842, "mean_token_accuracy": 0.4888931142166257, "num_tokens": 7621082.0, "step": 460 }, { "entropy": 2.158674482256174, "epoch": 0.376, "grad_norm": 0.0189208984375, "learning_rate": 9.38e-06, "loss": 2.8965, "mean_token_accuracy": 0.47893171701580284, "num_tokens": 7787036.0, "step": 470 }, { "entropy": 2.172913119196892, "epoch": 0.384, "grad_norm": 0.0211181640625, "learning_rate": 9.58e-06, "loss": 2.9484, "mean_token_accuracy": 0.47502864580601456, "num_tokens": 7955570.0, "step": 480 }, { "entropy": 2.133708968013525, "epoch": 0.392, "grad_norm": 0.0089111328125, "learning_rate": 9.780000000000001e-06, "loss": 2.9115, "mean_token_accuracy": 0.481097811833024, "num_tokens": 8123527.0, "step": 490 }, { "entropy": 2.075808613747358, "epoch": 0.4, "grad_norm": 0.01458740234375, "learning_rate": 9.980000000000001e-06, "loss": 2.8509, "mean_token_accuracy": 0.4880217058584094, "num_tokens": 8291049.0, "step": 500 }, { "epoch": 0.4, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 8291049.0, "eval_coding_runtime": 95.8745, "eval_coding_samples_per_second": 5.215, "eval_coding_steps_per_second": 2.608, "step": 500 }, { "epoch": 0.4, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 8291049.0, "eval_physics_runtime": 63.2475, "eval_physics_samples_per_second": 7.905, "eval_physics_steps_per_second": 3.953, "step": 500 }, { "entropy": 2.0808891989290714, "epoch": 0.408, "grad_norm": 0.00933837890625, "learning_rate": 1.018e-05, "loss": 2.8602, "mean_token_accuracy": 0.48418706730008126, "num_tokens": 8459795.0, "step": 510 }, { "entropy": 2.108520568162203, "epoch": 0.416, "grad_norm": 0.01300048828125, "learning_rate": 1.038e-05, "loss": 2.8709, "mean_token_accuracy": 0.4807485779747367, "num_tokens": 8627310.0, "step": 520 }, { "entropy": 2.095228637009859, "epoch": 0.424, "grad_norm": 0.0186767578125, "learning_rate": 1.0580000000000002e-05, "loss": 2.8575, "mean_token_accuracy": 0.4873119568452239, "num_tokens": 8792574.0, "step": 530 }, { "entropy": 2.0733875431120397, "epoch": 0.432, "grad_norm": 0.0152587890625, "learning_rate": 1.0780000000000002e-05, "loss": 2.8746, "mean_token_accuracy": 0.48152947407215835, "num_tokens": 8958371.0, "step": 540 }, { "entropy": 2.0743850864470006, "epoch": 0.44, "grad_norm": 0.0142822265625, "learning_rate": 1.0980000000000002e-05, "loss": 2.8417, "mean_token_accuracy": 0.48614103216677906, "num_tokens": 9125564.0, "step": 550 }, { "entropy": 2.0296142108738424, "epoch": 0.448, "grad_norm": 0.00909423828125, "learning_rate": 1.1180000000000001e-05, "loss": 2.8092, "mean_token_accuracy": 0.49211322125047446, "num_tokens": 9294388.0, "step": 560 }, { "entropy": 2.042448101937771, "epoch": 0.456, "grad_norm": 0.00897216796875, "learning_rate": 1.138e-05, "loss": 2.8196, "mean_token_accuracy": 0.4887328814715147, "num_tokens": 9455428.0, "step": 570 }, { "entropy": 2.124661027640104, "epoch": 0.464, "grad_norm": 0.0172119140625, "learning_rate": 1.1580000000000001e-05, "loss": 2.8825, "mean_token_accuracy": 0.4844032688066363, "num_tokens": 9620366.0, "step": 580 }, { "entropy": 2.0920108027756212, "epoch": 0.472, "grad_norm": 0.01458740234375, "learning_rate": 1.178e-05, "loss": 2.8011, "mean_token_accuracy": 0.49246231019496917, "num_tokens": 9788925.0, "step": 590 }, { "entropy": 2.0895667865872385, "epoch": 0.48, "grad_norm": 0.007568359375, "learning_rate": 1.198e-05, "loss": 2.8784, "mean_token_accuracy": 0.4846471020951867, "num_tokens": 9955431.0, "step": 600 }, { "epoch": 0.48, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 9955431.0, "eval_coding_runtime": 95.6269, "eval_coding_samples_per_second": 5.229, "eval_coding_steps_per_second": 2.614, "step": 600 }, { "epoch": 0.48, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 63.6584, "eval_physics_samples_per_second": 7.854, "eval_physics_steps_per_second": 3.927, "step": 600 }, { "entropy": 2.1037806943058968, "epoch": 0.488, "grad_norm": 0.01434326171875, "learning_rate": 1.218e-05, "loss": 2.8897, "mean_token_accuracy": 0.48129043709486724, "num_tokens": 10119599.0, "step": 610 }, { "entropy": 2.1045577257871626, "epoch": 0.496, "grad_norm": 0.02294921875, "learning_rate": 1.2380000000000002e-05, "loss": 2.8986, "mean_token_accuracy": 0.4801080621778965, "num_tokens": 10289883.0, "step": 620 }, { "entropy": 2.1075518101453783, "epoch": 0.504, "grad_norm": 0.007354736328125, "learning_rate": 1.2580000000000002e-05, "loss": 2.8519, "mean_token_accuracy": 0.4833444032818079, "num_tokens": 10454683.0, "step": 630 }, { "entropy": 2.106730377674103, "epoch": 0.512, "grad_norm": 0.00823974609375, "learning_rate": 1.2780000000000001e-05, "loss": 2.9125, "mean_token_accuracy": 0.4791097413748503, "num_tokens": 10619468.0, "step": 640 }, { "entropy": 2.149070845544338, "epoch": 0.52, "grad_norm": 0.017822265625, "learning_rate": 1.2980000000000001e-05, "loss": 2.9383, "mean_token_accuracy": 0.4766066731885076, "num_tokens": 10781285.0, "step": 650 }, { "entropy": 2.109221810847521, "epoch": 0.528, "grad_norm": 0.01123046875, "learning_rate": 1.3180000000000001e-05, "loss": 2.8512, "mean_token_accuracy": 0.48691278155893086, "num_tokens": 10944904.0, "step": 660 }, { "entropy": 2.10501976236701, "epoch": 0.536, "grad_norm": 0.0079345703125, "learning_rate": 1.3380000000000002e-05, "loss": 2.8583, "mean_token_accuracy": 0.48396560084074736, "num_tokens": 11106538.0, "step": 670 }, { "entropy": 2.1422630779445173, "epoch": 0.544, "grad_norm": 0.013916015625, "learning_rate": 1.3580000000000002e-05, "loss": 2.9198, "mean_token_accuracy": 0.4756466532126069, "num_tokens": 11266409.0, "step": 680 }, { "entropy": 2.066490865498781, "epoch": 0.552, "grad_norm": 0.0120849609375, "learning_rate": 1.378e-05, "loss": 2.8749, "mean_token_accuracy": 0.48253467567265035, "num_tokens": 11438391.0, "step": 690 }, { "entropy": 2.1218110136687756, "epoch": 0.56, "grad_norm": 0.00811767578125, "learning_rate": 1.398e-05, "loss": 2.8685, "mean_token_accuracy": 0.4830782825127244, "num_tokens": 11605544.0, "step": 700 }, { "epoch": 0.56, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 11605544.0, "eval_coding_runtime": 95.6063, "eval_coding_samples_per_second": 5.23, "eval_coding_steps_per_second": 2.615, "step": 700 }, { "epoch": 0.56, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 11605544.0, "eval_physics_runtime": 63.652, "eval_physics_samples_per_second": 7.855, "eval_physics_steps_per_second": 3.928, "step": 700 }, { "entropy": 2.0374118871986866, "epoch": 0.568, "grad_norm": 0.0164794921875, "learning_rate": 1.418e-05, "loss": 2.8443, "mean_token_accuracy": 0.48784226551651955, "num_tokens": 11765430.0, "step": 710 }, { "entropy": 2.079664882272482, "epoch": 0.576, "grad_norm": 0.01336669921875, "learning_rate": 1.4380000000000001e-05, "loss": 2.8436, "mean_token_accuracy": 0.4859828842803836, "num_tokens": 11931715.0, "step": 720 }, { "entropy": 2.091735219210386, "epoch": 0.584, "grad_norm": 0.00836181640625, "learning_rate": 1.4580000000000001e-05, "loss": 2.891, "mean_token_accuracy": 0.4845219163224101, "num_tokens": 12093686.0, "step": 730 }, { "entropy": 2.092084125429392, "epoch": 0.592, "grad_norm": 0.01153564453125, "learning_rate": 1.478e-05, "loss": 2.8884, "mean_token_accuracy": 0.4797184634953737, "num_tokens": 12257243.0, "step": 740 }, { "entropy": 2.138279449939728, "epoch": 0.6, "grad_norm": 0.01043701171875, "learning_rate": 1.498e-05, "loss": 2.8937, "mean_token_accuracy": 0.4808414502069354, "num_tokens": 12422008.0, "step": 750 }, { "entropy": 2.025119251012802, "epoch": 0.608, "grad_norm": 0.01422119140625, "learning_rate": 1.5180000000000002e-05, "loss": 2.801, "mean_token_accuracy": 0.4929833130910993, "num_tokens": 12583154.0, "step": 760 }, { "entropy": 2.068293011188507, "epoch": 0.616, "grad_norm": 0.018798828125, "learning_rate": 1.5380000000000002e-05, "loss": 2.8332, "mean_token_accuracy": 0.4890221064910293, "num_tokens": 12746458.0, "step": 770 }, { "entropy": 2.1028603583574297, "epoch": 0.624, "grad_norm": 0.009033203125, "learning_rate": 1.5580000000000003e-05, "loss": 2.9238, "mean_token_accuracy": 0.4755176778882742, "num_tokens": 12905392.0, "step": 780 }, { "entropy": 2.103465436398983, "epoch": 0.632, "grad_norm": 0.012939453125, "learning_rate": 1.578e-05, "loss": 2.8914, "mean_token_accuracy": 0.48264262825250626, "num_tokens": 13071058.0, "step": 790 }, { "entropy": 2.107102530449629, "epoch": 0.64, "grad_norm": 0.01373291015625, "learning_rate": 1.5980000000000003e-05, "loss": 2.8937, "mean_token_accuracy": 0.4812551226466894, "num_tokens": 13232198.0, "step": 800 }, { "epoch": 0.64, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 13232198.0, "eval_coding_runtime": 95.5879, "eval_coding_samples_per_second": 5.231, "eval_coding_steps_per_second": 2.615, "step": 800 }, { "epoch": 0.64, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 63.5241, "eval_physics_samples_per_second": 7.871, "eval_physics_steps_per_second": 3.936, "step": 800 }, { "entropy": 2.1226571217179298, "epoch": 0.648, "grad_norm": 0.010009765625, "learning_rate": 1.618e-05, "loss": 2.8715, "mean_token_accuracy": 0.48374479077756405, "num_tokens": 13401064.0, "step": 810 }, { "entropy": 2.071519161760807, "epoch": 0.656, "grad_norm": 0.007476806640625, "learning_rate": 1.638e-05, "loss": 2.8209, "mean_token_accuracy": 0.48990428987890483, "num_tokens": 13575902.0, "step": 820 }, { "entropy": 2.1139444075524807, "epoch": 0.664, "grad_norm": 0.008544921875, "learning_rate": 1.658e-05, "loss": 2.9185, "mean_token_accuracy": 0.48024304024875164, "num_tokens": 13733192.0, "step": 830 }, { "entropy": 2.1097038336098195, "epoch": 0.672, "grad_norm": 0.0201416015625, "learning_rate": 1.6780000000000002e-05, "loss": 2.856, "mean_token_accuracy": 0.48414181135594847, "num_tokens": 13895997.0, "step": 840 }, { "entropy": 2.0350618422031403, "epoch": 0.68, "grad_norm": 0.007354736328125, "learning_rate": 1.698e-05, "loss": 2.8006, "mean_token_accuracy": 0.49249137472361326, "num_tokens": 14074579.0, "step": 850 }, { "entropy": 2.1360582523047924, "epoch": 0.688, "grad_norm": 0.012939453125, "learning_rate": 1.718e-05, "loss": 2.9135, "mean_token_accuracy": 0.48240375574678185, "num_tokens": 14234888.0, "step": 860 }, { "entropy": 2.0854096628725527, "epoch": 0.696, "grad_norm": 0.0159912109375, "learning_rate": 1.7380000000000003e-05, "loss": 2.8889, "mean_token_accuracy": 0.48132998365908863, "num_tokens": 14411150.0, "step": 870 }, { "entropy": 2.1598948813974856, "epoch": 0.704, "grad_norm": 0.0067138671875, "learning_rate": 1.758e-05, "loss": 2.941, "mean_token_accuracy": 0.47370197977870704, "num_tokens": 14567908.0, "step": 880 }, { "entropy": 2.1014511175453663, "epoch": 0.712, "grad_norm": 0.01458740234375, "learning_rate": 1.7780000000000003e-05, "loss": 2.8989, "mean_token_accuracy": 0.4840679431334138, "num_tokens": 14728975.0, "step": 890 }, { "entropy": 2.128746274113655, "epoch": 0.72, "grad_norm": 0.01361083984375, "learning_rate": 1.798e-05, "loss": 2.8664, "mean_token_accuracy": 0.4867586812004447, "num_tokens": 14882927.0, "step": 900 }, { "epoch": 0.72, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 14882927.0, "eval_coding_runtime": 95.8303, "eval_coding_samples_per_second": 5.218, "eval_coding_steps_per_second": 2.609, "step": 900 }, { "epoch": 0.72, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 14882927.0, "eval_physics_runtime": 63.6666, "eval_physics_samples_per_second": 7.853, "eval_physics_steps_per_second": 3.927, "step": 900 }, { "entropy": 2.0901576526463033, "epoch": 0.728, "grad_norm": 0.0067138671875, "learning_rate": 1.8180000000000002e-05, "loss": 2.8708, "mean_token_accuracy": 0.48403808698058126, "num_tokens": 15052770.0, "step": 910 }, { "entropy": 2.122075318545103, "epoch": 0.736, "grad_norm": 0.01129150390625, "learning_rate": 1.8380000000000004e-05, "loss": 2.8633, "mean_token_accuracy": 0.4852804895490408, "num_tokens": 15217342.0, "step": 920 }, { "entropy": 2.093779005110264, "epoch": 0.744, "grad_norm": 0.0079345703125, "learning_rate": 1.858e-05, "loss": 2.8451, "mean_token_accuracy": 0.48721388857811687, "num_tokens": 15385161.0, "step": 930 }, { "entropy": 2.131568420678377, "epoch": 0.752, "grad_norm": 0.0096435546875, "learning_rate": 1.878e-05, "loss": 2.9118, "mean_token_accuracy": 0.479784238897264, "num_tokens": 15547710.0, "step": 940 }, { "entropy": 2.18331568017602, "epoch": 0.76, "grad_norm": 0.007171630859375, "learning_rate": 1.898e-05, "loss": 2.9565, "mean_token_accuracy": 0.4732973873615265, "num_tokens": 15704494.0, "step": 950 }, { "entropy": 2.1320055924355983, "epoch": 0.768, "grad_norm": 0.01177978515625, "learning_rate": 1.918e-05, "loss": 2.8867, "mean_token_accuracy": 0.4821720516309142, "num_tokens": 15877177.0, "step": 960 }, { "entropy": 2.035961114615202, "epoch": 0.776, "grad_norm": 0.008544921875, "learning_rate": 1.938e-05, "loss": 2.8155, "mean_token_accuracy": 0.4874297235161066, "num_tokens": 16050530.0, "step": 970 }, { "entropy": 2.0721310496330263, "epoch": 0.784, "grad_norm": 0.01214599609375, "learning_rate": 1.9580000000000002e-05, "loss": 2.8311, "mean_token_accuracy": 0.4866502169519663, "num_tokens": 16219640.0, "step": 980 }, { "entropy": 2.1055771827697756, "epoch": 0.792, "grad_norm": 0.00927734375, "learning_rate": 1.978e-05, "loss": 2.8748, "mean_token_accuracy": 0.4852980706840754, "num_tokens": 16378820.0, "step": 990 }, { "entropy": 2.0726777322590353, "epoch": 0.8, "grad_norm": 0.01348876953125, "learning_rate": 1.9980000000000002e-05, "loss": 2.8588, "mean_token_accuracy": 0.4857082888484001, "num_tokens": 16548261.0, "step": 1000 }, { "epoch": 0.8, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 16548261.0, "eval_coding_runtime": 95.9331, "eval_coding_samples_per_second": 5.212, "eval_coding_steps_per_second": 2.606, "step": 1000 }, { "epoch": 0.8, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 63.1527, "eval_physics_samples_per_second": 7.917, "eval_physics_steps_per_second": 3.959, "step": 1000 }, { "entropy": 2.1674983091652393, "epoch": 0.808, "grad_norm": 0.0155029296875, "learning_rate": 1.9980000000000002e-05, "loss": 2.9179, "mean_token_accuracy": 0.4778887365013361, "num_tokens": 16711667.0, "step": 1010 }, { "entropy": 2.082773020863533, "epoch": 0.816, "grad_norm": 0.00738525390625, "learning_rate": 1.995777777777778e-05, "loss": 2.8647, "mean_token_accuracy": 0.48503445759415625, "num_tokens": 16870533.0, "step": 1020 }, { "entropy": 2.1195092521607877, "epoch": 0.824, "grad_norm": 0.0084228515625, "learning_rate": 1.9935555555555557e-05, "loss": 2.8734, "mean_token_accuracy": 0.4803088143467903, "num_tokens": 17035600.0, "step": 1030 }, { "entropy": 2.1006917230784894, "epoch": 0.832, "grad_norm": 0.01318359375, "learning_rate": 1.9913333333333335e-05, "loss": 2.8807, "mean_token_accuracy": 0.48227673936635257, "num_tokens": 17196371.0, "step": 1040 }, { "entropy": 2.1442659996449946, "epoch": 0.84, "grad_norm": 0.01171875, "learning_rate": 1.9891111111111112e-05, "loss": 2.891, "mean_token_accuracy": 0.48074845802038907, "num_tokens": 17356919.0, "step": 1050 }, { "entropy": 2.0704655081033705, "epoch": 0.848, "grad_norm": 0.0177001953125, "learning_rate": 1.986888888888889e-05, "loss": 2.8486, "mean_token_accuracy": 0.4880160320550203, "num_tokens": 17527066.0, "step": 1060 }, { "entropy": 2.125109201669693, "epoch": 0.856, "grad_norm": 0.00823974609375, "learning_rate": 1.9846666666666668e-05, "loss": 2.8795, "mean_token_accuracy": 0.4847541594877839, "num_tokens": 17689902.0, "step": 1070 }, { "entropy": 2.1320445723831654, "epoch": 0.864, "grad_norm": 0.0140380859375, "learning_rate": 1.9824444444444445e-05, "loss": 2.8809, "mean_token_accuracy": 0.480629575625062, "num_tokens": 17859605.0, "step": 1080 }, { "entropy": 2.1207774586975576, "epoch": 0.872, "grad_norm": 0.0140380859375, "learning_rate": 1.9802222222222226e-05, "loss": 2.8729, "mean_token_accuracy": 0.4811369523406029, "num_tokens": 18019924.0, "step": 1090 }, { "entropy": 2.1364660277962684, "epoch": 0.88, "grad_norm": 0.01361083984375, "learning_rate": 1.978e-05, "loss": 2.8681, "mean_token_accuracy": 0.4821885580196977, "num_tokens": 18187315.0, "step": 1100 }, { "epoch": 0.88, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 18187315.0, "eval_coding_runtime": 95.6457, "eval_coding_samples_per_second": 5.228, "eval_coding_steps_per_second": 2.614, "step": 1100 }, { "epoch": 0.88, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 18187315.0, "eval_physics_runtime": 63.6627, "eval_physics_samples_per_second": 7.854, "eval_physics_steps_per_second": 3.927, "step": 1100 }, { "entropy": 1.9653398767113686, "epoch": 0.888, "grad_norm": 0.009033203125, "learning_rate": 1.975777777777778e-05, "loss": 2.7728, "mean_token_accuracy": 0.4965917535126209, "num_tokens": 18366329.0, "step": 1110 }, { "entropy": 2.0933822274208067, "epoch": 0.896, "grad_norm": 0.0191650390625, "learning_rate": 1.9735555555555556e-05, "loss": 2.8728, "mean_token_accuracy": 0.4872217344120145, "num_tokens": 18536790.0, "step": 1120 }, { "entropy": 2.1074513986706735, "epoch": 0.904, "grad_norm": 0.007049560546875, "learning_rate": 1.9713333333333337e-05, "loss": 2.9176, "mean_token_accuracy": 0.48107887227088214, "num_tokens": 18701809.0, "step": 1130 }, { "entropy": 2.160377311706543, "epoch": 0.912, "grad_norm": 0.006683349609375, "learning_rate": 1.969111111111111e-05, "loss": 2.9384, "mean_token_accuracy": 0.4739804005250335, "num_tokens": 18869344.0, "step": 1140 }, { "entropy": 2.0753089264035225, "epoch": 0.92, "grad_norm": 0.01416015625, "learning_rate": 1.9668888888888892e-05, "loss": 2.882, "mean_token_accuracy": 0.48446453278884294, "num_tokens": 19033910.0, "step": 1150 }, { "entropy": 2.1365489259362223, "epoch": 0.928, "grad_norm": 0.006988525390625, "learning_rate": 1.9646666666666666e-05, "loss": 2.8854, "mean_token_accuracy": 0.4823284359648824, "num_tokens": 19199834.0, "step": 1160 }, { "entropy": 2.0710911326110364, "epoch": 0.936, "grad_norm": 0.007080078125, "learning_rate": 1.9624444444444447e-05, "loss": 2.8127, "mean_token_accuracy": 0.48797463029623034, "num_tokens": 19361703.0, "step": 1170 }, { "entropy": 2.116609390079975, "epoch": 0.944, "grad_norm": 0.007110595703125, "learning_rate": 1.9602222222222225e-05, "loss": 2.8751, "mean_token_accuracy": 0.48552187960594895, "num_tokens": 19525385.0, "step": 1180 }, { "entropy": 2.0949166089296343, "epoch": 0.952, "grad_norm": 0.03076171875, "learning_rate": 1.9580000000000002e-05, "loss": 2.8499, "mean_token_accuracy": 0.48639670219272374, "num_tokens": 19690715.0, "step": 1190 }, { "entropy": 2.1095651358366014, "epoch": 0.96, "grad_norm": 0.015380859375, "learning_rate": 1.955777777777778e-05, "loss": 2.8706, "mean_token_accuracy": 0.48429451026022435, "num_tokens": 19856973.0, "step": 1200 }, { "epoch": 0.96, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 19856973.0, "eval_coding_runtime": 95.6684, "eval_coding_samples_per_second": 5.226, "eval_coding_steps_per_second": 2.613, "step": 1200 }, { "epoch": 0.96, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 19856973.0, "eval_physics_runtime": 63.6635, "eval_physics_samples_per_second": 7.854, "eval_physics_steps_per_second": 3.927, "step": 1200 }, { "entropy": 2.0797565251588823, "epoch": 0.968, "grad_norm": 0.0091552734375, "learning_rate": 1.9535555555555557e-05, "loss": 2.8871, "mean_token_accuracy": 0.487212173268199, "num_tokens": 20021093.0, "step": 1210 }, { "entropy": 2.0395397424697874, "epoch": 0.976, "grad_norm": 0.01397705078125, "learning_rate": 1.9513333333333335e-05, "loss": 2.8149, "mean_token_accuracy": 0.4898043507710099, "num_tokens": 20190600.0, "step": 1220 }, { "entropy": 2.170710327476263, "epoch": 0.984, "grad_norm": 0.0123291015625, "learning_rate": 1.9491111111111113e-05, "loss": 2.9186, "mean_token_accuracy": 0.4756599869579077, "num_tokens": 20352255.0, "step": 1230 }, { "entropy": 2.074953277409077, "epoch": 0.992, "grad_norm": 0.01092529296875, "learning_rate": 1.946888888888889e-05, "loss": 2.8591, "mean_token_accuracy": 0.48683627638965843, "num_tokens": 20518014.0, "step": 1240 }, { "entropy": 2.1222646929323674, "epoch": 1.0, "grad_norm": 0.00909423828125, "learning_rate": 1.9446666666666668e-05, "loss": 2.8802, "mean_token_accuracy": 0.4825377007946372, "num_tokens": 20686804.0, "step": 1250 }, { "entropy": 2.071950024366379, "epoch": 1.008, "grad_norm": 0.01141357421875, "learning_rate": 1.9424444444444446e-05, "loss": 2.8335, "mean_token_accuracy": 0.4894521238282323, "num_tokens": 20859215.0, "step": 1260 }, { "entropy": 2.178350891917944, "epoch": 1.016, "grad_norm": 0.0086669921875, "learning_rate": 1.9402222222222223e-05, "loss": 2.9515, "mean_token_accuracy": 0.475351152010262, "num_tokens": 21019542.0, "step": 1270 }, { "entropy": 2.0342648893594744, "epoch": 1.024, "grad_norm": 0.01483154296875, "learning_rate": 1.938e-05, "loss": 2.8036, "mean_token_accuracy": 0.4928664604201913, "num_tokens": 21186658.0, "step": 1280 }, { "entropy": 2.0908240154385567, "epoch": 1.032, "grad_norm": 0.00897216796875, "learning_rate": 1.935777777777778e-05, "loss": 2.8868, "mean_token_accuracy": 0.483198188431561, "num_tokens": 21351715.0, "step": 1290 }, { "entropy": 2.1210865639150143, "epoch": 1.04, "grad_norm": 0.008544921875, "learning_rate": 1.9335555555555556e-05, "loss": 2.9071, "mean_token_accuracy": 0.47981371227651837, "num_tokens": 21517237.0, "step": 1300 }, { "epoch": 1.04, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 21517237.0, "eval_coding_runtime": 96.1911, "eval_coding_samples_per_second": 5.198, "eval_coding_steps_per_second": 2.599, "step": 1300 }, { "epoch": 1.04, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 21517237.0, "eval_physics_runtime": 63.9611, "eval_physics_samples_per_second": 7.817, "eval_physics_steps_per_second": 3.909, "step": 1300 }, { "entropy": 2.102020622044802, "epoch": 1.048, "grad_norm": 0.007415771484375, "learning_rate": 1.9313333333333334e-05, "loss": 2.8719, "mean_token_accuracy": 0.4847179941833019, "num_tokens": 21682780.0, "step": 1310 }, { "entropy": 2.045586694777012, "epoch": 1.056, "grad_norm": 0.0086669921875, "learning_rate": 1.9291111111111115e-05, "loss": 2.8189, "mean_token_accuracy": 0.48618846032768487, "num_tokens": 21852528.0, "step": 1320 }, { "entropy": 2.0868709199130535, "epoch": 1.064, "grad_norm": 0.01226806640625, "learning_rate": 1.926888888888889e-05, "loss": 2.8313, "mean_token_accuracy": 0.4878941228613257, "num_tokens": 22021514.0, "step": 1330 }, { "entropy": 2.111675312370062, "epoch": 1.072, "grad_norm": 0.00909423828125, "learning_rate": 1.924666666666667e-05, "loss": 2.889, "mean_token_accuracy": 0.4821753781288862, "num_tokens": 22192380.0, "step": 1340 }, { "entropy": 2.080042862892151, "epoch": 1.08, "grad_norm": 0.02294921875, "learning_rate": 1.9224444444444444e-05, "loss": 2.8401, "mean_token_accuracy": 0.4857699448242784, "num_tokens": 22356835.0, "step": 1350 }, { "entropy": 2.138657582551241, "epoch": 1.088, "grad_norm": 0.006927490234375, "learning_rate": 1.9202222222222225e-05, "loss": 2.9223, "mean_token_accuracy": 0.4771748060360551, "num_tokens": 22517971.0, "step": 1360 }, { "entropy": 2.1065418414771555, "epoch": 1.096, "grad_norm": 0.01422119140625, "learning_rate": 1.918e-05, "loss": 2.8768, "mean_token_accuracy": 0.4816119741648436, "num_tokens": 22681680.0, "step": 1370 }, { "entropy": 2.087523814290762, "epoch": 1.104, "grad_norm": 0.020751953125, "learning_rate": 1.915777777777778e-05, "loss": 2.8938, "mean_token_accuracy": 0.4846269652247429, "num_tokens": 22848694.0, "step": 1380 }, { "entropy": 2.1188573092222214, "epoch": 1.112, "grad_norm": 0.0087890625, "learning_rate": 1.9135555555555555e-05, "loss": 2.8744, "mean_token_accuracy": 0.48116844836622474, "num_tokens": 23016014.0, "step": 1390 }, { "entropy": 2.071516238898039, "epoch": 1.12, "grad_norm": 0.01544189453125, "learning_rate": 1.9113333333333336e-05, "loss": 2.8453, "mean_token_accuracy": 0.48592273648828266, "num_tokens": 23183889.0, "step": 1400 }, { "epoch": 1.12, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 23183889.0, "eval_coding_runtime": 96.0999, "eval_coding_samples_per_second": 5.203, "eval_coding_steps_per_second": 2.601, "step": 1400 }, { "epoch": 1.12, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 23183889.0, "eval_physics_runtime": 64.1646, "eval_physics_samples_per_second": 7.792, "eval_physics_steps_per_second": 3.896, "step": 1400 }, { "entropy": 2.0450177520513533, "epoch": 1.1280000000000001, "grad_norm": 0.00823974609375, "learning_rate": 1.9091111111111113e-05, "loss": 2.8314, "mean_token_accuracy": 0.4920926999300718, "num_tokens": 23354096.0, "step": 1410 }, { "entropy": 2.0993775948882103, "epoch": 1.1360000000000001, "grad_norm": 0.0146484375, "learning_rate": 1.906888888888889e-05, "loss": 2.8513, "mean_token_accuracy": 0.4838741825893521, "num_tokens": 23523903.0, "step": 1420 }, { "entropy": 2.0676057398319245, "epoch": 1.144, "grad_norm": 0.0135498046875, "learning_rate": 1.904666666666667e-05, "loss": 2.8661, "mean_token_accuracy": 0.4864560257643461, "num_tokens": 23684571.0, "step": 1430 }, { "entropy": 2.0999209016561506, "epoch": 1.152, "grad_norm": 0.00982666015625, "learning_rate": 1.9024444444444446e-05, "loss": 2.8681, "mean_token_accuracy": 0.4822270443663001, "num_tokens": 23848848.0, "step": 1440 }, { "entropy": 2.110722591727972, "epoch": 1.16, "grad_norm": 0.0086669921875, "learning_rate": 1.9002222222222224e-05, "loss": 2.871, "mean_token_accuracy": 0.48459589947015047, "num_tokens": 24011654.0, "step": 1450 }, { "entropy": 2.181078152358532, "epoch": 1.168, "grad_norm": 0.0098876953125, "learning_rate": 1.898e-05, "loss": 2.943, "mean_token_accuracy": 0.47790372706949713, "num_tokens": 24174589.0, "step": 1460 }, { "entropy": 2.134608647972345, "epoch": 1.176, "grad_norm": 0.0244140625, "learning_rate": 1.895777777777778e-05, "loss": 2.8963, "mean_token_accuracy": 0.4795195382088423, "num_tokens": 24336249.0, "step": 1470 }, { "entropy": 2.0590513460338116, "epoch": 1.184, "grad_norm": 0.025146484375, "learning_rate": 1.8935555555555556e-05, "loss": 2.8476, "mean_token_accuracy": 0.4868919890373945, "num_tokens": 24495981.0, "step": 1480 }, { "entropy": 2.094113527983427, "epoch": 1.192, "grad_norm": 0.021728515625, "learning_rate": 1.8913333333333334e-05, "loss": 2.8476, "mean_token_accuracy": 0.488334871083498, "num_tokens": 24658862.0, "step": 1490 }, { "entropy": 2.066910207271576, "epoch": 1.2, "grad_norm": 0.01214599609375, "learning_rate": 1.8891111111111115e-05, "loss": 2.8177, "mean_token_accuracy": 0.4844546576961875, "num_tokens": 24826245.0, "step": 1500 }, { "epoch": 1.2, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 24826245.0, "eval_coding_runtime": 95.975, "eval_coding_samples_per_second": 5.21, "eval_coding_steps_per_second": 2.605, "step": 1500 }, { "epoch": 1.2, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 24826245.0, "eval_physics_runtime": 63.6317, "eval_physics_samples_per_second": 7.858, "eval_physics_steps_per_second": 3.929, "step": 1500 }, { "entropy": 2.1044005908071997, "epoch": 1.208, "grad_norm": 0.01177978515625, "learning_rate": 1.886888888888889e-05, "loss": 2.8829, "mean_token_accuracy": 0.48166574332863094, "num_tokens": 24988821.0, "step": 1510 }, { "entropy": 2.1516220718622208, "epoch": 1.216, "grad_norm": 0.012451171875, "learning_rate": 1.884666666666667e-05, "loss": 2.9433, "mean_token_accuracy": 0.47657596562057736, "num_tokens": 25147775.0, "step": 1520 }, { "entropy": 2.1071897082030775, "epoch": 1.224, "grad_norm": 0.00860595703125, "learning_rate": 1.8824444444444445e-05, "loss": 2.8592, "mean_token_accuracy": 0.479968504793942, "num_tokens": 25312652.0, "step": 1530 }, { "entropy": 2.031100772321224, "epoch": 1.232, "grad_norm": 0.006103515625, "learning_rate": 1.8802222222222226e-05, "loss": 2.8016, "mean_token_accuracy": 0.4919817751273513, "num_tokens": 25480033.0, "step": 1540 }, { "entropy": 2.0882932603359223, "epoch": 1.24, "grad_norm": 0.017822265625, "learning_rate": 1.878e-05, "loss": 2.87, "mean_token_accuracy": 0.4867149593308568, "num_tokens": 25643821.0, "step": 1550 }, { "entropy": 2.1069738253951074, "epoch": 1.248, "grad_norm": 0.01275634765625, "learning_rate": 1.875777777777778e-05, "loss": 2.9072, "mean_token_accuracy": 0.48037715051323177, "num_tokens": 25811978.0, "step": 1560 }, { "entropy": 2.1235784739255905, "epoch": 1.256, "grad_norm": 0.0098876953125, "learning_rate": 1.873555555555556e-05, "loss": 2.8361, "mean_token_accuracy": 0.4876450767740607, "num_tokens": 25980217.0, "step": 1570 }, { "entropy": 2.07255465015769, "epoch": 1.264, "grad_norm": 0.0205078125, "learning_rate": 1.8713333333333336e-05, "loss": 2.8226, "mean_token_accuracy": 0.48956795781850815, "num_tokens": 26146299.0, "step": 1580 }, { "entropy": 2.083909762650728, "epoch": 1.272, "grad_norm": 0.006805419921875, "learning_rate": 1.8691111111111114e-05, "loss": 2.8296, "mean_token_accuracy": 0.48632726091891526, "num_tokens": 26315594.0, "step": 1590 }, { "entropy": 2.1811861269176007, "epoch": 1.28, "grad_norm": 0.017333984375, "learning_rate": 1.866888888888889e-05, "loss": 2.9711, "mean_token_accuracy": 0.47178333820775153, "num_tokens": 26478207.0, "step": 1600 }, { "epoch": 1.28, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 26478207.0, "eval_coding_runtime": 95.7162, "eval_coding_samples_per_second": 5.224, "eval_coding_steps_per_second": 2.612, "step": 1600 }, { "epoch": 1.28, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 26478207.0, "eval_physics_runtime": 63.6631, "eval_physics_samples_per_second": 7.854, "eval_physics_steps_per_second": 3.927, "step": 1600 }, { "entropy": 2.018047058582306, "epoch": 1.288, "grad_norm": 0.015869140625, "learning_rate": 1.864666666666667e-05, "loss": 2.7963, "mean_token_accuracy": 0.49348701871931555, "num_tokens": 26647567.0, "step": 1610 }, { "entropy": 2.1041888780891895, "epoch": 1.296, "grad_norm": 0.0081787109375, "learning_rate": 1.8624444444444446e-05, "loss": 2.8794, "mean_token_accuracy": 0.4833239272236824, "num_tokens": 26806477.0, "step": 1620 }, { "entropy": 2.0873175472021104, "epoch": 1.304, "grad_norm": 0.0169677734375, "learning_rate": 1.8602222222222224e-05, "loss": 2.8534, "mean_token_accuracy": 0.4844379436224699, "num_tokens": 26973124.0, "step": 1630 }, { "entropy": 2.184805364906788, "epoch": 1.312, "grad_norm": 0.0081787109375, "learning_rate": 1.858e-05, "loss": 2.9204, "mean_token_accuracy": 0.4806702647358179, "num_tokens": 27137528.0, "step": 1640 }, { "entropy": 2.1004837110638617, "epoch": 1.32, "grad_norm": 0.0189208984375, "learning_rate": 1.855777777777778e-05, "loss": 2.839, "mean_token_accuracy": 0.4855312593281269, "num_tokens": 27300986.0, "step": 1650 }, { "entropy": 2.1488297916948795, "epoch": 1.328, "grad_norm": 0.00677490234375, "learning_rate": 1.8535555555555557e-05, "loss": 2.876, "mean_token_accuracy": 0.48554224893450737, "num_tokens": 27465859.0, "step": 1660 }, { "entropy": 2.11965301707387, "epoch": 1.336, "grad_norm": 0.0089111328125, "learning_rate": 1.8513333333333335e-05, "loss": 2.909, "mean_token_accuracy": 0.4828299328684807, "num_tokens": 27627148.0, "step": 1670 }, { "entropy": 2.139040616154671, "epoch": 1.3439999999999999, "grad_norm": 0.01214599609375, "learning_rate": 1.8491111111111112e-05, "loss": 2.9133, "mean_token_accuracy": 0.4807636830955744, "num_tokens": 27795568.0, "step": 1680 }, { "entropy": 2.1508292332291603, "epoch": 1.3519999999999999, "grad_norm": 0.00848388671875, "learning_rate": 1.846888888888889e-05, "loss": 2.9009, "mean_token_accuracy": 0.48149400427937505, "num_tokens": 27958162.0, "step": 1690 }, { "entropy": 2.0931077137589456, "epoch": 1.3599999999999999, "grad_norm": 0.0201416015625, "learning_rate": 1.8446666666666667e-05, "loss": 2.8726, "mean_token_accuracy": 0.4807087674736977, "num_tokens": 28129988.0, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 28129988.0, "eval_coding_runtime": 96.0975, "eval_coding_samples_per_second": 5.203, "eval_coding_steps_per_second": 2.602, "step": 1700 }, { "epoch": 1.3599999999999999, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 28129988.0, "eval_physics_runtime": 63.7443, "eval_physics_samples_per_second": 7.844, "eval_physics_steps_per_second": 3.922, "step": 1700 }, { "entropy": 2.0795333731919525, "epoch": 1.3679999999999999, "grad_norm": 0.0096435546875, "learning_rate": 1.842444444444445e-05, "loss": 2.8522, "mean_token_accuracy": 0.4887755334377289, "num_tokens": 28299534.0, "step": 1710 }, { "entropy": 2.141296311467886, "epoch": 1.376, "grad_norm": 0.0157470703125, "learning_rate": 1.8402222222222223e-05, "loss": 2.9271, "mean_token_accuracy": 0.4771733907982707, "num_tokens": 28465903.0, "step": 1720 }, { "entropy": 2.0781943283975126, "epoch": 1.384, "grad_norm": 0.01055908203125, "learning_rate": 1.8380000000000004e-05, "loss": 2.8574, "mean_token_accuracy": 0.4866001008078456, "num_tokens": 28639289.0, "step": 1730 }, { "entropy": 2.0997987784445287, "epoch": 1.392, "grad_norm": 0.02099609375, "learning_rate": 1.8357777777777778e-05, "loss": 2.8585, "mean_token_accuracy": 0.48320644851773975, "num_tokens": 28801554.0, "step": 1740 }, { "entropy": 2.150196324288845, "epoch": 1.4, "grad_norm": 0.007354736328125, "learning_rate": 1.833555555555556e-05, "loss": 2.9002, "mean_token_accuracy": 0.48246240708976984, "num_tokens": 28963885.0, "step": 1750 }, { "entropy": 2.084288237988949, "epoch": 1.408, "grad_norm": 0.01806640625, "learning_rate": 1.8313333333333333e-05, "loss": 2.8911, "mean_token_accuracy": 0.4849772537127137, "num_tokens": 29125114.0, "step": 1760 }, { "entropy": 2.1347289800643923, "epoch": 1.416, "grad_norm": 0.0120849609375, "learning_rate": 1.8291111111111114e-05, "loss": 2.9129, "mean_token_accuracy": 0.477182531170547, "num_tokens": 29285272.0, "step": 1770 }, { "entropy": 2.1285498954355715, "epoch": 1.424, "grad_norm": 0.018310546875, "learning_rate": 1.8268888888888888e-05, "loss": 2.9215, "mean_token_accuracy": 0.4772155126556754, "num_tokens": 29454334.0, "step": 1780 }, { "entropy": 2.0727533496916295, "epoch": 1.432, "grad_norm": 0.01336669921875, "learning_rate": 1.824666666666667e-05, "loss": 2.8803, "mean_token_accuracy": 0.4822785761207342, "num_tokens": 29616865.0, "step": 1790 }, { "entropy": 2.0606922268867494, "epoch": 1.44, "grad_norm": 0.00860595703125, "learning_rate": 1.8224444444444447e-05, "loss": 2.8232, "mean_token_accuracy": 0.48800144270062445, "num_tokens": 29789253.0, "step": 1800 }, { "epoch": 1.44, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 29789253.0, "eval_coding_runtime": 96.3208, "eval_coding_samples_per_second": 5.191, "eval_coding_steps_per_second": 2.595, "step": 1800 }, { "epoch": 1.44, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 29789253.0, "eval_physics_runtime": 63.9922, "eval_physics_samples_per_second": 7.813, "eval_physics_steps_per_second": 3.907, "step": 1800 }, { "entropy": 2.0618888407945635, "epoch": 1.448, "grad_norm": 0.007171630859375, "learning_rate": 1.8202222222222225e-05, "loss": 2.8573, "mean_token_accuracy": 0.4843838458880782, "num_tokens": 29953006.0, "step": 1810 }, { "entropy": 2.022880507260561, "epoch": 1.456, "grad_norm": 0.0194091796875, "learning_rate": 1.8180000000000002e-05, "loss": 2.8031, "mean_token_accuracy": 0.49401772283017636, "num_tokens": 30128576.0, "step": 1820 }, { "entropy": 2.0860828548669814, "epoch": 1.464, "grad_norm": 0.0294189453125, "learning_rate": 1.815777777777778e-05, "loss": 2.8756, "mean_token_accuracy": 0.4849246619269252, "num_tokens": 30288894.0, "step": 1830 }, { "entropy": 2.188418509066105, "epoch": 1.472, "grad_norm": 0.015869140625, "learning_rate": 1.8135555555555557e-05, "loss": 2.9527, "mean_token_accuracy": 0.4721253639087081, "num_tokens": 30452719.0, "step": 1840 }, { "entropy": 2.085013201087713, "epoch": 1.48, "grad_norm": 0.00946044921875, "learning_rate": 1.8113333333333335e-05, "loss": 2.8646, "mean_token_accuracy": 0.486043793335557, "num_tokens": 30620659.0, "step": 1850 }, { "entropy": 2.143638453632593, "epoch": 1.488, "grad_norm": 0.0108642578125, "learning_rate": 1.8091111111111113e-05, "loss": 2.9264, "mean_token_accuracy": 0.47975865937769413, "num_tokens": 30782343.0, "step": 1860 }, { "entropy": 2.110791251808405, "epoch": 1.496, "grad_norm": 0.00689697265625, "learning_rate": 1.806888888888889e-05, "loss": 2.8831, "mean_token_accuracy": 0.4842625314369798, "num_tokens": 30947703.0, "step": 1870 }, { "entropy": 2.1235111109912395, "epoch": 1.504, "grad_norm": 0.0181884765625, "learning_rate": 1.8046666666666668e-05, "loss": 2.8413, "mean_token_accuracy": 0.4860221529379487, "num_tokens": 31113238.0, "step": 1880 }, { "entropy": 2.0822810381650925, "epoch": 1.512, "grad_norm": 0.0125732421875, "learning_rate": 1.8024444444444445e-05, "loss": 2.864, "mean_token_accuracy": 0.48232704903930423, "num_tokens": 31283883.0, "step": 1890 }, { "entropy": 2.1699491038918497, "epoch": 1.52, "grad_norm": 0.0101318359375, "learning_rate": 1.8002222222222223e-05, "loss": 2.9505, "mean_token_accuracy": 0.4706884307786822, "num_tokens": 31444589.0, "step": 1900 }, { "epoch": 1.52, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 31444589.0, "eval_coding_runtime": 95.9376, "eval_coding_samples_per_second": 5.212, "eval_coding_steps_per_second": 2.606, "step": 1900 }, { "epoch": 1.52, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 31444589.0, "eval_physics_runtime": 63.8135, "eval_physics_samples_per_second": 7.835, "eval_physics_steps_per_second": 3.918, "step": 1900 }, { "entropy": 2.1199583016335963, "epoch": 1.528, "grad_norm": 0.007598876953125, "learning_rate": 1.798e-05, "loss": 2.8928, "mean_token_accuracy": 0.4802051903679967, "num_tokens": 31608952.0, "step": 1910 }, { "entropy": 2.175958313792944, "epoch": 1.536, "grad_norm": 0.0089111328125, "learning_rate": 1.7957777777777778e-05, "loss": 2.9365, "mean_token_accuracy": 0.4747030269354582, "num_tokens": 31774137.0, "step": 1920 }, { "entropy": 2.086700049042702, "epoch": 1.544, "grad_norm": 0.0113525390625, "learning_rate": 1.7935555555555556e-05, "loss": 2.8777, "mean_token_accuracy": 0.4859625767916441, "num_tokens": 31937927.0, "step": 1930 }, { "entropy": 2.1010241121053697, "epoch": 1.552, "grad_norm": 0.0068359375, "learning_rate": 1.7913333333333337e-05, "loss": 2.8916, "mean_token_accuracy": 0.4840227223932743, "num_tokens": 32101776.0, "step": 1940 }, { "entropy": 2.1249936796724795, "epoch": 1.56, "grad_norm": 0.01446533203125, "learning_rate": 1.789111111111111e-05, "loss": 2.8819, "mean_token_accuracy": 0.48242205642163755, "num_tokens": 32271213.0, "step": 1950 }, { "entropy": 2.040369462966919, "epoch": 1.568, "grad_norm": 0.0240478515625, "learning_rate": 1.7868888888888892e-05, "loss": 2.8209, "mean_token_accuracy": 0.48918427657335994, "num_tokens": 32428021.0, "step": 1960 }, { "entropy": 2.2181196875870226, "epoch": 1.576, "grad_norm": 0.0108642578125, "learning_rate": 1.7846666666666666e-05, "loss": 2.9794, "mean_token_accuracy": 0.4697700636461377, "num_tokens": 32585407.0, "step": 1970 }, { "entropy": 2.096325629204512, "epoch": 1.584, "grad_norm": 0.0196533203125, "learning_rate": 1.7824444444444447e-05, "loss": 2.8507, "mean_token_accuracy": 0.4877478454262018, "num_tokens": 32752735.0, "step": 1980 }, { "entropy": 2.1082000881433487, "epoch": 1.592, "grad_norm": 0.0079345703125, "learning_rate": 1.780222222222222e-05, "loss": 2.8789, "mean_token_accuracy": 0.482938090339303, "num_tokens": 32912519.0, "step": 1990 }, { "entropy": 2.139867788553238, "epoch": 1.6, "grad_norm": 0.01007080078125, "learning_rate": 1.7780000000000003e-05, "loss": 2.9136, "mean_token_accuracy": 0.4764457972720265, "num_tokens": 33075822.0, "step": 2000 }, { "epoch": 1.6, "eval_coding_entropy": 1.266241003036499, "eval_coding_loss": 1.2625732421875, "eval_coding_mean_token_accuracy": 0.6838934738636017, "eval_coding_num_tokens": 33075822.0, "eval_coding_runtime": 96.1948, "eval_coding_samples_per_second": 5.198, "eval_coding_steps_per_second": 2.599, "step": 2000 }, { "epoch": 1.6, "eval_physics_entropy": 2.116599133968353, "eval_physics_loss": 2.9798319339752197, "eval_physics_mean_token_accuracy": 0.4795549786090851, "eval_physics_num_tokens": 33075822.0, "eval_physics_runtime": 63.6877, "eval_physics_samples_per_second": 7.851, "eval_physics_steps_per_second": 3.925, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6802547029475983e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }