| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 100, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.085290367901325, |
| "epoch": 0.008, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.8e-07, |
| "loss": 2.8737, |
| "mean_token_accuracy": 0.48555898685008286, |
| "num_tokens": 173886.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.082674165070057, |
| "epoch": 0.016, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 3.8e-07, |
| "loss": 2.8766, |
| "mean_token_accuracy": 0.48314503096044065, |
| "num_tokens": 338352.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.083346585184336, |
| "epoch": 0.024, |
| "grad_norm": 0.01116943359375, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 2.8206, |
| "mean_token_accuracy": 0.4898203007876873, |
| "num_tokens": 505613.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 2.084573998302221, |
| "epoch": 0.032, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 7.8e-07, |
| "loss": 2.846, |
| "mean_token_accuracy": 0.4879810094833374, |
| "num_tokens": 671193.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 2.112175610661507, |
| "epoch": 0.04, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 2.8945, |
| "mean_token_accuracy": 0.48235367350280284, |
| "num_tokens": 835868.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.004745631664991, |
| "epoch": 0.048, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 2.7943, |
| "mean_token_accuracy": 0.4953182445839047, |
| "num_tokens": 1001008.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 2.0760968685150147, |
| "epoch": 0.056, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 2.8689, |
| "mean_token_accuracy": 0.48274643896147607, |
| "num_tokens": 1166615.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 2.1421599693596365, |
| "epoch": 0.064, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 2.927, |
| "mean_token_accuracy": 0.47564963400363924, |
| "num_tokens": 1331208.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 2.0629068821668626, |
| "epoch": 0.072, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 2.8669, |
| "mean_token_accuracy": 0.48655857108533385, |
| "num_tokens": 1499449.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 2.1223726868629456, |
| "epoch": 0.08, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 1.98e-06, |
| "loss": 2.8834, |
| "mean_token_accuracy": 0.48028963077813386, |
| "num_tokens": 1667532.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 1667532.0, |
| "eval_coding_runtime": 96.0471, |
| "eval_coding_samples_per_second": 5.206, |
| "eval_coding_steps_per_second": 2.603, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 1667532.0, |
| "eval_physics_runtime": 63.6847, |
| "eval_physics_samples_per_second": 7.851, |
| "eval_physics_steps_per_second": 3.926, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.028460328280926, |
| "epoch": 0.088, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 2.7998, |
| "mean_token_accuracy": 0.4919602179899812, |
| "num_tokens": 1835489.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 2.0500536493957044, |
| "epoch": 0.096, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 2.38e-06, |
| "loss": 2.843, |
| "mean_token_accuracy": 0.486666096188128, |
| "num_tokens": 2007176.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 2.133950074762106, |
| "epoch": 0.104, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 2.908, |
| "mean_token_accuracy": 0.47787463404238223, |
| "num_tokens": 2171335.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 2.0415130145847797, |
| "epoch": 0.112, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 2.8213, |
| "mean_token_accuracy": 0.48844432439655067, |
| "num_tokens": 2342259.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 2.19760585129261, |
| "epoch": 0.12, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 2.9587, |
| "mean_token_accuracy": 0.4719714432954788, |
| "num_tokens": 2504279.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.133862767368555, |
| "epoch": 0.128, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 2.895, |
| "mean_token_accuracy": 0.48206167966127395, |
| "num_tokens": 2663987.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 2.1023842833936213, |
| "epoch": 0.136, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 2.8679, |
| "mean_token_accuracy": 0.48314376436173917, |
| "num_tokens": 2829720.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 2.096611645817757, |
| "epoch": 0.144, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 3.58e-06, |
| "loss": 2.882, |
| "mean_token_accuracy": 0.4854999572038651, |
| "num_tokens": 2997824.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 2.104377479106188, |
| "epoch": 0.152, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 2.8908, |
| "mean_token_accuracy": 0.48179021421819923, |
| "num_tokens": 3159946.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 2.091322401165962, |
| "epoch": 0.16, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 2.8428, |
| "mean_token_accuracy": 0.4882024429738522, |
| "num_tokens": 3330597.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 3330597.0, |
| "eval_coding_runtime": 95.6196, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 3330597.0, |
| "eval_physics_runtime": 63.6492, |
| "eval_physics_samples_per_second": 7.856, |
| "eval_physics_steps_per_second": 3.928, |
| "step": 200 |
| }, |
| { |
| "entropy": 2.142216607928276, |
| "epoch": 0.168, |
| "grad_norm": 0.00701904296875, |
| "learning_rate": 4.18e-06, |
| "loss": 2.9186, |
| "mean_token_accuracy": 0.4779110599309206, |
| "num_tokens": 3496104.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 2.1577497258782388, |
| "epoch": 0.176, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 4.38e-06, |
| "loss": 2.9356, |
| "mean_token_accuracy": 0.4760912239551544, |
| "num_tokens": 3658264.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 2.1311790965497495, |
| "epoch": 0.184, |
| "grad_norm": 0.006317138671875, |
| "learning_rate": 4.58e-06, |
| "loss": 2.8804, |
| "mean_token_accuracy": 0.48113634046167136, |
| "num_tokens": 3821352.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 2.110344947129488, |
| "epoch": 0.192, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 4.78e-06, |
| "loss": 2.8458, |
| "mean_token_accuracy": 0.4866057766601443, |
| "num_tokens": 3995568.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 2.1274008348584177, |
| "epoch": 0.2, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 2.9325, |
| "mean_token_accuracy": 0.47856651023030283, |
| "num_tokens": 4160918.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 2.0445496000349523, |
| "epoch": 0.208, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 5.18e-06, |
| "loss": 2.7895, |
| "mean_token_accuracy": 0.49246302992105484, |
| "num_tokens": 4321436.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 2.118650445342064, |
| "epoch": 0.216, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 2.831, |
| "mean_token_accuracy": 0.48565028738230465, |
| "num_tokens": 4482225.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 2.0710766136646273, |
| "epoch": 0.224, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 2.8524, |
| "mean_token_accuracy": 0.4889204766601324, |
| "num_tokens": 4648491.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 2.0747790560126305, |
| "epoch": 0.232, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 5.78e-06, |
| "loss": 2.8736, |
| "mean_token_accuracy": 0.4836031161248684, |
| "num_tokens": 4813449.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 2.1116694904863835, |
| "epoch": 0.24, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 5.98e-06, |
| "loss": 2.8974, |
| "mean_token_accuracy": 0.47815693244338037, |
| "num_tokens": 4986175.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 4986175.0, |
| "eval_coding_runtime": 95.7299, |
| "eval_coding_samples_per_second": 5.223, |
| "eval_coding_steps_per_second": 2.612, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 4986175.0, |
| "eval_physics_runtime": 63.6784, |
| "eval_physics_samples_per_second": 7.852, |
| "eval_physics_steps_per_second": 3.926, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.085676248371601, |
| "epoch": 0.248, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 6.18e-06, |
| "loss": 2.8547, |
| "mean_token_accuracy": 0.4871716873720288, |
| "num_tokens": 5164741.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 2.111249604821205, |
| "epoch": 0.256, |
| "grad_norm": 0.01324462890625, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 2.869, |
| "mean_token_accuracy": 0.482149850204587, |
| "num_tokens": 5329320.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 2.0735500395298003, |
| "epoch": 0.264, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 2.815, |
| "mean_token_accuracy": 0.4895795300602913, |
| "num_tokens": 5497259.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 2.1957527093589304, |
| "epoch": 0.272, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 2.9296, |
| "mean_token_accuracy": 0.4737250735983253, |
| "num_tokens": 5658796.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 2.0978702254593373, |
| "epoch": 0.28, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 6.98e-06, |
| "loss": 2.9223, |
| "mean_token_accuracy": 0.47742239478975534, |
| "num_tokens": 5819027.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.126638425886631, |
| "epoch": 0.288, |
| "grad_norm": 0.01104736328125, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 2.8822, |
| "mean_token_accuracy": 0.48392471522092817, |
| "num_tokens": 5980201.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 2.1021372735500337, |
| "epoch": 0.296, |
| "grad_norm": 0.0208740234375, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 2.8638, |
| "mean_token_accuracy": 0.4845348121598363, |
| "num_tokens": 6139830.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 2.1106897443532944, |
| "epoch": 0.304, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 7.58e-06, |
| "loss": 2.8357, |
| "mean_token_accuracy": 0.4834788400679827, |
| "num_tokens": 6298858.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 2.143523923307657, |
| "epoch": 0.312, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 7.78e-06, |
| "loss": 2.9236, |
| "mean_token_accuracy": 0.47671638112515213, |
| "num_tokens": 6456502.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 2.1132052429020405, |
| "epoch": 0.32, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 2.8795, |
| "mean_token_accuracy": 0.48184755612164737, |
| "num_tokens": 6622798.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 6622798.0, |
| "eval_coding_runtime": 95.588, |
| "eval_coding_samples_per_second": 5.231, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 6622798.0, |
| "eval_physics_runtime": 63.5621, |
| "eval_physics_samples_per_second": 7.866, |
| "eval_physics_steps_per_second": 3.933, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.1359100699424745, |
| "epoch": 0.328, |
| "grad_norm": 0.01220703125, |
| "learning_rate": 8.18e-06, |
| "loss": 2.892, |
| "mean_token_accuracy": 0.4762568075209856, |
| "num_tokens": 6782985.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 2.080799935385585, |
| "epoch": 0.336, |
| "grad_norm": 0.021484375, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 2.8705, |
| "mean_token_accuracy": 0.4826640056446195, |
| "num_tokens": 6953611.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 2.0621030911803246, |
| "epoch": 0.344, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 2.777, |
| "mean_token_accuracy": 0.48948597013950346, |
| "num_tokens": 7121949.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 2.0954283088445664, |
| "epoch": 0.352, |
| "grad_norm": 0.01080322265625, |
| "learning_rate": 8.78e-06, |
| "loss": 2.8529, |
| "mean_token_accuracy": 0.48598715253174307, |
| "num_tokens": 7290378.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 2.1145325861871243, |
| "epoch": 0.36, |
| "grad_norm": 0.01611328125, |
| "learning_rate": 8.98e-06, |
| "loss": 2.864, |
| "mean_token_accuracy": 0.48475142791867254, |
| "num_tokens": 7452920.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.0749227844178675, |
| "epoch": 0.368, |
| "grad_norm": 0.0255126953125, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 2.842, |
| "mean_token_accuracy": 0.4888931142166257, |
| "num_tokens": 7621082.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 2.158674482256174, |
| "epoch": 0.376, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 9.38e-06, |
| "loss": 2.8965, |
| "mean_token_accuracy": 0.47893171701580284, |
| "num_tokens": 7787036.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 2.172913119196892, |
| "epoch": 0.384, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 9.58e-06, |
| "loss": 2.9484, |
| "mean_token_accuracy": 0.47502864580601456, |
| "num_tokens": 7955570.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 2.133708968013525, |
| "epoch": 0.392, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 2.9115, |
| "mean_token_accuracy": 0.481097811833024, |
| "num_tokens": 8123527.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 2.075808613747358, |
| "epoch": 0.4, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 2.8509, |
| "mean_token_accuracy": 0.4880217058584094, |
| "num_tokens": 8291049.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 8291049.0, |
| "eval_coding_runtime": 95.8745, |
| "eval_coding_samples_per_second": 5.215, |
| "eval_coding_steps_per_second": 2.608, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 8291049.0, |
| "eval_physics_runtime": 63.2475, |
| "eval_physics_samples_per_second": 7.905, |
| "eval_physics_steps_per_second": 3.953, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.0808891989290714, |
| "epoch": 0.408, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.018e-05, |
| "loss": 2.8602, |
| "mean_token_accuracy": 0.48418706730008126, |
| "num_tokens": 8459795.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 2.108520568162203, |
| "epoch": 0.416, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 1.038e-05, |
| "loss": 2.8709, |
| "mean_token_accuracy": 0.4807485779747367, |
| "num_tokens": 8627310.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 2.095228637009859, |
| "epoch": 0.424, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 2.8575, |
| "mean_token_accuracy": 0.4873119568452239, |
| "num_tokens": 8792574.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 2.0733875431120397, |
| "epoch": 0.432, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 2.8746, |
| "mean_token_accuracy": 0.48152947407215835, |
| "num_tokens": 8958371.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 2.0743850864470006, |
| "epoch": 0.44, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 2.8417, |
| "mean_token_accuracy": 0.48614103216677906, |
| "num_tokens": 9125564.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.0296142108738424, |
| "epoch": 0.448, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 2.8092, |
| "mean_token_accuracy": 0.49211322125047446, |
| "num_tokens": 9294388.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 2.042448101937771, |
| "epoch": 0.456, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.138e-05, |
| "loss": 2.8196, |
| "mean_token_accuracy": 0.4887328814715147, |
| "num_tokens": 9455428.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 2.124661027640104, |
| "epoch": 0.464, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 2.8825, |
| "mean_token_accuracy": 0.4844032688066363, |
| "num_tokens": 9620366.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 2.0920108027756212, |
| "epoch": 0.472, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 1.178e-05, |
| "loss": 2.8011, |
| "mean_token_accuracy": 0.49246231019496917, |
| "num_tokens": 9788925.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 2.0895667865872385, |
| "epoch": 0.48, |
| "grad_norm": 0.007568359375, |
| "learning_rate": 1.198e-05, |
| "loss": 2.8784, |
| "mean_token_accuracy": 0.4846471020951867, |
| "num_tokens": 9955431.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 9955431.0, |
| "eval_coding_runtime": 95.6269, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 9955431.0, |
| "eval_physics_runtime": 63.6584, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.1037806943058968, |
| "epoch": 0.488, |
| "grad_norm": 0.01434326171875, |
| "learning_rate": 1.218e-05, |
| "loss": 2.8897, |
| "mean_token_accuracy": 0.48129043709486724, |
| "num_tokens": 10119599.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 2.1045577257871626, |
| "epoch": 0.496, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 2.8986, |
| "mean_token_accuracy": 0.4801080621778965, |
| "num_tokens": 10289883.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 2.1075518101453783, |
| "epoch": 0.504, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 2.8519, |
| "mean_token_accuracy": 0.4833444032818079, |
| "num_tokens": 10454683.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 2.106730377674103, |
| "epoch": 0.512, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 2.9125, |
| "mean_token_accuracy": 0.4791097413748503, |
| "num_tokens": 10619468.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 2.149070845544338, |
| "epoch": 0.52, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 2.9383, |
| "mean_token_accuracy": 0.4766066731885076, |
| "num_tokens": 10781285.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 2.109221810847521, |
| "epoch": 0.528, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 2.8512, |
| "mean_token_accuracy": 0.48691278155893086, |
| "num_tokens": 10944904.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 2.10501976236701, |
| "epoch": 0.536, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 2.8583, |
| "mean_token_accuracy": 0.48396560084074736, |
| "num_tokens": 11106538.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 2.1422630779445173, |
| "epoch": 0.544, |
| "grad_norm": 0.013916015625, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 2.9198, |
| "mean_token_accuracy": 0.4756466532126069, |
| "num_tokens": 11266409.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 2.066490865498781, |
| "epoch": 0.552, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.378e-05, |
| "loss": 2.8749, |
| "mean_token_accuracy": 0.48253467567265035, |
| "num_tokens": 11438391.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 2.1218110136687756, |
| "epoch": 0.56, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.398e-05, |
| "loss": 2.8685, |
| "mean_token_accuracy": 0.4830782825127244, |
| "num_tokens": 11605544.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 11605544.0, |
| "eval_coding_runtime": 95.6063, |
| "eval_coding_samples_per_second": 5.23, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 11605544.0, |
| "eval_physics_runtime": 63.652, |
| "eval_physics_samples_per_second": 7.855, |
| "eval_physics_steps_per_second": 3.928, |
| "step": 700 |
| }, |
| { |
| "entropy": 2.0374118871986866, |
| "epoch": 0.568, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.418e-05, |
| "loss": 2.8443, |
| "mean_token_accuracy": 0.48784226551651955, |
| "num_tokens": 11765430.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 2.079664882272482, |
| "epoch": 0.576, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 2.8436, |
| "mean_token_accuracy": 0.4859828842803836, |
| "num_tokens": 11931715.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 2.091735219210386, |
| "epoch": 0.584, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 2.891, |
| "mean_token_accuracy": 0.4845219163224101, |
| "num_tokens": 12093686.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 2.092084125429392, |
| "epoch": 0.592, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 1.478e-05, |
| "loss": 2.8884, |
| "mean_token_accuracy": 0.4797184634953737, |
| "num_tokens": 12257243.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 2.138279449939728, |
| "epoch": 0.6, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.498e-05, |
| "loss": 2.8937, |
| "mean_token_accuracy": 0.4808414502069354, |
| "num_tokens": 12422008.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.025119251012802, |
| "epoch": 0.608, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 2.801, |
| "mean_token_accuracy": 0.4929833130910993, |
| "num_tokens": 12583154.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 2.068293011188507, |
| "epoch": 0.616, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 2.8332, |
| "mean_token_accuracy": 0.4890221064910293, |
| "num_tokens": 12746458.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 2.1028603583574297, |
| "epoch": 0.624, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 2.9238, |
| "mean_token_accuracy": 0.4755176778882742, |
| "num_tokens": 12905392.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 2.103465436398983, |
| "epoch": 0.632, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.578e-05, |
| "loss": 2.8914, |
| "mean_token_accuracy": 0.48264262825250626, |
| "num_tokens": 13071058.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 2.107102530449629, |
| "epoch": 0.64, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 2.8937, |
| "mean_token_accuracy": 0.4812551226466894, |
| "num_tokens": 13232198.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 13232198.0, |
| "eval_coding_runtime": 95.5879, |
| "eval_coding_samples_per_second": 5.231, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 13232198.0, |
| "eval_physics_runtime": 63.5241, |
| "eval_physics_samples_per_second": 7.871, |
| "eval_physics_steps_per_second": 3.936, |
| "step": 800 |
| }, |
| { |
| "entropy": 2.1226571217179298, |
| "epoch": 0.648, |
| "grad_norm": 0.010009765625, |
| "learning_rate": 1.618e-05, |
| "loss": 2.8715, |
| "mean_token_accuracy": 0.48374479077756405, |
| "num_tokens": 13401064.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 2.071519161760807, |
| "epoch": 0.656, |
| "grad_norm": 0.007476806640625, |
| "learning_rate": 1.638e-05, |
| "loss": 2.8209, |
| "mean_token_accuracy": 0.48990428987890483, |
| "num_tokens": 13575902.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 2.1139444075524807, |
| "epoch": 0.664, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.658e-05, |
| "loss": 2.9185, |
| "mean_token_accuracy": 0.48024304024875164, |
| "num_tokens": 13733192.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 2.1097038336098195, |
| "epoch": 0.672, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 2.856, |
| "mean_token_accuracy": 0.48414181135594847, |
| "num_tokens": 13895997.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 2.0350618422031403, |
| "epoch": 0.68, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.698e-05, |
| "loss": 2.8006, |
| "mean_token_accuracy": 0.49249137472361326, |
| "num_tokens": 14074579.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.1360582523047924, |
| "epoch": 0.688, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.718e-05, |
| "loss": 2.9135, |
| "mean_token_accuracy": 0.48240375574678185, |
| "num_tokens": 14234888.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 2.0854096628725527, |
| "epoch": 0.696, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 2.8889, |
| "mean_token_accuracy": 0.48132998365908863, |
| "num_tokens": 14411150.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 2.1598948813974856, |
| "epoch": 0.704, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 1.758e-05, |
| "loss": 2.941, |
| "mean_token_accuracy": 0.47370197977870704, |
| "num_tokens": 14567908.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 2.1014511175453663, |
| "epoch": 0.712, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.8989, |
| "mean_token_accuracy": 0.4840679431334138, |
| "num_tokens": 14728975.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 2.128746274113655, |
| "epoch": 0.72, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.798e-05, |
| "loss": 2.8664, |
| "mean_token_accuracy": 0.4867586812004447, |
| "num_tokens": 14882927.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 14882927.0, |
| "eval_coding_runtime": 95.8303, |
| "eval_coding_samples_per_second": 5.218, |
| "eval_coding_steps_per_second": 2.609, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 14882927.0, |
| "eval_physics_runtime": 63.6666, |
| "eval_physics_samples_per_second": 7.853, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 900 |
| }, |
| { |
| "entropy": 2.0901576526463033, |
| "epoch": 0.728, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.8708, |
| "mean_token_accuracy": 0.48403808698058126, |
| "num_tokens": 15052770.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 2.122075318545103, |
| "epoch": 0.736, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.8633, |
| "mean_token_accuracy": 0.4852804895490408, |
| "num_tokens": 15217342.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 2.093779005110264, |
| "epoch": 0.744, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.858e-05, |
| "loss": 2.8451, |
| "mean_token_accuracy": 0.48721388857811687, |
| "num_tokens": 15385161.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 2.131568420678377, |
| "epoch": 0.752, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.878e-05, |
| "loss": 2.9118, |
| "mean_token_accuracy": 0.479784238897264, |
| "num_tokens": 15547710.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 2.18331568017602, |
| "epoch": 0.76, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.898e-05, |
| "loss": 2.9565, |
| "mean_token_accuracy": 0.4732973873615265, |
| "num_tokens": 15704494.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 2.1320055924355983, |
| "epoch": 0.768, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 1.918e-05, |
| "loss": 2.8867, |
| "mean_token_accuracy": 0.4821720516309142, |
| "num_tokens": 15877177.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 2.035961114615202, |
| "epoch": 0.776, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.938e-05, |
| "loss": 2.8155, |
| "mean_token_accuracy": 0.4874297235161066, |
| "num_tokens": 16050530.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 2.0721310496330263, |
| "epoch": 0.784, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.8311, |
| "mean_token_accuracy": 0.4866502169519663, |
| "num_tokens": 16219640.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 2.1055771827697756, |
| "epoch": 0.792, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.978e-05, |
| "loss": 2.8748, |
| "mean_token_accuracy": 0.4852980706840754, |
| "num_tokens": 16378820.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 2.0726777322590353, |
| "epoch": 0.8, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.8588, |
| "mean_token_accuracy": 0.4857082888484001, |
| "num_tokens": 16548261.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 16548261.0, |
| "eval_coding_runtime": 95.9331, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 16548261.0, |
| "eval_physics_runtime": 63.1527, |
| "eval_physics_samples_per_second": 7.917, |
| "eval_physics_steps_per_second": 3.959, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.1674983091652393, |
| "epoch": 0.808, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.9179, |
| "mean_token_accuracy": 0.4778887365013361, |
| "num_tokens": 16711667.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 2.082773020863533, |
| "epoch": 0.816, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 2.8647, |
| "mean_token_accuracy": 0.48503445759415625, |
| "num_tokens": 16870533.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 2.1195092521607877, |
| "epoch": 0.824, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 2.8734, |
| "mean_token_accuracy": 0.4803088143467903, |
| "num_tokens": 17035600.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 2.1006917230784894, |
| "epoch": 0.832, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 2.8807, |
| "mean_token_accuracy": 0.48227673936635257, |
| "num_tokens": 17196371.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 2.1442659996449946, |
| "epoch": 0.84, |
| "grad_norm": 0.01171875, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 2.891, |
| "mean_token_accuracy": 0.48074845802038907, |
| "num_tokens": 17356919.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.0704655081033705, |
| "epoch": 0.848, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 2.8486, |
| "mean_token_accuracy": 0.4880160320550203, |
| "num_tokens": 17527066.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 2.125109201669693, |
| "epoch": 0.856, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 2.8795, |
| "mean_token_accuracy": 0.4847541594877839, |
| "num_tokens": 17689902.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 2.1320445723831654, |
| "epoch": 0.864, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 2.8809, |
| "mean_token_accuracy": 0.480629575625062, |
| "num_tokens": 17859605.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 2.1207774586975576, |
| "epoch": 0.872, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 2.8729, |
| "mean_token_accuracy": 0.4811369523406029, |
| "num_tokens": 18019924.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 2.1364660277962684, |
| "epoch": 0.88, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.978e-05, |
| "loss": 2.8681, |
| "mean_token_accuracy": 0.4821885580196977, |
| "num_tokens": 18187315.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 18187315.0, |
| "eval_coding_runtime": 95.6457, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 18187315.0, |
| "eval_physics_runtime": 63.6627, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.9653398767113686, |
| "epoch": 0.888, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 2.7728, |
| "mean_token_accuracy": 0.4965917535126209, |
| "num_tokens": 18366329.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 2.0933822274208067, |
| "epoch": 0.896, |
| "grad_norm": 0.0191650390625, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 2.8728, |
| "mean_token_accuracy": 0.4872217344120145, |
| "num_tokens": 18536790.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 2.1074513986706735, |
| "epoch": 0.904, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 2.9176, |
| "mean_token_accuracy": 0.48107887227088214, |
| "num_tokens": 18701809.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 2.160377311706543, |
| "epoch": 0.912, |
| "grad_norm": 0.006683349609375, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 2.9384, |
| "mean_token_accuracy": 0.4739804005250335, |
| "num_tokens": 18869344.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 2.0753089264035225, |
| "epoch": 0.92, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 2.882, |
| "mean_token_accuracy": 0.48446453278884294, |
| "num_tokens": 19033910.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.1365489259362223, |
| "epoch": 0.928, |
| "grad_norm": 0.006988525390625, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 2.8854, |
| "mean_token_accuracy": 0.4823284359648824, |
| "num_tokens": 19199834.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 2.0710911326110364, |
| "epoch": 0.936, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 2.8127, |
| "mean_token_accuracy": 0.48797463029623034, |
| "num_tokens": 19361703.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 2.116609390079975, |
| "epoch": 0.944, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 2.8751, |
| "mean_token_accuracy": 0.48552187960594895, |
| "num_tokens": 19525385.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 2.0949166089296343, |
| "epoch": 0.952, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.8499, |
| "mean_token_accuracy": 0.48639670219272374, |
| "num_tokens": 19690715.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 2.1095651358366014, |
| "epoch": 0.96, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 2.8706, |
| "mean_token_accuracy": 0.48429451026022435, |
| "num_tokens": 19856973.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 19856973.0, |
| "eval_coding_runtime": 95.6684, |
| "eval_coding_samples_per_second": 5.226, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 19856973.0, |
| "eval_physics_runtime": 63.6635, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.0797565251588823, |
| "epoch": 0.968, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 2.8871, |
| "mean_token_accuracy": 0.487212173268199, |
| "num_tokens": 20021093.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 2.0395397424697874, |
| "epoch": 0.976, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 2.8149, |
| "mean_token_accuracy": 0.4898043507710099, |
| "num_tokens": 20190600.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 2.170710327476263, |
| "epoch": 0.984, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 2.9186, |
| "mean_token_accuracy": 0.4756599869579077, |
| "num_tokens": 20352255.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 2.074953277409077, |
| "epoch": 0.992, |
| "grad_norm": 0.01092529296875, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 2.8591, |
| "mean_token_accuracy": 0.48683627638965843, |
| "num_tokens": 20518014.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 2.1222646929323674, |
| "epoch": 1.0, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 2.8802, |
| "mean_token_accuracy": 0.4825377007946372, |
| "num_tokens": 20686804.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 2.071950024366379, |
| "epoch": 1.008, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 2.8335, |
| "mean_token_accuracy": 0.4894521238282323, |
| "num_tokens": 20859215.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 2.178350891917944, |
| "epoch": 1.016, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 2.9515, |
| "mean_token_accuracy": 0.475351152010262, |
| "num_tokens": 21019542.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 2.0342648893594744, |
| "epoch": 1.024, |
| "grad_norm": 0.01483154296875, |
| "learning_rate": 1.938e-05, |
| "loss": 2.8036, |
| "mean_token_accuracy": 0.4928664604201913, |
| "num_tokens": 21186658.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 2.0908240154385567, |
| "epoch": 1.032, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 2.8868, |
| "mean_token_accuracy": 0.483198188431561, |
| "num_tokens": 21351715.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 2.1210865639150143, |
| "epoch": 1.04, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 2.9071, |
| "mean_token_accuracy": 0.47981371227651837, |
| "num_tokens": 21517237.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 21517237.0, |
| "eval_coding_runtime": 96.1911, |
| "eval_coding_samples_per_second": 5.198, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 21517237.0, |
| "eval_physics_runtime": 63.9611, |
| "eval_physics_samples_per_second": 7.817, |
| "eval_physics_steps_per_second": 3.909, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.102020622044802, |
| "epoch": 1.048, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 2.8719, |
| "mean_token_accuracy": 0.4847179941833019, |
| "num_tokens": 21682780.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 2.045586694777012, |
| "epoch": 1.056, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 2.8189, |
| "mean_token_accuracy": 0.48618846032768487, |
| "num_tokens": 21852528.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 2.0868709199130535, |
| "epoch": 1.064, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 2.8313, |
| "mean_token_accuracy": 0.4878941228613257, |
| "num_tokens": 22021514.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 2.111675312370062, |
| "epoch": 1.072, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 2.889, |
| "mean_token_accuracy": 0.4821753781288862, |
| "num_tokens": 22192380.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 2.080042862892151, |
| "epoch": 1.08, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 2.8401, |
| "mean_token_accuracy": 0.4857699448242784, |
| "num_tokens": 22356835.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.138657582551241, |
| "epoch": 1.088, |
| "grad_norm": 0.006927490234375, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 2.9223, |
| "mean_token_accuracy": 0.4771748060360551, |
| "num_tokens": 22517971.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 2.1065418414771555, |
| "epoch": 1.096, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.918e-05, |
| "loss": 2.8768, |
| "mean_token_accuracy": 0.4816119741648436, |
| "num_tokens": 22681680.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 2.087523814290762, |
| "epoch": 1.104, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 2.8938, |
| "mean_token_accuracy": 0.4846269652247429, |
| "num_tokens": 22848694.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 2.1188573092222214, |
| "epoch": 1.112, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 2.8744, |
| "mean_token_accuracy": 0.48116844836622474, |
| "num_tokens": 23016014.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 2.071516238898039, |
| "epoch": 1.12, |
| "grad_norm": 0.01544189453125, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 2.8453, |
| "mean_token_accuracy": 0.48592273648828266, |
| "num_tokens": 23183889.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 23183889.0, |
| "eval_coding_runtime": 96.0999, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.601, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 23183889.0, |
| "eval_physics_runtime": 64.1646, |
| "eval_physics_samples_per_second": 7.792, |
| "eval_physics_steps_per_second": 3.896, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.0450177520513533, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 2.8314, |
| "mean_token_accuracy": 0.4920926999300718, |
| "num_tokens": 23354096.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 2.0993775948882103, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 2.8513, |
| "mean_token_accuracy": 0.4838741825893521, |
| "num_tokens": 23523903.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 2.0676057398319245, |
| "epoch": 1.144, |
| "grad_norm": 0.0135498046875, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 2.8661, |
| "mean_token_accuracy": 0.4864560257643461, |
| "num_tokens": 23684571.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 2.0999209016561506, |
| "epoch": 1.152, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 2.8681, |
| "mean_token_accuracy": 0.4822270443663001, |
| "num_tokens": 23848848.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 2.110722591727972, |
| "epoch": 1.16, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 2.871, |
| "mean_token_accuracy": 0.48459589947015047, |
| "num_tokens": 24011654.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 2.181078152358532, |
| "epoch": 1.168, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 1.898e-05, |
| "loss": 2.943, |
| "mean_token_accuracy": 0.47790372706949713, |
| "num_tokens": 24174589.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 2.134608647972345, |
| "epoch": 1.176, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 2.8963, |
| "mean_token_accuracy": 0.4795195382088423, |
| "num_tokens": 24336249.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 2.0590513460338116, |
| "epoch": 1.184, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 2.8476, |
| "mean_token_accuracy": 0.4868919890373945, |
| "num_tokens": 24495981.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 2.094113527983427, |
| "epoch": 1.192, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 2.8476, |
| "mean_token_accuracy": 0.488334871083498, |
| "num_tokens": 24658862.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 2.066910207271576, |
| "epoch": 1.2, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 2.8177, |
| "mean_token_accuracy": 0.4844546576961875, |
| "num_tokens": 24826245.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 24826245.0, |
| "eval_coding_runtime": 95.975, |
| "eval_coding_samples_per_second": 5.21, |
| "eval_coding_steps_per_second": 2.605, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 24826245.0, |
| "eval_physics_runtime": 63.6317, |
| "eval_physics_samples_per_second": 7.858, |
| "eval_physics_steps_per_second": 3.929, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.1044005908071997, |
| "epoch": 1.208, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 2.8829, |
| "mean_token_accuracy": 0.48166574332863094, |
| "num_tokens": 24988821.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 2.1516220718622208, |
| "epoch": 1.216, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 2.9433, |
| "mean_token_accuracy": 0.47657596562057736, |
| "num_tokens": 25147775.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 2.1071897082030775, |
| "epoch": 1.224, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 2.8592, |
| "mean_token_accuracy": 0.479968504793942, |
| "num_tokens": 25312652.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 2.031100772321224, |
| "epoch": 1.232, |
| "grad_norm": 0.006103515625, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 2.8016, |
| "mean_token_accuracy": 0.4919817751273513, |
| "num_tokens": 25480033.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 2.0882932603359223, |
| "epoch": 1.24, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.878e-05, |
| "loss": 2.87, |
| "mean_token_accuracy": 0.4867149593308568, |
| "num_tokens": 25643821.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.1069738253951074, |
| "epoch": 1.248, |
| "grad_norm": 0.01275634765625, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 2.9072, |
| "mean_token_accuracy": 0.48037715051323177, |
| "num_tokens": 25811978.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 2.1235784739255905, |
| "epoch": 1.256, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 2.8361, |
| "mean_token_accuracy": 0.4876450767740607, |
| "num_tokens": 25980217.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 2.07255465015769, |
| "epoch": 1.264, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 2.8226, |
| "mean_token_accuracy": 0.48956795781850815, |
| "num_tokens": 26146299.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 2.083909762650728, |
| "epoch": 1.272, |
| "grad_norm": 0.006805419921875, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 2.8296, |
| "mean_token_accuracy": 0.48632726091891526, |
| "num_tokens": 26315594.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 2.1811861269176007, |
| "epoch": 1.28, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 2.9711, |
| "mean_token_accuracy": 0.47178333820775153, |
| "num_tokens": 26478207.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 26478207.0, |
| "eval_coding_runtime": 95.7162, |
| "eval_coding_samples_per_second": 5.224, |
| "eval_coding_steps_per_second": 2.612, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 26478207.0, |
| "eval_physics_runtime": 63.6631, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.018047058582306, |
| "epoch": 1.288, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 2.7963, |
| "mean_token_accuracy": 0.49348701871931555, |
| "num_tokens": 26647567.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 2.1041888780891895, |
| "epoch": 1.296, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 2.8794, |
| "mean_token_accuracy": 0.4833239272236824, |
| "num_tokens": 26806477.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 2.0873175472021104, |
| "epoch": 1.304, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 2.8534, |
| "mean_token_accuracy": 0.4844379436224699, |
| "num_tokens": 26973124.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 2.184805364906788, |
| "epoch": 1.312, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.858e-05, |
| "loss": 2.9204, |
| "mean_token_accuracy": 0.4806702647358179, |
| "num_tokens": 27137528.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 2.1004837110638617, |
| "epoch": 1.32, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 2.839, |
| "mean_token_accuracy": 0.4855312593281269, |
| "num_tokens": 27300986.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.1488297916948795, |
| "epoch": 1.328, |
| "grad_norm": 0.00677490234375, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 2.876, |
| "mean_token_accuracy": 0.48554224893450737, |
| "num_tokens": 27465859.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 2.11965301707387, |
| "epoch": 1.336, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 2.909, |
| "mean_token_accuracy": 0.4828299328684807, |
| "num_tokens": 27627148.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 2.139040616154671, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 2.9133, |
| "mean_token_accuracy": 0.4807636830955744, |
| "num_tokens": 27795568.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 2.1508292332291603, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 2.9009, |
| "mean_token_accuracy": 0.48149400427937505, |
| "num_tokens": 27958162.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 2.0931077137589456, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 2.8726, |
| "mean_token_accuracy": 0.4807087674736977, |
| "num_tokens": 28129988.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 28129988.0, |
| "eval_coding_runtime": 96.0975, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 28129988.0, |
| "eval_physics_runtime": 63.7443, |
| "eval_physics_samples_per_second": 7.844, |
| "eval_physics_steps_per_second": 3.922, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.0795333731919525, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 2.8522, |
| "mean_token_accuracy": 0.4887755334377289, |
| "num_tokens": 28299534.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 2.141296311467886, |
| "epoch": 1.376, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 2.9271, |
| "mean_token_accuracy": 0.4771733907982707, |
| "num_tokens": 28465903.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 2.0781943283975126, |
| "epoch": 1.384, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.8574, |
| "mean_token_accuracy": 0.4866001008078456, |
| "num_tokens": 28639289.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 2.0997987784445287, |
| "epoch": 1.392, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 2.8585, |
| "mean_token_accuracy": 0.48320644851773975, |
| "num_tokens": 28801554.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 2.150196324288845, |
| "epoch": 1.4, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 2.9002, |
| "mean_token_accuracy": 0.48246240708976984, |
| "num_tokens": 28963885.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 2.084288237988949, |
| "epoch": 1.408, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 2.8911, |
| "mean_token_accuracy": 0.4849772537127137, |
| "num_tokens": 29125114.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 2.1347289800643923, |
| "epoch": 1.416, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 2.9129, |
| "mean_token_accuracy": 0.477182531170547, |
| "num_tokens": 29285272.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 2.1285498954355715, |
| "epoch": 1.424, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 2.9215, |
| "mean_token_accuracy": 0.4772155126556754, |
| "num_tokens": 29454334.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 2.0727533496916295, |
| "epoch": 1.432, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 2.8803, |
| "mean_token_accuracy": 0.4822785761207342, |
| "num_tokens": 29616865.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 2.0606922268867494, |
| "epoch": 1.44, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 2.8232, |
| "mean_token_accuracy": 0.48800144270062445, |
| "num_tokens": 29789253.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 29789253.0, |
| "eval_coding_runtime": 96.3208, |
| "eval_coding_samples_per_second": 5.191, |
| "eval_coding_steps_per_second": 2.595, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 29789253.0, |
| "eval_physics_runtime": 63.9922, |
| "eval_physics_samples_per_second": 7.813, |
| "eval_physics_steps_per_second": 3.907, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.0618888407945635, |
| "epoch": 1.448, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 2.8573, |
| "mean_token_accuracy": 0.4843838458880782, |
| "num_tokens": 29953006.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 2.022880507260561, |
| "epoch": 1.456, |
| "grad_norm": 0.0194091796875, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.8031, |
| "mean_token_accuracy": 0.49401772283017636, |
| "num_tokens": 30128576.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 2.0860828548669814, |
| "epoch": 1.464, |
| "grad_norm": 0.0294189453125, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 2.8756, |
| "mean_token_accuracy": 0.4849246619269252, |
| "num_tokens": 30288894.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 2.188418509066105, |
| "epoch": 1.472, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 2.9527, |
| "mean_token_accuracy": 0.4721253639087081, |
| "num_tokens": 30452719.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 2.085013201087713, |
| "epoch": 1.48, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 2.8646, |
| "mean_token_accuracy": 0.486043793335557, |
| "num_tokens": 30620659.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 2.143638453632593, |
| "epoch": 1.488, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 2.9264, |
| "mean_token_accuracy": 0.47975865937769413, |
| "num_tokens": 30782343.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 2.110791251808405, |
| "epoch": 1.496, |
| "grad_norm": 0.00689697265625, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 2.8831, |
| "mean_token_accuracy": 0.4842625314369798, |
| "num_tokens": 30947703.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 2.1235111109912395, |
| "epoch": 1.504, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 2.8413, |
| "mean_token_accuracy": 0.4860221529379487, |
| "num_tokens": 31113238.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 2.0822810381650925, |
| "epoch": 1.512, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 2.864, |
| "mean_token_accuracy": 0.48232704903930423, |
| "num_tokens": 31283883.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 2.1699491038918497, |
| "epoch": 1.52, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 2.9505, |
| "mean_token_accuracy": 0.4706884307786822, |
| "num_tokens": 31444589.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 31444589.0, |
| "eval_coding_runtime": 95.9376, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 31444589.0, |
| "eval_physics_runtime": 63.8135, |
| "eval_physics_samples_per_second": 7.835, |
| "eval_physics_steps_per_second": 3.918, |
| "step": 1900 |
| }, |
| { |
| "entropy": 2.1199583016335963, |
| "epoch": 1.528, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.798e-05, |
| "loss": 2.8928, |
| "mean_token_accuracy": 0.4802051903679967, |
| "num_tokens": 31608952.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 2.175958313792944, |
| "epoch": 1.536, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 2.9365, |
| "mean_token_accuracy": 0.4747030269354582, |
| "num_tokens": 31774137.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 2.086700049042702, |
| "epoch": 1.544, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 2.8777, |
| "mean_token_accuracy": 0.4859625767916441, |
| "num_tokens": 31937927.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 2.1010241121053697, |
| "epoch": 1.552, |
| "grad_norm": 0.0068359375, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 2.8916, |
| "mean_token_accuracy": 0.4840227223932743, |
| "num_tokens": 32101776.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 2.1249936796724795, |
| "epoch": 1.56, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 2.8819, |
| "mean_token_accuracy": 0.48242205642163755, |
| "num_tokens": 32271213.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 2.040369462966919, |
| "epoch": 1.568, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 2.8209, |
| "mean_token_accuracy": 0.48918427657335994, |
| "num_tokens": 32428021.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 2.2181196875870226, |
| "epoch": 1.576, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 2.9794, |
| "mean_token_accuracy": 0.4697700636461377, |
| "num_tokens": 32585407.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 2.096325629204512, |
| "epoch": 1.584, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 2.8507, |
| "mean_token_accuracy": 0.4877478454262018, |
| "num_tokens": 32752735.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 2.1082000881433487, |
| "epoch": 1.592, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 2.8789, |
| "mean_token_accuracy": 0.482938090339303, |
| "num_tokens": 32912519.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 2.139867788553238, |
| "epoch": 1.6, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.9136, |
| "mean_token_accuracy": 0.4764457972720265, |
| "num_tokens": 33075822.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 33075822.0, |
| "eval_coding_runtime": 96.1948, |
| "eval_coding_samples_per_second": 5.198, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 33075822.0, |
| "eval_physics_runtime": 63.6877, |
| "eval_physics_samples_per_second": 7.851, |
| "eval_physics_steps_per_second": 3.925, |
| "step": 2000 |
| }, |
| { |
| "entropy": 2.0446168780326843, |
| "epoch": 1.608, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 1.7757777777777777e-05, |
| "loss": 2.8161, |
| "mean_token_accuracy": 0.49365365616977214, |
| "num_tokens": 33250394.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 2.0819836497306823, |
| "epoch": 1.616, |
| "grad_norm": 0.023681640625, |
| "learning_rate": 1.7735555555555558e-05, |
| "loss": 2.8419, |
| "mean_token_accuracy": 0.4879772139713168, |
| "num_tokens": 33411850.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 2.041516558080912, |
| "epoch": 1.624, |
| "grad_norm": 0.01708984375, |
| "learning_rate": 1.7713333333333335e-05, |
| "loss": 2.7918, |
| "mean_token_accuracy": 0.4918495800346136, |
| "num_tokens": 33577978.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 2.1720810759812594, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.016357421875, |
| "learning_rate": 1.7691111111111113e-05, |
| "loss": 2.93, |
| "mean_token_accuracy": 0.47755391960963606, |
| "num_tokens": 33739456.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 2.0735519513487817, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.766888888888889e-05, |
| "loss": 2.8702, |
| "mean_token_accuracy": 0.4865173580124974, |
| "num_tokens": 33904537.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 2.049781569093466, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.7646666666666668e-05, |
| "loss": 2.8312, |
| "mean_token_accuracy": 0.4884044425562024, |
| "num_tokens": 34072022.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 2.0407406061887743, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 1.7624444444444446e-05, |
| "loss": 2.8257, |
| "mean_token_accuracy": 0.48913145195692775, |
| "num_tokens": 34236954.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 2.0800452411174772, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.00634765625, |
| "learning_rate": 1.7602222222222223e-05, |
| "loss": 2.8431, |
| "mean_token_accuracy": 0.4870519321411848, |
| "num_tokens": 34398959.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 2.0950917959213258, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.758e-05, |
| "loss": 2.8336, |
| "mean_token_accuracy": 0.4887974336743355, |
| "num_tokens": 34564364.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 2.0823982164263724, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.755777777777778e-05, |
| "loss": 2.8866, |
| "mean_token_accuracy": 0.4803567361086607, |
| "num_tokens": 34732284.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 34732284.0, |
| "eval_coding_runtime": 96.096, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 34732284.0, |
| "eval_physics_runtime": 64.047, |
| "eval_physics_samples_per_second": 7.807, |
| "eval_physics_steps_per_second": 3.903, |
| "step": 2100 |
| }, |
| { |
| "entropy": 2.108163347095251, |
| "epoch": 1.688, |
| "grad_norm": 0.0255126953125, |
| "learning_rate": 1.7535555555555556e-05, |
| "loss": 2.8721, |
| "mean_token_accuracy": 0.4795815791934729, |
| "num_tokens": 34895374.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 2.094745473563671, |
| "epoch": 1.696, |
| "grad_norm": 0.01904296875, |
| "learning_rate": 1.7513333333333334e-05, |
| "loss": 2.8655, |
| "mean_token_accuracy": 0.48117484580725434, |
| "num_tokens": 35053712.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 2.1672711707651615, |
| "epoch": 1.704, |
| "grad_norm": 0.0238037109375, |
| "learning_rate": 1.749111111111111e-05, |
| "loss": 2.8962, |
| "mean_token_accuracy": 0.47733605969697235, |
| "num_tokens": 35216941.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 2.076349521800876, |
| "epoch": 1.712, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.746888888888889e-05, |
| "loss": 2.8409, |
| "mean_token_accuracy": 0.48228554893285036, |
| "num_tokens": 35393088.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 2.086632777005434, |
| "epoch": 1.72, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.7446666666666667e-05, |
| "loss": 2.8509, |
| "mean_token_accuracy": 0.48758579082787035, |
| "num_tokens": 35559385.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 2.116019304841757, |
| "epoch": 1.728, |
| "grad_norm": 0.006744384765625, |
| "learning_rate": 1.7424444444444444e-05, |
| "loss": 2.8813, |
| "mean_token_accuracy": 0.482978693023324, |
| "num_tokens": 35729007.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 2.0990616902709007, |
| "epoch": 1.736, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.7402222222222222e-05, |
| "loss": 2.8565, |
| "mean_token_accuracy": 0.4873128758743405, |
| "num_tokens": 35905199.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 2.0888281360268595, |
| "epoch": 1.744, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 2.8736, |
| "mean_token_accuracy": 0.4849411312490702, |
| "num_tokens": 36072489.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 2.1180737152695657, |
| "epoch": 1.752, |
| "grad_norm": 0.007720947265625, |
| "learning_rate": 1.735777777777778e-05, |
| "loss": 2.9049, |
| "mean_token_accuracy": 0.4808773329481483, |
| "num_tokens": 36240600.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 2.12109714448452, |
| "epoch": 1.76, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.7335555555555558e-05, |
| "loss": 2.8884, |
| "mean_token_accuracy": 0.4813131859526038, |
| "num_tokens": 36398285.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 36398285.0, |
| "eval_coding_runtime": 96.2637, |
| "eval_coding_samples_per_second": 5.194, |
| "eval_coding_steps_per_second": 2.597, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 36398285.0, |
| "eval_physics_runtime": 64.2412, |
| "eval_physics_samples_per_second": 7.783, |
| "eval_physics_steps_per_second": 3.892, |
| "step": 2200 |
| }, |
| { |
| "entropy": 2.155783172696829, |
| "epoch": 1.768, |
| "grad_norm": 0.00872802734375, |
| "learning_rate": 1.7313333333333336e-05, |
| "loss": 2.9112, |
| "mean_token_accuracy": 0.4770881399512291, |
| "num_tokens": 36559891.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 2.1637654595077036, |
| "epoch": 1.776, |
| "grad_norm": 0.015625, |
| "learning_rate": 1.7291111111111113e-05, |
| "loss": 2.9176, |
| "mean_token_accuracy": 0.47640001866966486, |
| "num_tokens": 36730479.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 2.0420110248029233, |
| "epoch": 1.784, |
| "grad_norm": 0.01531982421875, |
| "learning_rate": 1.726888888888889e-05, |
| "loss": 2.7908, |
| "mean_token_accuracy": 0.49196922313421965, |
| "num_tokens": 36897454.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 2.110601053386927, |
| "epoch": 1.792, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 1.724666666666667e-05, |
| "loss": 2.8796, |
| "mean_token_accuracy": 0.4833569750189781, |
| "num_tokens": 37057425.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 2.044808383285999, |
| "epoch": 1.8, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 1.7224444444444446e-05, |
| "loss": 2.8188, |
| "mean_token_accuracy": 0.489076080173254, |
| "num_tokens": 37225440.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 2.0545787177979946, |
| "epoch": 1.808, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.7202222222222224e-05, |
| "loss": 2.7854, |
| "mean_token_accuracy": 0.4944944025948644, |
| "num_tokens": 37395449.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 2.108742152154446, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.718e-05, |
| "loss": 2.8803, |
| "mean_token_accuracy": 0.48428055848926305, |
| "num_tokens": 37565463.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 2.066716094315052, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 0.0076904296875, |
| "learning_rate": 1.715777777777778e-05, |
| "loss": 2.8542, |
| "mean_token_accuracy": 0.487205614708364, |
| "num_tokens": 37729535.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 2.115487913042307, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 0.019775390625, |
| "learning_rate": 1.7135555555555557e-05, |
| "loss": 2.9002, |
| "mean_token_accuracy": 0.48101999796926975, |
| "num_tokens": 37888632.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 2.091165804117918, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.7113333333333334e-05, |
| "loss": 2.8162, |
| "mean_token_accuracy": 0.4885393580421805, |
| "num_tokens": 38057839.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 38057839.0, |
| "eval_coding_runtime": 96.1927, |
| "eval_coding_samples_per_second": 5.198, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 38057839.0, |
| "eval_physics_runtime": 63.966, |
| "eval_physics_samples_per_second": 7.817, |
| "eval_physics_steps_per_second": 3.908, |
| "step": 2300 |
| }, |
| { |
| "entropy": 2.0943633928894996, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 1.7091111111111112e-05, |
| "loss": 2.8744, |
| "mean_token_accuracy": 0.4854456914588809, |
| "num_tokens": 38228321.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 2.1238606452941893, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.706888888888889e-05, |
| "loss": 2.8687, |
| "mean_token_accuracy": 0.4809235129505396, |
| "num_tokens": 38398033.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 2.118824549764395, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 0.007720947265625, |
| "learning_rate": 1.704666666666667e-05, |
| "loss": 2.8771, |
| "mean_token_accuracy": 0.4786581486463547, |
| "num_tokens": 38568933.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 2.1477391608059406, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.7024444444444445e-05, |
| "loss": 2.9297, |
| "mean_token_accuracy": 0.4786318261176348, |
| "num_tokens": 38729203.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 2.1359975136816503, |
| "epoch": 1.88, |
| "grad_norm": 0.00677490234375, |
| "learning_rate": 1.7002222222222226e-05, |
| "loss": 2.9197, |
| "mean_token_accuracy": 0.47710791788995266, |
| "num_tokens": 38895784.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 2.1358583688735964, |
| "epoch": 1.888, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.698e-05, |
| "loss": 2.9045, |
| "mean_token_accuracy": 0.47970328964293, |
| "num_tokens": 39058791.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 2.1071565486490726, |
| "epoch": 1.896, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.695777777777778e-05, |
| "loss": 2.8754, |
| "mean_token_accuracy": 0.4833732729777694, |
| "num_tokens": 39224201.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 2.1035716101527213, |
| "epoch": 1.904, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.6935555555555555e-05, |
| "loss": 2.8949, |
| "mean_token_accuracy": 0.47980596460402014, |
| "num_tokens": 39386170.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 2.1066678419709204, |
| "epoch": 1.912, |
| "grad_norm": 0.0213623046875, |
| "learning_rate": 1.6913333333333336e-05, |
| "loss": 2.8792, |
| "mean_token_accuracy": 0.4800190070644021, |
| "num_tokens": 39545061.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 2.1327901519834995, |
| "epoch": 1.92, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.689111111111111e-05, |
| "loss": 2.8996, |
| "mean_token_accuracy": 0.4832611232995987, |
| "num_tokens": 39710381.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 39710381.0, |
| "eval_coding_runtime": 95.9878, |
| "eval_coding_samples_per_second": 5.209, |
| "eval_coding_steps_per_second": 2.604, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 39710381.0, |
| "eval_physics_runtime": 63.8325, |
| "eval_physics_samples_per_second": 7.833, |
| "eval_physics_steps_per_second": 3.916, |
| "step": 2400 |
| }, |
| { |
| "entropy": 2.1592532977461816, |
| "epoch": 1.928, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 1.686888888888889e-05, |
| "loss": 2.9007, |
| "mean_token_accuracy": 0.48193860817700623, |
| "num_tokens": 39878408.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 2.0319110229611397, |
| "epoch": 1.936, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 1.684666666666667e-05, |
| "loss": 2.8144, |
| "mean_token_accuracy": 0.49194381963461636, |
| "num_tokens": 40043354.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 2.0290094792842863, |
| "epoch": 1.944, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.6824444444444447e-05, |
| "loss": 2.8173, |
| "mean_token_accuracy": 0.490530570037663, |
| "num_tokens": 40206607.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 2.001203689724207, |
| "epoch": 1.952, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.6802222222222224e-05, |
| "loss": 2.775, |
| "mean_token_accuracy": 0.4963298514485359, |
| "num_tokens": 40375790.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 2.078115231543779, |
| "epoch": 1.96, |
| "grad_norm": 0.01220703125, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 2.8797, |
| "mean_token_accuracy": 0.4827138794586062, |
| "num_tokens": 40539856.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 2.1310867570340633, |
| "epoch": 1.968, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.675777777777778e-05, |
| "loss": 2.896, |
| "mean_token_accuracy": 0.4785461273044348, |
| "num_tokens": 40705451.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 2.1071126513183116, |
| "epoch": 1.976, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 1.6735555555555557e-05, |
| "loss": 2.8625, |
| "mean_token_accuracy": 0.4837299410253763, |
| "num_tokens": 40873446.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 2.1266873709857466, |
| "epoch": 1.984, |
| "grad_norm": 0.01470947265625, |
| "learning_rate": 1.6713333333333335e-05, |
| "loss": 2.9035, |
| "mean_token_accuracy": 0.4816863130778074, |
| "num_tokens": 41040313.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 2.0827384896576406, |
| "epoch": 1.992, |
| "grad_norm": 0.00640869140625, |
| "learning_rate": 1.6691111111111112e-05, |
| "loss": 2.8539, |
| "mean_token_accuracy": 0.4890477364882827, |
| "num_tokens": 41205940.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 2.1126816742122174, |
| "epoch": 2.0, |
| "grad_norm": 0.01556396484375, |
| "learning_rate": 1.666888888888889e-05, |
| "loss": 2.8749, |
| "mean_token_accuracy": 0.48269832525402306, |
| "num_tokens": 41373608.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 41373608.0, |
| "eval_coding_runtime": 96.1355, |
| "eval_coding_samples_per_second": 5.201, |
| "eval_coding_steps_per_second": 2.6, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 41373608.0, |
| "eval_physics_runtime": 63.6152, |
| "eval_physics_samples_per_second": 7.86, |
| "eval_physics_steps_per_second": 3.93, |
| "step": 2500 |
| }, |
| { |
| "entropy": 2.109695591032505, |
| "epoch": 2.008, |
| "grad_norm": 0.015625, |
| "learning_rate": 1.6646666666666668e-05, |
| "loss": 2.896, |
| "mean_token_accuracy": 0.479962002299726, |
| "num_tokens": 41541054.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 2.0923699125647546, |
| "epoch": 2.016, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.6624444444444445e-05, |
| "loss": 2.8569, |
| "mean_token_accuracy": 0.48771672397851945, |
| "num_tokens": 41708226.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 2.055105470865965, |
| "epoch": 2.024, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.6602222222222223e-05, |
| "loss": 2.8172, |
| "mean_token_accuracy": 0.492337409965694, |
| "num_tokens": 41874459.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 2.0631272219121457, |
| "epoch": 2.032, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.658e-05, |
| "loss": 2.8367, |
| "mean_token_accuracy": 0.489038916118443, |
| "num_tokens": 42045433.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 2.1083615384995937, |
| "epoch": 2.04, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 1.6557777777777778e-05, |
| "loss": 2.8541, |
| "mean_token_accuracy": 0.4863636655732989, |
| "num_tokens": 42208657.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 2.1512265764176846, |
| "epoch": 2.048, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 1.6535555555555556e-05, |
| "loss": 2.9351, |
| "mean_token_accuracy": 0.4792114529758692, |
| "num_tokens": 42372694.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 2.0822809383273126, |
| "epoch": 2.056, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 1.6513333333333333e-05, |
| "loss": 2.8577, |
| "mean_token_accuracy": 0.4852941369637847, |
| "num_tokens": 42541126.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 2.078305044025183, |
| "epoch": 2.064, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.6491111111111114e-05, |
| "loss": 2.8323, |
| "mean_token_accuracy": 0.48723302837461235, |
| "num_tokens": 42710240.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 2.096334061771631, |
| "epoch": 2.072, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.646888888888889e-05, |
| "loss": 2.8596, |
| "mean_token_accuracy": 0.48555029947310685, |
| "num_tokens": 42874781.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 2.1294494703412057, |
| "epoch": 2.08, |
| "grad_norm": 0.0235595703125, |
| "learning_rate": 1.644666666666667e-05, |
| "loss": 2.918, |
| "mean_token_accuracy": 0.4746216956526041, |
| "num_tokens": 43037844.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 43037844.0, |
| "eval_coding_runtime": 96.2051, |
| "eval_coding_samples_per_second": 5.197, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.08, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 43037844.0, |
| "eval_physics_runtime": 64.2843, |
| "eval_physics_samples_per_second": 7.778, |
| "eval_physics_steps_per_second": 3.889, |
| "step": 2600 |
| }, |
| { |
| "entropy": 2.1377002507448197, |
| "epoch": 2.088, |
| "grad_norm": 0.0062255859375, |
| "learning_rate": 1.6424444444444444e-05, |
| "loss": 2.9047, |
| "mean_token_accuracy": 0.47883504033088686, |
| "num_tokens": 43202298.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 2.049978656321764, |
| "epoch": 2.096, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.6402222222222225e-05, |
| "loss": 2.8012, |
| "mean_token_accuracy": 0.4908132331445813, |
| "num_tokens": 43368675.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 2.0891571439802648, |
| "epoch": 2.104, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 1.638e-05, |
| "loss": 2.858, |
| "mean_token_accuracy": 0.4863886846229434, |
| "num_tokens": 43528580.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 2.084837993234396, |
| "epoch": 2.112, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.635777777777778e-05, |
| "loss": 2.8258, |
| "mean_token_accuracy": 0.49133954383432865, |
| "num_tokens": 43699544.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 2.1012887261807918, |
| "epoch": 2.12, |
| "grad_norm": 0.0191650390625, |
| "learning_rate": 1.6335555555555558e-05, |
| "loss": 2.8533, |
| "mean_token_accuracy": 0.4835510091856122, |
| "num_tokens": 43858502.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 2.126673661917448, |
| "epoch": 2.128, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.6313333333333335e-05, |
| "loss": 2.9162, |
| "mean_token_accuracy": 0.48049711883068086, |
| "num_tokens": 44012632.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 2.110631703585386, |
| "epoch": 2.136, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.6291111111111113e-05, |
| "loss": 2.8524, |
| "mean_token_accuracy": 0.48574508912861347, |
| "num_tokens": 44179301.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 2.0693719699978828, |
| "epoch": 2.144, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.626888888888889e-05, |
| "loss": 2.8489, |
| "mean_token_accuracy": 0.486761187389493, |
| "num_tokens": 44349732.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 2.1314582899212837, |
| "epoch": 2.152, |
| "grad_norm": 0.0064697265625, |
| "learning_rate": 1.6246666666666668e-05, |
| "loss": 2.8933, |
| "mean_token_accuracy": 0.47988808769732716, |
| "num_tokens": 44514865.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 2.146127165853977, |
| "epoch": 2.16, |
| "grad_norm": 0.0191650390625, |
| "learning_rate": 1.6224444444444446e-05, |
| "loss": 2.9227, |
| "mean_token_accuracy": 0.47669907119125127, |
| "num_tokens": 44680410.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.16, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 44680410.0, |
| "eval_coding_runtime": 96.1226, |
| "eval_coding_samples_per_second": 5.202, |
| "eval_coding_steps_per_second": 2.601, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.16, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 44680410.0, |
| "eval_physics_runtime": 63.8308, |
| "eval_physics_samples_per_second": 7.833, |
| "eval_physics_steps_per_second": 3.917, |
| "step": 2700 |
| }, |
| { |
| "entropy": 2.1261583000421522, |
| "epoch": 2.168, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.6202222222222223e-05, |
| "loss": 2.8919, |
| "mean_token_accuracy": 0.4785753648728132, |
| "num_tokens": 44846515.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 2.0676582753658295, |
| "epoch": 2.176, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.618e-05, |
| "loss": 2.8569, |
| "mean_token_accuracy": 0.4861806998029351, |
| "num_tokens": 45017922.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 2.1044729210436346, |
| "epoch": 2.184, |
| "grad_norm": 0.007293701171875, |
| "learning_rate": 1.615777777777778e-05, |
| "loss": 2.8779, |
| "mean_token_accuracy": 0.48611082304269077, |
| "num_tokens": 45181823.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 2.0987057738006114, |
| "epoch": 2.192, |
| "grad_norm": 0.014892578125, |
| "learning_rate": 1.6135555555555556e-05, |
| "loss": 2.8557, |
| "mean_token_accuracy": 0.4868852591142058, |
| "num_tokens": 45348179.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 2.093054236471653, |
| "epoch": 2.2, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.6113333333333334e-05, |
| "loss": 2.8597, |
| "mean_token_accuracy": 0.48510244842618705, |
| "num_tokens": 45514208.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 2.070825570821762, |
| "epoch": 2.208, |
| "grad_norm": 0.0074462890625, |
| "learning_rate": 1.609111111111111e-05, |
| "loss": 2.8633, |
| "mean_token_accuracy": 0.486143646389246, |
| "num_tokens": 45678397.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 2.0762874506413938, |
| "epoch": 2.216, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 1.606888888888889e-05, |
| "loss": 2.8764, |
| "mean_token_accuracy": 0.4872116198763251, |
| "num_tokens": 45844632.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 2.0525411412119867, |
| "epoch": 2.224, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.6046666666666667e-05, |
| "loss": 2.8189, |
| "mean_token_accuracy": 0.4916418811306357, |
| "num_tokens": 46010052.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 2.0773925125598907, |
| "epoch": 2.232, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 1.6024444444444444e-05, |
| "loss": 2.8449, |
| "mean_token_accuracy": 0.4843680987134576, |
| "num_tokens": 46176730.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 2.1263207592070104, |
| "epoch": 2.24, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.6002222222222222e-05, |
| "loss": 2.918, |
| "mean_token_accuracy": 0.476583500020206, |
| "num_tokens": 46347116.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 46347116.0, |
| "eval_coding_runtime": 95.9232, |
| "eval_coding_samples_per_second": 5.213, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.24, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 46347116.0, |
| "eval_physics_runtime": 63.9756, |
| "eval_physics_samples_per_second": 7.815, |
| "eval_physics_steps_per_second": 3.908, |
| "step": 2800 |
| }, |
| { |
| "entropy": 2.1278377093374727, |
| "epoch": 2.248, |
| "grad_norm": 0.01080322265625, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 2.9003, |
| "mean_token_accuracy": 0.4784318573772907, |
| "num_tokens": 46513696.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 2.128158251941204, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.01312255859375, |
| "learning_rate": 1.5957777777777777e-05, |
| "loss": 2.8978, |
| "mean_token_accuracy": 0.48350818566977977, |
| "num_tokens": 46673074.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 2.07526678070426, |
| "epoch": 2.2640000000000002, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.5935555555555558e-05, |
| "loss": 2.8253, |
| "mean_token_accuracy": 0.48843313101679087, |
| "num_tokens": 46841474.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 2.098488190770149, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 1.5913333333333332e-05, |
| "loss": 2.8562, |
| "mean_token_accuracy": 0.4841760952025652, |
| "num_tokens": 47009386.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 2.1084106832742693, |
| "epoch": 2.2800000000000002, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.5891111111111113e-05, |
| "loss": 2.8686, |
| "mean_token_accuracy": 0.48397026900202034, |
| "num_tokens": 47175754.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 2.165386701375246, |
| "epoch": 2.288, |
| "grad_norm": 0.0179443359375, |
| "learning_rate": 1.5868888888888888e-05, |
| "loss": 2.9623, |
| "mean_token_accuracy": 0.47135423719882963, |
| "num_tokens": 47344231.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 2.062075611203909, |
| "epoch": 2.296, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.584666666666667e-05, |
| "loss": 2.8116, |
| "mean_token_accuracy": 0.49171398133039473, |
| "num_tokens": 47507052.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 2.126021772623062, |
| "epoch": 2.304, |
| "grad_norm": 0.006072998046875, |
| "learning_rate": 1.5824444444444446e-05, |
| "loss": 2.8995, |
| "mean_token_accuracy": 0.482550716586411, |
| "num_tokens": 47658392.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 2.0740631029009817, |
| "epoch": 2.312, |
| "grad_norm": 0.01104736328125, |
| "learning_rate": 1.5802222222222224e-05, |
| "loss": 2.8559, |
| "mean_token_accuracy": 0.4810700535774231, |
| "num_tokens": 47827802.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 2.1145732060074804, |
| "epoch": 2.32, |
| "grad_norm": 0.01116943359375, |
| "learning_rate": 1.578e-05, |
| "loss": 2.8727, |
| "mean_token_accuracy": 0.4866561494767666, |
| "num_tokens": 47995015.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.32, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 47995015.0, |
| "eval_coding_runtime": 95.7936, |
| "eval_coding_samples_per_second": 5.22, |
| "eval_coding_steps_per_second": 2.61, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.32, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 47995015.0, |
| "eval_physics_runtime": 64.2171, |
| "eval_physics_samples_per_second": 7.786, |
| "eval_physics_steps_per_second": 3.893, |
| "step": 2900 |
| }, |
| { |
| "entropy": 2.0761554703116416, |
| "epoch": 2.328, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 1.575777777777778e-05, |
| "loss": 2.8017, |
| "mean_token_accuracy": 0.4926915055140853, |
| "num_tokens": 48163933.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 2.0599563673138617, |
| "epoch": 2.336, |
| "grad_norm": 0.0076904296875, |
| "learning_rate": 1.5735555555555557e-05, |
| "loss": 2.852, |
| "mean_token_accuracy": 0.4864024940878153, |
| "num_tokens": 48336708.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 2.0758877620100975, |
| "epoch": 2.344, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.5713333333333334e-05, |
| "loss": 2.8046, |
| "mean_token_accuracy": 0.4939649226143956, |
| "num_tokens": 48501684.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 2.1009161733090878, |
| "epoch": 2.352, |
| "grad_norm": 0.01092529296875, |
| "learning_rate": 1.5691111111111112e-05, |
| "loss": 2.8784, |
| "mean_token_accuracy": 0.48454554490745066, |
| "num_tokens": 48665311.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 2.0380428835749624, |
| "epoch": 2.36, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 1.5668888888888893e-05, |
| "loss": 2.8256, |
| "mean_token_accuracy": 0.49210381004959347, |
| "num_tokens": 48842066.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 2.174724444001913, |
| "epoch": 2.368, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 1.5646666666666667e-05, |
| "loss": 2.9176, |
| "mean_token_accuracy": 0.476856386102736, |
| "num_tokens": 49005334.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 2.1461206674575806, |
| "epoch": 2.376, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 1.5624444444444448e-05, |
| "loss": 2.9246, |
| "mean_token_accuracy": 0.48218597918748857, |
| "num_tokens": 49166287.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 2.1835753597319125, |
| "epoch": 2.384, |
| "grad_norm": 0.00634765625, |
| "learning_rate": 1.5602222222222222e-05, |
| "loss": 2.945, |
| "mean_token_accuracy": 0.47245176918804643, |
| "num_tokens": 49331936.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 2.1323873437941074, |
| "epoch": 2.392, |
| "grad_norm": 0.01556396484375, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 2.9147, |
| "mean_token_accuracy": 0.4816423388198018, |
| "num_tokens": 49500119.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 2.0695614576339723, |
| "epoch": 2.4, |
| "grad_norm": 0.0078125, |
| "learning_rate": 1.5557777777777778e-05, |
| "loss": 2.8376, |
| "mean_token_accuracy": 0.48881256151944397, |
| "num_tokens": 49671048.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 49671048.0, |
| "eval_coding_runtime": 95.7733, |
| "eval_coding_samples_per_second": 5.221, |
| "eval_coding_steps_per_second": 2.61, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.4, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 49671048.0, |
| "eval_physics_runtime": 63.5578, |
| "eval_physics_samples_per_second": 7.867, |
| "eval_physics_steps_per_second": 3.933, |
| "step": 3000 |
| }, |
| { |
| "entropy": 2.048779007047415, |
| "epoch": 2.408, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.553555555555556e-05, |
| "loss": 2.8455, |
| "mean_token_accuracy": 0.4872598424553871, |
| "num_tokens": 49845815.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 2.053139239549637, |
| "epoch": 2.416, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.5513333333333333e-05, |
| "loss": 2.8561, |
| "mean_token_accuracy": 0.4852673104032874, |
| "num_tokens": 50018182.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 2.070374865829945, |
| "epoch": 2.424, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.5491111111111114e-05, |
| "loss": 2.8782, |
| "mean_token_accuracy": 0.4815656444057822, |
| "num_tokens": 50176877.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 2.0859608858823777, |
| "epoch": 2.432, |
| "grad_norm": 0.0137939453125, |
| "learning_rate": 1.546888888888889e-05, |
| "loss": 2.8833, |
| "mean_token_accuracy": 0.4850707145407796, |
| "num_tokens": 50341187.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 2.13402646407485, |
| "epoch": 2.44, |
| "grad_norm": 0.006805419921875, |
| "learning_rate": 1.544666666666667e-05, |
| "loss": 2.8688, |
| "mean_token_accuracy": 0.48171408735215665, |
| "num_tokens": 50505772.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 2.146274097263813, |
| "epoch": 2.448, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.5424444444444447e-05, |
| "loss": 2.8915, |
| "mean_token_accuracy": 0.48076413553208114, |
| "num_tokens": 50667019.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 2.117669067531824, |
| "epoch": 2.456, |
| "grad_norm": 0.025634765625, |
| "learning_rate": 1.5402222222222224e-05, |
| "loss": 2.9298, |
| "mean_token_accuracy": 0.4758553016930819, |
| "num_tokens": 50829652.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 2.121635789424181, |
| "epoch": 2.464, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 2.8609, |
| "mean_token_accuracy": 0.48166232127696273, |
| "num_tokens": 50991113.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 2.1312516391277314, |
| "epoch": 2.472, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.535777777777778e-05, |
| "loss": 2.8987, |
| "mean_token_accuracy": 0.4824211601167917, |
| "num_tokens": 51159212.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 2.0914432048797607, |
| "epoch": 2.48, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 1.5335555555555557e-05, |
| "loss": 2.8976, |
| "mean_token_accuracy": 0.47798026613891126, |
| "num_tokens": 51319624.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.48, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 51319624.0, |
| "eval_coding_runtime": 95.9767, |
| "eval_coding_samples_per_second": 5.21, |
| "eval_coding_steps_per_second": 2.605, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.48, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 51319624.0, |
| "eval_physics_runtime": 63.679, |
| "eval_physics_samples_per_second": 7.852, |
| "eval_physics_steps_per_second": 3.926, |
| "step": 3100 |
| }, |
| { |
| "entropy": 2.136239843815565, |
| "epoch": 2.488, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 1.5313333333333335e-05, |
| "loss": 2.894, |
| "mean_token_accuracy": 0.48326832950115206, |
| "num_tokens": 51488050.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 2.0875415176153185, |
| "epoch": 2.496, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 1.5291111111111112e-05, |
| "loss": 2.8606, |
| "mean_token_accuracy": 0.4845199853181839, |
| "num_tokens": 51654633.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 2.1257060185074805, |
| "epoch": 2.504, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 1.526888888888889e-05, |
| "loss": 2.8722, |
| "mean_token_accuracy": 0.4818729363381863, |
| "num_tokens": 51818044.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 2.0669797226786613, |
| "epoch": 2.512, |
| "grad_norm": 0.015625, |
| "learning_rate": 1.5246666666666668e-05, |
| "loss": 2.8189, |
| "mean_token_accuracy": 0.48912298064678905, |
| "num_tokens": 51980051.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 2.0928348623216153, |
| "epoch": 2.52, |
| "grad_norm": 0.01019287109375, |
| "learning_rate": 1.5224444444444447e-05, |
| "loss": 2.8648, |
| "mean_token_accuracy": 0.4851804681122303, |
| "num_tokens": 52146049.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 2.0820957243442537, |
| "epoch": 2.528, |
| "grad_norm": 0.00726318359375, |
| "learning_rate": 1.5202222222222223e-05, |
| "loss": 2.8607, |
| "mean_token_accuracy": 0.48901640996336937, |
| "num_tokens": 52316109.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 2.0938249841332435, |
| "epoch": 2.536, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 2.8368, |
| "mean_token_accuracy": 0.4879815449938178, |
| "num_tokens": 52475120.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 2.153928484022617, |
| "epoch": 2.544, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 1.5157777777777778e-05, |
| "loss": 2.9383, |
| "mean_token_accuracy": 0.4752116585150361, |
| "num_tokens": 52634528.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 2.090732706338167, |
| "epoch": 2.552, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.5135555555555557e-05, |
| "loss": 2.9029, |
| "mean_token_accuracy": 0.48127470947802065, |
| "num_tokens": 52797762.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 2.102490375936031, |
| "epoch": 2.56, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.5113333333333335e-05, |
| "loss": 2.8376, |
| "mean_token_accuracy": 0.4870879840105772, |
| "num_tokens": 52964035.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 52964035.0, |
| "eval_coding_runtime": 96.1669, |
| "eval_coding_samples_per_second": 5.199, |
| "eval_coding_steps_per_second": 2.6, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.56, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 52964035.0, |
| "eval_physics_runtime": 63.882, |
| "eval_physics_samples_per_second": 7.827, |
| "eval_physics_steps_per_second": 3.913, |
| "step": 3200 |
| }, |
| { |
| "entropy": 2.1070725575089453, |
| "epoch": 2.568, |
| "grad_norm": 0.01275634765625, |
| "learning_rate": 1.5091111111111113e-05, |
| "loss": 2.8546, |
| "mean_token_accuracy": 0.4875574728474021, |
| "num_tokens": 53127106.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 2.15907968506217, |
| "epoch": 2.576, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.506888888888889e-05, |
| "loss": 2.9158, |
| "mean_token_accuracy": 0.477414826862514, |
| "num_tokens": 53293345.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 2.132044891268015, |
| "epoch": 2.584, |
| "grad_norm": 0.01544189453125, |
| "learning_rate": 1.5046666666666668e-05, |
| "loss": 2.8982, |
| "mean_token_accuracy": 0.4789698511362076, |
| "num_tokens": 53454336.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 2.0777774840593337, |
| "epoch": 2.592, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 1.5024444444444445e-05, |
| "loss": 2.8415, |
| "mean_token_accuracy": 0.48330534659326074, |
| "num_tokens": 53620896.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 2.0706359148025513, |
| "epoch": 2.6, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 1.5002222222222223e-05, |
| "loss": 2.84, |
| "mean_token_accuracy": 0.4818432040512562, |
| "num_tokens": 53784544.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 2.0699372351169587, |
| "epoch": 2.608, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.498e-05, |
| "loss": 2.8468, |
| "mean_token_accuracy": 0.48775772508233783, |
| "num_tokens": 53950989.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 2.0821156427264214, |
| "epoch": 2.616, |
| "grad_norm": 0.0150146484375, |
| "learning_rate": 1.495777777777778e-05, |
| "loss": 2.8652, |
| "mean_token_accuracy": 0.48366980664432047, |
| "num_tokens": 54116097.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 2.1099216148257254, |
| "epoch": 2.624, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.4935555555555556e-05, |
| "loss": 2.8907, |
| "mean_token_accuracy": 0.4797281926497817, |
| "num_tokens": 54282732.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 2.1004807710647584, |
| "epoch": 2.632, |
| "grad_norm": 0.01953125, |
| "learning_rate": 1.4913333333333335e-05, |
| "loss": 2.8485, |
| "mean_token_accuracy": 0.48381266705691817, |
| "num_tokens": 54442780.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 2.1202649302780627, |
| "epoch": 2.64, |
| "grad_norm": 0.008056640625, |
| "learning_rate": 1.4891111111111111e-05, |
| "loss": 2.8615, |
| "mean_token_accuracy": 0.48509005140513184, |
| "num_tokens": 54609748.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.64, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 54609748.0, |
| "eval_coding_runtime": 96.1735, |
| "eval_coding_samples_per_second": 5.199, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.64, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 54609748.0, |
| "eval_physics_runtime": 63.9233, |
| "eval_physics_samples_per_second": 7.822, |
| "eval_physics_steps_per_second": 3.911, |
| "step": 3300 |
| }, |
| { |
| "entropy": 2.122569767385721, |
| "epoch": 2.648, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.486888888888889e-05, |
| "loss": 2.8805, |
| "mean_token_accuracy": 0.4804168799892068, |
| "num_tokens": 54783193.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 2.163311902433634, |
| "epoch": 2.656, |
| "grad_norm": 0.009521484375, |
| "learning_rate": 1.4846666666666666e-05, |
| "loss": 2.9588, |
| "mean_token_accuracy": 0.4717843746766448, |
| "num_tokens": 54953583.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 2.131222031265497, |
| "epoch": 2.664, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 1.4824444444444446e-05, |
| "loss": 2.9142, |
| "mean_token_accuracy": 0.47831583991646764, |
| "num_tokens": 55117144.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 2.118143031746149, |
| "epoch": 2.672, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 1.4802222222222225e-05, |
| "loss": 2.9115, |
| "mean_token_accuracy": 0.47941809725016354, |
| "num_tokens": 55281945.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 2.0971298292279243, |
| "epoch": 2.68, |
| "grad_norm": 0.01483154296875, |
| "learning_rate": 1.478e-05, |
| "loss": 2.8689, |
| "mean_token_accuracy": 0.48324434272944927, |
| "num_tokens": 55448136.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 2.063984639942646, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.475777777777778e-05, |
| "loss": 2.8728, |
| "mean_token_accuracy": 0.48411469515413047, |
| "num_tokens": 55612391.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 2.0723772957921027, |
| "epoch": 2.6959999999999997, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 1.4735555555555556e-05, |
| "loss": 2.8513, |
| "mean_token_accuracy": 0.4875372413545847, |
| "num_tokens": 55785705.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 2.0496960267424584, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 1.4713333333333335e-05, |
| "loss": 2.8275, |
| "mean_token_accuracy": 0.4882535219192505, |
| "num_tokens": 55949791.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 2.086968547850847, |
| "epoch": 2.7119999999999997, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.4691111111111111e-05, |
| "loss": 2.8755, |
| "mean_token_accuracy": 0.4829007627442479, |
| "num_tokens": 56116561.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 2.1156416177749633, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.466888888888889e-05, |
| "loss": 2.8348, |
| "mean_token_accuracy": 0.4874603316187859, |
| "num_tokens": 56282204.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 56282204.0, |
| "eval_coding_runtime": 96.09, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 56282204.0, |
| "eval_physics_runtime": 63.9083, |
| "eval_physics_samples_per_second": 7.824, |
| "eval_physics_steps_per_second": 3.912, |
| "step": 3400 |
| }, |
| { |
| "entropy": 2.115247245877981, |
| "epoch": 2.7279999999999998, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.4646666666666666e-05, |
| "loss": 2.9011, |
| "mean_token_accuracy": 0.48062342032790184, |
| "num_tokens": 56441829.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 2.1556713484227656, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.4624444444444446e-05, |
| "loss": 2.9192, |
| "mean_token_accuracy": 0.4784844558686018, |
| "num_tokens": 56605478.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 2.0934521816670895, |
| "epoch": 2.7439999999999998, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 1.4602222222222225e-05, |
| "loss": 2.8716, |
| "mean_token_accuracy": 0.48414743393659593, |
| "num_tokens": 56764430.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 2.0943986624479294, |
| "epoch": 2.752, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 2.8474, |
| "mean_token_accuracy": 0.48711673989892007, |
| "num_tokens": 56935874.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 2.1071030378341673, |
| "epoch": 2.76, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.455777777777778e-05, |
| "loss": 2.8753, |
| "mean_token_accuracy": 0.48405872862786054, |
| "num_tokens": 57104270.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 2.062966075539589, |
| "epoch": 2.768, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.4535555555555556e-05, |
| "loss": 2.8382, |
| "mean_token_accuracy": 0.4887760190293193, |
| "num_tokens": 57270552.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 2.0992351293563845, |
| "epoch": 2.776, |
| "grad_norm": 0.006591796875, |
| "learning_rate": 1.4513333333333336e-05, |
| "loss": 2.865, |
| "mean_token_accuracy": 0.4836490359157324, |
| "num_tokens": 57434347.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 2.1229082576930525, |
| "epoch": 2.784, |
| "grad_norm": 0.01239013671875, |
| "learning_rate": 1.4491111111111111e-05, |
| "loss": 2.9044, |
| "mean_token_accuracy": 0.47999848127365113, |
| "num_tokens": 57602968.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 2.1613018564879893, |
| "epoch": 2.792, |
| "grad_norm": 0.01287841796875, |
| "learning_rate": 1.446888888888889e-05, |
| "loss": 2.9421, |
| "mean_token_accuracy": 0.47603240981698036, |
| "num_tokens": 57760844.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 2.184716771543026, |
| "epoch": 2.8, |
| "grad_norm": 0.01953125, |
| "learning_rate": 1.4446666666666668e-05, |
| "loss": 2.9371, |
| "mean_token_accuracy": 0.47578484620898964, |
| "num_tokens": 57925886.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.8, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 57925886.0, |
| "eval_coding_runtime": 95.8176, |
| "eval_coding_samples_per_second": 5.218, |
| "eval_coding_steps_per_second": 2.609, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.8, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 57925886.0, |
| "eval_physics_runtime": 63.2558, |
| "eval_physics_samples_per_second": 7.904, |
| "eval_physics_steps_per_second": 3.952, |
| "step": 3500 |
| }, |
| { |
| "entropy": 2.103351902216673, |
| "epoch": 2.808, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.4424444444444446e-05, |
| "loss": 2.8631, |
| "mean_token_accuracy": 0.4846200209110975, |
| "num_tokens": 58090738.0, |
| "step": 3510 |
| }, |
| { |
| "entropy": 2.0398203767836094, |
| "epoch": 2.816, |
| "grad_norm": 0.011474609375, |
| "learning_rate": 1.4402222222222224e-05, |
| "loss": 2.8505, |
| "mean_token_accuracy": 0.4874549536034465, |
| "num_tokens": 58262960.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 2.1221997044980525, |
| "epoch": 2.824, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 2.9097, |
| "mean_token_accuracy": 0.47787830252200364, |
| "num_tokens": 58419404.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 2.1137804381549357, |
| "epoch": 2.832, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 1.4357777777777779e-05, |
| "loss": 2.8485, |
| "mean_token_accuracy": 0.4841687433421612, |
| "num_tokens": 58586324.0, |
| "step": 3540 |
| }, |
| { |
| "entropy": 2.141510935127735, |
| "epoch": 2.84, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 1.4335555555555556e-05, |
| "loss": 2.9084, |
| "mean_token_accuracy": 0.4797965111210942, |
| "num_tokens": 58753501.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 2.051307424902916, |
| "epoch": 2.848, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 1.4313333333333334e-05, |
| "loss": 2.8878, |
| "mean_token_accuracy": 0.4822594778612256, |
| "num_tokens": 58912949.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 2.07161338403821, |
| "epoch": 2.856, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 1.4291111111111113e-05, |
| "loss": 2.8904, |
| "mean_token_accuracy": 0.48354612961411475, |
| "num_tokens": 59082635.0, |
| "step": 3570 |
| }, |
| { |
| "entropy": 2.0775130078196526, |
| "epoch": 2.864, |
| "grad_norm": 0.0133056640625, |
| "learning_rate": 1.426888888888889e-05, |
| "loss": 2.8257, |
| "mean_token_accuracy": 0.48698422852903606, |
| "num_tokens": 59252494.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 2.098304682970047, |
| "epoch": 2.872, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.4246666666666669e-05, |
| "loss": 2.8601, |
| "mean_token_accuracy": 0.48505131993442774, |
| "num_tokens": 59415783.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 2.1120006777346134, |
| "epoch": 2.88, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.4224444444444445e-05, |
| "loss": 2.9037, |
| "mean_token_accuracy": 0.47901488821953536, |
| "num_tokens": 59574951.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 59574951.0, |
| "eval_coding_runtime": 95.9323, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.88, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 59574951.0, |
| "eval_physics_runtime": 63.9284, |
| "eval_physics_samples_per_second": 7.821, |
| "eval_physics_steps_per_second": 3.911, |
| "step": 3600 |
| }, |
| { |
| "entropy": 2.0330657340586185, |
| "epoch": 2.888, |
| "grad_norm": 0.006866455078125, |
| "learning_rate": 1.4202222222222224e-05, |
| "loss": 2.8195, |
| "mean_token_accuracy": 0.4911785379052162, |
| "num_tokens": 59742110.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 2.1034228295087813, |
| "epoch": 2.896, |
| "grad_norm": 0.01385498046875, |
| "learning_rate": 1.418e-05, |
| "loss": 2.9084, |
| "mean_token_accuracy": 0.47603141590952874, |
| "num_tokens": 59903274.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 2.0917329132556914, |
| "epoch": 2.904, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 1.4157777777777779e-05, |
| "loss": 2.8871, |
| "mean_token_accuracy": 0.48319196961820127, |
| "num_tokens": 60064633.0, |
| "step": 3630 |
| }, |
| { |
| "entropy": 2.1128187902271747, |
| "epoch": 2.912, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 1.4135555555555555e-05, |
| "loss": 2.8765, |
| "mean_token_accuracy": 0.4844615634530783, |
| "num_tokens": 60234203.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 2.0910258486866953, |
| "epoch": 2.92, |
| "grad_norm": 0.025390625, |
| "learning_rate": 1.4113333333333334e-05, |
| "loss": 2.8796, |
| "mean_token_accuracy": 0.48172583151608706, |
| "num_tokens": 60392231.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 2.08348820656538, |
| "epoch": 2.928, |
| "grad_norm": 0.0133056640625, |
| "learning_rate": 1.4091111111111114e-05, |
| "loss": 2.8514, |
| "mean_token_accuracy": 0.4873508451506495, |
| "num_tokens": 60559123.0, |
| "step": 3660 |
| }, |
| { |
| "entropy": 2.125985510647297, |
| "epoch": 2.936, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 1.406888888888889e-05, |
| "loss": 2.8648, |
| "mean_token_accuracy": 0.48422206435352566, |
| "num_tokens": 60723827.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 2.055304131656885, |
| "epoch": 2.944, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 1.4046666666666669e-05, |
| "loss": 2.8182, |
| "mean_token_accuracy": 0.4933670725673437, |
| "num_tokens": 60893423.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 2.130630461126566, |
| "epoch": 2.952, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.4024444444444445e-05, |
| "loss": 2.9199, |
| "mean_token_accuracy": 0.47805153056979177, |
| "num_tokens": 61061168.0, |
| "step": 3690 |
| }, |
| { |
| "entropy": 2.0945800617337227, |
| "epoch": 2.96, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 1.4002222222222224e-05, |
| "loss": 2.8165, |
| "mean_token_accuracy": 0.49219137877225877, |
| "num_tokens": 61228692.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.96, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 61228692.0, |
| "eval_coding_runtime": 96.0964, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.96, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 61228692.0, |
| "eval_physics_runtime": 63.9227, |
| "eval_physics_samples_per_second": 7.822, |
| "eval_physics_steps_per_second": 3.911, |
| "step": 3700 |
| }, |
| { |
| "entropy": 2.108620697259903, |
| "epoch": 2.968, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 1.398e-05, |
| "loss": 2.8812, |
| "mean_token_accuracy": 0.4813636614009738, |
| "num_tokens": 61391712.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 2.1721582867205145, |
| "epoch": 2.976, |
| "grad_norm": 0.01312255859375, |
| "learning_rate": 1.395777777777778e-05, |
| "loss": 2.8848, |
| "mean_token_accuracy": 0.4803122842684388, |
| "num_tokens": 61560852.0, |
| "step": 3720 |
| }, |
| { |
| "entropy": 2.070528745651245, |
| "epoch": 2.984, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.3935555555555557e-05, |
| "loss": 2.8335, |
| "mean_token_accuracy": 0.48847730122506616, |
| "num_tokens": 61730032.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 2.0762289818376303, |
| "epoch": 2.992, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1.3913333333333335e-05, |
| "loss": 2.8461, |
| "mean_token_accuracy": 0.4850864246487617, |
| "num_tokens": 61898951.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 2.0912881292402745, |
| "epoch": 3.0, |
| "grad_norm": 0.009765625, |
| "learning_rate": 1.3891111111111114e-05, |
| "loss": 2.837, |
| "mean_token_accuracy": 0.48912574239075185, |
| "num_tokens": 62060412.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 2.0631627537310124, |
| "epoch": 3.008, |
| "grad_norm": 0.007568359375, |
| "learning_rate": 1.386888888888889e-05, |
| "loss": 2.8415, |
| "mean_token_accuracy": 0.48977078720927236, |
| "num_tokens": 62223875.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 2.096564035117626, |
| "epoch": 3.016, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 1.3846666666666669e-05, |
| "loss": 2.9114, |
| "mean_token_accuracy": 0.4801721587777138, |
| "num_tokens": 62384589.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 2.0905525363981723, |
| "epoch": 3.024, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.3824444444444445e-05, |
| "loss": 2.8798, |
| "mean_token_accuracy": 0.48243289571255443, |
| "num_tokens": 62553166.0, |
| "step": 3780 |
| }, |
| { |
| "entropy": 2.0739366367459295, |
| "epoch": 3.032, |
| "grad_norm": 0.0106201171875, |
| "learning_rate": 1.3802222222222224e-05, |
| "loss": 2.8417, |
| "mean_token_accuracy": 0.4866447998210788, |
| "num_tokens": 62725683.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 2.123819203674793, |
| "epoch": 3.04, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 1.378e-05, |
| "loss": 2.8806, |
| "mean_token_accuracy": 0.4823303589597344, |
| "num_tokens": 62893113.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 62893113.0, |
| "eval_coding_runtime": 96.0924, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.04, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 62893113.0, |
| "eval_physics_runtime": 64.019, |
| "eval_physics_samples_per_second": 7.81, |
| "eval_physics_steps_per_second": 3.905, |
| "step": 3800 |
| }, |
| { |
| "entropy": 2.126609382778406, |
| "epoch": 3.048, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.375777777777778e-05, |
| "loss": 2.8639, |
| "mean_token_accuracy": 0.483892584964633, |
| "num_tokens": 63052246.0, |
| "step": 3810 |
| }, |
| { |
| "entropy": 2.10702953748405, |
| "epoch": 3.056, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.3735555555555557e-05, |
| "loss": 2.9187, |
| "mean_token_accuracy": 0.47841427810490134, |
| "num_tokens": 63218088.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 2.0819733083248138, |
| "epoch": 3.064, |
| "grad_norm": 0.0203857421875, |
| "learning_rate": 1.3713333333333335e-05, |
| "loss": 2.8582, |
| "mean_token_accuracy": 0.4865171581506729, |
| "num_tokens": 63382267.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 2.1380323246121407, |
| "epoch": 3.072, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.3691111111111112e-05, |
| "loss": 2.9185, |
| "mean_token_accuracy": 0.47557642199099065, |
| "num_tokens": 63551201.0, |
| "step": 3840 |
| }, |
| { |
| "entropy": 2.1633306980133056, |
| "epoch": 3.08, |
| "grad_norm": 0.013916015625, |
| "learning_rate": 1.366888888888889e-05, |
| "loss": 2.9549, |
| "mean_token_accuracy": 0.4695492934435606, |
| "num_tokens": 63712393.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 2.1138852924108504, |
| "epoch": 3.088, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.3646666666666668e-05, |
| "loss": 2.8836, |
| "mean_token_accuracy": 0.48224593289196493, |
| "num_tokens": 63876362.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 2.065829519927502, |
| "epoch": 3.096, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.3624444444444445e-05, |
| "loss": 2.8553, |
| "mean_token_accuracy": 0.48577627055346967, |
| "num_tokens": 64048468.0, |
| "step": 3870 |
| }, |
| { |
| "entropy": 2.069417446106672, |
| "epoch": 3.104, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 1.3602222222222223e-05, |
| "loss": 2.8875, |
| "mean_token_accuracy": 0.48573009874671697, |
| "num_tokens": 64208171.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 2.0596209660172464, |
| "epoch": 3.112, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 2.8312, |
| "mean_token_accuracy": 0.4890388745814562, |
| "num_tokens": 64374523.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 2.1080878011882307, |
| "epoch": 3.12, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.3557777777777778e-05, |
| "loss": 2.8625, |
| "mean_token_accuracy": 0.48503108602017164, |
| "num_tokens": 64544493.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.12, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 64544493.0, |
| "eval_coding_runtime": 95.9386, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.12, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 64544493.0, |
| "eval_physics_runtime": 63.7638, |
| "eval_physics_samples_per_second": 7.841, |
| "eval_physics_steps_per_second": 3.921, |
| "step": 3900 |
| }, |
| { |
| "entropy": 2.173774953186512, |
| "epoch": 3.128, |
| "grad_norm": 0.006378173828125, |
| "learning_rate": 1.3535555555555557e-05, |
| "loss": 2.9569, |
| "mean_token_accuracy": 0.4744618350639939, |
| "num_tokens": 64700791.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 2.0850121699273587, |
| "epoch": 3.136, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.3513333333333333e-05, |
| "loss": 2.8541, |
| "mean_token_accuracy": 0.48463378231972454, |
| "num_tokens": 64872488.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 2.05157615467906, |
| "epoch": 3.144, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.3491111111111113e-05, |
| "loss": 2.8004, |
| "mean_token_accuracy": 0.4967340124770999, |
| "num_tokens": 65038809.0, |
| "step": 3930 |
| }, |
| { |
| "entropy": 2.1126288808882236, |
| "epoch": 3.152, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 1.3468888888888888e-05, |
| "loss": 2.8859, |
| "mean_token_accuracy": 0.48235713597387075, |
| "num_tokens": 65201124.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 2.1096727564930915, |
| "epoch": 3.16, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.3446666666666668e-05, |
| "loss": 2.8681, |
| "mean_token_accuracy": 0.48350467029958966, |
| "num_tokens": 65362969.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 2.051737867295742, |
| "epoch": 3.168, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.3424444444444447e-05, |
| "loss": 2.8291, |
| "mean_token_accuracy": 0.49016483388841153, |
| "num_tokens": 65525868.0, |
| "step": 3960 |
| }, |
| { |
| "entropy": 2.045945072174072, |
| "epoch": 3.176, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 1.3402222222222223e-05, |
| "loss": 2.8157, |
| "mean_token_accuracy": 0.49007453080266716, |
| "num_tokens": 65699210.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 2.1140901155769827, |
| "epoch": 3.184, |
| "grad_norm": 0.0150146484375, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 2.8657, |
| "mean_token_accuracy": 0.48202042542397977, |
| "num_tokens": 65861603.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 2.1720369420945644, |
| "epoch": 3.192, |
| "grad_norm": 0.0147705078125, |
| "learning_rate": 1.3357777777777778e-05, |
| "loss": 2.9377, |
| "mean_token_accuracy": 0.4760190242901444, |
| "num_tokens": 66029358.0, |
| "step": 3990 |
| }, |
| { |
| "entropy": 2.119066651165485, |
| "epoch": 3.2, |
| "grad_norm": 0.00921630859375, |
| "learning_rate": 1.3335555555555558e-05, |
| "loss": 2.8737, |
| "mean_token_accuracy": 0.4842309094965458, |
| "num_tokens": 66193675.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 66193675.0, |
| "eval_coding_runtime": 96.2098, |
| "eval_coding_samples_per_second": 5.197, |
| "eval_coding_steps_per_second": 2.598, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.2, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 66193675.0, |
| "eval_physics_runtime": 63.2033, |
| "eval_physics_samples_per_second": 7.911, |
| "eval_physics_steps_per_second": 3.955, |
| "step": 4000 |
| }, |
| { |
| "entropy": 2.0837896250188352, |
| "epoch": 3.208, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 1.3313333333333333e-05, |
| "loss": 2.8407, |
| "mean_token_accuracy": 0.48807795345783234, |
| "num_tokens": 66358798.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 2.0642104752361776, |
| "epoch": 3.216, |
| "grad_norm": 0.01068115234375, |
| "learning_rate": 1.3291111111111113e-05, |
| "loss": 2.8505, |
| "mean_token_accuracy": 0.48781331330537797, |
| "num_tokens": 66522835.0, |
| "step": 4020 |
| }, |
| { |
| "entropy": 2.152095118165016, |
| "epoch": 3.224, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 1.3268888888888889e-05, |
| "loss": 2.8995, |
| "mean_token_accuracy": 0.4772783122956753, |
| "num_tokens": 66683941.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 2.158082576841116, |
| "epoch": 3.232, |
| "grad_norm": 0.01220703125, |
| "learning_rate": 1.3246666666666668e-05, |
| "loss": 2.9012, |
| "mean_token_accuracy": 0.47906074915081265, |
| "num_tokens": 66853693.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 2.0139299370348454, |
| "epoch": 3.24, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.3224444444444446e-05, |
| "loss": 2.7856, |
| "mean_token_accuracy": 0.496611525118351, |
| "num_tokens": 67023270.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 2.047669659554958, |
| "epoch": 3.248, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 1.3202222222222223e-05, |
| "loss": 2.8252, |
| "mean_token_accuracy": 0.49155114889144896, |
| "num_tokens": 67192299.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 2.086980938911438, |
| "epoch": 3.2560000000000002, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 2.8327, |
| "mean_token_accuracy": 0.48563261032104493, |
| "num_tokens": 67354858.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 2.0698634557425977, |
| "epoch": 3.2640000000000002, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 1.3157777777777778e-05, |
| "loss": 2.8554, |
| "mean_token_accuracy": 0.4893287578597665, |
| "num_tokens": 67517068.0, |
| "step": 4080 |
| }, |
| { |
| "entropy": 2.117519376426935, |
| "epoch": 3.2720000000000002, |
| "grad_norm": 0.0135498046875, |
| "learning_rate": 1.3135555555555558e-05, |
| "loss": 2.8507, |
| "mean_token_accuracy": 0.48337089773267505, |
| "num_tokens": 67682839.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 2.08798957914114, |
| "epoch": 3.2800000000000002, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 1.3113333333333334e-05, |
| "loss": 2.8881, |
| "mean_token_accuracy": 0.4841751741245389, |
| "num_tokens": 67841380.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.2800000000000002, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 67841380.0, |
| "eval_coding_runtime": 95.9623, |
| "eval_coding_samples_per_second": 5.21, |
| "eval_coding_steps_per_second": 2.605, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.2800000000000002, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 67841380.0, |
| "eval_physics_runtime": 64.0377, |
| "eval_physics_samples_per_second": 7.808, |
| "eval_physics_steps_per_second": 3.904, |
| "step": 4100 |
| }, |
| { |
| "entropy": 2.106385115534067, |
| "epoch": 3.288, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 1.3091111111111113e-05, |
| "loss": 2.848, |
| "mean_token_accuracy": 0.4901309922337532, |
| "num_tokens": 68009796.0, |
| "step": 4110 |
| }, |
| { |
| "entropy": 2.091083314269781, |
| "epoch": 3.296, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 1.306888888888889e-05, |
| "loss": 2.9206, |
| "mean_token_accuracy": 0.4773440737277269, |
| "num_tokens": 68175287.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 2.1063731901347635, |
| "epoch": 3.304, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.3046666666666668e-05, |
| "loss": 2.8551, |
| "mean_token_accuracy": 0.4847688987851143, |
| "num_tokens": 68343118.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 2.0561120234429837, |
| "epoch": 3.312, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 1.3024444444444446e-05, |
| "loss": 2.8074, |
| "mean_token_accuracy": 0.48743151165544985, |
| "num_tokens": 68513030.0, |
| "step": 4140 |
| }, |
| { |
| "entropy": 2.136690790951252, |
| "epoch": 3.32, |
| "grad_norm": 0.01190185546875, |
| "learning_rate": 1.3002222222222223e-05, |
| "loss": 2.9183, |
| "mean_token_accuracy": 0.4794544292613864, |
| "num_tokens": 68680267.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 2.1080239094793796, |
| "epoch": 3.328, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 2.8969, |
| "mean_token_accuracy": 0.48164221588522194, |
| "num_tokens": 68839883.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 2.1078054189682005, |
| "epoch": 3.336, |
| "grad_norm": 0.007537841796875, |
| "learning_rate": 1.2957777777777779e-05, |
| "loss": 2.9034, |
| "mean_token_accuracy": 0.48090961929410697, |
| "num_tokens": 69005121.0, |
| "step": 4170 |
| }, |
| { |
| "entropy": 2.0919542267918585, |
| "epoch": 3.344, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.2935555555555556e-05, |
| "loss": 2.8543, |
| "mean_token_accuracy": 0.48435192797333004, |
| "num_tokens": 69173450.0, |
| "step": 4180 |
| }, |
| { |
| "entropy": 2.0624096959829332, |
| "epoch": 3.352, |
| "grad_norm": 0.021484375, |
| "learning_rate": 1.2913333333333336e-05, |
| "loss": 2.8425, |
| "mean_token_accuracy": 0.48494853284209966, |
| "num_tokens": 69343066.0, |
| "step": 4190 |
| }, |
| { |
| "entropy": 2.067618714272976, |
| "epoch": 3.36, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 1.2891111111111112e-05, |
| "loss": 2.8428, |
| "mean_token_accuracy": 0.4874798431992531, |
| "num_tokens": 69510512.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 3.36, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 69510512.0, |
| "eval_coding_runtime": 96.2288, |
| "eval_coding_samples_per_second": 5.196, |
| "eval_coding_steps_per_second": 2.598, |
| "step": 4200 |
| }, |
| { |
| "epoch": 3.36, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 69510512.0, |
| "eval_physics_runtime": 64.0247, |
| "eval_physics_samples_per_second": 7.809, |
| "eval_physics_steps_per_second": 3.905, |
| "step": 4200 |
| }, |
| { |
| "entropy": 2.0581852816045285, |
| "epoch": 3.368, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.2868888888888891e-05, |
| "loss": 2.8097, |
| "mean_token_accuracy": 0.4904202425852418, |
| "num_tokens": 69684041.0, |
| "step": 4210 |
| }, |
| { |
| "entropy": 2.0333582244813444, |
| "epoch": 3.376, |
| "grad_norm": 0.0078125, |
| "learning_rate": 1.2846666666666667e-05, |
| "loss": 2.8192, |
| "mean_token_accuracy": 0.4936717139557004, |
| "num_tokens": 69853992.0, |
| "step": 4220 |
| }, |
| { |
| "entropy": 2.168538150936365, |
| "epoch": 3.384, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 1.2824444444444446e-05, |
| "loss": 2.9049, |
| "mean_token_accuracy": 0.4765908816829324, |
| "num_tokens": 70018276.0, |
| "step": 4230 |
| }, |
| { |
| "entropy": 2.168168669193983, |
| "epoch": 3.392, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.2802222222222222e-05, |
| "loss": 2.9258, |
| "mean_token_accuracy": 0.4788417614996433, |
| "num_tokens": 70178388.0, |
| "step": 4240 |
| }, |
| { |
| "entropy": 2.0856957830488683, |
| "epoch": 3.4, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 2.8073, |
| "mean_token_accuracy": 0.4932461641728878, |
| "num_tokens": 70341232.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 2.068350695818663, |
| "epoch": 3.408, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 1.2757777777777777e-05, |
| "loss": 2.8485, |
| "mean_token_accuracy": 0.48950101863592865, |
| "num_tokens": 70507903.0, |
| "step": 4260 |
| }, |
| { |
| "entropy": 2.110171654820442, |
| "epoch": 3.416, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.2735555555555557e-05, |
| "loss": 2.848, |
| "mean_token_accuracy": 0.4902174774557352, |
| "num_tokens": 70668601.0, |
| "step": 4270 |
| }, |
| { |
| "entropy": 2.070227029919624, |
| "epoch": 3.424, |
| "grad_norm": 0.01251220703125, |
| "learning_rate": 1.2713333333333336e-05, |
| "loss": 2.8453, |
| "mean_token_accuracy": 0.4847742343321443, |
| "num_tokens": 70835982.0, |
| "step": 4280 |
| }, |
| { |
| "entropy": 2.0945966966450213, |
| "epoch": 3.432, |
| "grad_norm": 0.006561279296875, |
| "learning_rate": 1.2691111111111112e-05, |
| "loss": 2.8637, |
| "mean_token_accuracy": 0.48577702064067124, |
| "num_tokens": 71003177.0, |
| "step": 4290 |
| }, |
| { |
| "entropy": 2.1412690974771977, |
| "epoch": 3.44, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.2668888888888891e-05, |
| "loss": 2.9252, |
| "mean_token_accuracy": 0.4771595584228635, |
| "num_tokens": 71172338.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 3.44, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 71172338.0, |
| "eval_coding_runtime": 96.2342, |
| "eval_coding_samples_per_second": 5.196, |
| "eval_coding_steps_per_second": 2.598, |
| "step": 4300 |
| }, |
| { |
| "epoch": 3.44, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 71172338.0, |
| "eval_physics_runtime": 64.0711, |
| "eval_physics_samples_per_second": 7.804, |
| "eval_physics_steps_per_second": 3.902, |
| "step": 4300 |
| }, |
| { |
| "entropy": 2.109636855870485, |
| "epoch": 3.448, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.2646666666666667e-05, |
| "loss": 2.8686, |
| "mean_token_accuracy": 0.48347108364105223, |
| "num_tokens": 71341354.0, |
| "step": 4310 |
| }, |
| { |
| "entropy": 2.1264806307852266, |
| "epoch": 3.456, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 1.2624444444444446e-05, |
| "loss": 2.8904, |
| "mean_token_accuracy": 0.48083948735147714, |
| "num_tokens": 71506904.0, |
| "step": 4320 |
| }, |
| { |
| "entropy": 2.1252997659146784, |
| "epoch": 3.464, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.2602222222222222e-05, |
| "loss": 2.8975, |
| "mean_token_accuracy": 0.47978228740394113, |
| "num_tokens": 71664931.0, |
| "step": 4330 |
| }, |
| { |
| "entropy": 2.1598515301942824, |
| "epoch": 3.472, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 2.9, |
| "mean_token_accuracy": 0.4783101208508015, |
| "num_tokens": 71830111.0, |
| "step": 4340 |
| }, |
| { |
| "entropy": 2.0683537125587463, |
| "epoch": 3.48, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 1.2557777777777779e-05, |
| "loss": 2.8126, |
| "mean_token_accuracy": 0.4885444832965732, |
| "num_tokens": 71995325.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 2.0984449461102486, |
| "epoch": 3.488, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 1.2535555555555557e-05, |
| "loss": 2.8636, |
| "mean_token_accuracy": 0.48451535440981386, |
| "num_tokens": 72162307.0, |
| "step": 4360 |
| }, |
| { |
| "entropy": 2.0970567755401133, |
| "epoch": 3.496, |
| "grad_norm": 0.0115966796875, |
| "learning_rate": 1.2513333333333334e-05, |
| "loss": 2.8887, |
| "mean_token_accuracy": 0.4860412361100316, |
| "num_tokens": 72325173.0, |
| "step": 4370 |
| }, |
| { |
| "entropy": 2.137071938067675, |
| "epoch": 3.504, |
| "grad_norm": 0.01409912109375, |
| "learning_rate": 1.2491111111111112e-05, |
| "loss": 2.9094, |
| "mean_token_accuracy": 0.4761716028675437, |
| "num_tokens": 72492324.0, |
| "step": 4380 |
| }, |
| { |
| "entropy": 2.0667936712503434, |
| "epoch": 3.512, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.246888888888889e-05, |
| "loss": 2.8455, |
| "mean_token_accuracy": 0.48641248401254417, |
| "num_tokens": 72657433.0, |
| "step": 4390 |
| }, |
| { |
| "entropy": 2.107886756956577, |
| "epoch": 3.52, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.2446666666666667e-05, |
| "loss": 2.9048, |
| "mean_token_accuracy": 0.48285721000283954, |
| "num_tokens": 72814868.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 3.52, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 72814868.0, |
| "eval_coding_runtime": 96.291, |
| "eval_coding_samples_per_second": 5.193, |
| "eval_coding_steps_per_second": 2.596, |
| "step": 4400 |
| }, |
| { |
| "epoch": 3.52, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 72814868.0, |
| "eval_physics_runtime": 64.0553, |
| "eval_physics_samples_per_second": 7.806, |
| "eval_physics_steps_per_second": 3.903, |
| "step": 4400 |
| }, |
| { |
| "entropy": 2.0622735850512983, |
| "epoch": 3.528, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 1.2424444444444445e-05, |
| "loss": 2.867, |
| "mean_token_accuracy": 0.4835289839655161, |
| "num_tokens": 72981043.0, |
| "step": 4410 |
| }, |
| { |
| "entropy": 2.107982115447521, |
| "epoch": 3.536, |
| "grad_norm": 0.01171875, |
| "learning_rate": 1.2402222222222222e-05, |
| "loss": 2.8809, |
| "mean_token_accuracy": 0.48394747320562603, |
| "num_tokens": 73144676.0, |
| "step": 4420 |
| }, |
| { |
| "entropy": 2.113698351383209, |
| "epoch": 3.544, |
| "grad_norm": 0.00775146484375, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 2.8634, |
| "mean_token_accuracy": 0.48486993331462147, |
| "num_tokens": 73309118.0, |
| "step": 4430 |
| }, |
| { |
| "entropy": 2.0850420869886874, |
| "epoch": 3.552, |
| "grad_norm": 0.01434326171875, |
| "learning_rate": 1.235777777777778e-05, |
| "loss": 2.8099, |
| "mean_token_accuracy": 0.48757751416414974, |
| "num_tokens": 73474150.0, |
| "step": 4440 |
| }, |
| { |
| "entropy": 2.0784714840352536, |
| "epoch": 3.56, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 1.2335555555555557e-05, |
| "loss": 2.8407, |
| "mean_token_accuracy": 0.4856953978538513, |
| "num_tokens": 73641373.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 2.054875613749027, |
| "epoch": 3.568, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.2313333333333335e-05, |
| "loss": 2.8258, |
| "mean_token_accuracy": 0.488472581282258, |
| "num_tokens": 73814337.0, |
| "step": 4460 |
| }, |
| { |
| "entropy": 2.18665976151824, |
| "epoch": 3.576, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.2291111111111112e-05, |
| "loss": 2.9774, |
| "mean_token_accuracy": 0.47147147078067064, |
| "num_tokens": 73975753.0, |
| "step": 4470 |
| }, |
| { |
| "entropy": 2.1290567815303802, |
| "epoch": 3.584, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 1.226888888888889e-05, |
| "loss": 2.8775, |
| "mean_token_accuracy": 0.48378261309117077, |
| "num_tokens": 74139916.0, |
| "step": 4480 |
| }, |
| { |
| "entropy": 2.046985600143671, |
| "epoch": 3.592, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.2246666666666667e-05, |
| "loss": 2.7691, |
| "mean_token_accuracy": 0.4934337234124541, |
| "num_tokens": 74319118.0, |
| "step": 4490 |
| }, |
| { |
| "entropy": 2.143827559798956, |
| "epoch": 3.6, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.2224444444444445e-05, |
| "loss": 2.9267, |
| "mean_token_accuracy": 0.47726704850792884, |
| "num_tokens": 74481410.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.6, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 74481410.0, |
| "eval_coding_runtime": 96.5913, |
| "eval_coding_samples_per_second": 5.176, |
| "eval_coding_steps_per_second": 2.588, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.6, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 74481410.0, |
| "eval_physics_runtime": 63.9746, |
| "eval_physics_samples_per_second": 7.816, |
| "eval_physics_steps_per_second": 3.908, |
| "step": 4500 |
| }, |
| { |
| "entropy": 2.1184289254248143, |
| "epoch": 3.608, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.2202222222222224e-05, |
| "loss": 2.8798, |
| "mean_token_accuracy": 0.48143869936466216, |
| "num_tokens": 74648297.0, |
| "step": 4510 |
| }, |
| { |
| "entropy": 2.02643828317523, |
| "epoch": 3.616, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 1.218e-05, |
| "loss": 2.8212, |
| "mean_token_accuracy": 0.49379726257175205, |
| "num_tokens": 74818399.0, |
| "step": 4520 |
| }, |
| { |
| "entropy": 2.068718433380127, |
| "epoch": 3.624, |
| "grad_norm": 0.01434326171875, |
| "learning_rate": 1.215777777777778e-05, |
| "loss": 2.8464, |
| "mean_token_accuracy": 0.489543769787997, |
| "num_tokens": 74986438.0, |
| "step": 4530 |
| }, |
| { |
| "entropy": 2.111594308167696, |
| "epoch": 3.632, |
| "grad_norm": 0.0179443359375, |
| "learning_rate": 1.2135555555555556e-05, |
| "loss": 2.8478, |
| "mean_token_accuracy": 0.48541087526828053, |
| "num_tokens": 75148445.0, |
| "step": 4540 |
| }, |
| { |
| "entropy": 2.127137565612793, |
| "epoch": 3.64, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 1.2113333333333335e-05, |
| "loss": 2.9224, |
| "mean_token_accuracy": 0.47523438911885024, |
| "num_tokens": 75309638.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 2.1118921637535095, |
| "epoch": 3.648, |
| "grad_norm": 0.02197265625, |
| "learning_rate": 1.209111111111111e-05, |
| "loss": 2.848, |
| "mean_token_accuracy": 0.48891634624451397, |
| "num_tokens": 75475448.0, |
| "step": 4560 |
| }, |
| { |
| "entropy": 2.140853127092123, |
| "epoch": 3.656, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.206888888888889e-05, |
| "loss": 2.8685, |
| "mean_token_accuracy": 0.4791616892442107, |
| "num_tokens": 75639974.0, |
| "step": 4570 |
| }, |
| { |
| "entropy": 2.0595723688602448, |
| "epoch": 3.664, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 1.204666666666667e-05, |
| "loss": 2.8164, |
| "mean_token_accuracy": 0.4857981903478503, |
| "num_tokens": 75811516.0, |
| "step": 4580 |
| }, |
| { |
| "entropy": 2.1883110869675875, |
| "epoch": 3.672, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.2024444444444445e-05, |
| "loss": 2.9329, |
| "mean_token_accuracy": 0.47467537205666305, |
| "num_tokens": 75970707.0, |
| "step": 4590 |
| }, |
| { |
| "entropy": 2.1031787514686586, |
| "epoch": 3.68, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.2002222222222225e-05, |
| "loss": 2.8995, |
| "mean_token_accuracy": 0.4794537903741002, |
| "num_tokens": 76131750.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 3.68, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 76131750.0, |
| "eval_coding_runtime": 96.3114, |
| "eval_coding_samples_per_second": 5.191, |
| "eval_coding_steps_per_second": 2.596, |
| "step": 4600 |
| }, |
| { |
| "epoch": 3.68, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 76131750.0, |
| "eval_physics_runtime": 64.1076, |
| "eval_physics_samples_per_second": 7.799, |
| "eval_physics_steps_per_second": 3.9, |
| "step": 4600 |
| }, |
| { |
| "entropy": 2.0964153692126275, |
| "epoch": 3.6879999999999997, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.198e-05, |
| "loss": 2.8864, |
| "mean_token_accuracy": 0.48329670932143926, |
| "num_tokens": 76298810.0, |
| "step": 4610 |
| }, |
| { |
| "entropy": 2.1300269193947314, |
| "epoch": 3.6959999999999997, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.195777777777778e-05, |
| "loss": 2.9344, |
| "mean_token_accuracy": 0.47541601955890656, |
| "num_tokens": 76460225.0, |
| "step": 4620 |
| }, |
| { |
| "entropy": 2.0679371163249014, |
| "epoch": 3.7039999999999997, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 1.1935555555555556e-05, |
| "loss": 2.8278, |
| "mean_token_accuracy": 0.491782989539206, |
| "num_tokens": 76631291.0, |
| "step": 4630 |
| }, |
| { |
| "entropy": 2.1284476265311243, |
| "epoch": 3.7119999999999997, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.1913333333333335e-05, |
| "loss": 2.8983, |
| "mean_token_accuracy": 0.47942620553076265, |
| "num_tokens": 76793614.0, |
| "step": 4640 |
| }, |
| { |
| "entropy": 2.2345407247543334, |
| "epoch": 3.7199999999999998, |
| "grad_norm": 0.006591796875, |
| "learning_rate": 1.1891111111111111e-05, |
| "loss": 2.9871, |
| "mean_token_accuracy": 0.4687922740355134, |
| "num_tokens": 76956311.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 2.088286682218313, |
| "epoch": 3.7279999999999998, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 1.186888888888889e-05, |
| "loss": 2.8911, |
| "mean_token_accuracy": 0.4833483690395951, |
| "num_tokens": 77121009.0, |
| "step": 4660 |
| }, |
| { |
| "entropy": 2.1162419065833094, |
| "epoch": 3.7359999999999998, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.1846666666666668e-05, |
| "loss": 2.9061, |
| "mean_token_accuracy": 0.4805751595646143, |
| "num_tokens": 77285031.0, |
| "step": 4670 |
| }, |
| { |
| "entropy": 2.109429658949375, |
| "epoch": 3.7439999999999998, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.1824444444444445e-05, |
| "loss": 2.8624, |
| "mean_token_accuracy": 0.48205650951713325, |
| "num_tokens": 77446311.0, |
| "step": 4680 |
| }, |
| { |
| "entropy": 2.145034124702215, |
| "epoch": 3.752, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 1.1802222222222223e-05, |
| "loss": 2.9137, |
| "mean_token_accuracy": 0.48031143862754105, |
| "num_tokens": 77611656.0, |
| "step": 4690 |
| }, |
| { |
| "entropy": 2.0881564006209374, |
| "epoch": 3.76, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.178e-05, |
| "loss": 2.8644, |
| "mean_token_accuracy": 0.4838464429602027, |
| "num_tokens": 77781488.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.76, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 77781488.0, |
| "eval_coding_runtime": 96.4001, |
| "eval_coding_samples_per_second": 5.187, |
| "eval_coding_steps_per_second": 2.593, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.76, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 77781488.0, |
| "eval_physics_runtime": 64.2998, |
| "eval_physics_samples_per_second": 7.776, |
| "eval_physics_steps_per_second": 3.888, |
| "step": 4700 |
| }, |
| { |
| "entropy": 2.149661224335432, |
| "epoch": 3.768, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 1.1757777777777778e-05, |
| "loss": 2.9199, |
| "mean_token_accuracy": 0.4811420265585184, |
| "num_tokens": 77946801.0, |
| "step": 4710 |
| }, |
| { |
| "entropy": 2.117289527505636, |
| "epoch": 3.776, |
| "grad_norm": 0.00579833984375, |
| "learning_rate": 1.1735555555555556e-05, |
| "loss": 2.8854, |
| "mean_token_accuracy": 0.481607536226511, |
| "num_tokens": 78108138.0, |
| "step": 4720 |
| }, |
| { |
| "entropy": 2.0819385163486004, |
| "epoch": 3.784, |
| "grad_norm": 0.010986328125, |
| "learning_rate": 1.1713333333333334e-05, |
| "loss": 2.8811, |
| "mean_token_accuracy": 0.48273362312465906, |
| "num_tokens": 78277029.0, |
| "step": 4730 |
| }, |
| { |
| "entropy": 2.0590057149529457, |
| "epoch": 3.792, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.1691111111111113e-05, |
| "loss": 2.8452, |
| "mean_token_accuracy": 0.4871557403355837, |
| "num_tokens": 78449177.0, |
| "step": 4740 |
| }, |
| { |
| "entropy": 2.097750276327133, |
| "epoch": 3.8, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 1.1668888888888889e-05, |
| "loss": 2.8818, |
| "mean_token_accuracy": 0.48112841956317426, |
| "num_tokens": 78612331.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 2.1645462520420553, |
| "epoch": 3.808, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 1.1646666666666668e-05, |
| "loss": 2.8845, |
| "mean_token_accuracy": 0.48126564230769875, |
| "num_tokens": 78778761.0, |
| "step": 4760 |
| }, |
| { |
| "entropy": 2.188579352200031, |
| "epoch": 3.816, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 1.1624444444444446e-05, |
| "loss": 2.943, |
| "mean_token_accuracy": 0.4771542547270656, |
| "num_tokens": 78938941.0, |
| "step": 4770 |
| }, |
| { |
| "entropy": 2.0487091943621634, |
| "epoch": 3.824, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.1602222222222223e-05, |
| "loss": 2.8339, |
| "mean_token_accuracy": 0.4885430796071887, |
| "num_tokens": 79102912.0, |
| "step": 4780 |
| }, |
| { |
| "entropy": 2.070402644574642, |
| "epoch": 3.832, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 2.8468, |
| "mean_token_accuracy": 0.48718465957790613, |
| "num_tokens": 79265096.0, |
| "step": 4790 |
| }, |
| { |
| "entropy": 2.1173644840717314, |
| "epoch": 3.84, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.1557777777777779e-05, |
| "loss": 2.8836, |
| "mean_token_accuracy": 0.4810946863144636, |
| "num_tokens": 79436236.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.84, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 79436236.0, |
| "eval_coding_runtime": 96.3659, |
| "eval_coding_samples_per_second": 5.189, |
| "eval_coding_steps_per_second": 2.594, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.84, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 79436236.0, |
| "eval_physics_runtime": 64.0966, |
| "eval_physics_samples_per_second": 7.801, |
| "eval_physics_steps_per_second": 3.9, |
| "step": 4800 |
| }, |
| { |
| "entropy": 2.054683540761471, |
| "epoch": 3.848, |
| "grad_norm": 0.006744384765625, |
| "learning_rate": 1.1535555555555556e-05, |
| "loss": 2.8568, |
| "mean_token_accuracy": 0.48323863800615074, |
| "num_tokens": 79607776.0, |
| "step": 4810 |
| }, |
| { |
| "entropy": 2.0291363187134266, |
| "epoch": 3.856, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.1513333333333334e-05, |
| "loss": 2.8152, |
| "mean_token_accuracy": 0.4893066372722387, |
| "num_tokens": 79772163.0, |
| "step": 4820 |
| }, |
| { |
| "entropy": 2.0947953447699548, |
| "epoch": 3.864, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 1.1491111111111113e-05, |
| "loss": 2.8396, |
| "mean_token_accuracy": 0.48406948503106834, |
| "num_tokens": 79937921.0, |
| "step": 4830 |
| }, |
| { |
| "entropy": 2.1082398749887945, |
| "epoch": 3.872, |
| "grad_norm": 0.0194091796875, |
| "learning_rate": 1.1468888888888889e-05, |
| "loss": 2.894, |
| "mean_token_accuracy": 0.48280362226068974, |
| "num_tokens": 80099967.0, |
| "step": 4840 |
| }, |
| { |
| "entropy": 2.0668815061450005, |
| "epoch": 3.88, |
| "grad_norm": 0.0107421875, |
| "learning_rate": 1.1446666666666668e-05, |
| "loss": 2.873, |
| "mean_token_accuracy": 0.48297429028898475, |
| "num_tokens": 80264711.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 2.10538115054369, |
| "epoch": 3.888, |
| "grad_norm": 0.006988525390625, |
| "learning_rate": 1.1424444444444444e-05, |
| "loss": 2.8718, |
| "mean_token_accuracy": 0.48427290078252555, |
| "num_tokens": 80430055.0, |
| "step": 4860 |
| }, |
| { |
| "entropy": 2.1708266600966453, |
| "epoch": 3.896, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.1402222222222224e-05, |
| "loss": 2.9515, |
| "mean_token_accuracy": 0.47322598174214364, |
| "num_tokens": 80598176.0, |
| "step": 4870 |
| }, |
| { |
| "entropy": 2.1749521791934967, |
| "epoch": 3.904, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.138e-05, |
| "loss": 2.9558, |
| "mean_token_accuracy": 0.47180029209703206, |
| "num_tokens": 80763923.0, |
| "step": 4880 |
| }, |
| { |
| "entropy": 2.058152811229229, |
| "epoch": 3.912, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.1357777777777779e-05, |
| "loss": 2.8218, |
| "mean_token_accuracy": 0.4886314647272229, |
| "num_tokens": 80929452.0, |
| "step": 4890 |
| }, |
| { |
| "entropy": 2.07114285081625, |
| "epoch": 3.92, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.1335555555555558e-05, |
| "loss": 2.8476, |
| "mean_token_accuracy": 0.4865548949688673, |
| "num_tokens": 81098568.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.92, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 81098568.0, |
| "eval_coding_runtime": 95.8984, |
| "eval_coding_samples_per_second": 5.214, |
| "eval_coding_steps_per_second": 2.607, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.92, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 81098568.0, |
| "eval_physics_runtime": 64.04, |
| "eval_physics_samples_per_second": 7.808, |
| "eval_physics_steps_per_second": 3.904, |
| "step": 4900 |
| }, |
| { |
| "entropy": 2.1175915651023387, |
| "epoch": 3.928, |
| "grad_norm": 0.0198974609375, |
| "learning_rate": 1.1313333333333334e-05, |
| "loss": 2.8916, |
| "mean_token_accuracy": 0.4806467808783054, |
| "num_tokens": 81258231.0, |
| "step": 4910 |
| }, |
| { |
| "entropy": 2.0774527341127396, |
| "epoch": 3.936, |
| "grad_norm": 0.01202392578125, |
| "learning_rate": 1.1291111111111113e-05, |
| "loss": 2.8713, |
| "mean_token_accuracy": 0.4864236503839493, |
| "num_tokens": 81423109.0, |
| "step": 4920 |
| }, |
| { |
| "entropy": 2.1824303403496743, |
| "epoch": 3.944, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.126888888888889e-05, |
| "loss": 2.888, |
| "mean_token_accuracy": 0.4796422986313701, |
| "num_tokens": 81586268.0, |
| "step": 4930 |
| }, |
| { |
| "entropy": 2.093172822892666, |
| "epoch": 3.952, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.1246666666666669e-05, |
| "loss": 2.8452, |
| "mean_token_accuracy": 0.4844025122001767, |
| "num_tokens": 81753214.0, |
| "step": 4940 |
| }, |
| { |
| "entropy": 2.1183643460273744, |
| "epoch": 3.96, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.1224444444444444e-05, |
| "loss": 2.893, |
| "mean_token_accuracy": 0.4826692482456565, |
| "num_tokens": 81918071.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 2.1083862885832785, |
| "epoch": 3.968, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 1.1202222222222224e-05, |
| "loss": 2.8873, |
| "mean_token_accuracy": 0.48046484012156726, |
| "num_tokens": 82076472.0, |
| "step": 4960 |
| }, |
| { |
| "entropy": 2.0946729026734827, |
| "epoch": 3.976, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 2.8512, |
| "mean_token_accuracy": 0.4866802429780364, |
| "num_tokens": 82247992.0, |
| "step": 4970 |
| }, |
| { |
| "entropy": 2.0790536925196648, |
| "epoch": 3.984, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 1.1157777777777779e-05, |
| "loss": 2.8292, |
| "mean_token_accuracy": 0.4870005436241627, |
| "num_tokens": 82418472.0, |
| "step": 4980 |
| }, |
| { |
| "entropy": 2.086121869832277, |
| "epoch": 3.992, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.1135555555555557e-05, |
| "loss": 2.8427, |
| "mean_token_accuracy": 0.48974594939500093, |
| "num_tokens": 82582854.0, |
| "step": 4990 |
| }, |
| { |
| "entropy": 2.0794210851192476, |
| "epoch": 4.0, |
| "grad_norm": 0.0203857421875, |
| "learning_rate": 1.1113333333333334e-05, |
| "loss": 2.8479, |
| "mean_token_accuracy": 0.4871451547369361, |
| "num_tokens": 82747216.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 82747216.0, |
| "eval_coding_runtime": 95.9103, |
| "eval_coding_samples_per_second": 5.213, |
| "eval_coding_steps_per_second": 2.607, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 82747216.0, |
| "eval_physics_runtime": 63.8373, |
| "eval_physics_samples_per_second": 7.832, |
| "eval_physics_steps_per_second": 3.916, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.713055577109017e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|