| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.6, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.085290367901325, |
| "epoch": 0.008, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.8e-07, |
| "loss": 2.8737, |
| "mean_token_accuracy": 0.48555898685008286, |
| "num_tokens": 173886.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.082674165070057, |
| "epoch": 0.016, |
| "grad_norm": 0.00750732421875, |
| "learning_rate": 3.8e-07, |
| "loss": 2.8766, |
| "mean_token_accuracy": 0.48314503096044065, |
| "num_tokens": 338352.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 2.083346585184336, |
| "epoch": 0.024, |
| "grad_norm": 0.01116943359375, |
| "learning_rate": 5.800000000000001e-07, |
| "loss": 2.8206, |
| "mean_token_accuracy": 0.4898203007876873, |
| "num_tokens": 505613.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 2.084573998302221, |
| "epoch": 0.032, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 7.8e-07, |
| "loss": 2.846, |
| "mean_token_accuracy": 0.4879810094833374, |
| "num_tokens": 671193.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 2.112175610661507, |
| "epoch": 0.04, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 9.800000000000001e-07, |
| "loss": 2.8945, |
| "mean_token_accuracy": 0.48235367350280284, |
| "num_tokens": 835868.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 2.004745631664991, |
| "epoch": 0.048, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.1800000000000001e-06, |
| "loss": 2.7943, |
| "mean_token_accuracy": 0.4953182445839047, |
| "num_tokens": 1001008.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 2.0760968685150147, |
| "epoch": 0.056, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 1.3800000000000001e-06, |
| "loss": 2.8689, |
| "mean_token_accuracy": 0.48274643896147607, |
| "num_tokens": 1166615.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 2.1421599693596365, |
| "epoch": 0.064, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 1.5800000000000001e-06, |
| "loss": 2.927, |
| "mean_token_accuracy": 0.47564963400363924, |
| "num_tokens": 1331208.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 2.0629068821668626, |
| "epoch": 0.072, |
| "grad_norm": 0.00994873046875, |
| "learning_rate": 1.7800000000000001e-06, |
| "loss": 2.8669, |
| "mean_token_accuracy": 0.48655857108533385, |
| "num_tokens": 1499449.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 2.1223726868629456, |
| "epoch": 0.08, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 1.98e-06, |
| "loss": 2.8834, |
| "mean_token_accuracy": 0.48028963077813386, |
| "num_tokens": 1667532.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 1667532.0, |
| "eval_coding_runtime": 96.0471, |
| "eval_coding_samples_per_second": 5.206, |
| "eval_coding_steps_per_second": 2.603, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 1667532.0, |
| "eval_physics_runtime": 63.6847, |
| "eval_physics_samples_per_second": 7.851, |
| "eval_physics_steps_per_second": 3.926, |
| "step": 100 |
| }, |
| { |
| "entropy": 2.028460328280926, |
| "epoch": 0.088, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 2.1800000000000003e-06, |
| "loss": 2.7998, |
| "mean_token_accuracy": 0.4919602179899812, |
| "num_tokens": 1835489.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 2.0500536493957044, |
| "epoch": 0.096, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 2.38e-06, |
| "loss": 2.843, |
| "mean_token_accuracy": 0.486666096188128, |
| "num_tokens": 2007176.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 2.133950074762106, |
| "epoch": 0.104, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 2.5800000000000003e-06, |
| "loss": 2.908, |
| "mean_token_accuracy": 0.47787463404238223, |
| "num_tokens": 2171335.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 2.0415130145847797, |
| "epoch": 0.112, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 2.7800000000000005e-06, |
| "loss": 2.8213, |
| "mean_token_accuracy": 0.48844432439655067, |
| "num_tokens": 2342259.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 2.19760585129261, |
| "epoch": 0.12, |
| "grad_norm": 0.01519775390625, |
| "learning_rate": 2.9800000000000003e-06, |
| "loss": 2.9587, |
| "mean_token_accuracy": 0.4719714432954788, |
| "num_tokens": 2504279.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 2.133862767368555, |
| "epoch": 0.128, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 3.1800000000000005e-06, |
| "loss": 2.895, |
| "mean_token_accuracy": 0.48206167966127395, |
| "num_tokens": 2663987.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 2.1023842833936213, |
| "epoch": 0.136, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 3.3800000000000007e-06, |
| "loss": 2.8679, |
| "mean_token_accuracy": 0.48314376436173917, |
| "num_tokens": 2829720.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 2.096611645817757, |
| "epoch": 0.144, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 3.58e-06, |
| "loss": 2.882, |
| "mean_token_accuracy": 0.4854999572038651, |
| "num_tokens": 2997824.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 2.104377479106188, |
| "epoch": 0.152, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 3.7800000000000002e-06, |
| "loss": 2.8908, |
| "mean_token_accuracy": 0.48179021421819923, |
| "num_tokens": 3159946.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 2.091322401165962, |
| "epoch": 0.16, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 3.980000000000001e-06, |
| "loss": 2.8428, |
| "mean_token_accuracy": 0.4882024429738522, |
| "num_tokens": 3330597.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 3330597.0, |
| "eval_coding_runtime": 95.6196, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 3330597.0, |
| "eval_physics_runtime": 63.6492, |
| "eval_physics_samples_per_second": 7.856, |
| "eval_physics_steps_per_second": 3.928, |
| "step": 200 |
| }, |
| { |
| "entropy": 2.142216607928276, |
| "epoch": 0.168, |
| "grad_norm": 0.00701904296875, |
| "learning_rate": 4.18e-06, |
| "loss": 2.9186, |
| "mean_token_accuracy": 0.4779110599309206, |
| "num_tokens": 3496104.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 2.1577497258782388, |
| "epoch": 0.176, |
| "grad_norm": 0.010498046875, |
| "learning_rate": 4.38e-06, |
| "loss": 2.9356, |
| "mean_token_accuracy": 0.4760912239551544, |
| "num_tokens": 3658264.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 2.1311790965497495, |
| "epoch": 0.184, |
| "grad_norm": 0.006317138671875, |
| "learning_rate": 4.58e-06, |
| "loss": 2.8804, |
| "mean_token_accuracy": 0.48113634046167136, |
| "num_tokens": 3821352.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 2.110344947129488, |
| "epoch": 0.192, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 4.78e-06, |
| "loss": 2.8458, |
| "mean_token_accuracy": 0.4866057766601443, |
| "num_tokens": 3995568.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 2.1274008348584177, |
| "epoch": 0.2, |
| "grad_norm": 0.00970458984375, |
| "learning_rate": 4.980000000000001e-06, |
| "loss": 2.9325, |
| "mean_token_accuracy": 0.47856651023030283, |
| "num_tokens": 4160918.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 2.0445496000349523, |
| "epoch": 0.208, |
| "grad_norm": 0.0072021484375, |
| "learning_rate": 5.18e-06, |
| "loss": 2.7895, |
| "mean_token_accuracy": 0.49246302992105484, |
| "num_tokens": 4321436.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 2.118650445342064, |
| "epoch": 0.216, |
| "grad_norm": 0.0118408203125, |
| "learning_rate": 5.380000000000001e-06, |
| "loss": 2.831, |
| "mean_token_accuracy": 0.48565028738230465, |
| "num_tokens": 4482225.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 2.0710766136646273, |
| "epoch": 0.224, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 5.580000000000001e-06, |
| "loss": 2.8524, |
| "mean_token_accuracy": 0.4889204766601324, |
| "num_tokens": 4648491.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 2.0747790560126305, |
| "epoch": 0.232, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 5.78e-06, |
| "loss": 2.8736, |
| "mean_token_accuracy": 0.4836031161248684, |
| "num_tokens": 4813449.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 2.1116694904863835, |
| "epoch": 0.24, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 5.98e-06, |
| "loss": 2.8974, |
| "mean_token_accuracy": 0.47815693244338037, |
| "num_tokens": 4986175.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 4986175.0, |
| "eval_coding_runtime": 95.7299, |
| "eval_coding_samples_per_second": 5.223, |
| "eval_coding_steps_per_second": 2.612, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 4986175.0, |
| "eval_physics_runtime": 63.6784, |
| "eval_physics_samples_per_second": 7.852, |
| "eval_physics_steps_per_second": 3.926, |
| "step": 300 |
| }, |
| { |
| "entropy": 2.085676248371601, |
| "epoch": 0.248, |
| "grad_norm": 0.01513671875, |
| "learning_rate": 6.18e-06, |
| "loss": 2.8547, |
| "mean_token_accuracy": 0.4871716873720288, |
| "num_tokens": 5164741.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 2.111249604821205, |
| "epoch": 0.256, |
| "grad_norm": 0.01324462890625, |
| "learning_rate": 6.380000000000001e-06, |
| "loss": 2.869, |
| "mean_token_accuracy": 0.482149850204587, |
| "num_tokens": 5329320.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 2.0735500395298003, |
| "epoch": 0.264, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 6.5800000000000005e-06, |
| "loss": 2.815, |
| "mean_token_accuracy": 0.4895795300602913, |
| "num_tokens": 5497259.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 2.1957527093589304, |
| "epoch": 0.272, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 6.780000000000001e-06, |
| "loss": 2.9296, |
| "mean_token_accuracy": 0.4737250735983253, |
| "num_tokens": 5658796.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 2.0978702254593373, |
| "epoch": 0.28, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 6.98e-06, |
| "loss": 2.9223, |
| "mean_token_accuracy": 0.47742239478975534, |
| "num_tokens": 5819027.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 2.126638425886631, |
| "epoch": 0.288, |
| "grad_norm": 0.01104736328125, |
| "learning_rate": 7.180000000000001e-06, |
| "loss": 2.8822, |
| "mean_token_accuracy": 0.48392471522092817, |
| "num_tokens": 5980201.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 2.1021372735500337, |
| "epoch": 0.296, |
| "grad_norm": 0.0208740234375, |
| "learning_rate": 7.3800000000000005e-06, |
| "loss": 2.8638, |
| "mean_token_accuracy": 0.4845348121598363, |
| "num_tokens": 6139830.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 2.1106897443532944, |
| "epoch": 0.304, |
| "grad_norm": 0.00787353515625, |
| "learning_rate": 7.58e-06, |
| "loss": 2.8357, |
| "mean_token_accuracy": 0.4834788400679827, |
| "num_tokens": 6298858.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 2.143523923307657, |
| "epoch": 0.312, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 7.78e-06, |
| "loss": 2.9236, |
| "mean_token_accuracy": 0.47671638112515213, |
| "num_tokens": 6456502.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 2.1132052429020405, |
| "epoch": 0.32, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 7.980000000000002e-06, |
| "loss": 2.8795, |
| "mean_token_accuracy": 0.48184755612164737, |
| "num_tokens": 6622798.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 6622798.0, |
| "eval_coding_runtime": 95.588, |
| "eval_coding_samples_per_second": 5.231, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 6622798.0, |
| "eval_physics_runtime": 63.5621, |
| "eval_physics_samples_per_second": 7.866, |
| "eval_physics_steps_per_second": 3.933, |
| "step": 400 |
| }, |
| { |
| "entropy": 2.1359100699424745, |
| "epoch": 0.328, |
| "grad_norm": 0.01220703125, |
| "learning_rate": 8.18e-06, |
| "loss": 2.892, |
| "mean_token_accuracy": 0.4762568075209856, |
| "num_tokens": 6782985.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 2.080799935385585, |
| "epoch": 0.336, |
| "grad_norm": 0.021484375, |
| "learning_rate": 8.380000000000001e-06, |
| "loss": 2.8705, |
| "mean_token_accuracy": 0.4826640056446195, |
| "num_tokens": 6953611.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 2.0621030911803246, |
| "epoch": 0.344, |
| "grad_norm": 0.013427734375, |
| "learning_rate": 8.580000000000001e-06, |
| "loss": 2.777, |
| "mean_token_accuracy": 0.48948597013950346, |
| "num_tokens": 7121949.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 2.0954283088445664, |
| "epoch": 0.352, |
| "grad_norm": 0.01080322265625, |
| "learning_rate": 8.78e-06, |
| "loss": 2.8529, |
| "mean_token_accuracy": 0.48598715253174307, |
| "num_tokens": 7290378.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 2.1145325861871243, |
| "epoch": 0.36, |
| "grad_norm": 0.01611328125, |
| "learning_rate": 8.98e-06, |
| "loss": 2.864, |
| "mean_token_accuracy": 0.48475142791867254, |
| "num_tokens": 7452920.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 2.0749227844178675, |
| "epoch": 0.368, |
| "grad_norm": 0.0255126953125, |
| "learning_rate": 9.180000000000002e-06, |
| "loss": 2.842, |
| "mean_token_accuracy": 0.4888931142166257, |
| "num_tokens": 7621082.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 2.158674482256174, |
| "epoch": 0.376, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 9.38e-06, |
| "loss": 2.8965, |
| "mean_token_accuracy": 0.47893171701580284, |
| "num_tokens": 7787036.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 2.172913119196892, |
| "epoch": 0.384, |
| "grad_norm": 0.0211181640625, |
| "learning_rate": 9.58e-06, |
| "loss": 2.9484, |
| "mean_token_accuracy": 0.47502864580601456, |
| "num_tokens": 7955570.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 2.133708968013525, |
| "epoch": 0.392, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 9.780000000000001e-06, |
| "loss": 2.9115, |
| "mean_token_accuracy": 0.481097811833024, |
| "num_tokens": 8123527.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 2.075808613747358, |
| "epoch": 0.4, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 9.980000000000001e-06, |
| "loss": 2.8509, |
| "mean_token_accuracy": 0.4880217058584094, |
| "num_tokens": 8291049.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 8291049.0, |
| "eval_coding_runtime": 95.8745, |
| "eval_coding_samples_per_second": 5.215, |
| "eval_coding_steps_per_second": 2.608, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 8291049.0, |
| "eval_physics_runtime": 63.2475, |
| "eval_physics_samples_per_second": 7.905, |
| "eval_physics_steps_per_second": 3.953, |
| "step": 500 |
| }, |
| { |
| "entropy": 2.0808891989290714, |
| "epoch": 0.408, |
| "grad_norm": 0.00933837890625, |
| "learning_rate": 1.018e-05, |
| "loss": 2.8602, |
| "mean_token_accuracy": 0.48418706730008126, |
| "num_tokens": 8459795.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 2.108520568162203, |
| "epoch": 0.416, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 1.038e-05, |
| "loss": 2.8709, |
| "mean_token_accuracy": 0.4807485779747367, |
| "num_tokens": 8627310.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 2.095228637009859, |
| "epoch": 0.424, |
| "grad_norm": 0.0186767578125, |
| "learning_rate": 1.0580000000000002e-05, |
| "loss": 2.8575, |
| "mean_token_accuracy": 0.4873119568452239, |
| "num_tokens": 8792574.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 2.0733875431120397, |
| "epoch": 0.432, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 1.0780000000000002e-05, |
| "loss": 2.8746, |
| "mean_token_accuracy": 0.48152947407215835, |
| "num_tokens": 8958371.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 2.0743850864470006, |
| "epoch": 0.44, |
| "grad_norm": 0.0142822265625, |
| "learning_rate": 1.0980000000000002e-05, |
| "loss": 2.8417, |
| "mean_token_accuracy": 0.48614103216677906, |
| "num_tokens": 9125564.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 2.0296142108738424, |
| "epoch": 0.448, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.1180000000000001e-05, |
| "loss": 2.8092, |
| "mean_token_accuracy": 0.49211322125047446, |
| "num_tokens": 9294388.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 2.042448101937771, |
| "epoch": 0.456, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.138e-05, |
| "loss": 2.8196, |
| "mean_token_accuracy": 0.4887328814715147, |
| "num_tokens": 9455428.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 2.124661027640104, |
| "epoch": 0.464, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 1.1580000000000001e-05, |
| "loss": 2.8825, |
| "mean_token_accuracy": 0.4844032688066363, |
| "num_tokens": 9620366.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 2.0920108027756212, |
| "epoch": 0.472, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 1.178e-05, |
| "loss": 2.8011, |
| "mean_token_accuracy": 0.49246231019496917, |
| "num_tokens": 9788925.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 2.0895667865872385, |
| "epoch": 0.48, |
| "grad_norm": 0.007568359375, |
| "learning_rate": 1.198e-05, |
| "loss": 2.8784, |
| "mean_token_accuracy": 0.4846471020951867, |
| "num_tokens": 9955431.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 9955431.0, |
| "eval_coding_runtime": 95.6269, |
| "eval_coding_samples_per_second": 5.229, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 9955431.0, |
| "eval_physics_runtime": 63.6584, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 600 |
| }, |
| { |
| "entropy": 2.1037806943058968, |
| "epoch": 0.488, |
| "grad_norm": 0.01434326171875, |
| "learning_rate": 1.218e-05, |
| "loss": 2.8897, |
| "mean_token_accuracy": 0.48129043709486724, |
| "num_tokens": 10119599.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 2.1045577257871626, |
| "epoch": 0.496, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.2380000000000002e-05, |
| "loss": 2.8986, |
| "mean_token_accuracy": 0.4801080621778965, |
| "num_tokens": 10289883.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 2.1075518101453783, |
| "epoch": 0.504, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.2580000000000002e-05, |
| "loss": 2.8519, |
| "mean_token_accuracy": 0.4833444032818079, |
| "num_tokens": 10454683.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 2.106730377674103, |
| "epoch": 0.512, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.2780000000000001e-05, |
| "loss": 2.9125, |
| "mean_token_accuracy": 0.4791097413748503, |
| "num_tokens": 10619468.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 2.149070845544338, |
| "epoch": 0.52, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.2980000000000001e-05, |
| "loss": 2.9383, |
| "mean_token_accuracy": 0.4766066731885076, |
| "num_tokens": 10781285.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 2.109221810847521, |
| "epoch": 0.528, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 1.3180000000000001e-05, |
| "loss": 2.8512, |
| "mean_token_accuracy": 0.48691278155893086, |
| "num_tokens": 10944904.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 2.10501976236701, |
| "epoch": 0.536, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.3380000000000002e-05, |
| "loss": 2.8583, |
| "mean_token_accuracy": 0.48396560084074736, |
| "num_tokens": 11106538.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 2.1422630779445173, |
| "epoch": 0.544, |
| "grad_norm": 0.013916015625, |
| "learning_rate": 1.3580000000000002e-05, |
| "loss": 2.9198, |
| "mean_token_accuracy": 0.4756466532126069, |
| "num_tokens": 11266409.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 2.066490865498781, |
| "epoch": 0.552, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.378e-05, |
| "loss": 2.8749, |
| "mean_token_accuracy": 0.48253467567265035, |
| "num_tokens": 11438391.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 2.1218110136687756, |
| "epoch": 0.56, |
| "grad_norm": 0.00811767578125, |
| "learning_rate": 1.398e-05, |
| "loss": 2.8685, |
| "mean_token_accuracy": 0.4830782825127244, |
| "num_tokens": 11605544.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 11605544.0, |
| "eval_coding_runtime": 95.6063, |
| "eval_coding_samples_per_second": 5.23, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 11605544.0, |
| "eval_physics_runtime": 63.652, |
| "eval_physics_samples_per_second": 7.855, |
| "eval_physics_steps_per_second": 3.928, |
| "step": 700 |
| }, |
| { |
| "entropy": 2.0374118871986866, |
| "epoch": 0.568, |
| "grad_norm": 0.0164794921875, |
| "learning_rate": 1.418e-05, |
| "loss": 2.8443, |
| "mean_token_accuracy": 0.48784226551651955, |
| "num_tokens": 11765430.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 2.079664882272482, |
| "epoch": 0.576, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.4380000000000001e-05, |
| "loss": 2.8436, |
| "mean_token_accuracy": 0.4859828842803836, |
| "num_tokens": 11931715.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 2.091735219210386, |
| "epoch": 0.584, |
| "grad_norm": 0.00836181640625, |
| "learning_rate": 1.4580000000000001e-05, |
| "loss": 2.891, |
| "mean_token_accuracy": 0.4845219163224101, |
| "num_tokens": 12093686.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 2.092084125429392, |
| "epoch": 0.592, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 1.478e-05, |
| "loss": 2.8884, |
| "mean_token_accuracy": 0.4797184634953737, |
| "num_tokens": 12257243.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 2.138279449939728, |
| "epoch": 0.6, |
| "grad_norm": 0.01043701171875, |
| "learning_rate": 1.498e-05, |
| "loss": 2.8937, |
| "mean_token_accuracy": 0.4808414502069354, |
| "num_tokens": 12422008.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 2.025119251012802, |
| "epoch": 0.608, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.5180000000000002e-05, |
| "loss": 2.801, |
| "mean_token_accuracy": 0.4929833130910993, |
| "num_tokens": 12583154.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 2.068293011188507, |
| "epoch": 0.616, |
| "grad_norm": 0.018798828125, |
| "learning_rate": 1.5380000000000002e-05, |
| "loss": 2.8332, |
| "mean_token_accuracy": 0.4890221064910293, |
| "num_tokens": 12746458.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 2.1028603583574297, |
| "epoch": 0.624, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.5580000000000003e-05, |
| "loss": 2.9238, |
| "mean_token_accuracy": 0.4755176778882742, |
| "num_tokens": 12905392.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 2.103465436398983, |
| "epoch": 0.632, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.578e-05, |
| "loss": 2.8914, |
| "mean_token_accuracy": 0.48264262825250626, |
| "num_tokens": 13071058.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 2.107102530449629, |
| "epoch": 0.64, |
| "grad_norm": 0.01373291015625, |
| "learning_rate": 1.5980000000000003e-05, |
| "loss": 2.8937, |
| "mean_token_accuracy": 0.4812551226466894, |
| "num_tokens": 13232198.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 13232198.0, |
| "eval_coding_runtime": 95.5879, |
| "eval_coding_samples_per_second": 5.231, |
| "eval_coding_steps_per_second": 2.615, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.64, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 13232198.0, |
| "eval_physics_runtime": 63.5241, |
| "eval_physics_samples_per_second": 7.871, |
| "eval_physics_steps_per_second": 3.936, |
| "step": 800 |
| }, |
| { |
| "entropy": 2.1226571217179298, |
| "epoch": 0.648, |
| "grad_norm": 0.010009765625, |
| "learning_rate": 1.618e-05, |
| "loss": 2.8715, |
| "mean_token_accuracy": 0.48374479077756405, |
| "num_tokens": 13401064.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 2.071519161760807, |
| "epoch": 0.656, |
| "grad_norm": 0.007476806640625, |
| "learning_rate": 1.638e-05, |
| "loss": 2.8209, |
| "mean_token_accuracy": 0.48990428987890483, |
| "num_tokens": 13575902.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 2.1139444075524807, |
| "epoch": 0.664, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.658e-05, |
| "loss": 2.9185, |
| "mean_token_accuracy": 0.48024304024875164, |
| "num_tokens": 13733192.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 2.1097038336098195, |
| "epoch": 0.672, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.6780000000000002e-05, |
| "loss": 2.856, |
| "mean_token_accuracy": 0.48414181135594847, |
| "num_tokens": 13895997.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 2.0350618422031403, |
| "epoch": 0.68, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.698e-05, |
| "loss": 2.8006, |
| "mean_token_accuracy": 0.49249137472361326, |
| "num_tokens": 14074579.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 2.1360582523047924, |
| "epoch": 0.688, |
| "grad_norm": 0.012939453125, |
| "learning_rate": 1.718e-05, |
| "loss": 2.9135, |
| "mean_token_accuracy": 0.48240375574678185, |
| "num_tokens": 14234888.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 2.0854096628725527, |
| "epoch": 0.696, |
| "grad_norm": 0.0159912109375, |
| "learning_rate": 1.7380000000000003e-05, |
| "loss": 2.8889, |
| "mean_token_accuracy": 0.48132998365908863, |
| "num_tokens": 14411150.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 2.1598948813974856, |
| "epoch": 0.704, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 1.758e-05, |
| "loss": 2.941, |
| "mean_token_accuracy": 0.47370197977870704, |
| "num_tokens": 14567908.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 2.1014511175453663, |
| "epoch": 0.712, |
| "grad_norm": 0.01458740234375, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.8989, |
| "mean_token_accuracy": 0.4840679431334138, |
| "num_tokens": 14728975.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 2.128746274113655, |
| "epoch": 0.72, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.798e-05, |
| "loss": 2.8664, |
| "mean_token_accuracy": 0.4867586812004447, |
| "num_tokens": 14882927.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 14882927.0, |
| "eval_coding_runtime": 95.8303, |
| "eval_coding_samples_per_second": 5.218, |
| "eval_coding_steps_per_second": 2.609, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 14882927.0, |
| "eval_physics_runtime": 63.6666, |
| "eval_physics_samples_per_second": 7.853, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 900 |
| }, |
| { |
| "entropy": 2.0901576526463033, |
| "epoch": 0.728, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.8708, |
| "mean_token_accuracy": 0.48403808698058126, |
| "num_tokens": 15052770.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 2.122075318545103, |
| "epoch": 0.736, |
| "grad_norm": 0.01129150390625, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.8633, |
| "mean_token_accuracy": 0.4852804895490408, |
| "num_tokens": 15217342.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 2.093779005110264, |
| "epoch": 0.744, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.858e-05, |
| "loss": 2.8451, |
| "mean_token_accuracy": 0.48721388857811687, |
| "num_tokens": 15385161.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 2.131568420678377, |
| "epoch": 0.752, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.878e-05, |
| "loss": 2.9118, |
| "mean_token_accuracy": 0.479784238897264, |
| "num_tokens": 15547710.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 2.18331568017602, |
| "epoch": 0.76, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.898e-05, |
| "loss": 2.9565, |
| "mean_token_accuracy": 0.4732973873615265, |
| "num_tokens": 15704494.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 2.1320055924355983, |
| "epoch": 0.768, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 1.918e-05, |
| "loss": 2.8867, |
| "mean_token_accuracy": 0.4821720516309142, |
| "num_tokens": 15877177.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 2.035961114615202, |
| "epoch": 0.776, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.938e-05, |
| "loss": 2.8155, |
| "mean_token_accuracy": 0.4874297235161066, |
| "num_tokens": 16050530.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 2.0721310496330263, |
| "epoch": 0.784, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.8311, |
| "mean_token_accuracy": 0.4866502169519663, |
| "num_tokens": 16219640.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 2.1055771827697756, |
| "epoch": 0.792, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 1.978e-05, |
| "loss": 2.8748, |
| "mean_token_accuracy": 0.4852980706840754, |
| "num_tokens": 16378820.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 2.0726777322590353, |
| "epoch": 0.8, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.8588, |
| "mean_token_accuracy": 0.4857082888484001, |
| "num_tokens": 16548261.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 16548261.0, |
| "eval_coding_runtime": 95.9331, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 16548261.0, |
| "eval_physics_runtime": 63.1527, |
| "eval_physics_samples_per_second": 7.917, |
| "eval_physics_steps_per_second": 3.959, |
| "step": 1000 |
| }, |
| { |
| "entropy": 2.1674983091652393, |
| "epoch": 0.808, |
| "grad_norm": 0.0155029296875, |
| "learning_rate": 1.9980000000000002e-05, |
| "loss": 2.9179, |
| "mean_token_accuracy": 0.4778887365013361, |
| "num_tokens": 16711667.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 2.082773020863533, |
| "epoch": 0.816, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 1.995777777777778e-05, |
| "loss": 2.8647, |
| "mean_token_accuracy": 0.48503445759415625, |
| "num_tokens": 16870533.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 2.1195092521607877, |
| "epoch": 0.824, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 1.9935555555555557e-05, |
| "loss": 2.8734, |
| "mean_token_accuracy": 0.4803088143467903, |
| "num_tokens": 17035600.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 2.1006917230784894, |
| "epoch": 0.832, |
| "grad_norm": 0.01318359375, |
| "learning_rate": 1.9913333333333335e-05, |
| "loss": 2.8807, |
| "mean_token_accuracy": 0.48227673936635257, |
| "num_tokens": 17196371.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 2.1442659996449946, |
| "epoch": 0.84, |
| "grad_norm": 0.01171875, |
| "learning_rate": 1.9891111111111112e-05, |
| "loss": 2.891, |
| "mean_token_accuracy": 0.48074845802038907, |
| "num_tokens": 17356919.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 2.0704655081033705, |
| "epoch": 0.848, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 1.986888888888889e-05, |
| "loss": 2.8486, |
| "mean_token_accuracy": 0.4880160320550203, |
| "num_tokens": 17527066.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 2.125109201669693, |
| "epoch": 0.856, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.9846666666666668e-05, |
| "loss": 2.8795, |
| "mean_token_accuracy": 0.4847541594877839, |
| "num_tokens": 17689902.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 2.1320445723831654, |
| "epoch": 0.864, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.9824444444444445e-05, |
| "loss": 2.8809, |
| "mean_token_accuracy": 0.480629575625062, |
| "num_tokens": 17859605.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 2.1207774586975576, |
| "epoch": 0.872, |
| "grad_norm": 0.0140380859375, |
| "learning_rate": 1.9802222222222226e-05, |
| "loss": 2.8729, |
| "mean_token_accuracy": 0.4811369523406029, |
| "num_tokens": 18019924.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 2.1364660277962684, |
| "epoch": 0.88, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 1.978e-05, |
| "loss": 2.8681, |
| "mean_token_accuracy": 0.4821885580196977, |
| "num_tokens": 18187315.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 18187315.0, |
| "eval_coding_runtime": 95.6457, |
| "eval_coding_samples_per_second": 5.228, |
| "eval_coding_steps_per_second": 2.614, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 18187315.0, |
| "eval_physics_runtime": 63.6627, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.9653398767113686, |
| "epoch": 0.888, |
| "grad_norm": 0.009033203125, |
| "learning_rate": 1.975777777777778e-05, |
| "loss": 2.7728, |
| "mean_token_accuracy": 0.4965917535126209, |
| "num_tokens": 18366329.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 2.0933822274208067, |
| "epoch": 0.896, |
| "grad_norm": 0.0191650390625, |
| "learning_rate": 1.9735555555555556e-05, |
| "loss": 2.8728, |
| "mean_token_accuracy": 0.4872217344120145, |
| "num_tokens": 18536790.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 2.1074513986706735, |
| "epoch": 0.904, |
| "grad_norm": 0.007049560546875, |
| "learning_rate": 1.9713333333333337e-05, |
| "loss": 2.9176, |
| "mean_token_accuracy": 0.48107887227088214, |
| "num_tokens": 18701809.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 2.160377311706543, |
| "epoch": 0.912, |
| "grad_norm": 0.006683349609375, |
| "learning_rate": 1.969111111111111e-05, |
| "loss": 2.9384, |
| "mean_token_accuracy": 0.4739804005250335, |
| "num_tokens": 18869344.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 2.0753089264035225, |
| "epoch": 0.92, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 1.9668888888888892e-05, |
| "loss": 2.882, |
| "mean_token_accuracy": 0.48446453278884294, |
| "num_tokens": 19033910.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 2.1365489259362223, |
| "epoch": 0.928, |
| "grad_norm": 0.006988525390625, |
| "learning_rate": 1.9646666666666666e-05, |
| "loss": 2.8854, |
| "mean_token_accuracy": 0.4823284359648824, |
| "num_tokens": 19199834.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 2.0710911326110364, |
| "epoch": 0.936, |
| "grad_norm": 0.007080078125, |
| "learning_rate": 1.9624444444444447e-05, |
| "loss": 2.8127, |
| "mean_token_accuracy": 0.48797463029623034, |
| "num_tokens": 19361703.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 2.116609390079975, |
| "epoch": 0.944, |
| "grad_norm": 0.007110595703125, |
| "learning_rate": 1.9602222222222225e-05, |
| "loss": 2.8751, |
| "mean_token_accuracy": 0.48552187960594895, |
| "num_tokens": 19525385.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 2.0949166089296343, |
| "epoch": 0.952, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 1.9580000000000002e-05, |
| "loss": 2.8499, |
| "mean_token_accuracy": 0.48639670219272374, |
| "num_tokens": 19690715.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 2.1095651358366014, |
| "epoch": 0.96, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 1.955777777777778e-05, |
| "loss": 2.8706, |
| "mean_token_accuracy": 0.48429451026022435, |
| "num_tokens": 19856973.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 19856973.0, |
| "eval_coding_runtime": 95.6684, |
| "eval_coding_samples_per_second": 5.226, |
| "eval_coding_steps_per_second": 2.613, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.96, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 19856973.0, |
| "eval_physics_runtime": 63.6635, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1200 |
| }, |
| { |
| "entropy": 2.0797565251588823, |
| "epoch": 0.968, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 1.9535555555555557e-05, |
| "loss": 2.8871, |
| "mean_token_accuracy": 0.487212173268199, |
| "num_tokens": 20021093.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 2.0395397424697874, |
| "epoch": 0.976, |
| "grad_norm": 0.01397705078125, |
| "learning_rate": 1.9513333333333335e-05, |
| "loss": 2.8149, |
| "mean_token_accuracy": 0.4898043507710099, |
| "num_tokens": 20190600.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 2.170710327476263, |
| "epoch": 0.984, |
| "grad_norm": 0.0123291015625, |
| "learning_rate": 1.9491111111111113e-05, |
| "loss": 2.9186, |
| "mean_token_accuracy": 0.4756599869579077, |
| "num_tokens": 20352255.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 2.074953277409077, |
| "epoch": 0.992, |
| "grad_norm": 0.01092529296875, |
| "learning_rate": 1.946888888888889e-05, |
| "loss": 2.8591, |
| "mean_token_accuracy": 0.48683627638965843, |
| "num_tokens": 20518014.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 2.1222646929323674, |
| "epoch": 1.0, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.9446666666666668e-05, |
| "loss": 2.8802, |
| "mean_token_accuracy": 0.4825377007946372, |
| "num_tokens": 20686804.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 2.071950024366379, |
| "epoch": 1.008, |
| "grad_norm": 0.01141357421875, |
| "learning_rate": 1.9424444444444446e-05, |
| "loss": 2.8335, |
| "mean_token_accuracy": 0.4894521238282323, |
| "num_tokens": 20859215.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 2.178350891917944, |
| "epoch": 1.016, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9402222222222223e-05, |
| "loss": 2.9515, |
| "mean_token_accuracy": 0.475351152010262, |
| "num_tokens": 21019542.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 2.0342648893594744, |
| "epoch": 1.024, |
| "grad_norm": 0.01483154296875, |
| "learning_rate": 1.938e-05, |
| "loss": 2.8036, |
| "mean_token_accuracy": 0.4928664604201913, |
| "num_tokens": 21186658.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 2.0908240154385567, |
| "epoch": 1.032, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 1.935777777777778e-05, |
| "loss": 2.8868, |
| "mean_token_accuracy": 0.483198188431561, |
| "num_tokens": 21351715.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 2.1210865639150143, |
| "epoch": 1.04, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 1.9335555555555556e-05, |
| "loss": 2.9071, |
| "mean_token_accuracy": 0.47981371227651837, |
| "num_tokens": 21517237.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 21517237.0, |
| "eval_coding_runtime": 96.1911, |
| "eval_coding_samples_per_second": 5.198, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 21517237.0, |
| "eval_physics_runtime": 63.9611, |
| "eval_physics_samples_per_second": 7.817, |
| "eval_physics_steps_per_second": 3.909, |
| "step": 1300 |
| }, |
| { |
| "entropy": 2.102020622044802, |
| "epoch": 1.048, |
| "grad_norm": 0.007415771484375, |
| "learning_rate": 1.9313333333333334e-05, |
| "loss": 2.8719, |
| "mean_token_accuracy": 0.4847179941833019, |
| "num_tokens": 21682780.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 2.045586694777012, |
| "epoch": 1.056, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9291111111111115e-05, |
| "loss": 2.8189, |
| "mean_token_accuracy": 0.48618846032768487, |
| "num_tokens": 21852528.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 2.0868709199130535, |
| "epoch": 1.064, |
| "grad_norm": 0.01226806640625, |
| "learning_rate": 1.926888888888889e-05, |
| "loss": 2.8313, |
| "mean_token_accuracy": 0.4878941228613257, |
| "num_tokens": 22021514.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 2.111675312370062, |
| "epoch": 1.072, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 1.924666666666667e-05, |
| "loss": 2.889, |
| "mean_token_accuracy": 0.4821753781288862, |
| "num_tokens": 22192380.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 2.080042862892151, |
| "epoch": 1.08, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.9224444444444444e-05, |
| "loss": 2.8401, |
| "mean_token_accuracy": 0.4857699448242784, |
| "num_tokens": 22356835.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 2.138657582551241, |
| "epoch": 1.088, |
| "grad_norm": 0.006927490234375, |
| "learning_rate": 1.9202222222222225e-05, |
| "loss": 2.9223, |
| "mean_token_accuracy": 0.4771748060360551, |
| "num_tokens": 22517971.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 2.1065418414771555, |
| "epoch": 1.096, |
| "grad_norm": 0.01422119140625, |
| "learning_rate": 1.918e-05, |
| "loss": 2.8768, |
| "mean_token_accuracy": 0.4816119741648436, |
| "num_tokens": 22681680.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 2.087523814290762, |
| "epoch": 1.104, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 1.915777777777778e-05, |
| "loss": 2.8938, |
| "mean_token_accuracy": 0.4846269652247429, |
| "num_tokens": 22848694.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 2.1188573092222214, |
| "epoch": 1.112, |
| "grad_norm": 0.0087890625, |
| "learning_rate": 1.9135555555555555e-05, |
| "loss": 2.8744, |
| "mean_token_accuracy": 0.48116844836622474, |
| "num_tokens": 23016014.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 2.071516238898039, |
| "epoch": 1.12, |
| "grad_norm": 0.01544189453125, |
| "learning_rate": 1.9113333333333336e-05, |
| "loss": 2.8453, |
| "mean_token_accuracy": 0.48592273648828266, |
| "num_tokens": 23183889.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 23183889.0, |
| "eval_coding_runtime": 96.0999, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.601, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 23183889.0, |
| "eval_physics_runtime": 64.1646, |
| "eval_physics_samples_per_second": 7.792, |
| "eval_physics_steps_per_second": 3.896, |
| "step": 1400 |
| }, |
| { |
| "entropy": 2.0450177520513533, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 1.9091111111111113e-05, |
| "loss": 2.8314, |
| "mean_token_accuracy": 0.4920926999300718, |
| "num_tokens": 23354096.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 2.0993775948882103, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 1.906888888888889e-05, |
| "loss": 2.8513, |
| "mean_token_accuracy": 0.4838741825893521, |
| "num_tokens": 23523903.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 2.0676057398319245, |
| "epoch": 1.144, |
| "grad_norm": 0.0135498046875, |
| "learning_rate": 1.904666666666667e-05, |
| "loss": 2.8661, |
| "mean_token_accuracy": 0.4864560257643461, |
| "num_tokens": 23684571.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 2.0999209016561506, |
| "epoch": 1.152, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 1.9024444444444446e-05, |
| "loss": 2.8681, |
| "mean_token_accuracy": 0.4822270443663001, |
| "num_tokens": 23848848.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 2.110722591727972, |
| "epoch": 1.16, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 1.9002222222222224e-05, |
| "loss": 2.871, |
| "mean_token_accuracy": 0.48459589947015047, |
| "num_tokens": 24011654.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 2.181078152358532, |
| "epoch": 1.168, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 1.898e-05, |
| "loss": 2.943, |
| "mean_token_accuracy": 0.47790372706949713, |
| "num_tokens": 24174589.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 2.134608647972345, |
| "epoch": 1.176, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 1.895777777777778e-05, |
| "loss": 2.8963, |
| "mean_token_accuracy": 0.4795195382088423, |
| "num_tokens": 24336249.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 2.0590513460338116, |
| "epoch": 1.184, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 1.8935555555555556e-05, |
| "loss": 2.8476, |
| "mean_token_accuracy": 0.4868919890373945, |
| "num_tokens": 24495981.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 2.094113527983427, |
| "epoch": 1.192, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 1.8913333333333334e-05, |
| "loss": 2.8476, |
| "mean_token_accuracy": 0.488334871083498, |
| "num_tokens": 24658862.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 2.066910207271576, |
| "epoch": 1.2, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.8891111111111115e-05, |
| "loss": 2.8177, |
| "mean_token_accuracy": 0.4844546576961875, |
| "num_tokens": 24826245.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 24826245.0, |
| "eval_coding_runtime": 95.975, |
| "eval_coding_samples_per_second": 5.21, |
| "eval_coding_steps_per_second": 2.605, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 24826245.0, |
| "eval_physics_runtime": 63.6317, |
| "eval_physics_samples_per_second": 7.858, |
| "eval_physics_steps_per_second": 3.929, |
| "step": 1500 |
| }, |
| { |
| "entropy": 2.1044005908071997, |
| "epoch": 1.208, |
| "grad_norm": 0.01177978515625, |
| "learning_rate": 1.886888888888889e-05, |
| "loss": 2.8829, |
| "mean_token_accuracy": 0.48166574332863094, |
| "num_tokens": 24988821.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 2.1516220718622208, |
| "epoch": 1.216, |
| "grad_norm": 0.012451171875, |
| "learning_rate": 1.884666666666667e-05, |
| "loss": 2.9433, |
| "mean_token_accuracy": 0.47657596562057736, |
| "num_tokens": 25147775.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 2.1071897082030775, |
| "epoch": 1.224, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.8824444444444445e-05, |
| "loss": 2.8592, |
| "mean_token_accuracy": 0.479968504793942, |
| "num_tokens": 25312652.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 2.031100772321224, |
| "epoch": 1.232, |
| "grad_norm": 0.006103515625, |
| "learning_rate": 1.8802222222222226e-05, |
| "loss": 2.8016, |
| "mean_token_accuracy": 0.4919817751273513, |
| "num_tokens": 25480033.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 2.0882932603359223, |
| "epoch": 1.24, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 1.878e-05, |
| "loss": 2.87, |
| "mean_token_accuracy": 0.4867149593308568, |
| "num_tokens": 25643821.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 2.1069738253951074, |
| "epoch": 1.248, |
| "grad_norm": 0.01275634765625, |
| "learning_rate": 1.875777777777778e-05, |
| "loss": 2.9072, |
| "mean_token_accuracy": 0.48037715051323177, |
| "num_tokens": 25811978.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 2.1235784739255905, |
| "epoch": 1.256, |
| "grad_norm": 0.0098876953125, |
| "learning_rate": 1.873555555555556e-05, |
| "loss": 2.8361, |
| "mean_token_accuracy": 0.4876450767740607, |
| "num_tokens": 25980217.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 2.07255465015769, |
| "epoch": 1.264, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 1.8713333333333336e-05, |
| "loss": 2.8226, |
| "mean_token_accuracy": 0.48956795781850815, |
| "num_tokens": 26146299.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 2.083909762650728, |
| "epoch": 1.272, |
| "grad_norm": 0.006805419921875, |
| "learning_rate": 1.8691111111111114e-05, |
| "loss": 2.8296, |
| "mean_token_accuracy": 0.48632726091891526, |
| "num_tokens": 26315594.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 2.1811861269176007, |
| "epoch": 1.28, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 1.866888888888889e-05, |
| "loss": 2.9711, |
| "mean_token_accuracy": 0.47178333820775153, |
| "num_tokens": 26478207.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 26478207.0, |
| "eval_coding_runtime": 95.7162, |
| "eval_coding_samples_per_second": 5.224, |
| "eval_coding_steps_per_second": 2.612, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 26478207.0, |
| "eval_physics_runtime": 63.6631, |
| "eval_physics_samples_per_second": 7.854, |
| "eval_physics_steps_per_second": 3.927, |
| "step": 1600 |
| }, |
| { |
| "entropy": 2.018047058582306, |
| "epoch": 1.288, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.864666666666667e-05, |
| "loss": 2.7963, |
| "mean_token_accuracy": 0.49348701871931555, |
| "num_tokens": 26647567.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 2.1041888780891895, |
| "epoch": 1.296, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.8624444444444446e-05, |
| "loss": 2.8794, |
| "mean_token_accuracy": 0.4833239272236824, |
| "num_tokens": 26806477.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 2.0873175472021104, |
| "epoch": 1.304, |
| "grad_norm": 0.0169677734375, |
| "learning_rate": 1.8602222222222224e-05, |
| "loss": 2.8534, |
| "mean_token_accuracy": 0.4844379436224699, |
| "num_tokens": 26973124.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 2.184805364906788, |
| "epoch": 1.312, |
| "grad_norm": 0.0081787109375, |
| "learning_rate": 1.858e-05, |
| "loss": 2.9204, |
| "mean_token_accuracy": 0.4806702647358179, |
| "num_tokens": 27137528.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 2.1004837110638617, |
| "epoch": 1.32, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 1.855777777777778e-05, |
| "loss": 2.839, |
| "mean_token_accuracy": 0.4855312593281269, |
| "num_tokens": 27300986.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 2.1488297916948795, |
| "epoch": 1.328, |
| "grad_norm": 0.00677490234375, |
| "learning_rate": 1.8535555555555557e-05, |
| "loss": 2.876, |
| "mean_token_accuracy": 0.48554224893450737, |
| "num_tokens": 27465859.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 2.11965301707387, |
| "epoch": 1.336, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.8513333333333335e-05, |
| "loss": 2.909, |
| "mean_token_accuracy": 0.4828299328684807, |
| "num_tokens": 27627148.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 2.139040616154671, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.01214599609375, |
| "learning_rate": 1.8491111111111112e-05, |
| "loss": 2.9133, |
| "mean_token_accuracy": 0.4807636830955744, |
| "num_tokens": 27795568.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 2.1508292332291603, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.00848388671875, |
| "learning_rate": 1.846888888888889e-05, |
| "loss": 2.9009, |
| "mean_token_accuracy": 0.48149400427937505, |
| "num_tokens": 27958162.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 2.0931077137589456, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.8446666666666667e-05, |
| "loss": 2.8726, |
| "mean_token_accuracy": 0.4807087674736977, |
| "num_tokens": 28129988.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 28129988.0, |
| "eval_coding_runtime": 96.0975, |
| "eval_coding_samples_per_second": 5.203, |
| "eval_coding_steps_per_second": 2.602, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 28129988.0, |
| "eval_physics_runtime": 63.7443, |
| "eval_physics_samples_per_second": 7.844, |
| "eval_physics_steps_per_second": 3.922, |
| "step": 1700 |
| }, |
| { |
| "entropy": 2.0795333731919525, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.0096435546875, |
| "learning_rate": 1.842444444444445e-05, |
| "loss": 2.8522, |
| "mean_token_accuracy": 0.4887755334377289, |
| "num_tokens": 28299534.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 2.141296311467886, |
| "epoch": 1.376, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.8402222222222223e-05, |
| "loss": 2.9271, |
| "mean_token_accuracy": 0.4771733907982707, |
| "num_tokens": 28465903.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 2.0781943283975126, |
| "epoch": 1.384, |
| "grad_norm": 0.01055908203125, |
| "learning_rate": 1.8380000000000004e-05, |
| "loss": 2.8574, |
| "mean_token_accuracy": 0.4866001008078456, |
| "num_tokens": 28639289.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 2.0997987784445287, |
| "epoch": 1.392, |
| "grad_norm": 0.02099609375, |
| "learning_rate": 1.8357777777777778e-05, |
| "loss": 2.8585, |
| "mean_token_accuracy": 0.48320644851773975, |
| "num_tokens": 28801554.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 2.150196324288845, |
| "epoch": 1.4, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 1.833555555555556e-05, |
| "loss": 2.9002, |
| "mean_token_accuracy": 0.48246240708976984, |
| "num_tokens": 28963885.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 2.084288237988949, |
| "epoch": 1.408, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 1.8313333333333333e-05, |
| "loss": 2.8911, |
| "mean_token_accuracy": 0.4849772537127137, |
| "num_tokens": 29125114.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 2.1347289800643923, |
| "epoch": 1.416, |
| "grad_norm": 0.0120849609375, |
| "learning_rate": 1.8291111111111114e-05, |
| "loss": 2.9129, |
| "mean_token_accuracy": 0.477182531170547, |
| "num_tokens": 29285272.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 2.1285498954355715, |
| "epoch": 1.424, |
| "grad_norm": 0.018310546875, |
| "learning_rate": 1.8268888888888888e-05, |
| "loss": 2.9215, |
| "mean_token_accuracy": 0.4772155126556754, |
| "num_tokens": 29454334.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 2.0727533496916295, |
| "epoch": 1.432, |
| "grad_norm": 0.01336669921875, |
| "learning_rate": 1.824666666666667e-05, |
| "loss": 2.8803, |
| "mean_token_accuracy": 0.4822785761207342, |
| "num_tokens": 29616865.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 2.0606922268867494, |
| "epoch": 1.44, |
| "grad_norm": 0.00860595703125, |
| "learning_rate": 1.8224444444444447e-05, |
| "loss": 2.8232, |
| "mean_token_accuracy": 0.48800144270062445, |
| "num_tokens": 29789253.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 29789253.0, |
| "eval_coding_runtime": 96.3208, |
| "eval_coding_samples_per_second": 5.191, |
| "eval_coding_steps_per_second": 2.595, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 29789253.0, |
| "eval_physics_runtime": 63.9922, |
| "eval_physics_samples_per_second": 7.813, |
| "eval_physics_steps_per_second": 3.907, |
| "step": 1800 |
| }, |
| { |
| "entropy": 2.0618888407945635, |
| "epoch": 1.448, |
| "grad_norm": 0.007171630859375, |
| "learning_rate": 1.8202222222222225e-05, |
| "loss": 2.8573, |
| "mean_token_accuracy": 0.4843838458880782, |
| "num_tokens": 29953006.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 2.022880507260561, |
| "epoch": 1.456, |
| "grad_norm": 0.0194091796875, |
| "learning_rate": 1.8180000000000002e-05, |
| "loss": 2.8031, |
| "mean_token_accuracy": 0.49401772283017636, |
| "num_tokens": 30128576.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 2.0860828548669814, |
| "epoch": 1.464, |
| "grad_norm": 0.0294189453125, |
| "learning_rate": 1.815777777777778e-05, |
| "loss": 2.8756, |
| "mean_token_accuracy": 0.4849246619269252, |
| "num_tokens": 30288894.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 2.188418509066105, |
| "epoch": 1.472, |
| "grad_norm": 0.015869140625, |
| "learning_rate": 1.8135555555555557e-05, |
| "loss": 2.9527, |
| "mean_token_accuracy": 0.4721253639087081, |
| "num_tokens": 30452719.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 2.085013201087713, |
| "epoch": 1.48, |
| "grad_norm": 0.00946044921875, |
| "learning_rate": 1.8113333333333335e-05, |
| "loss": 2.8646, |
| "mean_token_accuracy": 0.486043793335557, |
| "num_tokens": 30620659.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 2.143638453632593, |
| "epoch": 1.488, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.8091111111111113e-05, |
| "loss": 2.9264, |
| "mean_token_accuracy": 0.47975865937769413, |
| "num_tokens": 30782343.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 2.110791251808405, |
| "epoch": 1.496, |
| "grad_norm": 0.00689697265625, |
| "learning_rate": 1.806888888888889e-05, |
| "loss": 2.8831, |
| "mean_token_accuracy": 0.4842625314369798, |
| "num_tokens": 30947703.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 2.1235111109912395, |
| "epoch": 1.504, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 1.8046666666666668e-05, |
| "loss": 2.8413, |
| "mean_token_accuracy": 0.4860221529379487, |
| "num_tokens": 31113238.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 2.0822810381650925, |
| "epoch": 1.512, |
| "grad_norm": 0.0125732421875, |
| "learning_rate": 1.8024444444444445e-05, |
| "loss": 2.864, |
| "mean_token_accuracy": 0.48232704903930423, |
| "num_tokens": 31283883.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 2.1699491038918497, |
| "epoch": 1.52, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 1.8002222222222223e-05, |
| "loss": 2.9505, |
| "mean_token_accuracy": 0.4706884307786822, |
| "num_tokens": 31444589.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 31444589.0, |
| "eval_coding_runtime": 95.9376, |
| "eval_coding_samples_per_second": 5.212, |
| "eval_coding_steps_per_second": 2.606, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 31444589.0, |
| "eval_physics_runtime": 63.8135, |
| "eval_physics_samples_per_second": 7.835, |
| "eval_physics_steps_per_second": 3.918, |
| "step": 1900 |
| }, |
| { |
| "entropy": 2.1199583016335963, |
| "epoch": 1.528, |
| "grad_norm": 0.007598876953125, |
| "learning_rate": 1.798e-05, |
| "loss": 2.8928, |
| "mean_token_accuracy": 0.4802051903679967, |
| "num_tokens": 31608952.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 2.175958313792944, |
| "epoch": 1.536, |
| "grad_norm": 0.0089111328125, |
| "learning_rate": 1.7957777777777778e-05, |
| "loss": 2.9365, |
| "mean_token_accuracy": 0.4747030269354582, |
| "num_tokens": 31774137.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 2.086700049042702, |
| "epoch": 1.544, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 1.7935555555555556e-05, |
| "loss": 2.8777, |
| "mean_token_accuracy": 0.4859625767916441, |
| "num_tokens": 31937927.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 2.1010241121053697, |
| "epoch": 1.552, |
| "grad_norm": 0.0068359375, |
| "learning_rate": 1.7913333333333337e-05, |
| "loss": 2.8916, |
| "mean_token_accuracy": 0.4840227223932743, |
| "num_tokens": 32101776.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 2.1249936796724795, |
| "epoch": 1.56, |
| "grad_norm": 0.01446533203125, |
| "learning_rate": 1.789111111111111e-05, |
| "loss": 2.8819, |
| "mean_token_accuracy": 0.48242205642163755, |
| "num_tokens": 32271213.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 2.040369462966919, |
| "epoch": 1.568, |
| "grad_norm": 0.0240478515625, |
| "learning_rate": 1.7868888888888892e-05, |
| "loss": 2.8209, |
| "mean_token_accuracy": 0.48918427657335994, |
| "num_tokens": 32428021.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 2.2181196875870226, |
| "epoch": 1.576, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1.7846666666666666e-05, |
| "loss": 2.9794, |
| "mean_token_accuracy": 0.4697700636461377, |
| "num_tokens": 32585407.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 2.096325629204512, |
| "epoch": 1.584, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 1.7824444444444447e-05, |
| "loss": 2.8507, |
| "mean_token_accuracy": 0.4877478454262018, |
| "num_tokens": 32752735.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 2.1082000881433487, |
| "epoch": 1.592, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 1.780222222222222e-05, |
| "loss": 2.8789, |
| "mean_token_accuracy": 0.482938090339303, |
| "num_tokens": 32912519.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 2.139867788553238, |
| "epoch": 1.6, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 1.7780000000000003e-05, |
| "loss": 2.9136, |
| "mean_token_accuracy": 0.4764457972720265, |
| "num_tokens": 33075822.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_coding_entropy": 1.266241003036499, |
| "eval_coding_loss": 1.2625732421875, |
| "eval_coding_mean_token_accuracy": 0.6838934738636017, |
| "eval_coding_num_tokens": 33075822.0, |
| "eval_coding_runtime": 96.1948, |
| "eval_coding_samples_per_second": 5.198, |
| "eval_coding_steps_per_second": 2.599, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_physics_entropy": 2.116599133968353, |
| "eval_physics_loss": 2.9798319339752197, |
| "eval_physics_mean_token_accuracy": 0.4795549786090851, |
| "eval_physics_num_tokens": 33075822.0, |
| "eval_physics_runtime": 63.6877, |
| "eval_physics_samples_per_second": 7.851, |
| "eval_physics_steps_per_second": 3.925, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.6802547029475983e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|