{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 554.109375, "epoch": 0.0008, "grad_norm": 1.106962696508615, "kl": 0.0, "learning_rate": 4.999999122701883e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.4996672570705414, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.875, "step": 1 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 423.8125, "epoch": 0.0016, "grad_norm": 1.314941487556923, "kl": 0.0002727508544921875, "learning_rate": 4.999996490808146e-06, "loss": 0.0, "reward": 1.53125, "reward_std": 0.5366887450218201, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.859375, "step": 2 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 472.328125, "epoch": 0.0024, "grad_norm": 1.2349312957323104, "kl": 0.00128936767578125, "learning_rate": 4.9999921043206356e-06, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.4995177984237671, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.890625, "step": 3 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 504.171875, "epoch": 0.0032, "grad_norm": 0.6879182927151376, "kl": 0.00153350830078125, "learning_rate": 4.999985963242432e-06, "loss": 0.0001, "reward": 1.75, "reward_std": 0.20266614854335785, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 4 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.375, "epoch": 0.004, "grad_norm": 1.0634938924971327, "kl": 0.006317138671875, "learning_rate": 4.999978067577844e-06, "loss": 0.0003, "reward": 1.59375, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 5 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.15625, "epoch": 0.0048, "grad_norm": 1.0266108197177204, "kl": 0.009521484375, "learning_rate": 4.999968417332415e-06, "loss": 0.0004, "reward": 1.625, "reward_std": 0.31300368905067444, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 6 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 428.140625, "epoch": 0.0056, "grad_norm": 4.588923240071208, "kl": 0.0113525390625, "learning_rate": 4.999957012512916e-06, "loss": 0.0005, "reward": 1.71875, "reward_std": 0.2845909595489502, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 7 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 400.9375, "epoch": 0.0064, "grad_norm": 1.2374348718708577, "kl": 0.014892578125, "learning_rate": 4.999943853127351e-06, "loss": 0.0006, "reward": 1.3125, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 8 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.46875, "epoch": 0.0072, "grad_norm": 1.5154801241797453, "kl": 0.020263671875, "learning_rate": 4.999928939184958e-06, "loss": 0.0008, "reward": 1.703125, "reward_std": 0.3492845892906189, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 9 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 467.640625, "epoch": 0.008, "grad_norm": 2.8984064250207613, "kl": 0.0242919921875, "learning_rate": 4.999912270696202e-06, "loss": 0.001, "reward": 1.546875, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 10 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 457.203125, "epoch": 0.0088, "grad_norm": 0.997494152734877, "kl": 0.027099609375, "learning_rate": 4.999893847672783e-06, "loss": 0.0011, "reward": 1.765625, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 11 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 459.328125, "epoch": 0.0096, "grad_norm": 0.8699166450034578, "kl": 0.0281982421875, "learning_rate": 4.99987367012763e-06, "loss": 0.0011, "reward": 1.65625, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 12 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 425.515625, "epoch": 0.0104, "grad_norm": 1.312698419751893, "kl": 0.034423828125, "learning_rate": 4.999851738074904e-06, "loss": 0.0014, "reward": 1.5, "reward_std": 0.30509042739868164, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 13 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 421.1875, "epoch": 0.0112, "grad_norm": 1.0983638696965168, "kl": 0.038330078125, "learning_rate": 4.9998280515300006e-06, "loss": 0.0015, "reward": 1.625, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 14 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 388.53125, "epoch": 0.012, "grad_norm": 1.8508794118515073, "kl": 0.046630859375, "learning_rate": 4.999802610509541e-06, "loss": 0.0019, "reward": 1.6875, "reward_std": 0.2845909595489502, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 15 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 457.9375, "epoch": 0.0128, "grad_norm": 0.8599607884439239, "kl": 0.046875, "learning_rate": 4.999775415031381e-06, "loss": 0.0019, "reward": 1.921875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 16 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.953125, "epoch": 0.0136, "grad_norm": 1.2918338302283918, "kl": 0.055419921875, "learning_rate": 4.999746465114609e-06, "loss": 0.0022, "reward": 1.59375, "reward_std": 0.3335031569004059, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 17 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 442.765625, "epoch": 0.0144, "grad_norm": 0.9977107372357084, "kl": 0.0546875, "learning_rate": 4.999715760779541e-06, "loss": 0.0022, "reward": 1.578125, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 18 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 438.546875, "epoch": 0.0152, "grad_norm": 0.8113982563207017, "kl": 0.05908203125, "learning_rate": 4.999683302047729e-06, "loss": 0.0024, "reward": 1.46875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 19 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 446.578125, "epoch": 0.016, "grad_norm": 1.2752550143906236, "kl": 0.05419921875, "learning_rate": 4.999649088941951e-06, "loss": 0.0022, "reward": 1.5625, "reward_std": 0.22201895713806152, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 20 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 433.015625, "epoch": 0.0168, "grad_norm": 1.6448308314454292, "kl": 0.0625, "learning_rate": 4.999613121486222e-06, "loss": 0.0025, "reward": 1.65625, "reward_std": 0.24359199404716492, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 21 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 486.875, "epoch": 0.0176, "grad_norm": 1.5079682066474072, "kl": 0.060546875, "learning_rate": 4.999575399705782e-06, "loss": 0.0024, "reward": 1.609375, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 22 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 421.21875, "epoch": 0.0184, "grad_norm": 1.1243397903925967, "kl": 0.06201171875, "learning_rate": 4.9995359236271094e-06, "loss": 0.0025, "reward": 1.53125, "reward_std": 0.3029785752296448, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 23 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 484.265625, "epoch": 0.0192, "grad_norm": 0.9576568461832615, "kl": 0.0654296875, "learning_rate": 4.9994946932779076e-06, "loss": 0.0026, "reward": 1.53125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 24 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 488.375, "epoch": 0.02, "grad_norm": 0.9539507807360903, "kl": 0.05859375, "learning_rate": 4.999451708687114e-06, "loss": 0.0023, "reward": 1.6875, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 25 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 478.6875, "epoch": 0.0208, "grad_norm": 1.584300461540275, "kl": 0.064453125, "learning_rate": 4.999406969884897e-06, "loss": 0.0026, "reward": 1.265625, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 1.0, "step": 26 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 466.859375, "epoch": 0.0216, "grad_norm": 1.0007005937653206, "kl": 0.06884765625, "learning_rate": 4.999360476902656e-06, "loss": 0.0027, "reward": 1.5625, "reward_std": 0.2708277702331543, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 27 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 432.71875, "epoch": 0.0224, "grad_norm": 1.2573243271025918, "kl": 0.07421875, "learning_rate": 4.999312229773022e-06, "loss": 0.003, "reward": 1.578125, "reward_std": 0.3266732692718506, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 28 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 508.546875, "epoch": 0.0232, "grad_norm": 0.9794499459288435, "kl": 0.0751953125, "learning_rate": 4.999262228529855e-06, "loss": 0.003, "reward": 1.703125, "reward_std": 0.40822190046310425, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 455.328125, "epoch": 0.024, "grad_norm": 1.169183523278842, "kl": 0.08056640625, "learning_rate": 4.99921047320825e-06, "loss": 0.0032, "reward": 1.453125, "reward_std": 0.5022729635238647, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.96875, "step": 30 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 494.53125, "epoch": 0.0248, "grad_norm": 1.785017154453716, "kl": 0.0771484375, "learning_rate": 4.99915696384453e-06, "loss": 0.0031, "reward": 1.640625, "reward_std": 0.4545377492904663, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 31 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 491.71875, "epoch": 0.0256, "grad_norm": 0.9898909426994473, "kl": 0.072265625, "learning_rate": 4.99910170047625e-06, "loss": 0.0029, "reward": 1.75, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 32 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 472.171875, "epoch": 0.0264, "grad_norm": 0.8838784425314528, "kl": 0.08203125, "learning_rate": 4.999044683142196e-06, "loss": 0.0033, "reward": 1.734375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 33 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 472.25, "epoch": 0.0272, "grad_norm": 1.2255748101481239, "kl": 0.0849609375, "learning_rate": 4.998985911882383e-06, "loss": 0.0034, "reward": 1.453125, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 34 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 457.578125, "epoch": 0.028, "grad_norm": 0.9181810045745323, "kl": 0.087890625, "learning_rate": 4.998925386738063e-06, "loss": 0.0035, "reward": 1.765625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 35 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 493.90625, "epoch": 0.0288, "grad_norm": 0.9492186816581748, "kl": 0.08447265625, "learning_rate": 4.998863107751711e-06, "loss": 0.0034, "reward": 1.359375, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 36 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 433.828125, "epoch": 0.0296, "grad_norm": 0.7312117876501095, "kl": 0.0869140625, "learning_rate": 4.99879907496704e-06, "loss": 0.0035, "reward": 1.734375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 37 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 511.15625, "epoch": 0.0304, "grad_norm": 3.226160403793959, "kl": 0.09423828125, "learning_rate": 4.998733288428987e-06, "loss": 0.0038, "reward": 1.5, "reward_std": 0.5134496092796326, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.953125, "step": 38 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 427.625, "epoch": 0.0312, "grad_norm": 1.4155213994713876, "kl": 0.0927734375, "learning_rate": 4.998665748183727e-06, "loss": 0.0037, "reward": 1.28125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 39 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 440.984375, "epoch": 0.032, "grad_norm": 1.2465987661295874, "kl": 0.10302734375, "learning_rate": 4.998596454278661e-06, "loss": 0.0041, "reward": 1.421875, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 40 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 519.984375, "epoch": 0.0328, "grad_norm": 0.9485069789727323, "kl": 0.08837890625, "learning_rate": 4.998525406762422e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 41 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 473.8125, "epoch": 0.0336, "grad_norm": 1.1552791242133729, "kl": 0.10107421875, "learning_rate": 4.998452605684874e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.3492845892906189, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 42 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 432.265625, "epoch": 0.0344, "grad_norm": 0.8775955518065937, "kl": 0.1005859375, "learning_rate": 4.998378051097111e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 43 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 509.265625, "epoch": 0.0352, "grad_norm": 1.9400186881457824, "kl": 0.099609375, "learning_rate": 4.998301743051459e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 44 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 459.78125, "epoch": 0.036, "grad_norm": 1.2807091748853519, "kl": 0.099609375, "learning_rate": 4.9982236816014735e-06, "loss": 0.004, "reward": 1.609375, "reward_std": 0.38380008935928345, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 45 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 538.15625, "epoch": 0.0368, "grad_norm": 3.0208592385584594, "kl": 0.0947265625, "learning_rate": 4.998143866801941e-06, "loss": 0.0038, "reward": 1.359375, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 46 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 498.3125, "epoch": 0.0376, "grad_norm": 0.9409790694547606, "kl": 0.10693359375, "learning_rate": 4.99806229870888e-06, "loss": 0.0043, "reward": 1.5, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 47 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 424.578125, "epoch": 0.0384, "grad_norm": 1.361137450227213, "kl": 0.10595703125, "learning_rate": 4.9979789773795365e-06, "loss": 0.0042, "reward": 1.59375, "reward_std": 0.19149437546730042, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 48 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 471.046875, "epoch": 0.0392, "grad_norm": 1.3917125320283434, "kl": 0.1083984375, "learning_rate": 4.997893902872389e-06, "loss": 0.0043, "reward": 1.59375, "reward_std": 0.32195523381233215, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 49 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.34375, "epoch": 0.04, "grad_norm": 0.6143978746860285, "kl": 0.09765625, "learning_rate": 4.997807075247147e-06, "loss": 0.0039, "reward": 1.921875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 50 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 496.46875, "epoch": 0.0408, "grad_norm": 1.6577824499312852, "kl": 0.10205078125, "learning_rate": 4.997718494564747e-06, "loss": 0.0041, "reward": 1.546875, "reward_std": 0.3266732692718506, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 51 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 509.3125, "epoch": 0.0416, "grad_norm": 1.398115575311656, "kl": 0.09375, "learning_rate": 4.997628160887361e-06, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.31300365924835205, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 52 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 475.9375, "epoch": 0.0424, "grad_norm": 1.289505457669401, "kl": 0.099609375, "learning_rate": 4.997536074278388e-06, "loss": 0.004, "reward": 1.78125, "reward_std": 0.3377464711666107, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 53 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 477.140625, "epoch": 0.0432, "grad_norm": 1.0274251674462114, "kl": 0.0927734375, "learning_rate": 4.9974422348024565e-06, "loss": 0.0037, "reward": 1.59375, "reward_std": 0.30509042739868164, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 54 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 457.828125, "epoch": 0.044, "grad_norm": 1.2328865824867006, "kl": 0.09521484375, "learning_rate": 4.997346642525429e-06, "loss": 0.0038, "reward": 1.6875, "reward_std": 0.33090677857398987, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 55 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.84375, "epoch": 0.0448, "grad_norm": 2.649900512145904, "kl": 0.1171875, "learning_rate": 4.9972492975143936e-06, "loss": 0.0047, "reward": 1.609375, "reward_std": 0.34717273712158203, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 56 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 435.984375, "epoch": 0.0456, "grad_norm": 1.5429448769370444, "kl": 0.10107421875, "learning_rate": 4.997150199837671e-06, "loss": 0.004, "reward": 1.640625, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 57 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 476.75, "epoch": 0.0464, "grad_norm": 0.7750728694989101, "kl": 0.10009765625, "learning_rate": 4.997049349564814e-06, "loss": 0.004, "reward": 1.453125, "reward_std": 0.1893727034330368, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.96875, "step": 58 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.84375, "epoch": 0.0472, "grad_norm": 1.1808569185472115, "kl": 0.10302734375, "learning_rate": 4.996946746766602e-06, "loss": 0.0041, "reward": 1.6875, "reward_std": 0.3061639666557312, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 59 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.359375, "epoch": 0.048, "grad_norm": 0.87560419549391, "kl": 0.10107421875, "learning_rate": 4.996842391515045e-06, "loss": 0.004, "reward": 1.703125, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 60 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 444.109375, "epoch": 0.0488, "grad_norm": 0.9983118033206102, "kl": 0.10595703125, "learning_rate": 4.996736283883382e-06, "loss": 0.0042, "reward": 1.46875, "reward_std": 0.28247910737991333, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 61 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.140625, "epoch": 0.0496, "grad_norm": 1.2522486907935333, "kl": 0.1044921875, "learning_rate": 4.9966284239460875e-06, "loss": 0.0042, "reward": 1.6875, "reward_std": 0.3697938621044159, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 62 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 456.828125, "epoch": 0.0504, "grad_norm": 1.9128830955616545, "kl": 0.0966796875, "learning_rate": 4.996518811778858e-06, "loss": 0.0039, "reward": 1.59375, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 63 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 442.734375, "epoch": 0.0512, "grad_norm": 1.5421042944955659, "kl": 0.10205078125, "learning_rate": 4.996407447458626e-06, "loss": 0.0041, "reward": 1.734375, "reward_std": 0.3845370411872864, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 64 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 413.796875, "epoch": 0.052, "grad_norm": 1.0558375983386774, "kl": 0.10302734375, "learning_rate": 4.99629433106355e-06, "loss": 0.0041, "reward": 1.59375, "reward_std": 0.2404065728187561, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 65 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 518.75, "epoch": 0.0528, "grad_norm": 0.6372842511275832, "kl": 0.08837890625, "learning_rate": 4.99617946267302e-06, "loss": 0.0035, "reward": 1.640625, "reward_std": 0.22673700749874115, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 66 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.46875, "epoch": 0.0536, "grad_norm": 0.9039550027984934, "kl": 0.11181640625, "learning_rate": 4.996062842367655e-06, "loss": 0.0045, "reward": 1.734375, "reward_std": 0.2468603253364563, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 67 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 411.734375, "epoch": 0.0544, "grad_norm": 0.823442167835391, "kl": 0.10986328125, "learning_rate": 4.9959444702293025e-06, "loss": 0.0044, "reward": 1.625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 68 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 440.40625, "epoch": 0.0552, "grad_norm": 1.7659596418323726, "kl": 0.111328125, "learning_rate": 4.995824346341041e-06, "loss": 0.0045, "reward": 1.453125, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 69 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 419.703125, "epoch": 0.056, "grad_norm": 1.207292808967077, "kl": 0.107421875, "learning_rate": 4.99570247078718e-06, "loss": 0.0043, "reward": 1.703125, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 70 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 402.140625, "epoch": 0.0568, "grad_norm": 1.3768610203470437, "kl": 0.11328125, "learning_rate": 4.995578843653255e-06, "loss": 0.0045, "reward": 1.421875, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 71 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 432.078125, "epoch": 0.0576, "grad_norm": 1.7876726764336355, "kl": 0.1083984375, "learning_rate": 4.995453465026033e-06, "loss": 0.0043, "reward": 1.5, "reward_std": 0.2845909595489502, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 72 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 440.9375, "epoch": 0.0584, "grad_norm": 1.20618885021594, "kl": 0.1044921875, "learning_rate": 4.995326334993508e-06, "loss": 0.0042, "reward": 1.640625, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 73 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 481.453125, "epoch": 0.0592, "grad_norm": 1.0694999127985507, "kl": 0.10888671875, "learning_rate": 4.9951974536449055e-06, "loss": 0.0044, "reward": 1.453125, "reward_std": 0.34259048104286194, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.9375, "step": 74 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 563.21875, "epoch": 0.06, "grad_norm": 0.6675463248539277, "kl": 0.091796875, "learning_rate": 4.9950668210706795e-06, "loss": 0.0037, "reward": 1.828125, "reward_std": 0.22340349853038788, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 75 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 418.25, "epoch": 0.0608, "grad_norm": 0.7716067192758925, "kl": 0.10791015625, "learning_rate": 4.994934437362513e-06, "loss": 0.0043, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 76 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 465.265625, "epoch": 0.0616, "grad_norm": 0.9137739572414142, "kl": 0.107421875, "learning_rate": 4.994800302613318e-06, "loss": 0.0043, "reward": 1.6875, "reward_std": 0.24251842498779297, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 77 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 468.03125, "epoch": 0.0624, "grad_norm": 1.4439553758293866, "kl": 0.10302734375, "learning_rate": 4.994664416917236e-06, "loss": 0.0041, "reward": 1.703125, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 78 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 495.625, "epoch": 0.0632, "grad_norm": 1.5610476959939437, "kl": 0.099609375, "learning_rate": 4.994526780369636e-06, "loss": 0.004, "reward": 1.5625, "reward_std": 0.4433611333370209, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 79 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 461.71875, "epoch": 0.064, "grad_norm": 1.4117659211308278, "kl": 0.10400390625, "learning_rate": 4.9943873930671175e-06, "loss": 0.0042, "reward": 1.78125, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 80 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 468.828125, "epoch": 0.0648, "grad_norm": 1.1585357618323595, "kl": 0.1103515625, "learning_rate": 4.994246255107506e-06, "loss": 0.0044, "reward": 1.609375, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 81 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 473.859375, "epoch": 0.0656, "grad_norm": 0.650633536981878, "kl": 0.1142578125, "learning_rate": 4.994103366589859e-06, "loss": 0.0046, "reward": 1.671875, "reward_std": 0.2289944440126419, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.96875, "step": 82 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 542.234375, "epoch": 0.0664, "grad_norm": 0.9189496334102202, "kl": 0.1005859375, "learning_rate": 4.993958727614462e-06, "loss": 0.004, "reward": 1.65625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 83 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 432.5, "epoch": 0.0672, "grad_norm": 0.9665160891937301, "kl": 0.11328125, "learning_rate": 4.993812338282826e-06, "loss": 0.0045, "reward": 1.3125, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 84 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 491.40625, "epoch": 0.068, "grad_norm": 1.3058592035065932, "kl": 0.126953125, "learning_rate": 4.993664198697694e-06, "loss": 0.0051, "reward": 1.65625, "reward_std": 0.2893187999725342, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 85 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 549.328125, "epoch": 0.0688, "grad_norm": 0.9355477411550364, "kl": 0.1064453125, "learning_rate": 4.993514308963037e-06, "loss": 0.0043, "reward": 1.5625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 86 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 468.53125, "epoch": 0.0696, "grad_norm": 2.916067462057221, "kl": 0.11572265625, "learning_rate": 4.993362669184051e-06, "loss": 0.0046, "reward": 1.53125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 87 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 502.3125, "epoch": 0.0704, "grad_norm": 1.4264894854376802, "kl": 0.109375, "learning_rate": 4.993209279467164e-06, "loss": 0.0044, "reward": 1.78125, "reward_std": 0.3682710528373718, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 88 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 489.5625, "epoch": 0.0712, "grad_norm": 1.014116772780785, "kl": 0.107421875, "learning_rate": 4.993054139920031e-06, "loss": 0.0043, "reward": 1.625, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 89 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 502.875, "epoch": 0.072, "grad_norm": 1.0276948195058366, "kl": 0.11767578125, "learning_rate": 4.992897250651535e-06, "loss": 0.0047, "reward": 1.578125, "reward_std": 0.29826050996780396, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 90 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 435.234375, "epoch": 0.0728, "grad_norm": 2.4748299765258666, "kl": 0.1201171875, "learning_rate": 4.992738611771787e-06, "loss": 0.0048, "reward": 1.375, "reward_std": 0.3335031569004059, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 91 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 440.265625, "epoch": 0.0736, "grad_norm": 1.1630505917313283, "kl": 0.1142578125, "learning_rate": 4.992578223392124e-06, "loss": 0.0046, "reward": 1.65625, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 92 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 431.890625, "epoch": 0.0744, "grad_norm": 0.671672316043001, "kl": 0.10546875, "learning_rate": 4.992416085625115e-06, "loss": 0.0042, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 93 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.9375, "epoch": 0.0752, "grad_norm": 0.9166302480771739, "kl": 0.12060546875, "learning_rate": 4.992252198584554e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 94 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 427.390625, "epoch": 0.076, "grad_norm": 0.8938241675508535, "kl": 0.09521484375, "learning_rate": 4.992086562385462e-06, "loss": 0.0038, "reward": 1.609375, "reward_std": 0.24831004440784454, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 95 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.40625, "epoch": 0.0768, "grad_norm": 0.9472168820065903, "kl": 0.1064453125, "learning_rate": 4.9919191771440905e-06, "loss": 0.0043, "reward": 1.890625, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 96 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 486.171875, "epoch": 0.0776, "grad_norm": 1.53466631135331, "kl": 0.1123046875, "learning_rate": 4.9917500429779165e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.23827511072158813, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 97 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 425.484375, "epoch": 0.0784, "grad_norm": 0.9843285642666483, "kl": 0.107421875, "learning_rate": 4.991579160005644e-06, "loss": 0.0043, "reward": 1.734375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 98 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.4375, "epoch": 0.0792, "grad_norm": 0.6754793866432567, "kl": 0.10546875, "learning_rate": 4.991406528347206e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 99 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 461.84375, "epoch": 0.08, "grad_norm": 0.7474505512475345, "kl": 0.11669921875, "learning_rate": 4.9912321481237616e-06, "loss": 0.0047, "reward": 1.5, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 100 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 427.0, "epoch": 0.0808, "grad_norm": 1.0264334044052603, "kl": 0.11279296875, "learning_rate": 4.991056019457697e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 101 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 440.75, "epoch": 0.0816, "grad_norm": 1.30829061939001, "kl": 0.12255859375, "learning_rate": 4.990878142472628e-06, "loss": 0.0049, "reward": 1.53125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 102 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 411.921875, "epoch": 0.0824, "grad_norm": 0.5577946664774175, "kl": 0.1181640625, "learning_rate": 4.990698517293394e-06, "loss": 0.0047, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 103 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 425.9375, "epoch": 0.0832, "grad_norm": 0.8685769518063424, "kl": 0.1083984375, "learning_rate": 4.9905171440460645e-06, "loss": 0.0043, "reward": 1.78125, "reward_std": 0.1962026059627533, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 104 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 466.546875, "epoch": 0.084, "grad_norm": 0.5092956743369942, "kl": 0.11181640625, "learning_rate": 4.990334022857932e-06, "loss": 0.0045, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 105 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.0848, "grad_norm": 1.2897529204623364, "kl": 0.11474609375, "learning_rate": 4.990149153857519e-06, "loss": 0.0046, "reward": 1.421875, "reward_std": 0.3934885859489441, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 106 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 406.921875, "epoch": 0.0856, "grad_norm": 1.1754549401230758, "kl": 0.11572265625, "learning_rate": 4.989962537174573e-06, "loss": 0.0046, "reward": 1.75, "reward_std": 0.2961388826370239, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 107 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 440.75, "epoch": 0.0864, "grad_norm": 0.8386331634897316, "kl": 0.11572265625, "learning_rate": 4.989774172940071e-06, "loss": 0.0046, "reward": 1.625, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 108 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 448.296875, "epoch": 0.0872, "grad_norm": 0.7450142523211052, "kl": 0.10498046875, "learning_rate": 4.989584061286211e-06, "loss": 0.0042, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 109 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.640625, "epoch": 0.088, "grad_norm": 0.08383802239107432, "kl": 0.0986328125, "learning_rate": 4.989392202346423e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 110 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 428.46875, "epoch": 0.0888, "grad_norm": 1.2199438535090528, "kl": 0.12109375, "learning_rate": 4.989198596255361e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.38452720642089844, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 111 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 571.71875, "epoch": 0.0896, "grad_norm": 0.9646884627154972, "kl": 0.103515625, "learning_rate": 4.989003243148904e-06, "loss": 0.0041, "reward": 1.5, "reward_std": 0.33090677857398987, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 112 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.6875, "epoch": 0.0904, "grad_norm": 0.8033337673718765, "kl": 0.103515625, "learning_rate": 4.988806143164159e-06, "loss": 0.0041, "reward": 1.796875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 113 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 447.421875, "epoch": 0.0912, "grad_norm": 0.46704702690421707, "kl": 0.1083984375, "learning_rate": 4.988607296439459e-06, "loss": 0.0043, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 114 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 487.40625, "epoch": 0.092, "grad_norm": 0.9691362664902082, "kl": 0.103515625, "learning_rate": 4.98840670311436e-06, "loss": 0.0041, "reward": 1.671875, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 115 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 436.5625, "epoch": 0.0928, "grad_norm": 0.7765141393387954, "kl": 0.1005859375, "learning_rate": 4.988204363329648e-06, "loss": 0.004, "reward": 1.84375, "reward_std": 0.22201895713806152, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 116 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 495.34375, "epoch": 0.0936, "grad_norm": 0.7147590654651254, "kl": 0.099609375, "learning_rate": 4.988000277227334e-06, "loss": 0.004, "reward": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 117 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 495.984375, "epoch": 0.0944, "grad_norm": 1.677681916051708, "kl": 0.1103515625, "learning_rate": 4.987794444950651e-06, "loss": 0.0044, "reward": 1.546875, "reward_std": 0.3047240972518921, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 118 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 415.78125, "epoch": 0.0952, "grad_norm": 0.7635954540557363, "kl": 0.134765625, "learning_rate": 4.987586866644061e-06, "loss": 0.0054, "reward": 1.515625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 119 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 464.953125, "epoch": 0.096, "grad_norm": 0.7514932681425285, "kl": 0.1103515625, "learning_rate": 4.9873775424532515e-06, "loss": 0.0044, "reward": 1.59375, "reward_std": 0.17570313811302185, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 120 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 424.84375, "epoch": 0.0968, "grad_norm": 1.0456069680305644, "kl": 0.12353515625, "learning_rate": 4.9871664725251314e-06, "loss": 0.0049, "reward": 1.46875, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 121 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.234375, "epoch": 0.0976, "grad_norm": 1.3131456197984361, "kl": 0.09912109375, "learning_rate": 4.986953657007841e-06, "loss": 0.004, "reward": 1.828125, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 122 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 468.234375, "epoch": 0.0984, "grad_norm": 4.478005441665192, "kl": 0.10791015625, "learning_rate": 4.98673909605074e-06, "loss": 0.0043, "reward": 1.734375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 123 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.21875, "epoch": 0.0992, "grad_norm": 1.006704967535853, "kl": 0.103515625, "learning_rate": 4.986522789804417e-06, "loss": 0.0041, "reward": 1.71875, "reward_std": 0.24251843988895416, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 124 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 411.03125, "epoch": 0.1, "grad_norm": 1.5202176841365798, "kl": 0.130859375, "learning_rate": 4.986304738420684e-06, "loss": 0.0052, "reward": 1.328125, "reward_std": 0.42256343364715576, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 125 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.96875, "epoch": 0.1008, "grad_norm": 0.691520829690945, "kl": 0.1064453125, "learning_rate": 4.986084942052577e-06, "loss": 0.0043, "reward": 1.8125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 126 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 493.453125, "epoch": 0.1016, "grad_norm": 0.6394818378327461, "kl": 0.0966796875, "learning_rate": 4.9858634008543574e-06, "loss": 0.0039, "reward": 1.859375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 127 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 420.140625, "epoch": 0.1024, "grad_norm": 1.122230685100362, "kl": 0.10595703125, "learning_rate": 4.985640114981513e-06, "loss": 0.0042, "reward": 1.6875, "reward_std": 0.24359199404716492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 128 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.1032, "grad_norm": 1.0414783708435302, "kl": 0.103515625, "learning_rate": 4.985415084590752e-06, "loss": 0.0041, "reward": 1.84375, "reward_std": 0.1735912710428238, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 129 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 466.3125, "epoch": 0.104, "grad_norm": 1.2035770384664692, "kl": 0.09228515625, "learning_rate": 4.985188309840012e-06, "loss": 0.0037, "reward": 1.84375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 130 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.203125, "epoch": 0.1048, "grad_norm": 1.0365072299262117, "kl": 0.1240234375, "learning_rate": 4.984959790888451e-06, "loss": 0.005, "reward": 1.5625, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 131 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 443.65625, "epoch": 0.1056, "grad_norm": 14.647926893861055, "kl": 0.1357421875, "learning_rate": 4.984729527896451e-06, "loss": 0.0054, "reward": 1.6875, "reward_std": 0.1962026059627533, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 132 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.140625, "epoch": 0.1064, "grad_norm": 11.978453161980177, "kl": 0.263671875, "learning_rate": 4.984497521025622e-06, "loss": 0.0105, "reward": 1.734375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 133 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.890625, "epoch": 0.1072, "grad_norm": 1.2891133348471002, "kl": 0.498046875, "learning_rate": 4.984263770438793e-06, "loss": 0.0199, "reward": 1.640625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 134 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 596.953125, "epoch": 0.108, "grad_norm": 5.990687084060201, "kl": 1.0234375, "learning_rate": 4.984028276300021e-06, "loss": 0.0407, "reward": 1.8125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 135 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 495.765625, "epoch": 0.1088, "grad_norm": 0.7532120814342459, "kl": 0.103515625, "learning_rate": 4.983791038774585e-06, "loss": 0.0041, "reward": 1.890625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 136 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 442.171875, "epoch": 0.1096, "grad_norm": 0.866752394198012, "kl": 0.11572265625, "learning_rate": 4.983552058028985e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 137 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 476.703125, "epoch": 0.1104, "grad_norm": 0.5921378125196634, "kl": 0.1103515625, "learning_rate": 4.9833113342309495e-06, "loss": 0.0044, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 138 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 495.859375, "epoch": 0.1112, "grad_norm": 3.8570119454857057, "kl": 0.11328125, "learning_rate": 4.983068867549427e-06, "loss": 0.0045, "reward": 1.765625, "reward_std": 0.2824692726135254, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 139 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 432.078125, "epoch": 0.112, "grad_norm": 0.7479618451959271, "kl": 0.123046875, "learning_rate": 4.982824658154589e-06, "loss": 0.0049, "reward": 1.859375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 140 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 477.375, "epoch": 0.1128, "grad_norm": 0.7128326137294528, "kl": 0.10009765625, "learning_rate": 4.9825787062178315e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 141 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 445.53125, "epoch": 0.1136, "grad_norm": 1.6602439851000796, "kl": 0.115234375, "learning_rate": 4.982331011911774e-06, "loss": 0.0046, "reward": 1.6875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 142 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 475.25, "epoch": 0.1144, "grad_norm": 0.946041800084751, "kl": 0.1064453125, "learning_rate": 4.982081575410256e-06, "loss": 0.0043, "reward": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 143 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 473.34375, "epoch": 0.1152, "grad_norm": 0.997362591524624, "kl": 0.1064453125, "learning_rate": 4.9818303968883445e-06, "loss": 0.0043, "reward": 1.78125, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 144 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 473.625, "epoch": 0.116, "grad_norm": 0.9153829155861837, "kl": 0.1083984375, "learning_rate": 4.981577476522323e-06, "loss": 0.0043, "reward": 1.65625, "reward_std": 0.3208816647529602, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 145 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 520.578125, "epoch": 0.1168, "grad_norm": 0.6533677207164628, "kl": 0.10107421875, "learning_rate": 4.981322814489703e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.23925508558750153, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 146 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 480.828125, "epoch": 0.1176, "grad_norm": 0.93405014645264, "kl": 0.11181640625, "learning_rate": 4.981066410969215e-06, "loss": 0.0045, "reward": 1.609375, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 147 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 508.25, "epoch": 0.1184, "grad_norm": 0.9990112151974295, "kl": 0.1015625, "learning_rate": 4.980808266140813e-06, "loss": 0.0041, "reward": 1.25, "reward_std": 0.31300365924835205, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 148 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 480.359375, "epoch": 0.1192, "grad_norm": 0.9937737169099482, "kl": 0.15625, "learning_rate": 4.9805483801856744e-06, "loss": 0.0062, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 149 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 483.484375, "epoch": 0.12, "grad_norm": 0.9316226268565391, "kl": 0.10302734375, "learning_rate": 4.980286753286196e-06, "loss": 0.0041, "reward": 1.671875, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 150 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 490.984375, "epoch": 0.1208, "grad_norm": 0.9497020028783308, "kl": 0.130859375, "learning_rate": 4.980023385625996e-06, "loss": 0.0052, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 151 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.8125, "epoch": 0.1216, "grad_norm": 0.33921544153730604, "kl": 0.10888671875, "learning_rate": 4.979758277389919e-06, "loss": 0.0044, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 152 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 435.734375, "epoch": 0.1224, "grad_norm": 1.0254651580844363, "kl": 0.1171875, "learning_rate": 4.9794914287640264e-06, "loss": 0.0047, "reward": 1.421875, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 153 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 465.703125, "epoch": 0.1232, "grad_norm": 0.7503411726500947, "kl": 0.1435546875, "learning_rate": 4.979222839935602e-06, "loss": 0.0057, "reward": 1.640625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 154 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.78125, "epoch": 0.124, "grad_norm": 0.6925732949036683, "kl": 0.10009765625, "learning_rate": 4.9789525110931545e-06, "loss": 0.004, "reward": 1.828125, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 155 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 438.28125, "epoch": 0.1248, "grad_norm": 0.6444485434393429, "kl": 0.1123046875, "learning_rate": 4.978680442426409e-06, "loss": 0.0045, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 156 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 510.25, "epoch": 0.1256, "grad_norm": 0.7238164485839524, "kl": 0.09765625, "learning_rate": 4.978406634126315e-06, "loss": 0.0039, "reward": 1.828125, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 157 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 405.875, "epoch": 0.1264, "grad_norm": 0.7341274999066865, "kl": 0.1220703125, "learning_rate": 4.978131086385041e-06, "loss": 0.0049, "reward": 1.65625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 158 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 441.984375, "epoch": 0.1272, "grad_norm": 0.632450681610908, "kl": 0.12109375, "learning_rate": 4.977853799395976e-06, "loss": 0.0048, "reward": 1.625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 159 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.453125, "epoch": 0.128, "grad_norm": 0.9113883106967559, "kl": 0.109375, "learning_rate": 4.977574773353732e-06, "loss": 0.0044, "reward": 1.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 160 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 500.09375, "epoch": 0.1288, "grad_norm": 0.7766519579442204, "kl": 0.111328125, "learning_rate": 4.97729400845414e-06, "loss": 0.0044, "reward": 1.796875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 161 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 503.734375, "epoch": 0.1296, "grad_norm": 0.8484901654440711, "kl": 0.10107421875, "learning_rate": 4.977011504894253e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.23356689512729645, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 162 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.140625, "epoch": 0.1304, "grad_norm": 1.694602433143206, "kl": 0.11279296875, "learning_rate": 4.97672726287234e-06, "loss": 0.0045, "reward": 1.84375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 163 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 431.734375, "epoch": 0.1312, "grad_norm": 1.9691320474006948, "kl": 0.1103515625, "learning_rate": 4.976441282587894e-06, "loss": 0.0044, "reward": 1.734375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 164 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.3125, "epoch": 0.132, "grad_norm": 0.8571450813349311, "kl": 0.11083984375, "learning_rate": 4.9761535642416284e-06, "loss": 0.0044, "reward": 1.765625, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 165 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 422.296875, "epoch": 0.1328, "grad_norm": 0.8401543733494647, "kl": 0.10546875, "learning_rate": 4.9758641080354745e-06, "loss": 0.0042, "reward": 1.578125, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 166 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 454.875, "epoch": 0.1336, "grad_norm": 0.7715139058569292, "kl": 0.10546875, "learning_rate": 4.975572914172581e-06, "loss": 0.0042, "reward": 1.890625, "reward_std": 0.21778544783592224, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 167 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.328125, "epoch": 0.1344, "grad_norm": 1.033415950988869, "kl": 0.11474609375, "learning_rate": 4.975279982857324e-06, "loss": 0.0046, "reward": 1.71875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 168 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 354.640625, "epoch": 0.1352, "grad_norm": 1.3592179501596968, "kl": 0.12060546875, "learning_rate": 4.97498531429529e-06, "loss": 0.0048, "reward": 1.46875, "reward_std": 0.25513994693756104, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 169 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 402.28125, "epoch": 0.136, "grad_norm": 1.0245659937809795, "kl": 0.1162109375, "learning_rate": 4.97468890869329e-06, "loss": 0.0047, "reward": 1.734375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 170 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 441.65625, "epoch": 0.1368, "grad_norm": 0.8540055181158542, "kl": 0.1103515625, "learning_rate": 4.974390766259353e-06, "loss": 0.0044, "reward": 1.703125, "reward_std": 0.2597545385360718, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 171 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.734375, "epoch": 0.1376, "grad_norm": 0.5927539427136594, "kl": 0.1162109375, "learning_rate": 4.974090887202726e-06, "loss": 0.0047, "reward": 1.765625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 172 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.171875, "epoch": 0.1384, "grad_norm": 0.8255796715852812, "kl": 0.119140625, "learning_rate": 4.973789271733877e-06, "loss": 0.0048, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 173 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 464.390625, "epoch": 0.1392, "grad_norm": 0.08165960439046371, "kl": 0.10302734375, "learning_rate": 4.973485920064491e-06, "loss": 0.0041, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 174 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 430.578125, "epoch": 0.14, "grad_norm": 1.2656076828941412, "kl": 0.12353515625, "learning_rate": 4.973180832407471e-06, "loss": 0.0049, "reward": 1.703125, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 175 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 395.46875, "epoch": 0.1408, "grad_norm": 0.9967437669996866, "kl": 0.130859375, "learning_rate": 4.97287400897694e-06, "loss": 0.0052, "reward": 1.59375, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 176 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 408.203125, "epoch": 0.1416, "grad_norm": 1.2284270222943057, "kl": 0.1376953125, "learning_rate": 4.972565449988238e-06, "loss": 0.0055, "reward": 1.640625, "reward_std": 0.2687061131000519, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 177 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 415.90625, "epoch": 0.1424, "grad_norm": 1.3762326715144548, "kl": 0.140625, "learning_rate": 4.972255155657925e-06, "loss": 0.0056, "reward": 1.328125, "reward_std": 0.2824692726135254, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 1.0, "step": 178 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 459.703125, "epoch": 0.1432, "grad_norm": 0.6842166715194086, "kl": 0.111328125, "learning_rate": 4.9719431262037755e-06, "loss": 0.0044, "reward": 1.6875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 179 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 460.546875, "epoch": 0.144, "grad_norm": 0.8638732688380146, "kl": 0.12890625, "learning_rate": 4.971629361844785e-06, "loss": 0.0052, "reward": 1.65625, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 180 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 422.921875, "epoch": 0.1448, "grad_norm": 1.1470878317233275, "kl": 0.123046875, "learning_rate": 4.971313862801166e-06, "loss": 0.0049, "reward": 1.640625, "reward_std": 0.24831004440784454, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 181 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 467.40625, "epoch": 0.1456, "grad_norm": 1.2298228677872869, "kl": 0.1259765625, "learning_rate": 4.9709966292943455e-06, "loss": 0.005, "reward": 1.8125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 182 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 502.609375, "epoch": 0.1464, "grad_norm": 5.045132796818451, "kl": 0.11767578125, "learning_rate": 4.970677661546972e-06, "loss": 0.0047, "reward": 1.71875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 183 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 520.8125, "epoch": 0.1472, "grad_norm": 0.9572797889763173, "kl": 0.109375, "learning_rate": 4.970356959782909e-06, "loss": 0.0044, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 184 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 558.609375, "epoch": 0.148, "grad_norm": 0.8604921513908739, "kl": 0.111328125, "learning_rate": 4.970034524227239e-06, "loss": 0.0044, "reward": 1.84375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 185 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 510.625, "epoch": 0.1488, "grad_norm": 1.1377042956515393, "kl": 0.130859375, "learning_rate": 4.969710355106256e-06, "loss": 0.0052, "reward": 1.4375, "reward_std": 0.3424547016620636, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 186 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 545.859375, "epoch": 0.1496, "grad_norm": 0.7689950103002073, "kl": 0.11572265625, "learning_rate": 4.969384452647477e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 187 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 536.71875, "epoch": 0.1504, "grad_norm": 0.8912646596770333, "kl": 0.107421875, "learning_rate": 4.969056817079633e-06, "loss": 0.0043, "reward": 1.84375, "reward_std": 0.19149437546730042, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 188 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 509.21875, "epoch": 0.1512, "grad_norm": 1.002615987813076, "kl": 0.1298828125, "learning_rate": 4.968727448632669e-06, "loss": 0.0052, "reward": 1.53125, "reward_std": 0.25513994693756104, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 189 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 555.1875, "epoch": 0.152, "grad_norm": 1.6278003157000074, "kl": 0.123046875, "learning_rate": 4.968396347537751e-06, "loss": 0.0049, "reward": 1.765625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 190 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 518.546875, "epoch": 0.1528, "grad_norm": 0.8831829921163867, "kl": 0.1142578125, "learning_rate": 4.968063514027258e-06, "loss": 0.0046, "reward": 1.78125, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 191 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 570.671875, "epoch": 0.1536, "grad_norm": 0.7708673303650967, "kl": 0.12451171875, "learning_rate": 4.967728948334784e-06, "loss": 0.005, "reward": 1.734375, "reward_std": 0.20873048901557922, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 192 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 599.734375, "epoch": 0.1544, "grad_norm": 0.9116648653068451, "kl": 0.10546875, "learning_rate": 4.967392650695141e-06, "loss": 0.0042, "reward": 1.625, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.953125, "step": 193 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 482.828125, "epoch": 0.1552, "grad_norm": 1.185885422942044, "kl": 0.126953125, "learning_rate": 4.967054621344356e-06, "loss": 0.0051, "reward": 1.625, "reward_std": 0.3514062762260437, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 194 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 499.515625, "epoch": 0.156, "grad_norm": 0.8173946853176801, "kl": 0.115234375, "learning_rate": 4.96671486051967e-06, "loss": 0.0046, "reward": 1.65625, "reward_std": 0.1962025910615921, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 195 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 553.515625, "epoch": 0.1568, "grad_norm": 0.7455450159086521, "kl": 0.11865234375, "learning_rate": 4.966373368459542e-06, "loss": 0.0048, "reward": 1.578125, "reward_std": 0.26735979318618774, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 196 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 459.140625, "epoch": 0.1576, "grad_norm": 0.9292556255485958, "kl": 0.12158203125, "learning_rate": 4.966030145403642e-06, "loss": 0.0049, "reward": 1.59375, "reward_std": 0.27019423246383667, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 197 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 562.765625, "epoch": 0.1584, "grad_norm": 0.7768499393593932, "kl": 0.1298828125, "learning_rate": 4.965685191592859e-06, "loss": 0.0052, "reward": 1.796875, "reward_std": 0.2902791500091553, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 198 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.5625, "epoch": 0.1592, "grad_norm": 0.6450068913255618, "kl": 0.11328125, "learning_rate": 4.9653385072692935e-06, "loss": 0.0045, "reward": 1.78125, "reward_std": 0.14480239152908325, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 199 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 447.375, "epoch": 0.16, "grad_norm": 2.1890081897816867, "kl": 0.12890625, "learning_rate": 4.964990092676263e-06, "loss": 0.0051, "reward": 1.640625, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 200 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 408.921875, "epoch": 0.1608, "grad_norm": 0.9503745916839271, "kl": 0.1220703125, "learning_rate": 4.964639948058297e-06, "loss": 0.0049, "reward": 1.84375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 201 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.609375, "epoch": 0.1616, "grad_norm": 0.6309582728790826, "kl": 0.107421875, "learning_rate": 4.964288073661142e-06, "loss": 0.0043, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 202 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 488.328125, "epoch": 0.1624, "grad_norm": 0.6938703856880744, "kl": 0.119140625, "learning_rate": 4.963934469731756e-06, "loss": 0.0048, "reward": 1.765625, "reward_std": 0.1893727034330368, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 203 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 471.640625, "epoch": 0.1632, "grad_norm": 1.0495615953561261, "kl": 0.11181640625, "learning_rate": 4.963579136518312e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 204 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 461.875, "epoch": 0.164, "grad_norm": 0.8342885507359887, "kl": 0.1162109375, "learning_rate": 4.963222074270197e-06, "loss": 0.0046, "reward": 1.59375, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 205 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 466.0, "epoch": 0.1648, "grad_norm": 0.5996858178874089, "kl": 0.130859375, "learning_rate": 4.962863283238011e-06, "loss": 0.0052, "reward": 1.734375, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 206 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 462.984375, "epoch": 0.1656, "grad_norm": 15.684679168053783, "kl": 0.11474609375, "learning_rate": 4.962502763673566e-06, "loss": 0.0046, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 207 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 411.453125, "epoch": 0.1664, "grad_norm": 1.2234997917416828, "kl": 0.1240234375, "learning_rate": 4.96214051582989e-06, "loss": 0.005, "reward": 1.609375, "reward_std": 0.2519446909427643, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 208 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 469.046875, "epoch": 0.1672, "grad_norm": 3.6075728764664015, "kl": 0.12158203125, "learning_rate": 4.961776539961222e-06, "loss": 0.0049, "reward": 1.71875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 209 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 467.609375, "epoch": 0.168, "grad_norm": 1.0093347119497476, "kl": 0.12109375, "learning_rate": 4.961410836323014e-06, "loss": 0.0048, "reward": 1.546875, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 210 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 481.53125, "epoch": 0.1688, "grad_norm": 1.4890072026116983, "kl": 0.11279296875, "learning_rate": 4.961043405171931e-06, "loss": 0.0045, "reward": 1.671875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 211 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 477.5625, "epoch": 0.1696, "grad_norm": 0.9493457601594587, "kl": 0.10888671875, "learning_rate": 4.9606742467658505e-06, "loss": 0.0044, "reward": 1.5625, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 212 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.40625, "epoch": 0.1704, "grad_norm": 1.2679381611265614, "kl": 0.1240234375, "learning_rate": 4.960303361363863e-06, "loss": 0.0049, "reward": 1.78125, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 213 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 485.53125, "epoch": 0.1712, "grad_norm": 0.9454250792077608, "kl": 0.11083984375, "learning_rate": 4.959930749226269e-06, "loss": 0.0044, "reward": 1.640625, "reward_std": 0.3352486789226532, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 214 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 541.421875, "epoch": 0.172, "grad_norm": 0.7956760671345361, "kl": 0.1083984375, "learning_rate": 4.9595564106145825e-06, "loss": 0.0043, "reward": 1.796875, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 215 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 485.796875, "epoch": 0.1728, "grad_norm": 0.9430121875636792, "kl": 0.109375, "learning_rate": 4.959180345791528e-06, "loss": 0.0044, "reward": 1.828125, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 216 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.90625, "epoch": 0.1736, "grad_norm": 0.7171185322557593, "kl": 0.0986328125, "learning_rate": 4.958802555021042e-06, "loss": 0.004, "reward": 1.875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 217 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 473.796875, "epoch": 0.1744, "grad_norm": 0.5353140087390248, "kl": 0.115234375, "learning_rate": 4.958423038568274e-06, "loss": 0.0046, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 218 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 485.4375, "epoch": 0.1752, "grad_norm": 0.5570596760557941, "kl": 0.103515625, "learning_rate": 4.958041796699583e-06, "loss": 0.0041, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 219 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 448.890625, "epoch": 0.176, "grad_norm": 0.7544119083860452, "kl": 0.11328125, "learning_rate": 4.957658829682539e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 220 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.34375, "epoch": 0.1768, "grad_norm": 1.1584516801614175, "kl": 0.126953125, "learning_rate": 4.9572741377859225e-06, "loss": 0.0051, "reward": 1.78125, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 221 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 511.46875, "epoch": 0.1776, "grad_norm": 1.0657478691247289, "kl": 0.10595703125, "learning_rate": 4.956887721279726e-06, "loss": 0.0042, "reward": 1.734375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 222 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.546875, "epoch": 0.1784, "grad_norm": 0.6005686701039716, "kl": 0.10888671875, "learning_rate": 4.95649958043515e-06, "loss": 0.0043, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 223 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 557.609375, "epoch": 0.1792, "grad_norm": 0.7167623609538366, "kl": 0.09912109375, "learning_rate": 4.956109715524609e-06, "loss": 0.004, "reward": 1.828125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 224 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 445.765625, "epoch": 0.18, "grad_norm": 1.1325882965070015, "kl": 0.1083984375, "learning_rate": 4.9557181268217225e-06, "loss": 0.0043, "reward": 1.71875, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 225 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 462.71875, "epoch": 0.1808, "grad_norm": 1.1803208487766932, "kl": 0.119140625, "learning_rate": 4.955324814601324e-06, "loss": 0.0048, "reward": 1.6875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 226 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 446.671875, "epoch": 0.1816, "grad_norm": 0.6396309044211909, "kl": 0.10791015625, "learning_rate": 4.954929779139455e-06, "loss": 0.0043, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 227 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 485.78125, "epoch": 0.1824, "grad_norm": 0.9122035852929374, "kl": 0.10400390625, "learning_rate": 4.954533020713367e-06, "loss": 0.0042, "reward": 1.828125, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 228 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 443.296875, "epoch": 0.1832, "grad_norm": 0.43387655485080506, "kl": 0.123046875, "learning_rate": 4.954134539601519e-06, "loss": 0.0049, "reward": 1.359375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 229 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.015625, "epoch": 0.184, "grad_norm": 0.8096554413694654, "kl": 0.1298828125, "learning_rate": 4.953734336083582e-06, "loss": 0.0052, "reward": 1.71875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 230 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 426.5625, "epoch": 0.1848, "grad_norm": 0.6413121537106756, "kl": 0.1142578125, "learning_rate": 4.953332410440434e-06, "loss": 0.0046, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 231 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 524.484375, "epoch": 0.1856, "grad_norm": 1.03331732968214, "kl": 0.1015625, "learning_rate": 4.952928762954161e-06, "loss": 0.0041, "reward": 1.515625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 232 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 475.0625, "epoch": 0.1864, "grad_norm": 0.37423563486247585, "kl": 0.10107421875, "learning_rate": 4.952523393908059e-06, "loss": 0.004, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 233 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 415.375, "epoch": 0.1872, "grad_norm": 0.778531578943487, "kl": 0.115234375, "learning_rate": 4.952116303586631e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 234 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.109375, "epoch": 0.188, "grad_norm": 0.7253260385102834, "kl": 0.0966796875, "learning_rate": 4.951707492275589e-06, "loss": 0.0039, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 235 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 507.40625, "epoch": 0.1888, "grad_norm": 0.778897690244807, "kl": 0.10986328125, "learning_rate": 4.951296960261853e-06, "loss": 0.0044, "reward": 1.796875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 236 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 484.21875, "epoch": 0.1896, "grad_norm": 0.9712714231037451, "kl": 0.1123046875, "learning_rate": 4.95088470783355e-06, "loss": 0.0045, "reward": 1.578125, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 237 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 543.5, "epoch": 0.1904, "grad_norm": 0.7958523077567334, "kl": 0.10595703125, "learning_rate": 4.950470735280013e-06, "loss": 0.0042, "reward": 1.6875, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 238 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 507.09375, "epoch": 0.1912, "grad_norm": 0.09715496428711683, "kl": 0.0986328125, "learning_rate": 4.950055042891786e-06, "loss": 0.004, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 239 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 536.703125, "epoch": 0.192, "grad_norm": 1.0485606281171582, "kl": 0.111328125, "learning_rate": 4.949637630960618e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 240 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 506.53125, "epoch": 0.1928, "grad_norm": 0.8799111721141154, "kl": 0.11767578125, "learning_rate": 4.949218499779462e-06, "loss": 0.0047, "reward": 1.609375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 241 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 503.9375, "epoch": 0.1936, "grad_norm": 1.1528474685863739, "kl": 0.115234375, "learning_rate": 4.948797649642484e-06, "loss": 0.0046, "reward": 1.53125, "reward_std": 0.19149437546730042, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 242 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 583.390625, "epoch": 0.1944, "grad_norm": 0.635291825122285, "kl": 0.11279296875, "learning_rate": 4.94837508084505e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.26124268770217896, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 243 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 556.90625, "epoch": 0.1952, "grad_norm": 0.708666658482898, "kl": 0.11669921875, "learning_rate": 4.9479507936837364e-06, "loss": 0.0047, "reward": 1.84375, "reward_std": 0.19149437546730042, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 244 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 474.171875, "epoch": 0.196, "grad_norm": 0.843492251351777, "kl": 0.11572265625, "learning_rate": 4.947524788456325e-06, "loss": 0.0046, "reward": 1.640625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 245 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 515.984375, "epoch": 0.1968, "grad_norm": 1.128475107879419, "kl": 0.1142578125, "learning_rate": 4.947097065461801e-06, "loss": 0.0046, "reward": 1.8125, "reward_std": 0.22558549046516418, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "step": 246 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 478.703125, "epoch": 0.1976, "grad_norm": 1.3156818737169922, "kl": 0.12255859375, "learning_rate": 4.946667625000358e-06, "loss": 0.0049, "reward": 1.578125, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 247 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.96875, "epoch": 0.1984, "grad_norm": 0.5624094172009639, "kl": 0.1064453125, "learning_rate": 4.946236467373392e-06, "loss": 0.0043, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 248 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 470.671875, "epoch": 0.1992, "grad_norm": 0.7739082371087889, "kl": 0.11865234375, "learning_rate": 4.945803592883509e-06, "loss": 0.0047, "reward": 1.78125, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 249 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.828125, "epoch": 0.2, "grad_norm": 0.6262767300134252, "kl": 0.1005859375, "learning_rate": 4.9453690018345144e-06, "loss": 0.004, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 250 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 539.578125, "epoch": 0.2008, "grad_norm": 0.5808817933264756, "kl": 0.09375, "learning_rate": 4.944932694531423e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "step": 251 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 436.484375, "epoch": 0.2016, "grad_norm": 1.002185621612387, "kl": 0.10107421875, "learning_rate": 4.94449467128045e-06, "loss": 0.0041, "reward": 1.859375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 252 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 474.703125, "epoch": 0.2024, "grad_norm": 0.7756433117997087, "kl": 0.09912109375, "learning_rate": 4.944054932389018e-06, "loss": 0.004, "reward": 1.6875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 253 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 477.34375, "epoch": 0.2032, "grad_norm": 0.976580739419774, "kl": 0.10546875, "learning_rate": 4.943613478165753e-06, "loss": 0.0042, "reward": 1.71875, "reward_std": 0.34929442405700684, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 254 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 457.78125, "epoch": 0.204, "grad_norm": 1.0544302690718284, "kl": 0.1142578125, "learning_rate": 4.943170308920484e-06, "loss": 0.0046, "reward": 1.546875, "reward_std": 0.34208837151527405, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 255 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 471.0625, "epoch": 0.2048, "grad_norm": 0.9717316604564764, "kl": 0.1005859375, "learning_rate": 4.9427254249642445e-06, "loss": 0.004, "reward": 1.515625, "reward_std": 0.3021277189254761, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 256 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 519.953125, "epoch": 0.2056, "grad_norm": 0.7824221426335534, "kl": 0.1064453125, "learning_rate": 4.942278826609272e-06, "loss": 0.0043, "reward": 1.59375, "reward_std": 0.28883427381515503, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 257 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 446.75, "epoch": 0.2064, "grad_norm": 0.8664602612014396, "kl": 0.10009765625, "learning_rate": 4.9418305141690045e-06, "loss": 0.004, "reward": 1.671875, "reward_std": 0.23568853735923767, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 258 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 417.265625, "epoch": 0.2072, "grad_norm": 0.5245421561960985, "kl": 0.09716796875, "learning_rate": 4.9413804879580865e-06, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 259 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.28125, "epoch": 0.208, "grad_norm": 0.5647553609893893, "kl": 0.11376953125, "learning_rate": 4.940928748292363e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 260 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.234375, "epoch": 0.2088, "grad_norm": 0.6984961037767852, "kl": 0.1025390625, "learning_rate": 4.940475295488882e-06, "loss": 0.0041, "reward": 1.765625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 261 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.65625, "epoch": 0.2096, "grad_norm": 0.6968168736839139, "kl": 0.099609375, "learning_rate": 4.940020129865895e-06, "loss": 0.004, "reward": 1.78125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 262 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 426.328125, "epoch": 0.2104, "grad_norm": 0.9782982162086745, "kl": 0.103515625, "learning_rate": 4.9395632517428546e-06, "loss": 0.0041, "reward": 1.578125, "reward_std": 0.3608325123786926, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 263 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.2112, "grad_norm": 0.4489442102308692, "kl": 0.09912109375, "learning_rate": 4.939104661440415e-06, "loss": 0.004, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 264 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 414.53125, "epoch": 0.212, "grad_norm": 0.8169469654738788, "kl": 0.1171875, "learning_rate": 4.938644359280433e-06, "loss": 0.0047, "reward": 1.71875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 265 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.359375, "epoch": 0.2128, "grad_norm": 1.2032151789560201, "kl": 0.11181640625, "learning_rate": 4.938182345585967e-06, "loss": 0.0045, "reward": 1.546875, "reward_std": 0.32407689094543457, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 266 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.46875, "epoch": 0.2136, "grad_norm": 0.5442903107655314, "kl": 0.10009765625, "learning_rate": 4.937718620681273e-06, "loss": 0.004, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 267 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 418.90625, "epoch": 0.2144, "grad_norm": 0.9270822595209119, "kl": 0.1064453125, "learning_rate": 4.9372531848918145e-06, "loss": 0.0043, "reward": 1.5, "reward_std": 0.2573973536491394, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 268 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 442.75, "epoch": 0.2152, "grad_norm": 0.5784340447468568, "kl": 0.10498046875, "learning_rate": 4.936786038544251e-06, "loss": 0.0042, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 269 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 493.53125, "epoch": 0.216, "grad_norm": 0.5053126414260094, "kl": 0.10009765625, "learning_rate": 4.9363171819664434e-06, "loss": 0.004, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 270 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 443.9375, "epoch": 0.2168, "grad_norm": 0.7760369870867544, "kl": 0.10107421875, "learning_rate": 4.9358466154874535e-06, "loss": 0.004, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 271 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.125, "epoch": 0.2176, "grad_norm": 1.14786779748916, "kl": 0.11669921875, "learning_rate": 4.935374339437543e-06, "loss": 0.0047, "reward": 1.5, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 272 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 481.609375, "epoch": 0.2184, "grad_norm": 1.0288369680601825, "kl": 0.103515625, "learning_rate": 4.934900354148173e-06, "loss": 0.0041, "reward": 1.8125, "reward_std": 0.2708277702331543, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 273 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 499.609375, "epoch": 0.2192, "grad_norm": 0.9289192454082107, "kl": 0.0966796875, "learning_rate": 4.934424659952006e-06, "loss": 0.0039, "reward": 1.515625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 274 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 468.28125, "epoch": 0.22, "grad_norm": 0.6845760409425516, "kl": 0.09765625, "learning_rate": 4.933947257182901e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 275 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 439.25, "epoch": 0.2208, "grad_norm": 0.5347567869972326, "kl": 0.10205078125, "learning_rate": 4.933468146175918e-06, "loss": 0.0041, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 276 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 444.65625, "epoch": 0.2216, "grad_norm": 0.5296343921953094, "kl": 0.09716796875, "learning_rate": 4.932987327267317e-06, "loss": 0.0039, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 277 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 458.546875, "epoch": 0.2224, "grad_norm": 0.7431199028703543, "kl": 0.171875, "learning_rate": 4.932504800794553e-06, "loss": 0.0069, "reward": 1.671875, "reward_std": 0.26735979318618774, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 278 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 420.46875, "epoch": 0.2232, "grad_norm": 0.7454584011248475, "kl": 0.10205078125, "learning_rate": 4.9320205670962815e-06, "loss": 0.0041, "reward": 1.828125, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 279 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 462.09375, "epoch": 0.224, "grad_norm": 0.9510667076624594, "kl": 0.1103515625, "learning_rate": 4.931534626512359e-06, "loss": 0.0044, "reward": 1.578125, "reward_std": 0.2824692726135254, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 280 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 400.28125, "epoch": 0.2248, "grad_norm": 0.7687280891983295, "kl": 0.11083984375, "learning_rate": 4.931046979383836e-06, "loss": 0.0044, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 281 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.84375, "epoch": 0.2256, "grad_norm": 0.7574221662954138, "kl": 0.111328125, "learning_rate": 4.930557626052961e-06, "loss": 0.0045, "reward": 1.65625, "reward_std": 0.1962026059627533, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 282 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.75, "epoch": 0.2264, "grad_norm": 0.8783907034453455, "kl": 0.1044921875, "learning_rate": 4.930066566863182e-06, "loss": 0.0042, "reward": 1.78125, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 283 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 389.96875, "epoch": 0.2272, "grad_norm": 0.9774193184682641, "kl": 0.115234375, "learning_rate": 4.929573802159143e-06, "loss": 0.0046, "reward": 1.640625, "reward_std": 0.34717273712158203, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 284 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 447.40625, "epoch": 0.228, "grad_norm": 1.0777899832373057, "kl": 0.115234375, "learning_rate": 4.929079332286685e-06, "loss": 0.0046, "reward": 1.546875, "reward_std": 0.40822190046310425, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 285 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 442.828125, "epoch": 0.2288, "grad_norm": 0.9185695394739847, "kl": 0.1298828125, "learning_rate": 4.928583157592846e-06, "loss": 0.0052, "reward": 1.71875, "reward_std": 0.32195520401000977, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 286 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 413.1875, "epoch": 0.2296, "grad_norm": 1.0338072709985884, "kl": 0.10693359375, "learning_rate": 4.928085278425862e-06, "loss": 0.0043, "reward": 1.59375, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 287 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 461.390625, "epoch": 0.2304, "grad_norm": 0.8021566880229952, "kl": 0.1103515625, "learning_rate": 4.927585695135162e-06, "loss": 0.0044, "reward": 1.6875, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 288 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 387.6875, "epoch": 0.2312, "grad_norm": 0.9297237542138639, "kl": 0.1044921875, "learning_rate": 4.9270844080713735e-06, "loss": 0.0042, "reward": 1.8125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 289 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 435.71875, "epoch": 0.232, "grad_norm": 0.8541926001316015, "kl": 0.09765625, "learning_rate": 4.926581417586319e-06, "loss": 0.0039, "reward": 1.609375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 290 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 364.390625, "epoch": 0.2328, "grad_norm": 1.124317503956043, "kl": 0.109375, "learning_rate": 4.926076724033016e-06, "loss": 0.0044, "reward": 1.65625, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 291 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 410.328125, "epoch": 0.2336, "grad_norm": 0.4502816614200057, "kl": 0.11279296875, "learning_rate": 4.925570327765678e-06, "loss": 0.0045, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 292 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 448.171875, "epoch": 0.2344, "grad_norm": 0.45357608231069996, "kl": 0.087890625, "learning_rate": 4.9250622291397144e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 293 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.8125, "epoch": 0.2352, "grad_norm": 0.9337827458070511, "kl": 0.10888671875, "learning_rate": 4.924552428511727e-06, "loss": 0.0044, "reward": 1.703125, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 294 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 429.875, "epoch": 0.236, "grad_norm": 0.9199067291128734, "kl": 0.11083984375, "learning_rate": 4.924040926239515e-06, "loss": 0.0044, "reward": 1.71875, "reward_std": 0.28247910737991333, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 295 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 423.578125, "epoch": 0.2368, "grad_norm": 1.6806458457179179, "kl": 0.095703125, "learning_rate": 4.92352772268207e-06, "loss": 0.0038, "reward": 1.6875, "reward_std": 0.1825428307056427, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 296 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 426.359375, "epoch": 0.2376, "grad_norm": 0.46539335382214253, "kl": 0.11865234375, "learning_rate": 4.923012818199576e-06, "loss": 0.0047, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 297 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 437.6875, "epoch": 0.2384, "grad_norm": 0.7415496459961164, "kl": 0.10302734375, "learning_rate": 4.922496213153416e-06, "loss": 0.0041, "reward": 1.78125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 298 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.328125, "epoch": 0.2392, "grad_norm": 0.8651793057898831, "kl": 0.10693359375, "learning_rate": 4.921977907906161e-06, "loss": 0.0043, "reward": 1.75, "reward_std": 0.22201895713806152, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 299 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 455.734375, "epoch": 0.24, "grad_norm": 1.0135729622759693, "kl": 0.1298828125, "learning_rate": 4.921457902821578e-06, "loss": 0.0052, "reward": 1.5, "reward_std": 0.29143065214157104, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 300 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.984375, "epoch": 0.2408, "grad_norm": 0.8119903503196184, "kl": 0.1181640625, "learning_rate": 4.9209361982646275e-06, "loss": 0.0047, "reward": 1.640625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 301 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 502.09375, "epoch": 0.2416, "grad_norm": 0.6331152922884491, "kl": 0.10009765625, "learning_rate": 4.920412794601461e-06, "loss": 0.004, "reward": 1.546875, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 302 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.53125, "epoch": 0.2424, "grad_norm": 0.36110170160289956, "kl": 0.10693359375, "learning_rate": 4.919887692199423e-06, "loss": 0.0043, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 303 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 454.59375, "epoch": 0.2432, "grad_norm": 0.650522167585543, "kl": 0.1171875, "learning_rate": 4.9193608914270515e-06, "loss": 0.0047, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 304 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.546875, "epoch": 0.244, "grad_norm": 0.40448868486799067, "kl": 0.1083984375, "learning_rate": 4.918832392654075e-06, "loss": 0.0043, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 305 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 463.84375, "epoch": 0.2448, "grad_norm": 0.38978931826325497, "kl": 0.11962890625, "learning_rate": 4.9183021962514145e-06, "loss": 0.0048, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 306 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.375, "epoch": 0.2456, "grad_norm": 1.0986331700929117, "kl": 0.1162109375, "learning_rate": 4.917770302591183e-06, "loss": 0.0046, "reward": 1.59375, "reward_std": 0.3198433816432953, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 307 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 591.125, "epoch": 0.2464, "grad_norm": 1.1125510217790837, "kl": 0.11572265625, "learning_rate": 4.917236712046682e-06, "loss": 0.0046, "reward": 1.828125, "reward_std": 0.21320319175720215, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 308 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 512.453125, "epoch": 0.2472, "grad_norm": 1.0540625145093168, "kl": 0.10595703125, "learning_rate": 4.9167014249924075e-06, "loss": 0.0042, "reward": 1.578125, "reward_std": 0.2561880052089691, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 309 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 462.84375, "epoch": 0.248, "grad_norm": 0.8686096717928228, "kl": 0.162109375, "learning_rate": 4.916164441804044e-06, "loss": 0.0065, "reward": 1.640625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 310 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.40625, "epoch": 0.2488, "grad_norm": 0.7270541313642246, "kl": 0.1103515625, "learning_rate": 4.915625762858467e-06, "loss": 0.0044, "reward": 1.71875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 311 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 463.75, "epoch": 0.2496, "grad_norm": 1.0498750119950635, "kl": 0.1318359375, "learning_rate": 4.915085388533743e-06, "loss": 0.0053, "reward": 1.609375, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 312 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 494.828125, "epoch": 0.2504, "grad_norm": 0.7317483564705757, "kl": 0.12353515625, "learning_rate": 4.914543319209126e-06, "loss": 0.0049, "reward": 1.640625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 313 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 458.15625, "epoch": 0.2512, "grad_norm": 0.9310724758442198, "kl": 0.12255859375, "learning_rate": 4.913999555265062e-06, "loss": 0.0049, "reward": 1.59375, "reward_std": 0.32805800437927246, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 314 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 545.65625, "epoch": 0.252, "grad_norm": 18.925354049851833, "kl": 0.0947265625, "learning_rate": 4.913454097083185e-06, "loss": 0.0038, "reward": 1.71875, "reward_std": 0.240030437707901, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 315 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.4375, "epoch": 0.2528, "grad_norm": 0.6723579513353247, "kl": 0.10498046875, "learning_rate": 4.912906945046319e-06, "loss": 0.0042, "reward": 1.78125, "reward_std": 0.17570313811302185, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 316 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 542.640625, "epoch": 0.2536, "grad_norm": 0.7046884052855756, "kl": 0.09912109375, "learning_rate": 4.912358099538476e-06, "loss": 0.004, "reward": 1.6875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 317 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 506.96875, "epoch": 0.2544, "grad_norm": 0.7943111076995014, "kl": 0.11474609375, "learning_rate": 4.911807560944858e-06, "loss": 0.0046, "reward": 1.453125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 318 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 509.59375, "epoch": 0.2552, "grad_norm": 0.7505085892345437, "kl": 0.09375, "learning_rate": 4.911255329651852e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 319 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 467.328125, "epoch": 0.256, "grad_norm": 1.3011643228506782, "kl": 0.1162109375, "learning_rate": 4.910701406047037e-06, "loss": 0.0047, "reward": 1.578125, "reward_std": 0.41388463973999023, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 320 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 445.453125, "epoch": 0.2568, "grad_norm": 0.6195217573777856, "kl": 0.111328125, "learning_rate": 4.910145790519177e-06, "loss": 0.0045, "reward": 1.703125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 321 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 565.296875, "epoch": 0.2576, "grad_norm": 1.4057130532982316, "kl": 0.09326171875, "learning_rate": 4.9095884834582256e-06, "loss": 0.0037, "reward": 1.734375, "reward_std": 0.22636085748672485, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 322 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 506.890625, "epoch": 0.2584, "grad_norm": 0.9573420494635505, "kl": 0.10791015625, "learning_rate": 4.909029485255321e-06, "loss": 0.0043, "reward": 1.859375, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 323 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 469.40625, "epoch": 0.2592, "grad_norm": 1.0911521485399704, "kl": 0.10546875, "learning_rate": 4.90846879630279e-06, "loss": 0.0042, "reward": 1.578125, "reward_std": 0.2867126166820526, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 324 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 511.453125, "epoch": 0.26, "grad_norm": 0.40648896945998414, "kl": 0.08837890625, "learning_rate": 4.907906416994146e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 325 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 483.5625, "epoch": 0.2608, "grad_norm": 1.0194468410692472, "kl": 0.10009765625, "learning_rate": 4.907342347724088e-06, "loss": 0.004, "reward": 1.671875, "reward_std": 0.23531240224838257, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 326 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 423.328125, "epoch": 0.2616, "grad_norm": 0.6452948851174004, "kl": 0.11328125, "learning_rate": 4.906776588888502e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 327 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 531.59375, "epoch": 0.2624, "grad_norm": 0.9072527178998807, "kl": 0.08935546875, "learning_rate": 4.906209140884459e-06, "loss": 0.0036, "reward": 1.71875, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 328 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 496.84375, "epoch": 0.2632, "grad_norm": 1.0696754259727737, "kl": 0.09716796875, "learning_rate": 4.905640004110216e-06, "loss": 0.0039, "reward": 1.515625, "reward_std": 0.255811870098114, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 329 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 495.578125, "epoch": 0.264, "grad_norm": 0.8335515525216668, "kl": 0.1142578125, "learning_rate": 4.905069178965215e-06, "loss": 0.0046, "reward": 1.5, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 330 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 488.734375, "epoch": 0.2648, "grad_norm": 0.9957266368937032, "kl": 0.1064453125, "learning_rate": 4.904496665850083e-06, "loss": 0.0043, "reward": 1.515625, "reward_std": 0.34208837151527405, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 331 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 453.734375, "epoch": 0.2656, "grad_norm": 1.1223237724051203, "kl": 0.11669921875, "learning_rate": 4.903922465166633e-06, "loss": 0.0047, "reward": 1.59375, "reward_std": 0.35400262475013733, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 332 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 511.5, "epoch": 0.2664, "grad_norm": 1.2579776360335044, "kl": 0.10595703125, "learning_rate": 4.903346577317859e-06, "loss": 0.0042, "reward": 1.546875, "reward_std": 0.34564992785453796, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 333 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 477.71875, "epoch": 0.2672, "grad_norm": 1.2349668070290505, "kl": 0.125, "learning_rate": 4.902769002707942e-06, "loss": 0.005, "reward": 1.8125, "reward_std": 0.3039487302303314, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "step": 334 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 450.390625, "epoch": 0.268, "grad_norm": 0.8960974727205545, "kl": 0.12109375, "learning_rate": 4.902189741742247e-06, "loss": 0.0049, "reward": 1.6875, "reward_std": 0.3139738440513611, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.953125, "step": 335 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 514.34375, "epoch": 0.2688, "grad_norm": 0.706708681387227, "kl": 0.11865234375, "learning_rate": 4.901608794827321e-06, "loss": 0.0047, "reward": 1.6875, "reward_std": 0.2407432198524475, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 336 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.75, "epoch": 0.2696, "grad_norm": 1.5034128585970563, "kl": 0.1259765625, "learning_rate": 4.9010261623708945e-06, "loss": 0.005, "reward": 1.90625, "reward_std": 0.15769661962985992, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.96875, "step": 337 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 469.15625, "epoch": 0.2704, "grad_norm": 0.9957358517939294, "kl": 0.28515625, "learning_rate": 4.900441844781882e-06, "loss": 0.0114, "reward": 1.53125, "reward_std": 0.27281248569488525, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.921875, "step": 338 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 442.921875, "epoch": 0.2712, "grad_norm": 1.03838448773179, "kl": 0.126953125, "learning_rate": 4.89985584247038e-06, "loss": 0.0051, "reward": 1.546875, "reward_std": 0.26977968215942383, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 339 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 463.515625, "epoch": 0.272, "grad_norm": 1.1184747989813613, "kl": 0.232421875, "learning_rate": 4.899268155847667e-06, "loss": 0.0093, "reward": 1.59375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.953125, "step": 340 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 452.140625, "epoch": 0.2728, "grad_norm": 1.446286599289767, "kl": 0.20703125, "learning_rate": 4.898678785326205e-06, "loss": 0.0083, "reward": 1.609375, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 341 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.5, "epoch": 0.2736, "grad_norm": 1.0667490276319753, "kl": 0.1337890625, "learning_rate": 4.898087731319637e-06, "loss": 0.0053, "reward": 1.6875, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 342 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.2744, "grad_norm": 1.8075013197728957, "kl": 0.56640625, "learning_rate": 4.8974949942427854e-06, "loss": 0.0226, "reward": 1.3125, "reward_std": 0.7984259128570557, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.703125, "step": 343 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 409.453125, "epoch": 0.2752, "grad_norm": 2.0612645399792253, "kl": 0.78125, "learning_rate": 4.896900574511657e-06, "loss": 0.0313, "reward": 1.1875, "reward_std": 0.6475251913070679, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.734375, "step": 344 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 351.328125, "epoch": 0.276, "grad_norm": 5.117351232455169, "kl": 0.54296875, "learning_rate": 4.89630447254344e-06, "loss": 0.0217, "reward": 1.25, "reward_std": 0.5729714632034302, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.765625, "step": 345 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 247.0, "epoch": 0.2768, "grad_norm": 24.876604888296548, "kl": 6.625, "learning_rate": 4.8957066887565005e-06, "loss": 0.2649, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 346 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 640.671875, "epoch": 0.2776, "grad_norm": 201.1175403256699, "kl": 4.625, "learning_rate": 4.895107223570386e-06, "loss": 0.1846, "reward": 0.4375, "reward_std": 0.6013320684432983, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.1875, "step": 347 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 353.6875, "epoch": 0.2784, "grad_norm": 33.61976097623809, "kl": 2.4375, "learning_rate": 4.894506077405824e-06, "loss": 0.0977, "reward": 0.859375, "reward_std": 0.8206709027290344, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.546875, "step": 348 }, { "all_correct": 0.0, "all_wrong": 0.875, "completion_length": 451.484375, "epoch": 0.2792, "grad_norm": 626.7174881514043, "kl": 5.09375, "learning_rate": 4.893903250684723e-06, "loss": 0.2029, "reward": 0.171875, "reward_std": 0.3107786178588867, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.03125, "step": 349 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 332.546875, "epoch": 0.28, "grad_norm": 28.91852693203601, "kl": 0.498046875, "learning_rate": 4.893298743830168e-06, "loss": 0.0199, "reward": 1.640625, "reward_std": 0.3841980993747711, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.90625, "step": 350 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 343.0625, "epoch": 0.2808, "grad_norm": 7.171144824263768, "kl": 1.40625, "learning_rate": 4.892692557266429e-06, "loss": 0.0561, "reward": 1.1875, "reward_std": 0.7015436887741089, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.65625, "step": 351 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 255.765625, "epoch": 0.2816, "grad_norm": 7.446387142941203, "kl": 2.46875, "learning_rate": 4.8920846914189465e-06, "loss": 0.0987, "reward": 1.0, "reward_std": 1.0010052919387817, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.484375, "step": 352 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 279.5, "epoch": 0.2824, "grad_norm": 7.482555980879127, "kl": 2.46875, "learning_rate": 4.891475146714348e-06, "loss": 0.0987, "reward": 1.0, "reward_std": 0.9171967506408691, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.546875, "step": 353 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 314.140625, "epoch": 0.2832, "grad_norm": 186.2241805566282, "kl": 4.84375, "learning_rate": 4.8908639235804324e-06, "loss": 0.1935, "reward": 0.625, "reward_std": 0.7367569804191589, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.34375, "step": 354 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 531.34375, "epoch": 0.284, "grad_norm": 2163.964903161328, "kl": 58.0, "learning_rate": 4.890251022446181e-06, "loss": 2.3272, "reward": 0.09375, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.046875, "step": 355 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 599.875, "epoch": 0.2848, "grad_norm": 30.636793031414754, "kl": 0.79296875, "learning_rate": 4.889636443741752e-06, "loss": 0.0317, "reward": 0.109375, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.03125, "step": 356 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 208.96875, "epoch": 0.2856, "grad_norm": 5.97530784882832, "kl": 2.375, "learning_rate": 4.88902018789848e-06, "loss": 0.0949, "reward": 0.640625, "reward_std": 0.7411497831344604, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.28125, "step": 357 }, { "all_correct": 0.0, "all_wrong": 0.75, "completion_length": 176.25, "epoch": 0.2864, "grad_norm": 19.34351483851466, "kl": 7.4375, "learning_rate": 4.888402255348877e-06, "loss": 0.297, "reward": 0.234375, "reward_std": 0.4060066342353821, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.109375, "step": 358 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 304.015625, "epoch": 0.2872, "grad_norm": 12.216881362937261, "kl": 2.375, "learning_rate": 4.887782646526631e-06, "loss": 0.0951, "reward": 0.90625, "reward_std": 0.5779029130935669, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.34375, "step": 359 }, { "all_correct": 0.0, "all_wrong": 0.75, "completion_length": 293.453125, "epoch": 0.288, "grad_norm": 1.8055399813270507, "kl": 0.435546875, "learning_rate": 4.887161361866608e-06, "loss": 0.0175, "reward": 0.546875, "reward_std": 0.4071483612060547, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.140625, "step": 360 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 294.875, "epoch": 0.2888, "grad_norm": 2.2464320445436123, "kl": 0.2353515625, "learning_rate": 4.8865384018048494e-06, "loss": 0.0094, "reward": 1.046875, "reward_std": 0.6361694931983948, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.4375, "step": 361 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 277.6875, "epoch": 0.2896, "grad_norm": 1.5168721375251013, "kl": 0.2001953125, "learning_rate": 4.8859137667785735e-06, "loss": 0.008, "reward": 1.390625, "reward_std": 0.39560043811798096, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.890625, "step": 362 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 369.484375, "epoch": 0.2904, "grad_norm": 2.9135374435837598, "kl": 0.1845703125, "learning_rate": 4.8852874572261715e-06, "loss": 0.0074, "reward": 1.4375, "reward_std": 0.4606751799583435, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.671875, "step": 363 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 317.03125, "epoch": 0.2912, "grad_norm": 1.6946308584406236, "kl": 0.1884765625, "learning_rate": 4.884659473587213e-06, "loss": 0.0075, "reward": 1.515625, "reward_std": 0.4728219509124756, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.78125, "step": 364 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 409.3125, "epoch": 0.292, "grad_norm": 1.9890104705809388, "kl": 0.205078125, "learning_rate": 4.884029816302441e-06, "loss": 0.0082, "reward": 1.546875, "reward_std": 0.4024401307106018, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.78125, "step": 365 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 447.046875, "epoch": 0.2928, "grad_norm": 3.4786297634905567, "kl": 0.302734375, "learning_rate": 4.883398485813772e-06, "loss": 0.0121, "reward": 1.15625, "reward_std": 0.4918765723705292, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.59375, "step": 366 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 599.75, "epoch": 0.2936, "grad_norm": 16.311891900467852, "kl": 0.89453125, "learning_rate": 4.8827654825642984e-06, "loss": 0.0358, "reward": 1.078125, "reward_std": 0.6761397123336792, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.421875, "step": 367 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 553.78125, "epoch": 0.2944, "grad_norm": 11.37452807904692, "kl": 1.8125, "learning_rate": 4.882130806998287e-06, "loss": 0.0726, "reward": 0.984375, "reward_std": 0.7113250494003296, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.484375, "step": 368 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.421875, "epoch": 0.2952, "grad_norm": 8.426860834578568, "kl": 1.640625, "learning_rate": 4.881494459561177e-06, "loss": 0.0656, "reward": 1.21875, "reward_std": 0.7671608924865723, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.78125, "step": 369 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 386.421875, "epoch": 0.296, "grad_norm": 5.772641810637551, "kl": 4.125, "learning_rate": 4.880856440699582e-06, "loss": 0.1649, "reward": 1.375, "reward_std": 0.708383321762085, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.8125, "step": 370 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 293.734375, "epoch": 0.2968, "grad_norm": 22.04847857153818, "kl": 11.5, "learning_rate": 4.880216750861288e-06, "loss": 0.4595, "reward": 1.078125, "reward_std": 0.9076728820800781, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.640625, "step": 371 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 305.09375, "epoch": 0.2976, "grad_norm": 16.139049061859996, "kl": 12.875, "learning_rate": 4.879575390495254e-06, "loss": 0.5157, "reward": 1.140625, "reward_std": 0.8692222833633423, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.65625, "step": 372 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 439.140625, "epoch": 0.2984, "grad_norm": 5.483376733053643, "kl": 4.28125, "learning_rate": 4.878932360051611e-06, "loss": 0.1714, "reward": 1.734375, "reward_std": 0.5622838735580444, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.890625, "step": 373 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.96875, "epoch": 0.2992, "grad_norm": 22.744631135693638, "kl": 17.125, "learning_rate": 4.878287659981663e-06, "loss": 0.6852, "reward": 0.984375, "reward_std": 0.9501945972442627, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.53125, "step": 374 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 631.65625, "epoch": 0.3, "grad_norm": 10.663644117055918, "kl": 5.25, "learning_rate": 4.8776412907378845e-06, "loss": 0.2108, "reward": 1.25, "reward_std": 0.7919073700904846, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.734375, "step": 375 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 591.59375, "epoch": 0.3008, "grad_norm": 5.836849178969912, "kl": 7.875, "learning_rate": 4.876993252773923e-06, "loss": 0.315, "reward": 1.078125, "reward_std": 0.5978821516036987, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.65625, "step": 376 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 640.234375, "epoch": 0.3016, "grad_norm": 4.0425819468893645, "kl": 5.3125, "learning_rate": 4.876343546544596e-06, "loss": 0.2119, "reward": 1.25, "reward_std": 0.662531316280365, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.734375, "step": 377 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 606.0, "epoch": 0.3024, "grad_norm": 4.353961170477641, "kl": 5.625, "learning_rate": 4.8756921725058935e-06, "loss": 0.2251, "reward": 1.40625, "reward_std": 0.7833096981048584, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.765625, "step": 378 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 520.671875, "epoch": 0.3032, "grad_norm": 3.907345388335812, "kl": 7.9375, "learning_rate": 4.875039131114975e-06, "loss": 0.3178, "reward": 1.46875, "reward_std": 0.7728353142738342, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.796875, "step": 379 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 609.09375, "epoch": 0.304, "grad_norm": 3.0197647012699838, "kl": 7.0625, "learning_rate": 4.8743844228301676e-06, "loss": 0.282, "reward": 1.34375, "reward_std": 0.7501465082168579, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.8125, "step": 380 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 630.890625, "epoch": 0.3048, "grad_norm": 3.328362838541217, "kl": 4.75, "learning_rate": 4.873728048110973e-06, "loss": 0.1908, "reward": 1.34375, "reward_std": 0.669731855392456, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.828125, "step": 381 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 553.75, "epoch": 0.3056, "grad_norm": 4.545235460323257, "kl": 6.78125, "learning_rate": 4.873070007418059e-06, "loss": 0.2708, "reward": 1.296875, "reward_std": 0.7942386269569397, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.78125, "step": 382 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 543.34375, "epoch": 0.3064, "grad_norm": 11.300499138026463, "kl": 2.296875, "learning_rate": 4.872410301213265e-06, "loss": 0.092, "reward": 1.5, "reward_std": 0.5112500190734863, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.859375, "step": 383 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 479.453125, "epoch": 0.3072, "grad_norm": 2.5224974439131205, "kl": 1.7421875, "learning_rate": 4.871748929959598e-06, "loss": 0.0695, "reward": 1.546875, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.9375, "step": 384 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 522.390625, "epoch": 0.308, "grad_norm": 8.861165553607586, "kl": 2.40625, "learning_rate": 4.871085894121234e-06, "loss": 0.096, "reward": 1.515625, "reward_std": 0.37378886342048645, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.90625, "step": 385 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 415.375, "epoch": 0.3088, "grad_norm": 156.19365589168777, "kl": 6.8125, "learning_rate": 4.870421194163515e-06, "loss": 0.2727, "reward": 1.578125, "reward_std": 0.3775939345359802, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 386 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.75, "epoch": 0.3096, "grad_norm": 27.153773633031346, "kl": 0.345703125, "learning_rate": 4.869754830552956e-06, "loss": 0.0138, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 387 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 408.453125, "epoch": 0.3104, "grad_norm": 1030.1027037042702, "kl": 111.5, "learning_rate": 4.869086803757235e-06, "loss": 4.4696, "reward": 1.46875, "reward_std": 0.34669801592826843, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.9375, "step": 388 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 450.984375, "epoch": 0.3112, "grad_norm": 7.507009527485788, "kl": 0.34375, "learning_rate": 4.868417114245199e-06, "loss": 0.0137, "reward": 1.75, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9375, "step": 389 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 479.765625, "epoch": 0.312, "grad_norm": 0.4155073762935163, "kl": 0.216796875, "learning_rate": 4.867745762486862e-06, "loss": 0.0087, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 390 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 454.40625, "epoch": 0.3128, "grad_norm": 17.285174184609716, "kl": 0.6171875, "learning_rate": 4.8670727489534035e-06, "loss": 0.0247, "reward": 1.609375, "reward_std": 0.40701988339424133, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.90625, "step": 391 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 393.25, "epoch": 0.3136, "grad_norm": 876.5609922977326, "kl": 134.0, "learning_rate": 4.866398074117173e-06, "loss": 5.3539, "reward": 1.515625, "reward_std": 0.3132994771003723, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.9375, "step": 392 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 506.859375, "epoch": 0.3144, "grad_norm": 0.8228708456368266, "kl": 0.248046875, "learning_rate": 4.86572173845168e-06, "loss": 0.0099, "reward": 1.484375, "reward_std": 0.35352790355682373, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 393 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 419.296875, "epoch": 0.3152, "grad_norm": 35.19059370326572, "kl": 0.306640625, "learning_rate": 4.865043742431605e-06, "loss": 0.0122, "reward": 1.671875, "reward_std": 0.39809340238571167, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 394 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 371.359375, "epoch": 0.316, "grad_norm": 0.8201978988422712, "kl": 0.228515625, "learning_rate": 4.864364086532792e-06, "loss": 0.0091, "reward": 1.640625, "reward_std": 0.22707363963127136, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 395 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 454.890625, "epoch": 0.3168, "grad_norm": 111.8832716959935, "kl": 5.0, "learning_rate": 4.863682771232249e-06, "loss": 0.1995, "reward": 1.453125, "reward_std": 0.5471649169921875, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.875, "step": 396 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 401.109375, "epoch": 0.3176, "grad_norm": 0.5286369705491609, "kl": 0.197265625, "learning_rate": 4.862999797008149e-06, "loss": 0.0079, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 397 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 479.84375, "epoch": 0.3184, "grad_norm": 22.95638431876077, "kl": 1.453125, "learning_rate": 4.862315164339829e-06, "loss": 0.0581, "reward": 1.734375, "reward_std": 0.38578349351882935, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.90625, "step": 398 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 466.390625, "epoch": 0.3192, "grad_norm": 4.65125203900269, "kl": 1.6640625, "learning_rate": 4.861628873707792e-06, "loss": 0.0664, "reward": 1.46875, "reward_std": 0.5292564630508423, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.890625, "step": 399 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 393.796875, "epoch": 0.32, "grad_norm": 1.085220273805104, "kl": 0.2265625, "learning_rate": 4.860940925593703e-06, "loss": 0.009, "reward": 1.34375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 400 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 470.21875, "epoch": 0.3208, "grad_norm": 0.6245969728254261, "kl": 0.1767578125, "learning_rate": 4.86025132048039e-06, "loss": 0.0071, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 401 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 455.5, "epoch": 0.3216, "grad_norm": 0.7126066128293126, "kl": 0.201171875, "learning_rate": 4.859560058851844e-06, "loss": 0.0081, "reward": 1.75, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 402 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.4375, "epoch": 0.3224, "grad_norm": 0.761201915189821, "kl": 0.201171875, "learning_rate": 4.8588671411932195e-06, "loss": 0.008, "reward": 1.671875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 403 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 447.75, "epoch": 0.3232, "grad_norm": 1.1718892639708978, "kl": 0.1865234375, "learning_rate": 4.858172567990832e-06, "loss": 0.0075, "reward": 1.734375, "reward_std": 0.29788440465927124, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 404 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 445.3125, "epoch": 0.324, "grad_norm": 0.8366164839673635, "kl": 0.185546875, "learning_rate": 4.857476339732162e-06, "loss": 0.0074, "reward": 1.78125, "reward_std": 0.2756393849849701, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 405 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.6875, "epoch": 0.3248, "grad_norm": 0.6147289990534973, "kl": 0.1923828125, "learning_rate": 4.856778456905846e-06, "loss": 0.0077, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 406 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 388.484375, "epoch": 0.3256, "grad_norm": 0.7855381422211676, "kl": 0.21875, "learning_rate": 4.856078920001689e-06, "loss": 0.0087, "reward": 1.421875, "reward_std": 0.22636085748672485, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 407 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 417.15625, "epoch": 0.3264, "grad_norm": 2.275516154603678, "kl": 2.3125, "learning_rate": 4.855377729510648e-06, "loss": 0.0929, "reward": 1.40625, "reward_std": 0.2924008071422577, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.953125, "step": 408 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.921875, "epoch": 0.3272, "grad_norm": 9.369765022864346, "kl": 1.4921875, "learning_rate": 4.8546748859248504e-06, "loss": 0.06, "reward": 1.609375, "reward_std": 0.2993340790271759, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 409 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 401.140625, "epoch": 0.328, "grad_norm": 1.4444623821966434, "kl": 2.265625, "learning_rate": 4.853970389737576e-06, "loss": 0.0904, "reward": 1.515625, "reward_std": 0.33351296186447144, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.953125, "step": 410 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.09375, "epoch": 0.3288, "grad_norm": 2.549204946324485, "kl": 3.0, "learning_rate": 4.8532642414432675e-06, "loss": 0.1202, "reward": 1.609375, "reward_std": 0.4376729428768158, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.921875, "step": 411 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 417.71875, "epoch": 0.3296, "grad_norm": 2.656722449352898, "kl": 3.34375, "learning_rate": 4.852556441537528e-06, "loss": 0.1337, "reward": 1.25, "reward_std": 0.5111948251724243, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.90625, "step": 412 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.28125, "epoch": 0.3304, "grad_norm": 0.9276053292737674, "kl": 1.3359375, "learning_rate": 4.851846990517118e-06, "loss": 0.0534, "reward": 1.65625, "reward_std": 0.4088938534259796, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.96875, "step": 413 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 475.5, "epoch": 0.3312, "grad_norm": 1.2562446558868838, "kl": 1.5, "learning_rate": 4.851135888879958e-06, "loss": 0.0601, "reward": 1.71875, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 414 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 457.390625, "epoch": 0.332, "grad_norm": 1.5340077204292817, "kl": 1.0, "learning_rate": 4.850423137125126e-06, "loss": 0.0399, "reward": 1.78125, "reward_std": 0.3061639666557312, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 415 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 434.6875, "epoch": 0.3328, "grad_norm": 0.9409909612683872, "kl": 0.5859375, "learning_rate": 4.8497087357528585e-06, "loss": 0.0234, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.984375, "step": 416 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 425.046875, "epoch": 0.3336, "grad_norm": 0.545076811233695, "kl": 0.1669921875, "learning_rate": 4.8489926852645505e-06, "loss": 0.0067, "reward": 1.703125, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 417 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 409.515625, "epoch": 0.3344, "grad_norm": 0.9556631606557033, "kl": 0.796875, "learning_rate": 4.848274986162754e-06, "loss": 0.0318, "reward": 1.546875, "reward_std": 0.28778618574142456, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 418 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.34375, "epoch": 0.3352, "grad_norm": 0.7867848801233414, "kl": 0.36328125, "learning_rate": 4.847555638951177e-06, "loss": 0.0145, "reward": 1.828125, "reward_std": 0.2662131190299988, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 419 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 368.890625, "epoch": 0.336, "grad_norm": 0.5755910960305857, "kl": 0.1533203125, "learning_rate": 4.846834644134686e-06, "loss": 0.0061, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 420 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.03125, "epoch": 0.3368, "grad_norm": 0.5315593964989055, "kl": 0.1552734375, "learning_rate": 4.846112002219301e-06, "loss": 0.0062, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 421 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.734375, "epoch": 0.3376, "grad_norm": 0.545127285092993, "kl": 0.201171875, "learning_rate": 4.845387713712203e-06, "loss": 0.0081, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 422 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 364.78125, "epoch": 0.3384, "grad_norm": 1.312387736907392, "kl": 0.1533203125, "learning_rate": 4.844661779121723e-06, "loss": 0.0061, "reward": 1.546875, "reward_std": 0.28930896520614624, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 423 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.71875, "epoch": 0.3392, "grad_norm": 2.1562952500677324, "kl": 0.8515625, "learning_rate": 4.843934198957351e-06, "loss": 0.034, "reward": 1.578125, "reward_std": 0.2805546522140503, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 424 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 362.40625, "epoch": 0.34, "grad_norm": 3.3751659314634805, "kl": 0.171875, "learning_rate": 4.84320497372973e-06, "loss": 0.0069, "reward": 1.625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 425 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.171875, "epoch": 0.3408, "grad_norm": 0.8741894143777079, "kl": 0.13671875, "learning_rate": 4.842474103950658e-06, "loss": 0.0055, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 426 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 396.1875, "epoch": 0.3416, "grad_norm": 0.8174260769621657, "kl": 0.494140625, "learning_rate": 4.841741590133089e-06, "loss": 0.0197, "reward": 1.59375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 427 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 369.109375, "epoch": 0.3424, "grad_norm": 0.5209696523338851, "kl": 0.1396484375, "learning_rate": 4.841007432791129e-06, "loss": 0.0056, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 428 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.875, "epoch": 0.3432, "grad_norm": 0.8153419848070561, "kl": 0.169921875, "learning_rate": 4.8402716324400375e-06, "loss": 0.0068, "reward": 1.8125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 429 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 415.6875, "epoch": 0.344, "grad_norm": 1.0666154124765679, "kl": 0.78125, "learning_rate": 4.839534189596228e-06, "loss": 0.0311, "reward": 1.59375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 430 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 395.859375, "epoch": 0.3448, "grad_norm": 0.44393856411551624, "kl": 0.169921875, "learning_rate": 4.8387951047772656e-06, "loss": 0.0068, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 431 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.28125, "epoch": 0.3456, "grad_norm": 1.1161358537344634, "kl": 2.203125, "learning_rate": 4.838054378501868e-06, "loss": 0.0883, "reward": 1.890625, "reward_std": 0.20873048901557922, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.953125, "step": 432 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 376.890625, "epoch": 0.3464, "grad_norm": 0.4075823163043336, "kl": 0.1435546875, "learning_rate": 4.837312011289907e-06, "loss": 0.0057, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 433 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 359.421875, "epoch": 0.3472, "grad_norm": 1.0816196211554479, "kl": 0.1474609375, "learning_rate": 4.836568003662403e-06, "loss": 0.0059, "reward": 1.609375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 434 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 420.015625, "epoch": 0.348, "grad_norm": 0.7898993144441228, "kl": 1.5, "learning_rate": 4.8358223561415304e-06, "loss": 0.06, "reward": 1.765625, "reward_std": 0.20290403068065643, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.953125, "step": 435 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 407.921875, "epoch": 0.3488, "grad_norm": 1.578568176968985, "kl": 0.130859375, "learning_rate": 4.835075069250613e-06, "loss": 0.0053, "reward": 1.640625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 436 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.953125, "epoch": 0.3496, "grad_norm": 0.8569826158727925, "kl": 0.1328125, "learning_rate": 4.8343261435141245e-06, "loss": 0.0053, "reward": 1.78125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 437 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 381.625, "epoch": 0.3504, "grad_norm": 0.5041244227154292, "kl": 0.203125, "learning_rate": 4.833575579457691e-06, "loss": 0.0081, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 438 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 439.984375, "epoch": 0.3512, "grad_norm": 0.544866660057303, "kl": 0.1357421875, "learning_rate": 4.832823377608088e-06, "loss": 0.0054, "reward": 1.4375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 439 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.140625, "epoch": 0.352, "grad_norm": 1.0531205514657995, "kl": 0.1396484375, "learning_rate": 4.832069538493237e-06, "loss": 0.0056, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 440 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.703125, "epoch": 0.3528, "grad_norm": 0.6199752182002176, "kl": 0.2041015625, "learning_rate": 4.831314062642213e-06, "loss": 0.0082, "reward": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 441 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 453.0625, "epoch": 0.3536, "grad_norm": 1.4048917434449617, "kl": 0.2890625, "learning_rate": 4.830556950585239e-06, "loss": 0.0116, "reward": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 442 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 441.734375, "epoch": 0.3544, "grad_norm": 0.4946040299292862, "kl": 0.328125, "learning_rate": 4.829798202853683e-06, "loss": 0.0131, "reward": 1.75, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 443 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 432.59375, "epoch": 0.3552, "grad_norm": 0.7484184221012916, "kl": 0.365234375, "learning_rate": 4.829037819980065e-06, "loss": 0.0146, "reward": 1.546875, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 444 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 372.46875, "epoch": 0.356, "grad_norm": 0.6506183134020523, "kl": 0.1669921875, "learning_rate": 4.828275802498051e-06, "loss": 0.0067, "reward": 1.625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 445 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 430.9375, "epoch": 0.3568, "grad_norm": 0.9208137570875184, "kl": 0.138671875, "learning_rate": 4.827512150942454e-06, "loss": 0.0056, "reward": 1.609375, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 446 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 428.296875, "epoch": 0.3576, "grad_norm": 1.2098713467531075, "kl": 0.4921875, "learning_rate": 4.8267468658492335e-06, "loss": 0.0197, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 447 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 467.296875, "epoch": 0.3584, "grad_norm": 0.7150741939065385, "kl": 0.1220703125, "learning_rate": 4.825979947755496e-06, "loss": 0.0049, "reward": 1.84375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 448 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 421.4375, "epoch": 0.3592, "grad_norm": 1.1223282700977706, "kl": 0.1416015625, "learning_rate": 4.8252113971994955e-06, "loss": 0.0057, "reward": 1.765625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 449 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.265625, "epoch": 0.36, "grad_norm": 0.5208765265104575, "kl": 0.515625, "learning_rate": 4.824441214720629e-06, "loss": 0.0207, "reward": 1.703125, "reward_std": 0.13868528604507446, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 450 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 479.421875, "epoch": 0.3608, "grad_norm": 1.1245698395617494, "kl": 0.85546875, "learning_rate": 4.823669400859441e-06, "loss": 0.0342, "reward": 1.71875, "reward_std": 0.3618124723434448, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 451 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 496.6875, "epoch": 0.3616, "grad_norm": 0.7035661119729969, "kl": 0.357421875, "learning_rate": 4.8228959561576195e-06, "loss": 0.0143, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 452 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 409.140625, "epoch": 0.3624, "grad_norm": 0.8392469138583998, "kl": 1.3125, "learning_rate": 4.822120881157998e-06, "loss": 0.0526, "reward": 1.6875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.953125, "step": 453 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 478.75, "epoch": 0.3632, "grad_norm": 1.173091688779587, "kl": 0.93359375, "learning_rate": 4.821344176404554e-06, "loss": 0.0374, "reward": 1.421875, "reward_std": 0.2867125868797302, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.953125, "step": 454 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 397.53125, "epoch": 0.364, "grad_norm": 1.002096287669717, "kl": 0.416015625, "learning_rate": 4.820565842442408e-06, "loss": 0.0167, "reward": 1.703125, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 455 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 510.15625, "epoch": 0.3648, "grad_norm": 0.9178416021368806, "kl": 0.58984375, "learning_rate": 4.819785879817827e-06, "loss": 0.0236, "reward": 1.34375, "reward_std": 0.35786980390548706, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.9375, "step": 456 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 490.609375, "epoch": 0.3656, "grad_norm": 2.3311408484899823, "kl": 1.8046875, "learning_rate": 4.819004289078217e-06, "loss": 0.072, "reward": 1.546875, "reward_std": 0.3183838725090027, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.890625, "step": 457 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 534.90625, "epoch": 0.3664, "grad_norm": 1.4680518444754158, "kl": 1.4609375, "learning_rate": 4.818221070772129e-06, "loss": 0.0584, "reward": 1.40625, "reward_std": 0.292504221200943, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.9375, "step": 458 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 485.171875, "epoch": 0.3672, "grad_norm": 1.0154822113166795, "kl": 0.39453125, "learning_rate": 4.8174362254492555e-06, "loss": 0.0159, "reward": 1.8125, "reward_std": 0.1825428307056427, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 459 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 447.453125, "epoch": 0.368, "grad_norm": 1.6372782134672146, "kl": 1.78125, "learning_rate": 4.816649753660431e-06, "loss": 0.0712, "reward": 1.5, "reward_std": 0.5082703232765198, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.921875, "step": 460 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 475.1875, "epoch": 0.3688, "grad_norm": 1.5660653871347994, "kl": 0.8828125, "learning_rate": 4.815861655957632e-06, "loss": 0.0353, "reward": 1.671875, "reward_std": 0.3624844253063202, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.96875, "step": 461 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 489.21875, "epoch": 0.3696, "grad_norm": 0.9808014696511161, "kl": 1.6484375, "learning_rate": 4.815071932893976e-06, "loss": 0.0661, "reward": 1.65625, "reward_std": 0.3139738440513611, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 462 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.0625, "epoch": 0.3704, "grad_norm": 0.7216301646582304, "kl": 1.2578125, "learning_rate": 4.81428058502372e-06, "loss": 0.0504, "reward": 1.84375, "reward_std": 0.3049619495868683, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.953125, "step": 463 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 432.796875, "epoch": 0.3712, "grad_norm": 0.8390030730948433, "kl": 0.1484375, "learning_rate": 4.813487612902265e-06, "loss": 0.0059, "reward": 1.671875, "reward_std": 0.1893727034330368, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 464 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 476.640625, "epoch": 0.372, "grad_norm": 0.5245197613735183, "kl": 0.1748046875, "learning_rate": 4.812693017086145e-06, "loss": 0.007, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 465 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 448.40625, "epoch": 0.3728, "grad_norm": 1.5094922067340875, "kl": 1.078125, "learning_rate": 4.811896798133042e-06, "loss": 0.043, "reward": 1.71875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 466 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 519.328125, "epoch": 0.3736, "grad_norm": 0.9716355528132773, "kl": 2.328125, "learning_rate": 4.811098956601772e-06, "loss": 0.0934, "reward": 1.671875, "reward_std": 0.44749903678894043, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.890625, "step": 467 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 397.421875, "epoch": 0.3744, "grad_norm": 1.1380947906135268, "kl": 0.341796875, "learning_rate": 4.810299493052289e-06, "loss": 0.0137, "reward": 1.46875, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 468 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 388.015625, "epoch": 0.3752, "grad_norm": 1.6899780677744536, "kl": 0.56640625, "learning_rate": 4.809498408045691e-06, "loss": 0.0227, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 469 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 359.234375, "epoch": 0.376, "grad_norm": 0.6075789265258572, "kl": 0.3515625, "learning_rate": 4.808695702144206e-06, "loss": 0.0141, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 470 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 345.421875, "epoch": 0.3768, "grad_norm": 0.8841111221529462, "kl": 0.1650390625, "learning_rate": 4.807891375911207e-06, "loss": 0.0066, "reward": 1.859375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 471 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 407.25, "epoch": 0.3776, "grad_norm": 5.8807950669550415, "kl": 0.5625, "learning_rate": 4.8070854299112e-06, "loss": 0.0225, "reward": 1.6875, "reward_std": 0.3715250492095947, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.953125, "step": 472 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 304.453125, "epoch": 0.3784, "grad_norm": 1.2945351267840253, "kl": 0.48046875, "learning_rate": 4.806277864709828e-06, "loss": 0.0193, "reward": 1.734375, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 473 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 375.546875, "epoch": 0.3792, "grad_norm": 1.3149418973552802, "kl": 1.4453125, "learning_rate": 4.805468680873874e-06, "loss": 0.0581, "reward": 1.375, "reward_std": 0.27439430356025696, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.9375, "step": 474 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.71875, "epoch": 0.38, "grad_norm": 2.7455582598650285, "kl": 1.09375, "learning_rate": 4.804657878971252e-06, "loss": 0.0439, "reward": 1.71875, "reward_std": 0.19506090879440308, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 475 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 366.921875, "epoch": 0.3808, "grad_norm": 3.0891644648564505, "kl": 2.015625, "learning_rate": 4.803845459571014e-06, "loss": 0.0806, "reward": 1.6875, "reward_std": 0.5475407242774963, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.890625, "step": 476 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 445.15625, "epoch": 0.3816, "grad_norm": 5.880200757491912, "kl": 7.6875, "learning_rate": 4.803031423243349e-06, "loss": 0.309, "reward": 1.125, "reward_std": 0.6859990358352661, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.671875, "step": 477 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 451.234375, "epoch": 0.3824, "grad_norm": 10.03130773454054, "kl": 5.34375, "learning_rate": 4.802215770559578e-06, "loss": 0.2142, "reward": 1.484375, "reward_std": 0.4931156635284424, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.8125, "step": 478 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 455.046875, "epoch": 0.3832, "grad_norm": 2.1507203179173766, "kl": 2.703125, "learning_rate": 4.801398502092156e-06, "loss": 0.108, "reward": 1.578125, "reward_std": 0.3764522075653076, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.84375, "step": 479 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 321.890625, "epoch": 0.384, "grad_norm": 1.021505949210174, "kl": 0.318359375, "learning_rate": 4.800579618414677e-06, "loss": 0.0128, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 480 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.53125, "epoch": 0.3848, "grad_norm": 0.6694409753576069, "kl": 0.1943359375, "learning_rate": 4.799759120101861e-06, "loss": 0.0077, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 481 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 311.375, "epoch": 0.3856, "grad_norm": 0.4359363974042517, "kl": 0.1357421875, "learning_rate": 4.798937007729568e-06, "loss": 0.0054, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 482 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 350.828125, "epoch": 0.3864, "grad_norm": 1.5162481213831498, "kl": 0.134765625, "learning_rate": 4.798113281874788e-06, "loss": 0.0054, "reward": 1.671875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 483 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 321.4375, "epoch": 0.3872, "grad_norm": 0.4597374200983244, "kl": 0.142578125, "learning_rate": 4.797287943115642e-06, "loss": 0.0057, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 484 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 396.8125, "epoch": 0.388, "grad_norm": 0.7607387695800587, "kl": 0.1201171875, "learning_rate": 4.796460992031386e-06, "loss": 0.0048, "reward": 1.65625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 485 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 322.0625, "epoch": 0.3888, "grad_norm": 0.4831613708707655, "kl": 0.1416015625, "learning_rate": 4.7956324292024045e-06, "loss": 0.0057, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 486 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.40625, "epoch": 0.3896, "grad_norm": 0.4081897198615544, "kl": 0.12255859375, "learning_rate": 4.794802255210217e-06, "loss": 0.0049, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 487 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 373.765625, "epoch": 0.3904, "grad_norm": 0.5795253658984987, "kl": 0.1298828125, "learning_rate": 4.793970470637469e-06, "loss": 0.0052, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 488 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.296875, "epoch": 0.3912, "grad_norm": 0.38536641281872774, "kl": 0.140625, "learning_rate": 4.7931370760679415e-06, "loss": 0.0056, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 489 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.265625, "epoch": 0.392, "grad_norm": 0.884095503790402, "kl": 0.1240234375, "learning_rate": 4.792302072086542e-06, "loss": 0.005, "reward": 1.46875, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 490 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 387.1875, "epoch": 0.3928, "grad_norm": 0.9184220362440718, "kl": 0.1279296875, "learning_rate": 4.7914654592793065e-06, "loss": 0.0051, "reward": 1.71875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 491 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 379.71875, "epoch": 0.3936, "grad_norm": 0.9116583207306123, "kl": 0.1298828125, "learning_rate": 4.790627238233405e-06, "loss": 0.0052, "reward": 1.609375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 492 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 410.296875, "epoch": 0.3944, "grad_norm": 0.42981223850322453, "kl": 0.109375, "learning_rate": 4.789787409537131e-06, "loss": 0.0044, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 493 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 408.34375, "epoch": 0.3952, "grad_norm": 0.3384378532478862, "kl": 0.1240234375, "learning_rate": 4.7889459737799105e-06, "loss": 0.0049, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 494 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 404.125, "epoch": 0.396, "grad_norm": 0.9063049215223403, "kl": 0.12890625, "learning_rate": 4.788102931552294e-06, "loss": 0.0052, "reward": 1.703125, "reward_std": 0.28930896520614624, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 495 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 434.78125, "epoch": 0.3968, "grad_norm": 0.7924641044749103, "kl": 0.11328125, "learning_rate": 4.787258283445962e-06, "loss": 0.0045, "reward": 1.53125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 496 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.15625, "epoch": 0.3976, "grad_norm": 0.5730328890520717, "kl": 0.1318359375, "learning_rate": 4.786412030053721e-06, "loss": 0.0053, "reward": 1.796875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 497 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 433.03125, "epoch": 0.3984, "grad_norm": 0.4940836808042512, "kl": 0.12109375, "learning_rate": 4.785564171969503e-06, "loss": 0.0048, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 498 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.609375, "epoch": 0.3992, "grad_norm": 1.014219373765608, "kl": 0.140625, "learning_rate": 4.784714709788368e-06, "loss": 0.0056, "reward": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 499 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 403.859375, "epoch": 0.4, "grad_norm": 0.543273438917351, "kl": 0.12158203125, "learning_rate": 4.783863644106502e-06, "loss": 0.0049, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 500 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.46875, "epoch": 0.4008, "grad_norm": 0.09114864037290937, "kl": 0.12890625, "learning_rate": 4.783010975521216e-06, "loss": 0.0051, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 501 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 408.96875, "epoch": 0.4016, "grad_norm": 0.17449315291315534, "kl": 0.1328125, "learning_rate": 4.782156704630944e-06, "loss": 0.0053, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 502 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.734375, "epoch": 0.4024, "grad_norm": 0.5055626056608999, "kl": 0.115234375, "learning_rate": 4.7813008320352475e-06, "loss": 0.0046, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 503 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.359375, "epoch": 0.4032, "grad_norm": 0.7827907023189481, "kl": 0.1474609375, "learning_rate": 4.78044335833481e-06, "loss": 0.0059, "reward": 1.59375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 504 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.484375, "epoch": 0.404, "grad_norm": 0.4589464961908402, "kl": 0.115234375, "learning_rate": 4.77958428413144e-06, "loss": 0.0046, "reward": 1.921875, "reward_std": 0.13868528604507446, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 505 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 462.15625, "epoch": 0.4048, "grad_norm": 0.6544346911739349, "kl": 0.12353515625, "learning_rate": 4.7787236100280685e-06, "loss": 0.0049, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 506 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 452.515625, "epoch": 0.4056, "grad_norm": 1.3105254135159445, "kl": 0.1240234375, "learning_rate": 4.777861336628751e-06, "loss": 0.005, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 507 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 449.171875, "epoch": 0.4064, "grad_norm": 0.8220089849713779, "kl": 0.1416015625, "learning_rate": 4.7769974645386616e-06, "loss": 0.0057, "reward": 1.5, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 508 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 432.546875, "epoch": 0.4072, "grad_norm": 0.7873270142931792, "kl": 0.140625, "learning_rate": 4.776131994364102e-06, "loss": 0.0056, "reward": 1.671875, "reward_std": 0.1983242630958557, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 509 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 452.078125, "epoch": 0.408, "grad_norm": 1.2832406872401583, "kl": 0.12890625, "learning_rate": 4.775264926712489e-06, "loss": 0.0052, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 510 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 437.03125, "epoch": 0.4088, "grad_norm": 0.5124697706361917, "kl": 0.12451171875, "learning_rate": 4.774396262192368e-06, "loss": 0.005, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 511 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 488.8125, "epoch": 0.4096, "grad_norm": 0.4250072391435047, "kl": 0.1201171875, "learning_rate": 4.7735260014133986e-06, "loss": 0.0048, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 512 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 417.34375, "epoch": 0.4104, "grad_norm": 0.40503428079574494, "kl": 0.14453125, "learning_rate": 4.772654144986364e-06, "loss": 0.0058, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 513 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 462.703125, "epoch": 0.4112, "grad_norm": 0.7881543976365165, "kl": 0.1279296875, "learning_rate": 4.7717806935231665e-06, "loss": 0.0051, "reward": 1.8125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 514 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 424.65625, "epoch": 0.412, "grad_norm": 0.5766837892479305, "kl": 0.154296875, "learning_rate": 4.770905647636828e-06, "loss": 0.0062, "reward": 1.53125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 515 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 419.375, "epoch": 0.4128, "grad_norm": 0.44407401013076075, "kl": 0.138671875, "learning_rate": 4.77002900794149e-06, "loss": 0.0055, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 516 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 470.59375, "epoch": 0.4136, "grad_norm": 0.5235926160400373, "kl": 0.12890625, "learning_rate": 4.769150775052411e-06, "loss": 0.0051, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 517 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.578125, "epoch": 0.4144, "grad_norm": 0.9967752126166552, "kl": 0.1279296875, "learning_rate": 4.768270949585968e-06, "loss": 0.0051, "reward": 1.78125, "reward_std": 0.23356689512729645, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 518 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.09375, "epoch": 0.4152, "grad_norm": 0.8675042446430157, "kl": 0.1376953125, "learning_rate": 4.767389532159659e-06, "loss": 0.0055, "reward": 1.5, "reward_std": 0.2630178928375244, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 519 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 396.46875, "epoch": 0.416, "grad_norm": 0.8224038768492201, "kl": 0.1494140625, "learning_rate": 4.766506523392095e-06, "loss": 0.006, "reward": 1.359375, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 520 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 448.890625, "epoch": 0.4168, "grad_norm": 0.5425410755487962, "kl": 0.125, "learning_rate": 4.765621923903005e-06, "loss": 0.005, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 521 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 486.34375, "epoch": 0.4176, "grad_norm": 1.128073055749155, "kl": 0.130859375, "learning_rate": 4.764735734313236e-06, "loss": 0.0052, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 522 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 426.1875, "epoch": 0.4184, "grad_norm": 0.4031109956060146, "kl": 0.11669921875, "learning_rate": 4.763847955244749e-06, "loss": 0.0047, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 523 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.203125, "epoch": 0.4192, "grad_norm": 0.90433099369881, "kl": 0.11474609375, "learning_rate": 4.762958587320623e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 524 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 388.640625, "epoch": 0.42, "grad_norm": 0.6687933553064049, "kl": 0.1865234375, "learning_rate": 4.762067631165049e-06, "loss": 0.0075, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 525 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 445.078125, "epoch": 0.4208, "grad_norm": 0.6786375875150067, "kl": 0.1181640625, "learning_rate": 4.761175087403336e-06, "loss": 0.0047, "reward": 1.84375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 526 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 438.96875, "epoch": 0.4216, "grad_norm": 0.7303677491642302, "kl": 0.1240234375, "learning_rate": 4.760280956661904e-06, "loss": 0.005, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 527 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.53125, "epoch": 0.4224, "grad_norm": 0.7593473495539002, "kl": 0.1318359375, "learning_rate": 4.75938523956829e-06, "loss": 0.0053, "reward": 1.703125, "reward_std": 0.2519446909427643, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 528 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 410.75, "epoch": 0.4232, "grad_norm": 0.330189456706158, "kl": 0.1298828125, "learning_rate": 4.75848793675114e-06, "loss": 0.0052, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 529 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.1875, "epoch": 0.424, "grad_norm": 0.6018398606145906, "kl": 0.1396484375, "learning_rate": 4.757589048840219e-06, "loss": 0.0056, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 530 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 456.4375, "epoch": 0.4248, "grad_norm": 0.9966756944312201, "kl": 0.142578125, "learning_rate": 4.756688576466398e-06, "loss": 0.0057, "reward": 1.75, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 531 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 410.59375, "epoch": 0.4256, "grad_norm": 0.5972720922208659, "kl": 0.1328125, "learning_rate": 4.755786520261666e-06, "loss": 0.0053, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 532 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 464.234375, "epoch": 0.4264, "grad_norm": 0.9903609430647151, "kl": 0.15234375, "learning_rate": 4.75488288085912e-06, "loss": 0.0061, "reward": 1.890625, "reward_std": 0.22707363963127136, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.953125, "step": 533 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 502.03125, "epoch": 0.4272, "grad_norm": 0.7985987687798536, "kl": 0.146484375, "learning_rate": 4.753977658892967e-06, "loss": 0.0058, "reward": 1.65625, "reward_std": 0.2404065728187561, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 534 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 454.25, "epoch": 0.428, "grad_norm": 0.7620615509609708, "kl": 0.13671875, "learning_rate": 4.753070854998529e-06, "loss": 0.0055, "reward": 1.6875, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 535 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 466.875, "epoch": 0.4288, "grad_norm": 1.2767021083048151, "kl": 0.134765625, "learning_rate": 4.752162469812234e-06, "loss": 0.0054, "reward": 1.703125, "reward_std": 0.27883461117744446, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.9375, "step": 536 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.40625, "epoch": 0.4296, "grad_norm": 0.6046278635344334, "kl": 0.1806640625, "learning_rate": 4.751252503971624e-06, "loss": 0.0072, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 537 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 485.984375, "epoch": 0.4304, "grad_norm": 1.0901821078716176, "kl": 0.19140625, "learning_rate": 4.750340958115346e-06, "loss": 0.0077, "reward": 1.5, "reward_std": 0.2798827290534973, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.953125, "step": 538 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 429.78125, "epoch": 0.4312, "grad_norm": 1.1995065520907704, "kl": 0.220703125, "learning_rate": 4.749427832883158e-06, "loss": 0.0088, "reward": 1.796875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 539 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 443.078125, "epoch": 0.432, "grad_norm": 1.9491122287555656, "kl": 0.484375, "learning_rate": 4.748513128915928e-06, "loss": 0.0194, "reward": 1.609375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.921875, "step": 540 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 474.0625, "epoch": 0.4328, "grad_norm": 3.1890070394788874, "kl": 0.6171875, "learning_rate": 4.747596846855629e-06, "loss": 0.0246, "reward": 1.59375, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9375, "step": 541 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 486.125, "epoch": 0.4336, "grad_norm": 2.0043216789347746, "kl": 0.60546875, "learning_rate": 4.7466789873453446e-06, "loss": 0.0242, "reward": 1.53125, "reward_std": 0.3412526845932007, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.953125, "step": 542 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 490.640625, "epoch": 0.4344, "grad_norm": 2.580392206072138, "kl": 2.140625, "learning_rate": 4.7457595510292615e-06, "loss": 0.0856, "reward": 1.671875, "reward_std": 0.4963078498840332, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.859375, "step": 543 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.0625, "epoch": 0.4352, "grad_norm": 2.597169689699224, "kl": 1.1328125, "learning_rate": 4.744838538552678e-06, "loss": 0.0454, "reward": 1.625, "reward_std": 0.22558549046516418, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.921875, "step": 544 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.46875, "epoch": 0.436, "grad_norm": 1.0571142153194588, "kl": 1.546875, "learning_rate": 4.7439159505619946e-06, "loss": 0.0618, "reward": 1.890625, "reward_std": 0.20290401577949524, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.953125, "step": 545 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 407.71875, "epoch": 0.4368, "grad_norm": 2.0231863556745306, "kl": 1.6875, "learning_rate": 4.74299178770472e-06, "loss": 0.0675, "reward": 1.65625, "reward_std": 0.3596561551094055, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.921875, "step": 546 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 424.0625, "epoch": 0.4376, "grad_norm": 1.5551994849968498, "kl": 3.046875, "learning_rate": 4.742066050629465e-06, "loss": 0.1219, "reward": 1.484375, "reward_std": 0.39774754643440247, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.921875, "step": 547 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 360.84375, "epoch": 0.4384, "grad_norm": 3.919915861139782, "kl": 3.1875, "learning_rate": 4.741138739985951e-06, "loss": 0.1278, "reward": 1.421875, "reward_std": 0.45242589712142944, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.90625, "step": 548 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 389.46875, "epoch": 0.4392, "grad_norm": 23.12505474924314, "kl": 14.625, "learning_rate": 4.740209856424998e-06, "loss": 0.5854, "reward": 1.21875, "reward_std": 0.712020993232727, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.578125, "step": 549 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 339.640625, "epoch": 0.44, "grad_norm": 1.9920077078374177, "kl": 2.578125, "learning_rate": 4.7392794005985324e-06, "loss": 0.1031, "reward": 1.59375, "reward_std": 0.4238043427467346, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.90625, "step": 550 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 404.171875, "epoch": 0.4408, "grad_norm": 2.8098958853055827, "kl": 4.75, "learning_rate": 4.738347373159585e-06, "loss": 0.1898, "reward": 1.578125, "reward_std": 0.5108742117881775, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.828125, "step": 551 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 311.0625, "epoch": 0.4416, "grad_norm": 1.752527803564222, "kl": 0.435546875, "learning_rate": 4.737413774762287e-06, "loss": 0.0175, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 552 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 324.328125, "epoch": 0.4424, "grad_norm": 0.623290617592855, "kl": 0.1298828125, "learning_rate": 4.736478606061876e-06, "loss": 0.0052, "reward": 1.640625, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 553 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.984375, "epoch": 0.4432, "grad_norm": 0.9636215109224563, "kl": 0.373046875, "learning_rate": 4.735541867714687e-06, "loss": 0.0149, "reward": 1.65625, "reward_std": 0.21917018294334412, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 554 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 372.0625, "epoch": 0.444, "grad_norm": 0.4643488621551407, "kl": 0.1220703125, "learning_rate": 4.73460356037816e-06, "loss": 0.0049, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 555 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 349.78125, "epoch": 0.4448, "grad_norm": 0.4106290847161274, "kl": 0.10888671875, "learning_rate": 4.733663684710835e-06, "loss": 0.0043, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 556 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.859375, "epoch": 0.4456, "grad_norm": 0.8089250709426993, "kl": 0.12109375, "learning_rate": 4.732722241372354e-06, "loss": 0.0048, "reward": 1.6875, "reward_std": 0.17570313811302185, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 557 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 363.03125, "epoch": 0.4464, "grad_norm": 0.46368035433701, "kl": 0.10498046875, "learning_rate": 4.731779231023456e-06, "loss": 0.0042, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 558 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 301.84375, "epoch": 0.4472, "grad_norm": 0.9322080245234828, "kl": 0.13671875, "learning_rate": 4.730834654325984e-06, "loss": 0.0055, "reward": 1.671875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 559 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 358.171875, "epoch": 0.448, "grad_norm": 0.39997720342036575, "kl": 0.125, "learning_rate": 4.729888511942877e-06, "loss": 0.005, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 560 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 287.71875, "epoch": 0.4488, "grad_norm": 0.9548420650515719, "kl": 0.126953125, "learning_rate": 4.728940804538176e-06, "loss": 0.0051, "reward": 1.625, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 561 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 361.078125, "epoch": 0.4496, "grad_norm": 3.617701918943756, "kl": 0.1240234375, "learning_rate": 4.727991532777016e-06, "loss": 0.005, "reward": 1.8125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 562 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 333.625, "epoch": 0.4504, "grad_norm": 0.7436500355949118, "kl": 0.130859375, "learning_rate": 4.727040697325634e-06, "loss": 0.0052, "reward": 1.59375, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 563 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 359.921875, "epoch": 0.4512, "grad_norm": 0.8053192115050734, "kl": 0.12109375, "learning_rate": 4.726088298851362e-06, "loss": 0.0049, "reward": 1.59375, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 564 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 376.40625, "epoch": 0.452, "grad_norm": 0.8159629873340042, "kl": 0.1123046875, "learning_rate": 4.725134338022631e-06, "loss": 0.0045, "reward": 1.65625, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 565 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 322.578125, "epoch": 0.4528, "grad_norm": 4.563543622614359, "kl": 0.134765625, "learning_rate": 4.724178815508967e-06, "loss": 0.0054, "reward": 1.6875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 566 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 368.265625, "epoch": 0.4536, "grad_norm": 0.3786388782574309, "kl": 0.1171875, "learning_rate": 4.723221731980993e-06, "loss": 0.0047, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 567 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 357.828125, "epoch": 0.4544, "grad_norm": 0.8553088933063837, "kl": 0.1201171875, "learning_rate": 4.722263088110426e-06, "loss": 0.0048, "reward": 1.53125, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 568 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.6875, "epoch": 0.4552, "grad_norm": 0.4770339948697236, "kl": 0.130859375, "learning_rate": 4.721302884570079e-06, "loss": 0.0052, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 569 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 352.921875, "epoch": 0.456, "grad_norm": 1.0804985777028955, "kl": 0.1171875, "learning_rate": 4.720341122033862e-06, "loss": 0.0047, "reward": 1.78125, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 570 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 386.78125, "epoch": 0.4568, "grad_norm": 0.35084366597904426, "kl": 0.1162109375, "learning_rate": 4.719377801176774e-06, "loss": 0.0046, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 571 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 327.75, "epoch": 0.4576, "grad_norm": 0.4010624118550041, "kl": 0.140625, "learning_rate": 4.718412922674913e-06, "loss": 0.0056, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 572 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 361.28125, "epoch": 0.4584, "grad_norm": 0.5170960923198824, "kl": 0.1298828125, "learning_rate": 4.717446487205466e-06, "loss": 0.0052, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 573 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 383.25, "epoch": 0.4592, "grad_norm": 1.9945218987513769, "kl": 0.15234375, "learning_rate": 4.716478495446717e-06, "loss": 0.0061, "reward": 1.828125, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 574 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 410.40625, "epoch": 0.46, "grad_norm": 2.895751664424082, "kl": 0.19140625, "learning_rate": 4.715508948078037e-06, "loss": 0.0076, "reward": 1.546875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.9375, "step": 575 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.1875, "epoch": 0.4608, "grad_norm": 1.638065505747679, "kl": 0.400390625, "learning_rate": 4.714537845779894e-06, "loss": 0.0161, "reward": 1.890625, "reward_std": 0.19654905796051025, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.921875, "step": 576 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 349.734375, "epoch": 0.4616, "grad_norm": 2.5247233544787684, "kl": 0.54296875, "learning_rate": 4.7135651892338445e-06, "loss": 0.0217, "reward": 1.609375, "reward_std": 0.38194066286087036, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.921875, "step": 577 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 338.265625, "epoch": 0.4624, "grad_norm": 2.7984982639925433, "kl": 0.75390625, "learning_rate": 4.712590979122534e-06, "loss": 0.0301, "reward": 1.328125, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.90625, "step": 578 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 381.15625, "epoch": 0.4632, "grad_norm": 2.735569317045563, "kl": 1.375, "learning_rate": 4.7116152161297045e-06, "loss": 0.055, "reward": 1.390625, "reward_std": 0.39916694164276123, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.875, "step": 579 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 442.53125, "epoch": 0.464, "grad_norm": 3.1388133340699715, "kl": 4.5625, "learning_rate": 4.710637900940181e-06, "loss": 0.1821, "reward": 1.25, "reward_std": 0.6596025228500366, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.734375, "step": 580 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 365.3125, "epoch": 0.4648, "grad_norm": 2.807083376416559, "kl": 2.328125, "learning_rate": 4.7096590342398825e-06, "loss": 0.0932, "reward": 1.703125, "reward_std": 0.44655635952949524, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.921875, "step": 581 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 387.5, "epoch": 0.4656, "grad_norm": 2.8132496897579014, "kl": 4.125, "learning_rate": 4.708678616715815e-06, "loss": 0.1651, "reward": 1.578125, "reward_std": 0.4879796802997589, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.875, "step": 582 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 344.5, "epoch": 0.4664, "grad_norm": 1.8018781601902947, "kl": 1.046875, "learning_rate": 4.707696649056073e-06, "loss": 0.0419, "reward": 1.5, "reward_std": 0.3648670017719269, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 583 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.484375, "epoch": 0.4672, "grad_norm": 0.760952215794593, "kl": 1.140625, "learning_rate": 4.706713131949839e-06, "loss": 0.0456, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 584 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 365.390625, "epoch": 0.468, "grad_norm": 0.58701159191976, "kl": 0.69921875, "learning_rate": 4.705728066087384e-06, "loss": 0.028, "reward": 1.765625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 585 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 359.875, "epoch": 0.4688, "grad_norm": 0.5590360711726854, "kl": 0.87109375, "learning_rate": 4.704741452160064e-06, "loss": 0.0347, "reward": 1.375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 586 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 336.140625, "epoch": 0.4696, "grad_norm": 0.37962287263134975, "kl": 0.13671875, "learning_rate": 4.703753290860323e-06, "loss": 0.0055, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 587 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 321.515625, "epoch": 0.4704, "grad_norm": 0.7498115796865782, "kl": 0.77734375, "learning_rate": 4.702763582881692e-06, "loss": 0.0311, "reward": 1.859375, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 588 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.125, "epoch": 0.4712, "grad_norm": 1.149421698917399, "kl": 1.75, "learning_rate": 4.701772328918784e-06, "loss": 0.07, "reward": 1.78125, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 589 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 365.5625, "epoch": 0.472, "grad_norm": 0.4679289139806567, "kl": 0.1318359375, "learning_rate": 4.700779529667301e-06, "loss": 0.0053, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 590 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 341.84375, "epoch": 0.4728, "grad_norm": 1.4087039371104781, "kl": 0.12890625, "learning_rate": 4.699785185824026e-06, "loss": 0.0052, "reward": 1.53125, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 591 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 348.71875, "epoch": 0.4736, "grad_norm": 0.7139348028378941, "kl": 0.1474609375, "learning_rate": 4.69878929808683e-06, "loss": 0.0059, "reward": 1.8125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 592 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 348.796875, "epoch": 0.4744, "grad_norm": 0.09219704742523589, "kl": 0.11962890625, "learning_rate": 4.6977918671546635e-06, "loss": 0.0048, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 593 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 332.84375, "epoch": 0.4752, "grad_norm": 0.45624501772723347, "kl": 0.1435546875, "learning_rate": 4.696792893727562e-06, "loss": 0.0057, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 594 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 304.5, "epoch": 0.476, "grad_norm": 0.9202164978785126, "kl": 0.1650390625, "learning_rate": 4.695792378506645e-06, "loss": 0.0066, "reward": 1.421875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 595 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 332.828125, "epoch": 0.4768, "grad_norm": 0.6614322723537502, "kl": 0.140625, "learning_rate": 4.694790322194111e-06, "loss": 0.0056, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 596 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 394.515625, "epoch": 0.4776, "grad_norm": 0.2163278343277934, "kl": 0.123046875, "learning_rate": 4.693786725493242e-06, "loss": 0.0049, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 597 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 353.6875, "epoch": 0.4784, "grad_norm": 1.0025384969215045, "kl": 0.146484375, "learning_rate": 4.692781589108402e-06, "loss": 0.0059, "reward": 1.6875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 598 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 304.203125, "epoch": 0.4792, "grad_norm": 0.8555086375496709, "kl": 0.1474609375, "learning_rate": 4.691774913745033e-06, "loss": 0.0059, "reward": 1.59375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 599 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 388.4375, "epoch": 0.48, "grad_norm": 0.6762760084546305, "kl": 0.12890625, "learning_rate": 4.690766700109659e-06, "loss": 0.0052, "reward": 1.71875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 600 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 323.890625, "epoch": 0.4808, "grad_norm": 0.8763816444356921, "kl": 0.146484375, "learning_rate": 4.689756948909884e-06, "loss": 0.0059, "reward": 1.65625, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 601 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 351.65625, "epoch": 0.4816, "grad_norm": 0.8698407297285081, "kl": 0.158203125, "learning_rate": 4.688745660854388e-06, "loss": 0.0063, "reward": 1.515625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 602 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.6875, "epoch": 0.4824, "grad_norm": 0.7746532613310444, "kl": 0.1435546875, "learning_rate": 4.687732836652935e-06, "loss": 0.0058, "reward": 1.875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 603 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 342.328125, "epoch": 0.4832, "grad_norm": 0.9476309548652774, "kl": 0.138671875, "learning_rate": 4.686718477016361e-06, "loss": 0.0056, "reward": 1.625, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 604 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 372.625, "epoch": 0.484, "grad_norm": 0.6907800503076698, "kl": 0.1416015625, "learning_rate": 4.6857025826565845e-06, "loss": 0.0056, "reward": 1.390625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 605 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 333.875, "epoch": 0.4848, "grad_norm": 0.7036792594591179, "kl": 0.1474609375, "learning_rate": 4.684685154286599e-06, "loss": 0.0059, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 606 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 334.3125, "epoch": 0.4856, "grad_norm": 0.7784029972183542, "kl": 0.150390625, "learning_rate": 4.683666192620474e-06, "loss": 0.006, "reward": 1.640625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 607 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 381.234375, "epoch": 0.4864, "grad_norm": 0.3276931766459097, "kl": 0.1337890625, "learning_rate": 4.682645698373357e-06, "loss": 0.0053, "reward": 1.390625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 608 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 385.4375, "epoch": 0.4872, "grad_norm": 1.0244585337473353, "kl": 0.15234375, "learning_rate": 4.6816236722614694e-06, "loss": 0.0061, "reward": 1.578125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 609 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.28125, "epoch": 0.488, "grad_norm": 0.6623725278278989, "kl": 0.1298828125, "learning_rate": 4.680600115002109e-06, "loss": 0.0052, "reward": 1.734375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 610 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 410.140625, "epoch": 0.4888, "grad_norm": 1.1182525515241812, "kl": 0.345703125, "learning_rate": 4.679575027313649e-06, "loss": 0.0139, "reward": 1.75, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 611 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 402.421875, "epoch": 0.4896, "grad_norm": 1.1473152123947696, "kl": 0.12890625, "learning_rate": 4.6785484099155324e-06, "loss": 0.0052, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 612 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 403.703125, "epoch": 0.4904, "grad_norm": 0.37850839844547285, "kl": 0.1484375, "learning_rate": 4.67752026352828e-06, "loss": 0.0059, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 613 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.734375, "epoch": 0.4912, "grad_norm": 0.680113184405759, "kl": 0.1357421875, "learning_rate": 4.676490588873486e-06, "loss": 0.0054, "reward": 1.765625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 614 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 463.28125, "epoch": 0.492, "grad_norm": 0.9847351909602291, "kl": 0.13671875, "learning_rate": 4.675459386673815e-06, "loss": 0.0055, "reward": 1.625, "reward_std": 0.2756393849849701, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 615 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 466.578125, "epoch": 0.4928, "grad_norm": 0.45510120379863056, "kl": 0.150390625, "learning_rate": 4.674426657653003e-06, "loss": 0.006, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 616 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 347.984375, "epoch": 0.4936, "grad_norm": 0.7723889384366126, "kl": 0.1396484375, "learning_rate": 4.67339240253586e-06, "loss": 0.0056, "reward": 1.453125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 617 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 421.75, "epoch": 0.4944, "grad_norm": 0.537734608716655, "kl": 0.140625, "learning_rate": 4.672356622048266e-06, "loss": 0.0056, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 618 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 356.71875, "epoch": 0.4952, "grad_norm": 0.671441619255682, "kl": 0.140625, "learning_rate": 4.671319316917172e-06, "loss": 0.0056, "reward": 1.46875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 619 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.765625, "epoch": 0.496, "grad_norm": 0.7721416598091413, "kl": 0.7734375, "learning_rate": 4.670280487870599e-06, "loss": 0.0308, "reward": 1.6875, "reward_std": 0.21556037664413452, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 620 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 390.46875, "epoch": 0.4968, "grad_norm": 0.9680757407929496, "kl": 0.169921875, "learning_rate": 4.669240135637635e-06, "loss": 0.0068, "reward": 1.53125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 621 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 378.234375, "epoch": 0.4976, "grad_norm": 0.4582522192089543, "kl": 0.1396484375, "learning_rate": 4.668198260948442e-06, "loss": 0.0056, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 622 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 448.0625, "epoch": 0.4984, "grad_norm": 2.4735346905444797, "kl": 2.546875, "learning_rate": 4.667154864534245e-06, "loss": 0.1017, "reward": 1.625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.9375, "step": 623 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 453.34375, "epoch": 0.4992, "grad_norm": 2.3522533719660346, "kl": 1.1328125, "learning_rate": 4.666109947127343e-06, "loss": 0.0452, "reward": 1.875, "reward_std": 0.1825428307056427, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.96875, "step": 624 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 371.96875, "epoch": 0.5, "grad_norm": 0.67998421474587, "kl": 0.134765625, "learning_rate": 4.665063509461098e-06, "loss": 0.0054, "reward": 1.53125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 625 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 399.28125, "epoch": 0.5008, "grad_norm": 0.6331284378912473, "kl": 1.7265625, "learning_rate": 4.664015552269938e-06, "loss": 0.069, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 626 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.71875, "epoch": 0.5016, "grad_norm": 0.9447568979983324, "kl": 0.6015625, "learning_rate": 4.662966076289363e-06, "loss": 0.0241, "reward": 1.9375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.984375, "step": 627 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 408.09375, "epoch": 0.5024, "grad_norm": 0.448770492187416, "kl": 0.1298828125, "learning_rate": 4.661915082255932e-06, "loss": 0.0052, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 628 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 372.140625, "epoch": 0.5032, "grad_norm": 1.0622563594989127, "kl": 0.150390625, "learning_rate": 4.6608625709072766e-06, "loss": 0.006, "reward": 1.515625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 629 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 430.5625, "epoch": 0.504, "grad_norm": 0.7294440628390886, "kl": 0.1328125, "learning_rate": 4.659808542982089e-06, "loss": 0.0053, "reward": 1.65625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 630 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 394.90625, "epoch": 0.5048, "grad_norm": 0.7076799509181766, "kl": 0.890625, "learning_rate": 4.658752999220125e-06, "loss": 0.0356, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 631 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 434.0, "epoch": 0.5056, "grad_norm": 0.7658790385590288, "kl": 0.251953125, "learning_rate": 4.657695940362207e-06, "loss": 0.0101, "reward": 1.625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 632 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 394.765625, "epoch": 0.5064, "grad_norm": 0.8367046057258256, "kl": 0.66796875, "learning_rate": 4.65663736715022e-06, "loss": 0.0268, "reward": 1.421875, "reward_std": 0.3054368495941162, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 633 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 363.171875, "epoch": 0.5072, "grad_norm": 0.7394620159296568, "kl": 0.21875, "learning_rate": 4.65557728032711e-06, "loss": 0.0087, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 634 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 438.46875, "epoch": 0.508, "grad_norm": 0.9007138649455658, "kl": 0.388671875, "learning_rate": 4.654515680636888e-06, "loss": 0.0155, "reward": 1.9375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.984375, "step": 635 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 407.859375, "epoch": 0.5088, "grad_norm": 0.8812664025932062, "kl": 1.3046875, "learning_rate": 4.653452568824625e-06, "loss": 0.0525, "reward": 1.78125, "reward_std": 0.3319803476333618, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 636 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 371.84375, "epoch": 0.5096, "grad_norm": 1.9274762594921737, "kl": 0.494140625, "learning_rate": 4.652387945636454e-06, "loss": 0.0198, "reward": 1.28125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.984375, "step": 637 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 475.71875, "epoch": 0.5104, "grad_norm": 0.5049969313742863, "kl": 0.12158203125, "learning_rate": 4.651321811819568e-06, "loss": 0.0049, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 638 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 363.796875, "epoch": 0.5112, "grad_norm": 0.683187960721247, "kl": 0.1357421875, "learning_rate": 4.650254168122222e-06, "loss": 0.0054, "reward": 1.78125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 639 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 468.65625, "epoch": 0.512, "grad_norm": 2.3577243207279666, "kl": 0.828125, "learning_rate": 4.649185015293728e-06, "loss": 0.0331, "reward": 1.53125, "reward_std": 0.22451192140579224, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 640 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.734375, "epoch": 0.5128, "grad_norm": 0.8012985914421464, "kl": 1.2109375, "learning_rate": 4.64811435408446e-06, "loss": 0.0483, "reward": 1.6875, "reward_std": 0.22558549046516418, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 641 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 449.625, "epoch": 0.5136, "grad_norm": 1.2406518104534963, "kl": 1.65625, "learning_rate": 4.647042185245848e-06, "loss": 0.0664, "reward": 1.640625, "reward_std": 0.22707363963127136, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 642 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 409.09375, "epoch": 0.5144, "grad_norm": 0.7995409575096425, "kl": 0.734375, "learning_rate": 4.645968509530381e-06, "loss": 0.0293, "reward": 1.5625, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 643 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 405.140625, "epoch": 0.5152, "grad_norm": 0.9728174896744552, "kl": 0.123046875, "learning_rate": 4.644893327691608e-06, "loss": 0.0049, "reward": 1.71875, "reward_std": 0.3208816647529602, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 644 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 355.171875, "epoch": 0.516, "grad_norm": 1.2186157898665686, "kl": 1.5078125, "learning_rate": 4.6438166404841316e-06, "loss": 0.0603, "reward": 1.53125, "reward_std": 0.47431010007858276, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.96875, "step": 645 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 469.75, "epoch": 0.5168, "grad_norm": 1.2018790641173411, "kl": 0.86328125, "learning_rate": 4.6427384486636115e-06, "loss": 0.0347, "reward": 1.640625, "reward_std": 0.3187600076198578, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 646 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 415.734375, "epoch": 0.5176, "grad_norm": 0.5962496013183518, "kl": 0.11181640625, "learning_rate": 4.6416587529867665e-06, "loss": 0.0045, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 647 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.5625, "epoch": 0.5184, "grad_norm": 0.5950301304457719, "kl": 0.1083984375, "learning_rate": 4.640577554211366e-06, "loss": 0.0043, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 648 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.5192, "grad_norm": 0.7874157693561777, "kl": 0.1103515625, "learning_rate": 4.63949485309624e-06, "loss": 0.0044, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 649 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 355.5, "epoch": 0.52, "grad_norm": 1.5109255828936672, "kl": 0.1181640625, "learning_rate": 4.638410650401267e-06, "loss": 0.0047, "reward": 1.5, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 650 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 452.46875, "epoch": 0.5208, "grad_norm": 0.6834046639715636, "kl": 0.11767578125, "learning_rate": 4.637324946887384e-06, "loss": 0.0047, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 651 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 348.1875, "epoch": 0.5216, "grad_norm": 3.4529482216950655, "kl": 0.87109375, "learning_rate": 4.636237743316578e-06, "loss": 0.0348, "reward": 1.609375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 652 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 424.84375, "epoch": 0.5224, "grad_norm": 0.731927864044106, "kl": 0.125, "learning_rate": 4.635149040451891e-06, "loss": 0.005, "reward": 1.453125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 653 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 418.328125, "epoch": 0.5232, "grad_norm": 0.7038942934042214, "kl": 0.11328125, "learning_rate": 4.634058839057417e-06, "loss": 0.0045, "reward": 1.640625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 654 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 391.9375, "epoch": 0.524, "grad_norm": 0.7724484209864683, "kl": 0.1103515625, "learning_rate": 4.632967139898301e-06, "loss": 0.0044, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 655 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 419.609375, "epoch": 0.5248, "grad_norm": 0.7184964328574942, "kl": 0.11767578125, "learning_rate": 4.63187394374074e-06, "loss": 0.0047, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 656 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 415.90625, "epoch": 0.5256, "grad_norm": 0.3646902862710937, "kl": 0.10595703125, "learning_rate": 4.63077925135198e-06, "loss": 0.0042, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 657 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 432.1875, "epoch": 0.5264, "grad_norm": 0.670708203135066, "kl": 0.10009765625, "learning_rate": 4.629683063500319e-06, "loss": 0.004, "reward": 1.53125, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 658 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 372.828125, "epoch": 0.5272, "grad_norm": 0.7104430342537692, "kl": 0.98828125, "learning_rate": 4.628585380955104e-06, "loss": 0.0394, "reward": 1.625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 659 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 402.796875, "epoch": 0.528, "grad_norm": 0.8181530916132829, "kl": 0.91796875, "learning_rate": 4.62748620448673e-06, "loss": 0.0368, "reward": 1.640625, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 660 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.015625, "epoch": 0.5288, "grad_norm": 0.6460679965046602, "kl": 0.81640625, "learning_rate": 4.626385534866642e-06, "loss": 0.0327, "reward": 1.765625, "reward_std": 0.15981829166412354, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 661 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.0625, "epoch": 0.5296, "grad_norm": 0.8947737055186511, "kl": 0.11279296875, "learning_rate": 4.625283372867333e-06, "loss": 0.0045, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 662 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.65625, "epoch": 0.5304, "grad_norm": 0.6340310799448744, "kl": 0.10595703125, "learning_rate": 4.624179719262342e-06, "loss": 0.0042, "reward": 1.859375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 663 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 397.96875, "epoch": 0.5312, "grad_norm": 0.9445756454310534, "kl": 0.890625, "learning_rate": 4.623074574826254e-06, "loss": 0.0355, "reward": 1.765625, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 664 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 366.71875, "epoch": 0.532, "grad_norm": 1.6832785858802943, "kl": 0.12353515625, "learning_rate": 4.621967940334705e-06, "loss": 0.0049, "reward": 1.609375, "reward_std": 0.2651395797729492, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 665 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.90625, "epoch": 0.5328, "grad_norm": 0.3725953073806499, "kl": 1.0859375, "learning_rate": 4.620859816564371e-06, "loss": 0.0434, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 666 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 369.609375, "epoch": 0.5336, "grad_norm": 2.10160684646745, "kl": 0.11572265625, "learning_rate": 4.619750204292978e-06, "loss": 0.0046, "reward": 1.59375, "reward_std": 0.1962026059627533, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 667 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 396.53125, "epoch": 0.5344, "grad_norm": 1.0811850376721042, "kl": 0.12060546875, "learning_rate": 4.618639104299294e-06, "loss": 0.0048, "reward": 1.71875, "reward_std": 0.28247907757759094, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 668 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 372.96875, "epoch": 0.5352, "grad_norm": 4.040796775648103, "kl": 0.365234375, "learning_rate": 4.6175265173631304e-06, "loss": 0.0147, "reward": 1.609375, "reward_std": 0.33351296186447144, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 669 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 367.046875, "epoch": 0.536, "grad_norm": 0.9364156193477713, "kl": 0.84765625, "learning_rate": 4.616412444265344e-06, "loss": 0.0338, "reward": 1.671875, "reward_std": 0.23531240224838257, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 670 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 377.015625, "epoch": 0.5368, "grad_norm": 0.7278856518684674, "kl": 0.11328125, "learning_rate": 4.6152968857878365e-06, "loss": 0.0045, "reward": 1.765625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 671 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.828125, "epoch": 0.5376, "grad_norm": 1.907213125846056, "kl": 0.119140625, "learning_rate": 4.6141798427135475e-06, "loss": 0.0048, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 672 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 419.390625, "epoch": 0.5384, "grad_norm": 0.6939537294722774, "kl": 0.115234375, "learning_rate": 4.6130613158264605e-06, "loss": 0.0046, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 673 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 462.15625, "epoch": 0.5392, "grad_norm": 2.732658890054957, "kl": 1.125, "learning_rate": 4.611941305911602e-06, "loss": 0.0452, "reward": 1.875, "reward_std": 0.1243029236793518, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.96875, "step": 674 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 388.640625, "epoch": 0.54, "grad_norm": 0.6753912416589343, "kl": 0.8515625, "learning_rate": 4.610819813755038e-06, "loss": 0.0339, "reward": 1.71875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 675 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 404.625, "epoch": 0.5408, "grad_norm": 1.9874445396659512, "kl": 0.1298828125, "learning_rate": 4.609696840143875e-06, "loss": 0.0052, "reward": 1.546875, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 676 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 396.46875, "epoch": 0.5416, "grad_norm": 0.7510630767554702, "kl": 0.9609375, "learning_rate": 4.6085723858662575e-06, "loss": 0.0383, "reward": 1.515625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 677 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 455.5, "epoch": 0.5424, "grad_norm": 1.6421316050848016, "kl": 1.6953125, "learning_rate": 4.607446451711372e-06, "loss": 0.0678, "reward": 1.765625, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "step": 678 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 378.921875, "epoch": 0.5432, "grad_norm": 1.1696105580122462, "kl": 0.224609375, "learning_rate": 4.606319038469443e-06, "loss": 0.009, "reward": 1.5, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 679 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 421.375, "epoch": 0.544, "grad_norm": 0.6993453019710336, "kl": 0.84765625, "learning_rate": 4.605190146931731e-06, "loss": 0.0337, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 680 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 364.59375, "epoch": 0.5448, "grad_norm": 1.0028918860428877, "kl": 0.55078125, "learning_rate": 4.604059777890537e-06, "loss": 0.0221, "reward": 1.515625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 681 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.484375, "epoch": 0.5456, "grad_norm": 0.9951682898291854, "kl": 0.703125, "learning_rate": 4.602927932139197e-06, "loss": 0.0282, "reward": 1.671875, "reward_std": 0.20189079642295837, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 682 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 409.0625, "epoch": 0.5464, "grad_norm": 0.5659555810744075, "kl": 0.59765625, "learning_rate": 4.601794610472083e-06, "loss": 0.024, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 683 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 345.3125, "epoch": 0.5472, "grad_norm": 0.9820024220569756, "kl": 1.9453125, "learning_rate": 4.6006598136846056e-06, "loss": 0.0779, "reward": 1.703125, "reward_std": 0.3012469410896301, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.953125, "step": 684 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 398.5, "epoch": 0.548, "grad_norm": 1.6200290850707568, "kl": 1.46875, "learning_rate": 4.599523542573207e-06, "loss": 0.0586, "reward": 1.59375, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 685 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 369.875, "epoch": 0.5488, "grad_norm": 1.1585081024064825, "kl": 0.404296875, "learning_rate": 4.598385797935368e-06, "loss": 0.0162, "reward": 1.53125, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 686 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 389.828125, "epoch": 0.5496, "grad_norm": 0.5636989474360153, "kl": 0.12353515625, "learning_rate": 4.5972465805696e-06, "loss": 0.0049, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 687 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 343.234375, "epoch": 0.5504, "grad_norm": 0.7489029107882763, "kl": 0.125, "learning_rate": 4.596105891275449e-06, "loss": 0.005, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 688 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 366.140625, "epoch": 0.5512, "grad_norm": 0.07564483123034887, "kl": 0.11474609375, "learning_rate": 4.594963730853497e-06, "loss": 0.0046, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 689 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 329.015625, "epoch": 0.552, "grad_norm": 0.7715043962696903, "kl": 0.1318359375, "learning_rate": 4.593820100105355e-06, "loss": 0.0053, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 690 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 408.4375, "epoch": 0.5528, "grad_norm": 1.5903541288745378, "kl": 0.13671875, "learning_rate": 4.5926749998336665e-06, "loss": 0.0054, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 691 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.09375, "epoch": 0.5536, "grad_norm": 0.8409980617832177, "kl": 0.12890625, "learning_rate": 4.5915284308421075e-06, "loss": 0.0051, "reward": 1.71875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 692 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 407.84375, "epoch": 0.5544, "grad_norm": 0.11505513163004885, "kl": 0.11767578125, "learning_rate": 4.590380393935383e-06, "loss": 0.0047, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 693 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 337.8125, "epoch": 0.5552, "grad_norm": 1.0015722524145403, "kl": 0.134765625, "learning_rate": 4.589230889919232e-06, "loss": 0.0054, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 694 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.96875, "epoch": 0.556, "grad_norm": 0.7181094305585296, "kl": 0.7421875, "learning_rate": 4.588079919600419e-06, "loss": 0.0297, "reward": 1.578125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 695 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.515625, "epoch": 0.5568, "grad_norm": 1.032777656021463, "kl": 0.80859375, "learning_rate": 4.586927483786739e-06, "loss": 0.0324, "reward": 1.65625, "reward_std": 0.20266614854335785, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 696 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 399.96875, "epoch": 0.5576, "grad_norm": 0.7345604816782041, "kl": 0.30859375, "learning_rate": 4.585773583287017e-06, "loss": 0.0124, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 697 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 417.484375, "epoch": 0.5584, "grad_norm": 1.0895463669750132, "kl": 0.73828125, "learning_rate": 4.584618218911104e-06, "loss": 0.0296, "reward": 1.578125, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 698 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 351.46875, "epoch": 0.5592, "grad_norm": 1.597638201896758, "kl": 0.16015625, "learning_rate": 4.583461391469879e-06, "loss": 0.0064, "reward": 1.546875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 699 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.21875, "epoch": 0.56, "grad_norm": 0.5662371958432129, "kl": 0.119140625, "learning_rate": 4.582303101775249e-06, "loss": 0.0048, "reward": 1.65625, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 700 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.984375, "epoch": 0.5608, "grad_norm": 2.0291530281532735, "kl": 0.87890625, "learning_rate": 4.581143350640146e-06, "loss": 0.0352, "reward": 1.828125, "reward_std": 0.2993340790271759, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 701 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 401.328125, "epoch": 0.5616, "grad_norm": 0.6348448126263817, "kl": 0.5625, "learning_rate": 4.579982138878527e-06, "loss": 0.0224, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 702 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 416.28125, "epoch": 0.5624, "grad_norm": 0.9368449426361695, "kl": 2.140625, "learning_rate": 4.578819467305375e-06, "loss": 0.0855, "reward": 1.625, "reward_std": 0.3049619197845459, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.953125, "step": 703 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 421.015625, "epoch": 0.5632, "grad_norm": 0.913545916055359, "kl": 2.15625, "learning_rate": 4.5776553367367e-06, "loss": 0.086, "reward": 1.5625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 704 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.625, "epoch": 0.564, "grad_norm": 0.6110781288263009, "kl": 0.12353515625, "learning_rate": 4.576489747989532e-06, "loss": 0.0049, "reward": 1.671875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 705 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 383.5625, "epoch": 0.5648, "grad_norm": 0.755254769811337, "kl": 0.74609375, "learning_rate": 4.575322701881926e-06, "loss": 0.0297, "reward": 1.46875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 706 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 425.125, "epoch": 0.5656, "grad_norm": 1.247168348852021, "kl": 0.953125, "learning_rate": 4.57415419923296e-06, "loss": 0.0381, "reward": 1.578125, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 707 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 424.328125, "epoch": 0.5664, "grad_norm": 0.6235588189660318, "kl": 0.1259765625, "learning_rate": 4.572984240862733e-06, "loss": 0.0051, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 708 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 388.453125, "epoch": 0.5672, "grad_norm": 1.547770321678808, "kl": 4.03125, "learning_rate": 4.57181282759237e-06, "loss": 0.1616, "reward": 1.46875, "reward_std": 0.5385891795158386, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.890625, "step": 709 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 406.375, "epoch": 0.568, "grad_norm": 0.7482968349890383, "kl": 1.0, "learning_rate": 4.570639960244011e-06, "loss": 0.0401, "reward": 1.546875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 710 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.59375, "epoch": 0.5688, "grad_norm": 0.8559596436138244, "kl": 0.111328125, "learning_rate": 4.56946563964082e-06, "loss": 0.0045, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 711 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 409.96875, "epoch": 0.5696, "grad_norm": 1.0352416021885698, "kl": 0.126953125, "learning_rate": 4.5682898666069815e-06, "loss": 0.0051, "reward": 1.671875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 712 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 403.21875, "epoch": 0.5704, "grad_norm": 1.0665806428938223, "kl": 0.6875, "learning_rate": 4.567112641967697e-06, "loss": 0.0274, "reward": 1.578125, "reward_std": 0.2662131190299988, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 713 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.5625, "epoch": 0.5712, "grad_norm": 1.4321423266016051, "kl": 0.1279296875, "learning_rate": 4.5659339665491894e-06, "loss": 0.0051, "reward": 1.6875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 714 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 435.9375, "epoch": 0.572, "grad_norm": 0.6344378789069257, "kl": 0.11328125, "learning_rate": 4.5647538411786965e-06, "loss": 0.0045, "reward": 1.546875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 715 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.0, "epoch": 0.5728, "grad_norm": 0.6581202971018658, "kl": 0.1259765625, "learning_rate": 4.563572266684478e-06, "loss": 0.005, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 716 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 442.890625, "epoch": 0.5736, "grad_norm": 1.1452932072382693, "kl": 0.478515625, "learning_rate": 4.562389243895807e-06, "loss": 0.0191, "reward": 1.546875, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 717 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.90625, "epoch": 0.5744, "grad_norm": 0.5898405189255272, "kl": 0.703125, "learning_rate": 4.561204773642974e-06, "loss": 0.0282, "reward": 1.703125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 718 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 388.796875, "epoch": 0.5752, "grad_norm": 0.7806688656878049, "kl": 0.1455078125, "learning_rate": 4.5600188567572874e-06, "loss": 0.0058, "reward": 1.375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 719 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 430.78125, "epoch": 0.576, "grad_norm": 0.8776508767164352, "kl": 2.28125, "learning_rate": 4.558831494071069e-06, "loss": 0.0909, "reward": 1.640625, "reward_std": 0.329858660697937, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 720 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 399.203125, "epoch": 0.5768, "grad_norm": 1.0941455692436421, "kl": 1.046875, "learning_rate": 4.557642686417654e-06, "loss": 0.0418, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 721 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 457.28125, "epoch": 0.5776, "grad_norm": 0.6712978048188565, "kl": 1.9140625, "learning_rate": 4.556452434631396e-06, "loss": 0.0764, "reward": 1.671875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.953125, "step": 722 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 443.734375, "epoch": 0.5784, "grad_norm": 0.7381385221923685, "kl": 0.134765625, "learning_rate": 4.555260739547657e-06, "loss": 0.0054, "reward": 1.5625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 723 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 423.421875, "epoch": 0.5792, "grad_norm": 0.688014727599496, "kl": 0.6484375, "learning_rate": 4.554067602002815e-06, "loss": 0.0259, "reward": 1.640625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 724 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.796875, "epoch": 0.58, "grad_norm": 1.8472595817554485, "kl": 0.84375, "learning_rate": 4.55287302283426e-06, "loss": 0.0335, "reward": 1.609375, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 725 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 460.46875, "epoch": 0.5808, "grad_norm": 0.8264857423169704, "kl": 2.21875, "learning_rate": 4.551677002880395e-06, "loss": 0.0891, "reward": 1.875, "reward_std": 0.2619796395301819, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.953125, "step": 726 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 417.484375, "epoch": 0.5816, "grad_norm": 0.785843803400338, "kl": 0.1181640625, "learning_rate": 4.550479542980632e-06, "loss": 0.0047, "reward": 1.859375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 727 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 424.0, "epoch": 0.5824, "grad_norm": 0.8086127282101715, "kl": 0.69921875, "learning_rate": 4.549280643975394e-06, "loss": 0.0279, "reward": 1.71875, "reward_std": 0.2177756279706955, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 728 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 429.9375, "epoch": 0.5832, "grad_norm": 0.761908604882218, "kl": 1.2890625, "learning_rate": 4.548080306706114e-06, "loss": 0.0516, "reward": 1.578125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 729 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.625, "epoch": 0.584, "grad_norm": 0.9352748214370604, "kl": 0.453125, "learning_rate": 4.5468785320152365e-06, "loss": 0.0182, "reward": 1.703125, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 730 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 369.546875, "epoch": 0.5848, "grad_norm": 1.199317088782103, "kl": 0.359375, "learning_rate": 4.545675320746212e-06, "loss": 0.0144, "reward": 1.75, "reward_std": 0.3197399973869324, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 731 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 386.796875, "epoch": 0.5856, "grad_norm": 1.0920792282092384, "kl": 0.11962890625, "learning_rate": 4.544470673743502e-06, "loss": 0.0048, "reward": 1.328125, "reward_std": 0.2472364753484726, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 1.0, "step": 732 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 476.21875, "epoch": 0.5864, "grad_norm": 4.64403574438647, "kl": 0.11767578125, "learning_rate": 4.543264591852572e-06, "loss": 0.0047, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 733 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 404.21875, "epoch": 0.5872, "grad_norm": 0.5808856463061708, "kl": 0.671875, "learning_rate": 4.542057075919898e-06, "loss": 0.0269, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 734 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 408.96875, "epoch": 0.588, "grad_norm": 0.9289541622106935, "kl": 0.88671875, "learning_rate": 4.54084812679296e-06, "loss": 0.0353, "reward": 1.484375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 735 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.09375, "epoch": 0.5888, "grad_norm": 0.5923941542606462, "kl": 0.1337890625, "learning_rate": 4.539637745320247e-06, "loss": 0.0053, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 736 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 420.046875, "epoch": 0.5896, "grad_norm": 0.5623431689697669, "kl": 0.11474609375, "learning_rate": 4.53842593235125e-06, "loss": 0.0046, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 737 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 397.015625, "epoch": 0.5904, "grad_norm": 0.7521836983481913, "kl": 0.77734375, "learning_rate": 4.537212688736466e-06, "loss": 0.031, "reward": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 738 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 431.0, "epoch": 0.5912, "grad_norm": 1.1721982513328153, "kl": 0.54296875, "learning_rate": 4.535998015327396e-06, "loss": 0.0217, "reward": 1.546875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 739 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 388.234375, "epoch": 0.592, "grad_norm": 1.1851939620543772, "kl": 0.54296875, "learning_rate": 4.534781912976546e-06, "loss": 0.0217, "reward": 1.53125, "reward_std": 0.24251843988895416, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 740 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.71875, "epoch": 0.5928, "grad_norm": 1.1869725036453096, "kl": 0.455078125, "learning_rate": 4.533564382537421e-06, "loss": 0.0181, "reward": 1.6875, "reward_std": 0.23356689512729645, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 741 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 384.046875, "epoch": 0.5936, "grad_norm": 0.5718438675844124, "kl": 0.12060546875, "learning_rate": 4.532345424864533e-06, "loss": 0.0048, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 742 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 455.265625, "epoch": 0.5944, "grad_norm": 5.584850455407113, "kl": 0.431640625, "learning_rate": 4.531125040813392e-06, "loss": 0.0172, "reward": 1.921875, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.96875, "step": 743 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 477.796875, "epoch": 0.5952, "grad_norm": 51.22100161387199, "kl": 30.0, "learning_rate": 4.529903231240511e-06, "loss": 1.1987, "reward": 0.5, "reward_std": 0.46577346324920654, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.3125, "step": 744 }, { "all_correct": 0.0, "all_wrong": 0.875, "completion_length": 517.625, "epoch": 0.596, "grad_norm": 39.20600833098527, "kl": 29.0, "learning_rate": 4.528679997003403e-06, "loss": 1.1599, "reward": 0.328125, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.25, "step": 745 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 529.28125, "epoch": 0.5968, "grad_norm": 20.64166517450329, "kl": 15.25, "learning_rate": 4.52745533896058e-06, "loss": 0.6117, "reward": 0.90625, "reward_std": 0.3945944905281067, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.484375, "step": 746 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 613.9375, "epoch": 0.5976, "grad_norm": 12.692577062543846, "kl": 11.1875, "learning_rate": 4.526229257971556e-06, "loss": 0.4475, "reward": 0.90625, "reward_std": 0.7484728097915649, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.40625, "step": 747 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 614.953125, "epoch": 0.5984, "grad_norm": 10.667956388663862, "kl": 4.0625, "learning_rate": 4.52500175489684e-06, "loss": 0.1625, "reward": 1.0, "reward_std": 0.6321523189544678, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.484375, "step": 748 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 466.03125, "epoch": 0.5992, "grad_norm": 3.5404744863131983, "kl": 0.5859375, "learning_rate": 4.523772830597942e-06, "loss": 0.0234, "reward": 1.1875, "reward_std": 0.3434731066226959, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.671875, "step": 749 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.0, "epoch": 0.6, "grad_norm": 1.7548650503096686, "kl": 0.1298828125, "learning_rate": 4.522542485937369e-06, "loss": 0.0052, "reward": 1.625, "reward_std": 0.2619796395301819, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 750 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 468.15625, "epoch": 0.6008, "grad_norm": 1.6651771330256788, "kl": 0.169921875, "learning_rate": 4.521310721778622e-06, "loss": 0.0068, "reward": 1.15625, "reward_std": 0.33090677857398987, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.65625, "step": 751 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 449.609375, "epoch": 0.6016, "grad_norm": 1.0959913148107947, "kl": 0.142578125, "learning_rate": 4.520077538986203e-06, "loss": 0.0057, "reward": 1.640625, "reward_std": 0.23925508558750153, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.796875, "step": 752 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 442.046875, "epoch": 0.6024, "grad_norm": 0.6686004200543701, "kl": 0.1259765625, "learning_rate": 4.518842938425606e-06, "loss": 0.005, "reward": 1.78125, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 753 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 396.015625, "epoch": 0.6032, "grad_norm": 0.5521083003632641, "kl": 0.134765625, "learning_rate": 4.51760692096332e-06, "loss": 0.0054, "reward": 1.515625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 754 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 418.421875, "epoch": 0.604, "grad_norm": 0.576389331229294, "kl": 0.119140625, "learning_rate": 4.516369487466832e-06, "loss": 0.0048, "reward": 1.625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 755 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.640625, "epoch": 0.6048, "grad_norm": 0.9091605258035641, "kl": 0.1337890625, "learning_rate": 4.5151306388046175e-06, "loss": 0.0053, "reward": 1.578125, "reward_std": 0.2956691384315491, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.921875, "step": 756 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 370.828125, "epoch": 0.6056, "grad_norm": 0.7456635203896337, "kl": 0.1318359375, "learning_rate": 4.513890375846152e-06, "loss": 0.0053, "reward": 1.546875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 757 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 374.53125, "epoch": 0.6064, "grad_norm": 0.4054554163375172, "kl": 0.138671875, "learning_rate": 4.512648699461897e-06, "loss": 0.0055, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 758 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 353.203125, "epoch": 0.6072, "grad_norm": 0.5153817014977968, "kl": 0.1416015625, "learning_rate": 4.511405610523309e-06, "loss": 0.0057, "reward": 1.453125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 759 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 348.015625, "epoch": 0.608, "grad_norm": 0.9645685741953072, "kl": 0.13671875, "learning_rate": 4.510161109902837e-06, "loss": 0.0055, "reward": 1.6875, "reward_std": 0.23319074511528015, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 760 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 352.625, "epoch": 0.6088, "grad_norm": 0.6561053834688735, "kl": 0.1357421875, "learning_rate": 4.508915198473919e-06, "loss": 0.0055, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 761 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.453125, "epoch": 0.6096, "grad_norm": 0.4154680865776334, "kl": 0.1328125, "learning_rate": 4.507667877110982e-06, "loss": 0.0053, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 762 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 334.875, "epoch": 0.6104, "grad_norm": 1.0690327606009025, "kl": 0.1357421875, "learning_rate": 4.506419146689445e-06, "loss": 0.0054, "reward": 1.484375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 763 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 357.59375, "epoch": 0.6112, "grad_norm": 0.09845757431172224, "kl": 0.12109375, "learning_rate": 4.505169008085717e-06, "loss": 0.0049, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 764 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 352.3125, "epoch": 0.612, "grad_norm": 0.9511197576575076, "kl": 0.12890625, "learning_rate": 4.503917462177192e-06, "loss": 0.0051, "reward": 1.6875, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 765 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 354.78125, "epoch": 0.6128, "grad_norm": 0.3822778374443259, "kl": 0.12890625, "learning_rate": 4.5026645098422515e-06, "loss": 0.0051, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 766 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 368.890625, "epoch": 0.6136, "grad_norm": 0.8657068830504863, "kl": 0.1328125, "learning_rate": 4.5014101519602684e-06, "loss": 0.0053, "reward": 1.515625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 767 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 350.25, "epoch": 0.6144, "grad_norm": 0.6166122855295121, "kl": 0.1328125, "learning_rate": 4.500154389411598e-06, "loss": 0.0053, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 768 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.21875, "epoch": 0.6152, "grad_norm": 0.8540974873902282, "kl": 0.126953125, "learning_rate": 4.498897223077582e-06, "loss": 0.0051, "reward": 1.6875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 769 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 371.90625, "epoch": 0.616, "grad_norm": 1.2215144813460077, "kl": 0.134765625, "learning_rate": 4.49763865384055e-06, "loss": 0.0054, "reward": 1.59375, "reward_std": 0.2040124535560608, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 770 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.484375, "epoch": 0.6168, "grad_norm": 0.8932655228788702, "kl": 0.12890625, "learning_rate": 4.496378682583813e-06, "loss": 0.0052, "reward": 1.796875, "reward_std": 0.22673699259757996, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 771 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 311.90625, "epoch": 0.6176, "grad_norm": 0.8671336585205175, "kl": 0.1416015625, "learning_rate": 4.495117310191667e-06, "loss": 0.0057, "reward": 1.4375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 772 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 321.75, "epoch": 0.6184, "grad_norm": 0.8212597306559153, "kl": 0.140625, "learning_rate": 4.493854537549393e-06, "loss": 0.0056, "reward": 1.546875, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 773 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 301.171875, "epoch": 0.6192, "grad_norm": 1.5452424345377327, "kl": 0.1328125, "learning_rate": 4.492590365543253e-06, "loss": 0.0053, "reward": 1.453125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 774 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 347.703125, "epoch": 0.62, "grad_norm": 0.8303947701075123, "kl": 0.119140625, "learning_rate": 4.491324795060491e-06, "loss": 0.0048, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 775 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 363.9375, "epoch": 0.6208, "grad_norm": 0.7069099324776755, "kl": 0.11376953125, "learning_rate": 4.490057826989333e-06, "loss": 0.0045, "reward": 1.375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 776 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 319.453125, "epoch": 0.6216, "grad_norm": 2.817713219267624, "kl": 0.12158203125, "learning_rate": 4.488789462218988e-06, "loss": 0.0049, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 777 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.40625, "epoch": 0.6224, "grad_norm": 0.47685500021164945, "kl": 0.1171875, "learning_rate": 4.487519701639641e-06, "loss": 0.0047, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 778 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 399.75, "epoch": 0.6232, "grad_norm": 0.566943640083845, "kl": 0.1083984375, "learning_rate": 4.486248546142459e-06, "loss": 0.0043, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 779 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 323.515625, "epoch": 0.624, "grad_norm": 0.9569549841774113, "kl": 0.12353515625, "learning_rate": 4.4849759966195885e-06, "loss": 0.0049, "reward": 1.4375, "reward_std": 0.24251842498779297, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 780 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 393.875, "epoch": 0.6248, "grad_norm": 0.6087250698429332, "kl": 0.1279296875, "learning_rate": 4.483702053964154e-06, "loss": 0.0051, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 781 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 409.75, "epoch": 0.6256, "grad_norm": 0.7636360924647048, "kl": 0.11328125, "learning_rate": 4.482426719070258e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 782 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 388.59375, "epoch": 0.6264, "grad_norm": 0.9792797804278635, "kl": 0.125, "learning_rate": 4.4811499928329775e-06, "loss": 0.005, "reward": 1.4375, "reward_std": 0.2734241485595703, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 783 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 425.796875, "epoch": 0.6272, "grad_norm": 76.4412477834923, "kl": 2.5, "learning_rate": 4.479871876148368e-06, "loss": 0.0995, "reward": 1.78125, "reward_std": 0.19506090879440308, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 784 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 420.34375, "epoch": 0.628, "grad_norm": 0.5444322017275038, "kl": 0.125, "learning_rate": 4.478592369913464e-06, "loss": 0.005, "reward": 1.53125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 785 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 458.703125, "epoch": 0.6288, "grad_norm": 0.08333092753592482, "kl": 0.12353515625, "learning_rate": 4.477311475026271e-06, "loss": 0.0049, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 786 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.546875, "epoch": 0.6296, "grad_norm": 0.32721957273627983, "kl": 0.115234375, "learning_rate": 4.476029192385769e-06, "loss": 0.0046, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 787 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 452.875, "epoch": 0.6304, "grad_norm": 0.565466382848136, "kl": 0.11181640625, "learning_rate": 4.474745522891915e-06, "loss": 0.0045, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 788 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.546875, "epoch": 0.6312, "grad_norm": 0.8991346765306769, "kl": 0.126953125, "learning_rate": 4.473460467445637e-06, "loss": 0.0051, "reward": 1.65625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 789 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 452.40625, "epoch": 0.632, "grad_norm": 0.7590185646436524, "kl": 0.11669921875, "learning_rate": 4.472174026948836e-06, "loss": 0.0047, "reward": 1.71875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 790 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 434.125, "epoch": 0.6328, "grad_norm": 0.6733327069903833, "kl": 0.123046875, "learning_rate": 4.470886202304385e-06, "loss": 0.0049, "reward": 1.609375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 791 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 352.796875, "epoch": 0.6336, "grad_norm": 0.8714933056678619, "kl": 0.140625, "learning_rate": 4.469596994416131e-06, "loss": 0.0056, "reward": 1.65625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 792 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 369.28125, "epoch": 0.6344, "grad_norm": 2.659537158653402, "kl": 0.1396484375, "learning_rate": 4.468306404188887e-06, "loss": 0.0056, "reward": 1.484375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 793 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 447.296875, "epoch": 0.6352, "grad_norm": 24.563601751105438, "kl": 0.224609375, "learning_rate": 4.467014432528441e-06, "loss": 0.009, "reward": 1.484375, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 794 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 393.484375, "epoch": 0.636, "grad_norm": 1.4850003247494263, "kl": 0.134765625, "learning_rate": 4.465721080341547e-06, "loss": 0.0054, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 795 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 373.96875, "epoch": 0.6368, "grad_norm": 1.44494639151913, "kl": 0.12890625, "learning_rate": 4.4644263485359316e-06, "loss": 0.0052, "reward": 1.515625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 796 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 392.328125, "epoch": 0.6376, "grad_norm": 0.4537455588808983, "kl": 0.134765625, "learning_rate": 4.463130238020284e-06, "loss": 0.0054, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 797 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 439.4375, "epoch": 0.6384, "grad_norm": 0.4835674647328229, "kl": 0.109375, "learning_rate": 4.4618327497042676e-06, "loss": 0.0044, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 798 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 361.4375, "epoch": 0.6392, "grad_norm": 1.2639303070692236, "kl": 0.134765625, "learning_rate": 4.460533884498509e-06, "loss": 0.0054, "reward": 1.421875, "reward_std": 0.32878512144088745, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 799 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 386.453125, "epoch": 0.64, "grad_norm": 1.0922796218789752, "kl": 0.123046875, "learning_rate": 4.4592336433146e-06, "loss": 0.0049, "reward": 1.609375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 800 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 390.453125, "epoch": 0.6408, "grad_norm": 1.0156191846007012, "kl": 0.1240234375, "learning_rate": 4.457932027065102e-06, "loss": 0.005, "reward": 1.53125, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.96875, "step": 801 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 351.078125, "epoch": 0.6416, "grad_norm": 0.8198981311230296, "kl": 0.12255859375, "learning_rate": 4.456629036663537e-06, "loss": 0.0049, "reward": 1.65625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 802 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.921875, "epoch": 0.6424, "grad_norm": 0.5991400033831288, "kl": 0.1318359375, "learning_rate": 4.455324673024396e-06, "loss": 0.0053, "reward": 1.9375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.96875, "step": 803 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 422.796875, "epoch": 0.6432, "grad_norm": 0.6692124112556407, "kl": 0.1318359375, "learning_rate": 4.4540189370631315e-06, "loss": 0.0053, "reward": 1.515625, "reward_std": 0.24831004440784454, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 804 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 438.140625, "epoch": 0.644, "grad_norm": 6.832094630975682, "kl": 250.0, "learning_rate": 4.452711829696158e-06, "loss": 10.0082, "reward": 1.390625, "reward_std": 0.1889965683221817, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.96875, "step": 805 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.90625, "epoch": 0.6448, "grad_norm": 0.9717012307975434, "kl": 0.1376953125, "learning_rate": 4.451403351840855e-06, "loss": 0.0055, "reward": 1.609375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.953125, "step": 806 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 449.40625, "epoch": 0.6456, "grad_norm": 0.956411793550628, "kl": 0.130859375, "learning_rate": 4.450093504415562e-06, "loss": 0.0052, "reward": 1.578125, "reward_std": 0.24820663034915924, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.953125, "step": 807 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.015625, "epoch": 0.6464, "grad_norm": 0.9250986580827515, "kl": 0.125, "learning_rate": 4.44878228833958e-06, "loss": 0.005, "reward": 1.546875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 808 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 407.9375, "epoch": 0.6472, "grad_norm": 7.617951227193584, "kl": 215.0, "learning_rate": 4.447469704533172e-06, "loss": 8.5714, "reward": 1.515625, "reward_std": 0.29317614436149597, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.921875, "step": 809 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 430.40625, "epoch": 0.648, "grad_norm": 1.5574071141834538, "kl": 0.1591796875, "learning_rate": 4.446155753917559e-06, "loss": 0.0064, "reward": 1.328125, "reward_std": 0.3298586905002594, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.921875, "step": 810 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 395.078125, "epoch": 0.6488, "grad_norm": 1.160622746543496, "kl": 71.5, "learning_rate": 4.444840437414923e-06, "loss": 2.8578, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.953125, "step": 811 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 393.78125, "epoch": 0.6496, "grad_norm": 1.2594757986222074, "kl": 0.12890625, "learning_rate": 4.443523755948401e-06, "loss": 0.0052, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 812 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 460.046875, "epoch": 0.6504, "grad_norm": 1.1971191096520515, "kl": 0.142578125, "learning_rate": 4.442205710442095e-06, "loss": 0.0057, "reward": 1.453125, "reward_std": 0.2877861559391022, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 813 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 453.546875, "epoch": 0.6512, "grad_norm": 0.09472697828466468, "kl": 0.1083984375, "learning_rate": 4.4408863018210564e-06, "loss": 0.0043, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 814 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 432.6875, "epoch": 0.652, "grad_norm": 2.7013955744300415, "kl": 0.2412109375, "learning_rate": 4.439565531011299e-06, "loss": 0.0096, "reward": 1.671875, "reward_std": 0.46608567237854004, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.890625, "step": 815 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 365.703125, "epoch": 0.6528, "grad_norm": 1.44901882627503, "kl": 0.328125, "learning_rate": 4.43824339893979e-06, "loss": 0.0131, "reward": 1.640625, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.9375, "step": 816 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 485.84375, "epoch": 0.6536, "grad_norm": 3.6030275380795476, "kl": 0.431640625, "learning_rate": 4.436919906534452e-06, "loss": 0.0172, "reward": 1.515625, "reward_std": 0.4057711362838745, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.875, "step": 817 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 451.875, "epoch": 0.6544, "grad_norm": 3.9299901435845896, "kl": 0.72265625, "learning_rate": 4.4355950547241645e-06, "loss": 0.0289, "reward": 1.6875, "reward_std": 0.4538106322288513, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.90625, "step": 818 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 495.71875, "epoch": 0.6552, "grad_norm": 4.264532891042201, "kl": 169.0, "learning_rate": 4.434268844438758e-06, "loss": 6.7844, "reward": 1.265625, "reward_std": 0.4659822881221771, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.8125, "step": 819 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 416.625, "epoch": 0.656, "grad_norm": 10.862042633192596, "kl": 316.0, "learning_rate": 4.432941276609018e-06, "loss": 12.6175, "reward": 1.578125, "reward_std": 0.5162889957427979, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.890625, "step": 820 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 420.71875, "epoch": 0.6568, "grad_norm": 9.047241686718142, "kl": 248.0, "learning_rate": 4.431612352166684e-06, "loss": 9.9225, "reward": 1.4375, "reward_std": 0.5500767827033997, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.859375, "step": 821 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 518.984375, "epoch": 0.6576, "grad_norm": 12.760883469416578, "kl": 496.0, "learning_rate": 4.4302820720444454e-06, "loss": 19.7786, "reward": 1.21875, "reward_std": 0.7390350699424744, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.734375, "step": 822 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 437.6875, "epoch": 0.6584, "grad_norm": 72.80389942581843, "kl": 528.0, "learning_rate": 4.428950437175944e-06, "loss": 21.0454, "reward": 1.59375, "reward_std": 0.633817195892334, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.859375, "step": 823 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 397.421875, "epoch": 0.6592, "grad_norm": 3.522463920196615, "kl": 4.1875, "learning_rate": 4.427617448495772e-06, "loss": 0.1671, "reward": 1.421875, "reward_std": 0.589083194732666, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.84375, "step": 824 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 336.109375, "epoch": 0.66, "grad_norm": 1.248801220246186, "kl": 1.7109375, "learning_rate": 4.426283106939474e-06, "loss": 0.0683, "reward": 1.78125, "reward_std": 0.33225804567337036, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.921875, "step": 825 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 411.671875, "epoch": 0.6608, "grad_norm": 5.90413161193254, "kl": 4.90625, "learning_rate": 4.424947413443539e-06, "loss": 0.196, "reward": 1.265625, "reward_std": 0.46100130677223206, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.890625, "step": 826 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 306.03125, "epoch": 0.6616, "grad_norm": 2.5172821672381973, "kl": 1.609375, "learning_rate": 4.423610368945411e-06, "loss": 0.0646, "reward": 1.5625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9375, "step": 827 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 382.203125, "epoch": 0.6624, "grad_norm": 4.785093259770447, "kl": 1.109375, "learning_rate": 4.422271974383479e-06, "loss": 0.0445, "reward": 1.265625, "reward_std": 0.31761831045150757, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9375, "step": 828 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 323.203125, "epoch": 0.6632, "grad_norm": 2.3084817675871903, "kl": 0.9296875, "learning_rate": 4.420932230697079e-06, "loss": 0.0373, "reward": 1.40625, "reward_std": 0.2705550193786621, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 829 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 330.15625, "epoch": 0.664, "grad_norm": 0.4928413062199495, "kl": 0.1650390625, "learning_rate": 4.419591138826495e-06, "loss": 0.0066, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 830 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 311.53125, "epoch": 0.6648, "grad_norm": 0.923167756488755, "kl": 0.1640625, "learning_rate": 4.418248699712955e-06, "loss": 0.0065, "reward": 1.703125, "reward_std": 0.23568855226039886, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 831 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 296.578125, "epoch": 0.6656, "grad_norm": 0.6476239243055839, "kl": 0.79296875, "learning_rate": 4.416904914298637e-06, "loss": 0.0317, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 832 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 322.5625, "epoch": 0.6664, "grad_norm": 0.5599649961332619, "kl": 0.1552734375, "learning_rate": 4.415559783526661e-06, "loss": 0.0062, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 833 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 297.875, "epoch": 0.6672, "grad_norm": 0.8868100745745057, "kl": 0.1611328125, "learning_rate": 4.414213308341092e-06, "loss": 0.0064, "reward": 1.75, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.96875, "step": 834 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 301.125, "epoch": 0.668, "grad_norm": 1.1584434850079184, "kl": 0.16796875, "learning_rate": 4.412865489686936e-06, "loss": 0.0067, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 835 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 275.0625, "epoch": 0.6688, "grad_norm": 0.8900494786989672, "kl": 0.1513671875, "learning_rate": 4.411516328510145e-06, "loss": 0.0061, "reward": 1.875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 836 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 300.328125, "epoch": 0.6696, "grad_norm": 0.8115495134521837, "kl": 0.146484375, "learning_rate": 4.410165825757613e-06, "loss": 0.0059, "reward": 1.78125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 837 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 330.328125, "epoch": 0.6704, "grad_norm": 0.896029068850484, "kl": 0.14453125, "learning_rate": 4.408813982377175e-06, "loss": 0.0058, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 838 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 314.421875, "epoch": 0.6712, "grad_norm": 0.5187846213444184, "kl": 0.1328125, "learning_rate": 4.407460799317605e-06, "loss": 0.0053, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 839 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 310.734375, "epoch": 0.672, "grad_norm": 1.1498330504928742, "kl": 0.86328125, "learning_rate": 4.40610627752862e-06, "loss": 0.0347, "reward": 1.53125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 840 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 347.21875, "epoch": 0.6728, "grad_norm": 0.7601751639149329, "kl": 0.1494140625, "learning_rate": 4.404750417960876e-06, "loss": 0.006, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 841 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.421875, "epoch": 0.6736, "grad_norm": 0.734199264480647, "kl": 0.1650390625, "learning_rate": 4.403393221565966e-06, "loss": 0.0066, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 842 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.609375, "epoch": 0.6744, "grad_norm": 1.4623768306458316, "kl": 1.1328125, "learning_rate": 4.402034689296425e-06, "loss": 0.0453, "reward": 1.765625, "reward_std": 0.20873048901557922, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 843 }, { "all_correct": 0.375, "all_wrong": 0.625, "completion_length": 384.984375, "epoch": 0.6752, "grad_norm": 0.49041687670330963, "kl": 0.84375, "learning_rate": 4.400674822105721e-06, "loss": 0.0339, "reward": 1.359375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 844 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 447.078125, "epoch": 0.676, "grad_norm": 3.8286635673292326, "kl": 2.78125, "learning_rate": 4.399313620948262e-06, "loss": 0.1114, "reward": 1.421875, "reward_std": 0.3187599778175354, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.953125, "step": 845 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 402.71875, "epoch": 0.6768, "grad_norm": 1.39392373494802, "kl": 0.5546875, "learning_rate": 4.397951086779392e-06, "loss": 0.0222, "reward": 1.484375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 846 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 350.546875, "epoch": 0.6776, "grad_norm": 0.8760598270294438, "kl": 0.130859375, "learning_rate": 4.396587220555389e-06, "loss": 0.0052, "reward": 1.640625, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 847 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 416.59375, "epoch": 0.6784, "grad_norm": 0.8117302602451169, "kl": 0.6015625, "learning_rate": 4.395222023233467e-06, "loss": 0.0241, "reward": 1.796875, "reward_std": 0.22600007057189941, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 848 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 354.140625, "epoch": 0.6792, "grad_norm": 0.7893827765546036, "kl": 0.130859375, "learning_rate": 4.393855495771774e-06, "loss": 0.0053, "reward": 1.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 849 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 348.09375, "epoch": 0.68, "grad_norm": 1.1273192375292231, "kl": 0.1435546875, "learning_rate": 4.3924876391293915e-06, "loss": 0.0057, "reward": 1.75, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 850 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.171875, "epoch": 0.6808, "grad_norm": 1.6848210227745468, "kl": 0.134765625, "learning_rate": 4.391118454266335e-06, "loss": 0.0054, "reward": 1.65625, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 851 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 425.09375, "epoch": 0.6816, "grad_norm": 0.5722804310063825, "kl": 0.12060546875, "learning_rate": 4.389747942143549e-06, "loss": 0.0048, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 852 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 427.328125, "epoch": 0.6824, "grad_norm": 1.7251835603654744, "kl": 0.1201171875, "learning_rate": 4.388376103722914e-06, "loss": 0.0048, "reward": 1.671875, "reward_std": 0.22673699259757996, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 853 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 406.390625, "epoch": 0.6832, "grad_norm": 0.4098808078432349, "kl": 0.1318359375, "learning_rate": 4.387002939967237e-06, "loss": 0.0053, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 854 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 462.953125, "epoch": 0.684, "grad_norm": 1.8043388097299795, "kl": 0.353515625, "learning_rate": 4.38562845184026e-06, "loss": 0.0142, "reward": 1.828125, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 855 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 481.984375, "epoch": 0.6848, "grad_norm": 1.6393645313605443, "kl": 0.361328125, "learning_rate": 4.384252640306649e-06, "loss": 0.0144, "reward": 1.65625, "reward_std": 0.2041158527135849, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 856 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 461.46875, "epoch": 0.6856, "grad_norm": 2.221226937529482, "kl": 0.3828125, "learning_rate": 4.382875506332002e-06, "loss": 0.0153, "reward": 1.640625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 857 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 357.734375, "epoch": 0.6864, "grad_norm": 0.47531247002388066, "kl": 0.1298828125, "learning_rate": 4.381497050882845e-06, "loss": 0.0052, "reward": 1.4375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 858 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.609375, "epoch": 0.6872, "grad_norm": 0.9210820085839544, "kl": 0.484375, "learning_rate": 4.380117274926632e-06, "loss": 0.0195, "reward": 1.703125, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 859 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 402.34375, "epoch": 0.688, "grad_norm": 1.3840191855749242, "kl": 1.078125, "learning_rate": 4.3787361794317405e-06, "loss": 0.0433, "reward": 1.703125, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 860 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 433.671875, "epoch": 0.6888, "grad_norm": 0.8237080651822265, "kl": 0.1220703125, "learning_rate": 4.377353765367479e-06, "loss": 0.0049, "reward": 1.578125, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 861 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 433.328125, "epoch": 0.6896, "grad_norm": 0.8167935126873793, "kl": 0.1240234375, "learning_rate": 4.375970033704078e-06, "loss": 0.005, "reward": 1.625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 862 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 430.328125, "epoch": 0.6904, "grad_norm": 0.6034458812927239, "kl": 0.11865234375, "learning_rate": 4.374584985412692e-06, "loss": 0.0048, "reward": 1.703125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 863 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 444.0625, "epoch": 0.6912, "grad_norm": 2.037211644377725, "kl": 2.96875, "learning_rate": 4.373198621465405e-06, "loss": 0.1192, "reward": 1.453125, "reward_std": 0.43403828144073486, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.9375, "step": 864 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 404.515625, "epoch": 0.692, "grad_norm": 0.942355756417357, "kl": 0.1162109375, "learning_rate": 4.3718109428352155e-06, "loss": 0.0046, "reward": 1.546875, "reward_std": 0.21778544783592224, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 865 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 480.65625, "epoch": 0.6928, "grad_norm": 0.819499283019901, "kl": 0.1455078125, "learning_rate": 4.370421950496055e-06, "loss": 0.0058, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 866 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 399.265625, "epoch": 0.6936, "grad_norm": 0.8418814238510208, "kl": 0.119140625, "learning_rate": 4.369031645422768e-06, "loss": 0.0048, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 867 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 429.265625, "epoch": 0.6944, "grad_norm": 0.6174620957043189, "kl": 0.7421875, "learning_rate": 4.367640028591126e-06, "loss": 0.0297, "reward": 1.546875, "reward_std": 0.09300297498703003, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 868 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 465.25, "epoch": 0.6952, "grad_norm": 0.5750532827732322, "kl": 0.1201171875, "learning_rate": 4.366247100977818e-06, "loss": 0.0048, "reward": 1.53125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 869 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 464.40625, "epoch": 0.696, "grad_norm": 1.7061165580059114, "kl": 2.65625, "learning_rate": 4.364852863560456e-06, "loss": 0.106, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.953125, "step": 870 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 442.515625, "epoch": 0.6968, "grad_norm": 0.985137653664329, "kl": 1.71875, "learning_rate": 4.363457317317568e-06, "loss": 0.0687, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 871 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 423.40625, "epoch": 0.6976, "grad_norm": 0.8717810235148079, "kl": 0.75, "learning_rate": 4.362060463228603e-06, "loss": 0.03, "reward": 1.78125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 872 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.8125, "epoch": 0.6984, "grad_norm": 1.8094845589703843, "kl": 1.296875, "learning_rate": 4.360662302273926e-06, "loss": 0.0517, "reward": 1.640625, "reward_std": 0.30296874046325684, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 873 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 420.5625, "epoch": 0.6992, "grad_norm": 0.6926124145060691, "kl": 0.115234375, "learning_rate": 4.35926283543482e-06, "loss": 0.0046, "reward": 1.671875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 874 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 479.53125, "epoch": 0.7, "grad_norm": 3.0898708500563794, "kl": 0.94140625, "learning_rate": 4.357862063693486e-06, "loss": 0.0376, "reward": 1.875, "reward_std": 0.24359199404716492, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.96875, "step": 875 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 431.890625, "epoch": 0.7008, "grad_norm": 0.6974488775760038, "kl": 0.1162109375, "learning_rate": 4.356459988033039e-06, "loss": 0.0046, "reward": 1.484375, "reward_std": 0.19408093392848969, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 876 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 444.078125, "epoch": 0.7016, "grad_norm": 3.037156735334447, "kl": 2.078125, "learning_rate": 4.355056609437509e-06, "loss": 0.0829, "reward": 1.5625, "reward_std": 0.21968847513198853, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9375, "step": 877 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 455.84375, "epoch": 0.7024, "grad_norm": 0.40572337908482486, "kl": 0.119140625, "learning_rate": 4.353651928891842e-06, "loss": 0.0047, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 878 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 442.0, "epoch": 0.7032, "grad_norm": 1.4903271791393706, "kl": 0.83984375, "learning_rate": 4.352245947381897e-06, "loss": 0.0336, "reward": 1.65625, "reward_std": 0.19918900728225708, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.96875, "step": 879 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 448.578125, "epoch": 0.704, "grad_norm": 2.0545542691137295, "kl": 0.37890625, "learning_rate": 4.3508386658944455e-06, "loss": 0.0152, "reward": 1.6875, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 880 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 447.203125, "epoch": 0.7048, "grad_norm": 1.0909881391793068, "kl": 0.3671875, "learning_rate": 4.349430085417171e-06, "loss": 0.0148, "reward": 1.796875, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 881 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 403.828125, "epoch": 0.7056, "grad_norm": 1.5553388649827689, "kl": 0.1064453125, "learning_rate": 4.348020206938672e-06, "loss": 0.0043, "reward": 1.8125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 882 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 395.734375, "epoch": 0.7064, "grad_norm": 0.4552842879173502, "kl": 0.1142578125, "learning_rate": 4.3466090314484526e-06, "loss": 0.0046, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 883 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 417.0625, "epoch": 0.7072, "grad_norm": 0.3362833858620528, "kl": 0.1064453125, "learning_rate": 4.345196559936931e-06, "loss": 0.0043, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 884 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.265625, "epoch": 0.708, "grad_norm": 1.6585334236065075, "kl": 0.1875, "learning_rate": 4.343782793395435e-06, "loss": 0.0075, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "step": 885 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 490.6875, "epoch": 0.7088, "grad_norm": 0.5715755935163027, "kl": 0.1064453125, "learning_rate": 4.3423677328162e-06, "loss": 0.0043, "reward": 1.546875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 886 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 450.1875, "epoch": 0.7096, "grad_norm": 0.5989269164215718, "kl": 0.10693359375, "learning_rate": 4.340951379192369e-06, "loss": 0.0043, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 887 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 447.09375, "epoch": 0.7104, "grad_norm": 0.340813035742099, "kl": 0.10205078125, "learning_rate": 4.3395337335179945e-06, "loss": 0.0041, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 888 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 420.78125, "epoch": 0.7112, "grad_norm": 0.4025425134963079, "kl": 0.1162109375, "learning_rate": 4.338114796788035e-06, "loss": 0.0047, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 889 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.484375, "epoch": 0.712, "grad_norm": 1.4984277973866036, "kl": 0.12109375, "learning_rate": 4.336694569998354e-06, "loss": 0.0048, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 890 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 479.71875, "epoch": 0.7128, "grad_norm": 1.5903143900453378, "kl": 0.4765625, "learning_rate": 4.3352730541457215e-06, "loss": 0.0191, "reward": 1.921875, "reward_std": 0.18139132857322693, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.96875, "step": 891 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 487.484375, "epoch": 0.7136, "grad_norm": 1.040021665372247, "kl": 0.72265625, "learning_rate": 4.333850250227814e-06, "loss": 0.0289, "reward": 1.859375, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.96875, "step": 892 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 356.828125, "epoch": 0.7144, "grad_norm": 0.8145869982087229, "kl": 0.94140625, "learning_rate": 4.332426159243206e-06, "loss": 0.0377, "reward": 1.65625, "reward_std": 0.2041158527135849, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 893 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 485.140625, "epoch": 0.7152, "grad_norm": 1.097727906062679, "kl": 2.0, "learning_rate": 4.331000782191384e-06, "loss": 0.0801, "reward": 1.609375, "reward_std": 0.22673699259757996, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.953125, "step": 894 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 515.203125, "epoch": 0.716, "grad_norm": 3.025969830890034, "kl": 3.578125, "learning_rate": 4.329574120072728e-06, "loss": 0.1432, "reward": 1.6875, "reward_std": 0.3413130044937134, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.921875, "step": 895 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 549.359375, "epoch": 0.7168, "grad_norm": 0.7902604584398955, "kl": 2.96875, "learning_rate": 4.328146173888528e-06, "loss": 0.1187, "reward": 1.484375, "reward_std": 0.33669838309288025, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.921875, "step": 896 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 498.9375, "epoch": 0.7176, "grad_norm": 1.3940284999410812, "kl": 1.546875, "learning_rate": 4.32671694464097e-06, "loss": 0.062, "reward": 1.484375, "reward_std": 0.36355799436569214, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.921875, "step": 897 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 516.40625, "epoch": 0.7184, "grad_norm": 1.9310527726746134, "kl": 3.59375, "learning_rate": 4.3252864333331424e-06, "loss": 0.1439, "reward": 1.328125, "reward_std": 0.41361188888549805, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.875, "step": 898 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 480.234375, "epoch": 0.7192, "grad_norm": 4.6985717880758395, "kl": 2.84375, "learning_rate": 4.323854640969033e-06, "loss": 0.1137, "reward": 1.703125, "reward_std": 0.45561131834983826, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.921875, "step": 899 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 362.921875, "epoch": 0.72, "grad_norm": 0.7749778063374342, "kl": 0.76171875, "learning_rate": 4.322421568553529e-06, "loss": 0.0304, "reward": 1.5, "reward_std": 0.23356689512729645, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 900 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 536.71875, "epoch": 0.7208, "grad_norm": 1.801671379190005, "kl": 3.0625, "learning_rate": 4.320987217092416e-06, "loss": 0.1229, "reward": 1.59375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9375, "step": 901 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 540.734375, "epoch": 0.7216, "grad_norm": 2.3304463660704235, "kl": 3.578125, "learning_rate": 4.319551587592377e-06, "loss": 0.1429, "reward": 1.515625, "reward_std": 0.40822190046310425, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.921875, "step": 902 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 470.0625, "epoch": 0.7224, "grad_norm": 1.7944162093678502, "kl": 4.46875, "learning_rate": 4.318114681060989e-06, "loss": 0.1795, "reward": 1.703125, "reward_std": 0.42508673667907715, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.90625, "step": 903 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 587.203125, "epoch": 0.7232, "grad_norm": 1.6222665192833228, "kl": 1.9375, "learning_rate": 4.316676498506735e-06, "loss": 0.0775, "reward": 1.53125, "reward_std": 0.40749478340148926, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.921875, "step": 904 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 505.015625, "epoch": 0.724, "grad_norm": 1.1143695214080347, "kl": 1.8671875, "learning_rate": 4.3152370409389795e-06, "loss": 0.0749, "reward": 1.53125, "reward_std": 0.3014557659626007, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.953125, "step": 905 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 489.875, "epoch": 0.7248, "grad_norm": 1.5889331546013732, "kl": 2.03125, "learning_rate": 4.3137963093679945e-06, "loss": 0.0814, "reward": 1.71875, "reward_std": 0.23689788579940796, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.9375, "step": 906 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 564.921875, "epoch": 0.7256, "grad_norm": 1.0491259327278086, "kl": 2.015625, "learning_rate": 4.3123543048049395e-06, "loss": 0.0804, "reward": 1.65625, "reward_std": 0.3010796010494232, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.921875, "step": 907 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.375, "epoch": 0.7264, "grad_norm": 1.1311641700877069, "kl": 1.2421875, "learning_rate": 4.310911028261867e-06, "loss": 0.0496, "reward": 1.8125, "reward_std": 0.24608495831489563, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "step": 908 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 495.1875, "epoch": 0.7272, "grad_norm": 1.0562690996200423, "kl": 0.5703125, "learning_rate": 4.309466480751726e-06, "loss": 0.0228, "reward": 1.78125, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 909 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 455.0625, "epoch": 0.728, "grad_norm": 1.5715659468664727, "kl": 1.7265625, "learning_rate": 4.308020663288356e-06, "loss": 0.0691, "reward": 1.578125, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.9375, "step": 910 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 532.40625, "epoch": 0.7288, "grad_norm": 0.6750795964090044, "kl": 0.10888671875, "learning_rate": 4.306573576886485e-06, "loss": 0.0044, "reward": 1.640625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 911 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 434.671875, "epoch": 0.7296, "grad_norm": 0.6186247071856164, "kl": 0.11865234375, "learning_rate": 4.305125222561736e-06, "loss": 0.0047, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 912 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 469.046875, "epoch": 0.7304, "grad_norm": 1.228159699553349, "kl": 0.427734375, "learning_rate": 4.303675601330618e-06, "loss": 0.0171, "reward": 1.640625, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 913 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 470.859375, "epoch": 0.7312, "grad_norm": 1.3058863651653687, "kl": 0.63671875, "learning_rate": 4.302224714210532e-06, "loss": 0.0255, "reward": 1.671875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 914 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 424.21875, "epoch": 0.732, "grad_norm": 0.6512093483515646, "kl": 0.1162109375, "learning_rate": 4.3007725622197675e-06, "loss": 0.0047, "reward": 1.84375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 915 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 512.078125, "epoch": 0.7328, "grad_norm": 1.229335381648818, "kl": 2.65625, "learning_rate": 4.2993191463775e-06, "loss": 0.1065, "reward": 1.71875, "reward_std": 0.3868919014930725, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.921875, "step": 916 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 380.0625, "epoch": 0.7336, "grad_norm": 1.3564010145791259, "kl": 2.296875, "learning_rate": 4.29786446770379e-06, "loss": 0.092, "reward": 1.40625, "reward_std": 0.46144071221351624, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.921875, "step": 917 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 473.984375, "epoch": 0.7344, "grad_norm": 0.8627534279098009, "kl": 2.140625, "learning_rate": 4.296408527219592e-06, "loss": 0.0855, "reward": 1.5, "reward_std": 0.287577360868454, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.9375, "step": 918 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 469.734375, "epoch": 0.7352, "grad_norm": 0.7801019710985146, "kl": 2.46875, "learning_rate": 4.294951325946737e-06, "loss": 0.0988, "reward": 1.34375, "reward_std": 0.40489843487739563, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.9375, "step": 919 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 403.359375, "epoch": 0.736, "grad_norm": 1.5718654200701025, "kl": 0.828125, "learning_rate": 4.293492864907947e-06, "loss": 0.0332, "reward": 1.609375, "reward_std": 0.34564992785453796, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 920 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 534.484375, "epoch": 0.7368, "grad_norm": 2.161536986347826, "kl": 2.546875, "learning_rate": 4.2920331451268246e-06, "loss": 0.102, "reward": 1.515625, "reward_std": 0.5058395266532898, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.890625, "step": 921 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 512.09375, "epoch": 0.7376, "grad_norm": 1.7295118963451739, "kl": 3.421875, "learning_rate": 4.2905721676278585e-06, "loss": 0.1369, "reward": 1.3125, "reward_std": 0.3742702603340149, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.90625, "step": 922 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 519.265625, "epoch": 0.7384, "grad_norm": 2.1540118335096063, "kl": 4.3125, "learning_rate": 4.28910993343642e-06, "loss": 0.1727, "reward": 1.375, "reward_std": 0.5602303147315979, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.875, "step": 923 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 450.78125, "epoch": 0.7392, "grad_norm": 0.9790998139016089, "kl": 2.421875, "learning_rate": 4.2876464435787576e-06, "loss": 0.0965, "reward": 1.53125, "reward_std": 0.3380831182003021, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.9375, "step": 924 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 494.484375, "epoch": 0.74, "grad_norm": 2.399007793907817, "kl": 2.25, "learning_rate": 4.286181699082008e-06, "loss": 0.0899, "reward": 1.359375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.921875, "step": 925 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 385.15625, "epoch": 0.7408, "grad_norm": 1.1416933071007234, "kl": 0.93359375, "learning_rate": 4.284715700974186e-06, "loss": 0.0373, "reward": 1.578125, "reward_std": 0.2971188426017761, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 926 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 521.203125, "epoch": 0.7416, "grad_norm": 9.194586955494502, "kl": 260.0, "learning_rate": 4.283248450284182e-06, "loss": 10.4072, "reward": 1.390625, "reward_std": 0.5807557106018066, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.796875, "step": 927 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 553.21875, "epoch": 0.7424, "grad_norm": 4.26876911233488, "kl": 7.28125, "learning_rate": 4.281779948041772e-06, "loss": 0.2909, "reward": 1.265625, "reward_std": 0.6378883123397827, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.765625, "step": 928 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 538.5, "epoch": 0.7432, "grad_norm": 2.353768064399416, "kl": 5.875, "learning_rate": 4.280310195277606e-06, "loss": 0.2358, "reward": 1.265625, "reward_std": 0.48670461773872375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.765625, "step": 929 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.03125, "epoch": 0.744, "grad_norm": 1.5584445115662269, "kl": 0.66796875, "learning_rate": 4.278839193023214e-06, "loss": 0.0267, "reward": 1.5625, "reward_std": 0.29176726937294006, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 930 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 397.390625, "epoch": 0.7448, "grad_norm": 0.6210216856962318, "kl": 0.1171875, "learning_rate": 4.277366942311001e-06, "loss": 0.0047, "reward": 1.59375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 931 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.84375, "epoch": 0.7456, "grad_norm": 0.30790529190005606, "kl": 0.11474609375, "learning_rate": 4.2758934441742494e-06, "loss": 0.0046, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 932 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.296875, "epoch": 0.7464, "grad_norm": 0.6533715565725364, "kl": 0.1171875, "learning_rate": 4.274418699647117e-06, "loss": 0.0047, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 933 }, { "all_correct": 0.0, "all_wrong": 0.625, "completion_length": 347.0, "epoch": 0.7472, "grad_norm": 0.6429134637471882, "kl": 0.1220703125, "learning_rate": 4.272942709764638e-06, "loss": 0.0049, "reward": 1.140625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 1.0, "step": 934 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 409.3125, "epoch": 0.748, "grad_norm": 0.7219361422305827, "kl": 0.150390625, "learning_rate": 4.271465475562716e-06, "loss": 0.006, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "step": 935 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 385.421875, "epoch": 0.7488, "grad_norm": 0.9652043265766445, "kl": 0.12060546875, "learning_rate": 4.269986998078132e-06, "loss": 0.0048, "reward": 1.6875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 936 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 439.1875, "epoch": 0.7496, "grad_norm": 1.0733356267908236, "kl": 0.1171875, "learning_rate": 4.268507278348539e-06, "loss": 0.0047, "reward": 1.71875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 937 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 416.90625, "epoch": 0.7504, "grad_norm": 0.37362274771795734, "kl": 0.10595703125, "learning_rate": 4.2670263174124615e-06, "loss": 0.0042, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 938 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 404.390625, "epoch": 0.7512, "grad_norm": 0.09190962754036436, "kl": 0.11376953125, "learning_rate": 4.265544116309294e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 939 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 410.171875, "epoch": 0.752, "grad_norm": 0.9576930951379631, "kl": 0.1259765625, "learning_rate": 4.264060676079302e-06, "loss": 0.0051, "reward": 1.53125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 940 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 423.234375, "epoch": 0.7528, "grad_norm": 0.3486492469895735, "kl": 0.11474609375, "learning_rate": 4.262575997763622e-06, "loss": 0.0046, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 941 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 411.390625, "epoch": 0.7536, "grad_norm": 0.42797705325285396, "kl": 0.11669921875, "learning_rate": 4.2610900824042575e-06, "loss": 0.0047, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 942 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.640625, "epoch": 0.7544, "grad_norm": 0.5371471332126149, "kl": 0.10791015625, "learning_rate": 4.2596029310440826e-06, "loss": 0.0043, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 943 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 386.515625, "epoch": 0.7552, "grad_norm": 0.516151719265387, "kl": 0.11328125, "learning_rate": 4.258114544726835e-06, "loss": 0.0045, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 944 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.53125, "epoch": 0.756, "grad_norm": 1.6891573571293064, "kl": 0.123046875, "learning_rate": 4.256624924497124e-06, "loss": 0.0049, "reward": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 945 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 392.703125, "epoch": 0.7568, "grad_norm": 1.0182792334186193, "kl": 0.12109375, "learning_rate": 4.25513407140042e-06, "loss": 0.0048, "reward": 1.71875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 946 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 400.703125, "epoch": 0.7576, "grad_norm": 0.420747136044565, "kl": 0.11669921875, "learning_rate": 4.253641986483063e-06, "loss": 0.0047, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 947 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 386.0, "epoch": 0.7584, "grad_norm": 0.9703799860470895, "kl": 0.1572265625, "learning_rate": 4.2521486707922545e-06, "loss": 0.0063, "reward": 1.5625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 948 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.265625, "epoch": 0.7592, "grad_norm": 0.5790431184943604, "kl": 0.1171875, "learning_rate": 4.250654125376062e-06, "loss": 0.0047, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 949 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 356.109375, "epoch": 0.76, "grad_norm": 0.784741344714855, "kl": 0.130859375, "learning_rate": 4.249158351283414e-06, "loss": 0.0052, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 950 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 365.96875, "epoch": 0.7608, "grad_norm": 0.3682297319192504, "kl": 0.12060546875, "learning_rate": 4.247661349564103e-06, "loss": 0.0048, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 951 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 425.4375, "epoch": 0.7616, "grad_norm": 0.542369647139902, "kl": 0.1181640625, "learning_rate": 4.246163121268782e-06, "loss": 0.0047, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 952 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 356.765625, "epoch": 0.7624, "grad_norm": 0.6686418492448905, "kl": 0.126953125, "learning_rate": 4.244663667448965e-06, "loss": 0.0051, "reward": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 953 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 465.984375, "epoch": 0.7632, "grad_norm": 0.5241841620615234, "kl": 0.1259765625, "learning_rate": 4.243162989157027e-06, "loss": 0.005, "reward": 1.484375, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 954 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 414.03125, "epoch": 0.764, "grad_norm": 0.6609924834357154, "kl": 0.138671875, "learning_rate": 4.241661087446202e-06, "loss": 0.0055, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 955 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 418.1875, "epoch": 0.7648, "grad_norm": 2.635363041629432, "kl": 0.2216796875, "learning_rate": 4.240157963370583e-06, "loss": 0.0089, "reward": 1.296875, "reward_std": 0.4558286964893341, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 956 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 379.171875, "epoch": 0.7656, "grad_norm": 1.4448050764201095, "kl": 0.1611328125, "learning_rate": 4.2386536179851175e-06, "loss": 0.0064, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 957 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 446.421875, "epoch": 0.7664, "grad_norm": 1.000212044079561, "kl": 0.12451171875, "learning_rate": 4.2371480523456156e-06, "loss": 0.005, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 958 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.609375, "epoch": 0.7672, "grad_norm": 0.8981210410115046, "kl": 0.1298828125, "learning_rate": 4.235641267508741e-06, "loss": 0.0052, "reward": 1.78125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 959 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 380.765625, "epoch": 0.768, "grad_norm": 0.8232337759805564, "kl": 0.12890625, "learning_rate": 4.234133264532012e-06, "loss": 0.0052, "reward": 1.484375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 960 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 417.203125, "epoch": 0.7688, "grad_norm": 5.143963317662924, "kl": 0.29296875, "learning_rate": 4.232624044473805e-06, "loss": 0.0117, "reward": 1.71875, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 961 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.5, "epoch": 0.7696, "grad_norm": 1.1019180106834146, "kl": 0.2197265625, "learning_rate": 4.231113608393348e-06, "loss": 0.0088, "reward": 1.515625, "reward_std": 0.23925507068634033, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 962 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 409.78125, "epoch": 0.7704, "grad_norm": 1.8150596559083683, "kl": 0.353515625, "learning_rate": 4.229601957350722e-06, "loss": 0.0142, "reward": 1.703125, "reward_std": 0.20189079642295837, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 963 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 429.234375, "epoch": 0.7712, "grad_norm": 0.5328641427292242, "kl": 0.1513671875, "learning_rate": 4.228089092406863e-06, "loss": 0.0061, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 964 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 475.234375, "epoch": 0.772, "grad_norm": 0.7284892417847438, "kl": 0.61328125, "learning_rate": 4.226575014623557e-06, "loss": 0.0245, "reward": 1.671875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.953125, "step": 965 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 409.828125, "epoch": 0.7728, "grad_norm": 2.672494169486163, "kl": 0.83984375, "learning_rate": 4.225059725063444e-06, "loss": 0.0336, "reward": 1.65625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 966 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 425.15625, "epoch": 0.7736, "grad_norm": 0.8381645589499661, "kl": 0.138671875, "learning_rate": 4.22354322479001e-06, "loss": 0.0056, "reward": 1.59375, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 967 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 388.28125, "epoch": 0.7744, "grad_norm": 1.5610358286513286, "kl": 0.2060546875, "learning_rate": 4.222025514867596e-06, "loss": 0.0083, "reward": 1.671875, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 968 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 349.265625, "epoch": 0.7752, "grad_norm": 1.3621580253801306, "kl": 0.16015625, "learning_rate": 4.220506596361387e-06, "loss": 0.0064, "reward": 1.453125, "reward_std": 0.35612428188323975, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 969 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.953125, "epoch": 0.776, "grad_norm": 0.7268537039641286, "kl": 0.130859375, "learning_rate": 4.218986470337419e-06, "loss": 0.0052, "reward": 1.828125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 970 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.234375, "epoch": 0.7768, "grad_norm": 0.6318043018805877, "kl": 0.1298828125, "learning_rate": 4.217465137862575e-06, "loss": 0.0052, "reward": 1.703125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 971 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 413.625, "epoch": 0.7776, "grad_norm": 0.5299685435492659, "kl": 0.12158203125, "learning_rate": 4.215942600004586e-06, "loss": 0.0049, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 972 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 396.90625, "epoch": 0.7784, "grad_norm": 0.1458544807785085, "kl": 0.140625, "learning_rate": 4.214418857832025e-06, "loss": 0.0056, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 973 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 355.703125, "epoch": 0.7792, "grad_norm": 0.9539917998314951, "kl": 0.13671875, "learning_rate": 4.212893912414316e-06, "loss": 0.0055, "reward": 1.6875, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 974 }, { "all_correct": 0.0, "all_wrong": 0.625, "completion_length": 399.453125, "epoch": 0.78, "grad_norm": 1.7882583607879647, "kl": 0.158203125, "learning_rate": 4.211367764821722e-06, "loss": 0.0063, "reward": 1.15625, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "step": 975 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.9375, "epoch": 0.7808, "grad_norm": 1.8457448576517015, "kl": 1.578125, "learning_rate": 4.209840416125353e-06, "loss": 0.0633, "reward": 1.515625, "reward_std": 0.3847702443599701, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.90625, "step": 976 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 450.15625, "epoch": 0.7816, "grad_norm": 0.8375536084250883, "kl": 1.859375, "learning_rate": 4.208311867397162e-06, "loss": 0.0743, "reward": 1.65625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.9375, "step": 977 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 496.421875, "epoch": 0.7824, "grad_norm": 2.695559553833614, "kl": 5.46875, "learning_rate": 4.206782119709942e-06, "loss": 0.2185, "reward": 1.46875, "reward_std": 0.4887067973613739, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.78125, "step": 978 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 399.75, "epoch": 0.7832, "grad_norm": 0.6598807469508163, "kl": 0.8828125, "learning_rate": 4.205251174137329e-06, "loss": 0.0354, "reward": 1.484375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 979 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 420.53125, "epoch": 0.784, "grad_norm": 0.9743811025961808, "kl": 1.2734375, "learning_rate": 4.2037190317538e-06, "loss": 0.0508, "reward": 1.65625, "reward_std": 0.43191659450531006, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.921875, "step": 980 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 413.984375, "epoch": 0.7848, "grad_norm": 0.842145948104087, "kl": 0.296875, "learning_rate": 4.202185693634671e-06, "loss": 0.0119, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 981 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 419.078125, "epoch": 0.7856, "grad_norm": 1.3977982611011994, "kl": 1.5546875, "learning_rate": 4.200651160856099e-06, "loss": 0.062, "reward": 1.296875, "reward_std": 0.33669838309288025, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 982 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 473.46875, "epoch": 0.7864, "grad_norm": 0.28050828036992015, "kl": 0.5859375, "learning_rate": 4.1991154344950755e-06, "loss": 0.0234, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 983 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 403.359375, "epoch": 0.7872, "grad_norm": 0.9636221712471047, "kl": 0.390625, "learning_rate": 4.197578515629435e-06, "loss": 0.0156, "reward": 1.4375, "reward_std": 0.2925041913986206, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 984 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 460.03125, "epoch": 0.788, "grad_norm": 0.342473944432868, "kl": 0.271484375, "learning_rate": 4.196040405337846e-06, "loss": 0.0109, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 985 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 364.203125, "epoch": 0.7888, "grad_norm": 1.7808786921240929, "kl": 0.9609375, "learning_rate": 4.194501104699813e-06, "loss": 0.0385, "reward": 1.59375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 986 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 420.75, "epoch": 0.7896, "grad_norm": 0.8194480679343348, "kl": 0.71875, "learning_rate": 4.192960614795676e-06, "loss": 0.0287, "reward": 1.640625, "reward_std": 0.2773849070072174, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 987 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 361.515625, "epoch": 0.7904, "grad_norm": 0.9579962256217175, "kl": 0.1337890625, "learning_rate": 4.19141893670661e-06, "loss": 0.0053, "reward": 1.453125, "reward_std": 0.23568853735923767, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 988 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 387.6875, "epoch": 0.7912, "grad_norm": 0.9229492984968124, "kl": 0.9765625, "learning_rate": 4.189876071514624e-06, "loss": 0.0392, "reward": 1.640625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 989 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 396.078125, "epoch": 0.792, "grad_norm": 0.9196193490960514, "kl": 0.126953125, "learning_rate": 4.188332020302561e-06, "loss": 0.0051, "reward": 1.734375, "reward_std": 0.3051002323627472, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 990 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 373.609375, "epoch": 0.7928, "grad_norm": 0.4291672668439854, "kl": 0.123046875, "learning_rate": 4.186786784154096e-06, "loss": 0.0049, "reward": 1.46875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 991 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 401.03125, "epoch": 0.7936, "grad_norm": 0.9173605611044626, "kl": 0.58203125, "learning_rate": 4.1852403641537344e-06, "loss": 0.0233, "reward": 1.578125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 992 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 421.625, "epoch": 0.7944, "grad_norm": 0.8235714182946214, "kl": 0.48828125, "learning_rate": 4.183692761386813e-06, "loss": 0.0196, "reward": 1.5625, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 993 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 401.5625, "epoch": 0.7952, "grad_norm": 0.5304761774438076, "kl": 0.12890625, "learning_rate": 4.1821439769395e-06, "loss": 0.0052, "reward": 1.390625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 994 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 387.140625, "epoch": 0.796, "grad_norm": 0.6290673111443011, "kl": 0.42578125, "learning_rate": 4.180594011898791e-06, "loss": 0.0171, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 995 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 351.84375, "epoch": 0.7968, "grad_norm": 0.6991693797366741, "kl": 0.1318359375, "learning_rate": 4.1790428673525104e-06, "loss": 0.0053, "reward": 1.8125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 996 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 460.28125, "epoch": 0.7976, "grad_norm": 2.638023572486133, "kl": 0.8046875, "learning_rate": 4.177490544389313e-06, "loss": 0.0322, "reward": 1.578125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 997 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 407.90625, "epoch": 0.7984, "grad_norm": 1.7128764247694277, "kl": 0.11962890625, "learning_rate": 4.175937044098678e-06, "loss": 0.0048, "reward": 1.59375, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 998 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 327.734375, "epoch": 0.7992, "grad_norm": 1.0275671648896874, "kl": 0.1484375, "learning_rate": 4.1743823675709115e-06, "loss": 0.0059, "reward": 1.46875, "reward_std": 0.3119301199913025, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 999 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 361.578125, "epoch": 0.8, "grad_norm": 0.9927971728000622, "kl": 0.89453125, "learning_rate": 4.172826515897146e-06, "loss": 0.0358, "reward": 1.4375, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 1000 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 432.828125, "epoch": 0.8008, "grad_norm": 0.3048605528477739, "kl": 0.12451171875, "learning_rate": 4.171269490169337e-06, "loss": 0.005, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1001 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 361.703125, "epoch": 0.8016, "grad_norm": 0.09732700080514368, "kl": 0.1240234375, "learning_rate": 4.1697112914802665e-06, "loss": 0.005, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1002 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 383.140625, "epoch": 0.8024, "grad_norm": 0.6311599518634863, "kl": 0.376953125, "learning_rate": 4.168151920923536e-06, "loss": 0.015, "reward": 1.609375, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1003 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 447.625, "epoch": 0.8032, "grad_norm": 0.7021377784094579, "kl": 1.5078125, "learning_rate": 4.1665913795935755e-06, "loss": 0.0604, "reward": 1.5, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 1004 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 393.5, "epoch": 0.804, "grad_norm": 1.171767142404209, "kl": 0.74609375, "learning_rate": 4.16502966858563e-06, "loss": 0.0298, "reward": 1.59375, "reward_std": 0.3093337416648865, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 1005 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 417.84375, "epoch": 0.8048, "grad_norm": 0.32791847154584824, "kl": 0.119140625, "learning_rate": 4.163466788995768e-06, "loss": 0.0048, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1006 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.203125, "epoch": 0.8056, "grad_norm": 0.8809647797227202, "kl": 1.1640625, "learning_rate": 4.161902741920881e-06, "loss": 0.0465, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.96875, "step": 1007 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 408.09375, "epoch": 0.8064, "grad_norm": 0.9255991702128258, "kl": 1.375, "learning_rate": 4.160337528458676e-06, "loss": 0.055, "reward": 1.78125, "reward_std": 0.3014557659626007, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.953125, "step": 1008 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 463.078125, "epoch": 0.8072, "grad_norm": 0.8992639035654059, "kl": 1.609375, "learning_rate": 4.15877114970768e-06, "loss": 0.0642, "reward": 1.78125, "reward_std": 0.28566449880599976, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 1009 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 450.265625, "epoch": 0.808, "grad_norm": 1.119342046783512, "kl": 1.875, "learning_rate": 4.1572036067672386e-06, "loss": 0.0749, "reward": 1.75, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.96875, "step": 1010 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 434.8125, "epoch": 0.8088, "grad_norm": 0.9183180237329227, "kl": 0.130859375, "learning_rate": 4.155634900737513e-06, "loss": 0.0052, "reward": 1.796875, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1011 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 396.03125, "epoch": 0.8096, "grad_norm": 1.332965526658357, "kl": 1.265625, "learning_rate": 4.154065032719482e-06, "loss": 0.0508, "reward": 1.421875, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 1012 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 421.046875, "epoch": 0.8104, "grad_norm": 1.1436703759975864, "kl": 1.2578125, "learning_rate": 4.152494003814939e-06, "loss": 0.0503, "reward": 1.53125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.953125, "step": 1013 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 416.796875, "epoch": 0.8112, "grad_norm": 6.194941679937528, "kl": 0.1201171875, "learning_rate": 4.150921815126493e-06, "loss": 0.0048, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 1014 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.640625, "epoch": 0.812, "grad_norm": 0.736162386203766, "kl": 0.224609375, "learning_rate": 4.149348467757566e-06, "loss": 0.009, "reward": 1.828125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1015 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 414.03125, "epoch": 0.8128, "grad_norm": 1.2415159824198114, "kl": 0.2001953125, "learning_rate": 4.147773962812393e-06, "loss": 0.008, "reward": 1.59375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 1016 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 405.625, "epoch": 0.8136, "grad_norm": 1.160255873423496, "kl": 0.5390625, "learning_rate": 4.146198301396025e-06, "loss": 0.0217, "reward": 1.40625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 1017 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.015625, "epoch": 0.8144, "grad_norm": 0.8885955478672967, "kl": 1.7734375, "learning_rate": 4.14462148461432e-06, "loss": 0.0709, "reward": 1.828125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.953125, "step": 1018 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 389.96875, "epoch": 0.8152, "grad_norm": 0.9742868988911085, "kl": 0.3046875, "learning_rate": 4.143043513573949e-06, "loss": 0.0122, "reward": 1.875, "reward_std": 0.21556037664413452, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 1019 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 439.859375, "epoch": 0.816, "grad_norm": 0.9084803679903584, "kl": 1.484375, "learning_rate": 4.141464389382392e-06, "loss": 0.0593, "reward": 1.46875, "reward_std": 0.213067427277565, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.96875, "step": 1020 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 423.1875, "epoch": 0.8168, "grad_norm": 1.095865842968603, "kl": 0.82421875, "learning_rate": 4.13988411314794e-06, "loss": 0.0329, "reward": 1.421875, "reward_std": 0.3298586905002594, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 1021 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 550.21875, "epoch": 0.8176, "grad_norm": 0.9089747180881477, "kl": 0.984375, "learning_rate": 4.13830268597969e-06, "loss": 0.0391, "reward": 1.515625, "reward_std": 0.2773849070072174, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 1022 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 416.953125, "epoch": 0.8184, "grad_norm": 0.8960698067226799, "kl": 0.34375, "learning_rate": 4.136720108987552e-06, "loss": 0.0138, "reward": 1.625, "reward_std": 0.3014557659626007, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 1023 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 450.75, "epoch": 0.8192, "grad_norm": 1.0249080707929057, "kl": 0.5, "learning_rate": 4.1351363832822364e-06, "loss": 0.0201, "reward": 1.5, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 1024 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 372.90625, "epoch": 0.82, "grad_norm": 0.851722960815373, "kl": 0.7265625, "learning_rate": 4.133551509975264e-06, "loss": 0.0291, "reward": 1.546875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 1025 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 523.890625, "epoch": 0.8208, "grad_norm": 1.5004102169250988, "kl": 2.90625, "learning_rate": 4.13196549017896e-06, "loss": 0.1165, "reward": 1.75, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9375, "step": 1026 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 437.0, "epoch": 0.8216, "grad_norm": 1.7522783445835257, "kl": 1.4296875, "learning_rate": 4.130378325006453e-06, "loss": 0.0573, "reward": 1.4375, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 1027 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 464.09375, "epoch": 0.8224, "grad_norm": 1.2636399601831858, "kl": 1.109375, "learning_rate": 4.128790015571679e-06, "loss": 0.0445, "reward": 1.46875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.96875, "step": 1028 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 447.28125, "epoch": 0.8232, "grad_norm": 0.665300889424532, "kl": 1.5390625, "learning_rate": 4.127200562989372e-06, "loss": 0.0618, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.953125, "step": 1029 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 413.09375, "epoch": 0.824, "grad_norm": 0.49619710604110157, "kl": 0.78515625, "learning_rate": 4.125609968375073e-06, "loss": 0.0314, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1030 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 392.21875, "epoch": 0.8248, "grad_norm": 0.8067248471828645, "kl": 0.671875, "learning_rate": 4.12401823284512e-06, "loss": 0.0267, "reward": 1.578125, "reward_std": 0.2867125868797302, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 1031 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.671875, "epoch": 0.8256, "grad_norm": 0.07466081667756266, "kl": 0.10302734375, "learning_rate": 4.122425357516658e-06, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1032 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 406.875, "epoch": 0.8264, "grad_norm": 0.8089643320442237, "kl": 1.1171875, "learning_rate": 4.1208313435076255e-06, "loss": 0.0445, "reward": 1.40625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 1033 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 449.1875, "epoch": 0.8272, "grad_norm": 0.42636839515317615, "kl": 0.64453125, "learning_rate": 4.119236191936764e-06, "loss": 0.0258, "reward": 1.453125, "reward_std": 0.09300297498703003, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 1034 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.84375, "epoch": 0.828, "grad_norm": 0.6555441663508101, "kl": 0.251953125, "learning_rate": 4.117639903923611e-06, "loss": 0.0101, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1035 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 412.21875, "epoch": 0.8288, "grad_norm": 3.028610298530577, "kl": 0.2578125, "learning_rate": 4.116042480588505e-06, "loss": 0.0103, "reward": 1.578125, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 1036 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 451.34375, "epoch": 0.8296, "grad_norm": 0.7112208658020002, "kl": 1.1875, "learning_rate": 4.114443923052577e-06, "loss": 0.0475, "reward": 1.484375, "reward_std": 0.18133097887039185, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 1037 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 456.03125, "epoch": 0.8304, "grad_norm": 0.9702026427863657, "kl": 0.10986328125, "learning_rate": 4.112844232437757e-06, "loss": 0.0044, "reward": 1.84375, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1038 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 420.0, "epoch": 0.8312, "grad_norm": 1.0720073881012364, "kl": 1.0703125, "learning_rate": 4.11124340986677e-06, "loss": 0.0427, "reward": 1.578125, "reward_std": 0.4240131676197052, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 1039 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 417.703125, "epoch": 0.832, "grad_norm": 1.2995545112867135, "kl": 0.58203125, "learning_rate": 4.109641456463135e-06, "loss": 0.0233, "reward": 1.625, "reward_std": 0.31290027499198914, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 1040 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 442.75, "epoch": 0.8328, "grad_norm": 0.6877738385337426, "kl": 0.2080078125, "learning_rate": 4.108038373351163e-06, "loss": 0.0083, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 1041 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 448.5625, "epoch": 0.8336, "grad_norm": 0.7300017276955302, "kl": 0.24609375, "learning_rate": 4.106434161655962e-06, "loss": 0.0099, "reward": 1.421875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 1042 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 460.828125, "epoch": 0.8344, "grad_norm": 0.6772855946508758, "kl": 1.609375, "learning_rate": 4.104828822503427e-06, "loss": 0.0648, "reward": 1.484375, "reward_std": 0.2723158895969391, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.953125, "step": 1043 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 444.421875, "epoch": 0.8352, "grad_norm": 0.7554695451244722, "kl": 0.1240234375, "learning_rate": 4.103222357020248e-06, "loss": 0.005, "reward": 1.484375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1044 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.703125, "epoch": 0.836, "grad_norm": 0.8594837473654137, "kl": 0.1875, "learning_rate": 4.101614766333904e-06, "loss": 0.0075, "reward": 1.65625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1045 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 492.578125, "epoch": 0.8368, "grad_norm": 1.5062552579782205, "kl": 2.125, "learning_rate": 4.100006051572664e-06, "loss": 0.0848, "reward": 1.421875, "reward_std": 0.2790825366973877, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.9375, "step": 1046 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 451.21875, "epoch": 0.8376, "grad_norm": 0.655316319652123, "kl": 0.66796875, "learning_rate": 4.098396213865587e-06, "loss": 0.0268, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 1047 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 394.546875, "epoch": 0.8384, "grad_norm": 1.2043472474942574, "kl": 0.12255859375, "learning_rate": 4.096785254342518e-06, "loss": 0.0049, "reward": 1.703125, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1048 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 427.515625, "epoch": 0.8392, "grad_norm": 0.9783731884645066, "kl": 2.234375, "learning_rate": 4.095173174134091e-06, "loss": 0.0897, "reward": 1.59375, "reward_std": 0.3618125021457672, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.921875, "step": 1049 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 459.9375, "epoch": 0.84, "grad_norm": 0.3746688535860715, "kl": 0.095703125, "learning_rate": 4.093559974371725e-06, "loss": 0.0038, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1050 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 482.0, "epoch": 0.8408, "grad_norm": 2.6628157148567038, "kl": 2.78125, "learning_rate": 4.091945656187626e-06, "loss": 0.1114, "reward": 1.78125, "reward_std": 0.42674916982650757, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.921875, "step": 1051 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 430.953125, "epoch": 0.8416, "grad_norm": 5.829316587830681, "kl": 1.140625, "learning_rate": 4.090330220714785e-06, "loss": 0.0456, "reward": 1.34375, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.953125, "step": 1052 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.296875, "epoch": 0.8424, "grad_norm": 0.9181553115126243, "kl": 2.953125, "learning_rate": 4.0887136690869774e-06, "loss": 0.1181, "reward": 1.6875, "reward_std": 0.3714011013507843, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.90625, "step": 1053 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 444.453125, "epoch": 0.8432, "grad_norm": 0.7858814990477181, "kl": 1.890625, "learning_rate": 4.08709600243876e-06, "loss": 0.0758, "reward": 1.515625, "reward_std": 0.26977968215942383, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.9375, "step": 1054 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 437.28125, "epoch": 0.844, "grad_norm": 1.7486694303914967, "kl": 2.078125, "learning_rate": 4.0854772219054735e-06, "loss": 0.0834, "reward": 1.6875, "reward_std": 0.3807891607284546, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.921875, "step": 1055 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 533.75, "epoch": 0.8448, "grad_norm": 0.8953239164155159, "kl": 3.671875, "learning_rate": 4.083857328623243e-06, "loss": 0.1468, "reward": 1.640625, "reward_std": 0.4467737078666687, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.875, "step": 1056 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 422.921875, "epoch": 0.8456, "grad_norm": 1.298852110723827, "kl": 2.53125, "learning_rate": 4.082236323728969e-06, "loss": 0.1011, "reward": 1.71875, "reward_std": 0.3654222786426544, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.90625, "step": 1057 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 510.84375, "epoch": 0.8464, "grad_norm": 4.666457472223463, "kl": 7.84375, "learning_rate": 4.0806142083603365e-06, "loss": 0.3131, "reward": 1.4375, "reward_std": 0.6150861978530884, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.765625, "step": 1058 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 470.5625, "epoch": 0.8472, "grad_norm": 1.1720509995194253, "kl": 4.15625, "learning_rate": 4.078990983655807e-06, "loss": 0.1665, "reward": 1.546875, "reward_std": 0.4772983193397522, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.859375, "step": 1059 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.375, "epoch": 0.848, "grad_norm": 1.1323759184229216, "kl": 1.6328125, "learning_rate": 4.077366650754624e-06, "loss": 0.0653, "reward": 1.765625, "reward_std": 0.3095765709877014, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.9375, "step": 1060 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 494.640625, "epoch": 0.8488, "grad_norm": 0.8141453768595988, "kl": 1.4453125, "learning_rate": 4.075741210796806e-06, "loss": 0.0578, "reward": 1.671875, "reward_std": 0.2834876775741577, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 1061 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.921875, "epoch": 0.8496, "grad_norm": 1.537521701564151, "kl": 2.5, "learning_rate": 4.07411466492315e-06, "loss": 0.1001, "reward": 1.515625, "reward_std": 0.46376699209213257, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.90625, "step": 1062 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 383.0625, "epoch": 0.8504, "grad_norm": 2.096686728399835, "kl": 1.3203125, "learning_rate": 4.072487014275228e-06, "loss": 0.0527, "reward": 1.765625, "reward_std": 0.4027767479419708, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.953125, "step": 1063 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 445.1875, "epoch": 0.8512, "grad_norm": 0.8970506248741964, "kl": 0.62109375, "learning_rate": 4.070858259995388e-06, "loss": 0.0248, "reward": 1.515625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 1064 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 407.015625, "epoch": 0.852, "grad_norm": 1.889448913986524, "kl": 0.9296875, "learning_rate": 4.069228403226751e-06, "loss": 0.0374, "reward": 1.546875, "reward_std": 0.3776973485946655, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.953125, "step": 1065 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 417.859375, "epoch": 0.8528, "grad_norm": 1.8209643815758436, "kl": 1.03125, "learning_rate": 4.067597445113216e-06, "loss": 0.0413, "reward": 1.859375, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.96875, "step": 1066 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 422.9375, "epoch": 0.8536, "grad_norm": 1.138793212784693, "kl": 0.326171875, "learning_rate": 4.06596538679945e-06, "loss": 0.013, "reward": 1.546875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 1067 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 351.546875, "epoch": 0.8544, "grad_norm": 1.319715729705474, "kl": 0.86328125, "learning_rate": 4.064332229430895e-06, "loss": 0.0344, "reward": 1.5, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 1068 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 428.375, "epoch": 0.8552, "grad_norm": 1.0128713148904005, "kl": 1.2421875, "learning_rate": 4.062697974153764e-06, "loss": 0.0499, "reward": 1.640625, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 1069 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 403.71875, "epoch": 0.856, "grad_norm": 1.2929267924245809, "kl": 0.7578125, "learning_rate": 4.06106262211504e-06, "loss": 0.0302, "reward": 1.6875, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1070 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 405.484375, "epoch": 0.8568, "grad_norm": 1.6960882081379258, "kl": 0.21484375, "learning_rate": 4.059426174462476e-06, "loss": 0.0086, "reward": 1.546875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1071 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 412.40625, "epoch": 0.8576, "grad_norm": 0.9100252003673931, "kl": 0.62890625, "learning_rate": 4.057788632344594e-06, "loss": 0.0252, "reward": 1.546875, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 1072 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.671875, "epoch": 0.8584, "grad_norm": 1.0317845091800215, "kl": 1.28125, "learning_rate": 4.056149996910683e-06, "loss": 0.0513, "reward": 1.6875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1073 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.21875, "epoch": 0.8592, "grad_norm": 1.9072878508109186, "kl": 2.296875, "learning_rate": 4.054510269310803e-06, "loss": 0.0921, "reward": 1.53125, "reward_std": 0.3230288028717041, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.921875, "step": 1074 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 387.25, "epoch": 0.86, "grad_norm": 1.0221055564406814, "kl": 0.126953125, "learning_rate": 4.052869450695776e-06, "loss": 0.0051, "reward": 1.453125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 1075 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 425.875, "epoch": 0.8608, "grad_norm": 0.8024075102370594, "kl": 1.21875, "learning_rate": 4.051227542217192e-06, "loss": 0.0487, "reward": 1.421875, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 1076 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 397.484375, "epoch": 0.8616, "grad_norm": 1.0949966164463376, "kl": 0.251953125, "learning_rate": 4.049584545027406e-06, "loss": 0.0101, "reward": 1.625, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1077 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 460.3125, "epoch": 0.8624, "grad_norm": 1.4031116868993465, "kl": 1.4921875, "learning_rate": 4.047940460279537e-06, "loss": 0.0597, "reward": 1.671875, "reward_std": 0.22673699259757996, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 1078 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 429.421875, "epoch": 0.8632, "grad_norm": 1.05338284339803, "kl": 0.82421875, "learning_rate": 4.046295289127466e-06, "loss": 0.033, "reward": 1.640625, "reward_std": 0.29826053977012634, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1079 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 378.84375, "epoch": 0.864, "grad_norm": 0.5837210666821087, "kl": 0.4296875, "learning_rate": 4.044649032725836e-06, "loss": 0.0172, "reward": 1.421875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 1080 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.8648, "grad_norm": 1.2920251508082814, "kl": 0.890625, "learning_rate": 4.0430016922300566e-06, "loss": 0.0356, "reward": 1.828125, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 1081 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 455.78125, "epoch": 0.8656, "grad_norm": 2.0959285204173743, "kl": 1.8359375, "learning_rate": 4.0413532687962926e-06, "loss": 0.0733, "reward": 1.5625, "reward_std": 0.16130642592906952, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1082 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 381.296875, "epoch": 0.8664, "grad_norm": 0.6957391888221854, "kl": 0.61328125, "learning_rate": 4.039703763581472e-06, "loss": 0.0245, "reward": 1.53125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 1083 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 375.390625, "epoch": 0.8672, "grad_norm": 0.9464861405251479, "kl": 1.21875, "learning_rate": 4.038053177743279e-06, "loss": 0.0489, "reward": 1.453125, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.96875, "step": 1084 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 484.1875, "epoch": 0.868, "grad_norm": 0.622831183756262, "kl": 1.2265625, "learning_rate": 4.036401512440161e-06, "loss": 0.0493, "reward": 1.78125, "reward_std": 0.1735912710428238, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 1085 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 418.25, "epoch": 0.8688, "grad_norm": 0.8347975194065455, "kl": 2.21875, "learning_rate": 4.034748768831319e-06, "loss": 0.0889, "reward": 1.640625, "reward_std": 0.26977968215942383, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.9375, "step": 1086 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 421.375, "epoch": 0.8696, "grad_norm": 0.7875842265559289, "kl": 0.1220703125, "learning_rate": 4.033094948076713e-06, "loss": 0.0049, "reward": 1.671875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1087 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 379.453125, "epoch": 0.8704, "grad_norm": 0.6386806171987647, "kl": 0.126953125, "learning_rate": 4.031440051337056e-06, "loss": 0.0051, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1088 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 414.65625, "epoch": 0.8712, "grad_norm": 2.838155575700541, "kl": 1.78125, "learning_rate": 4.02978407977382e-06, "loss": 0.0713, "reward": 1.5625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1089 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 409.578125, "epoch": 0.872, "grad_norm": 1.508732730579972, "kl": 1.7578125, "learning_rate": 4.02812703454923e-06, "loss": 0.0701, "reward": 1.5625, "reward_std": 0.1828794628381729, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1090 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 472.84375, "epoch": 0.8728, "grad_norm": 1.1816446654777986, "kl": 0.12109375, "learning_rate": 4.026468916826262e-06, "loss": 0.0048, "reward": 1.359375, "reward_std": 0.28930896520614624, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 1091 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 454.3125, "epoch": 0.8736, "grad_norm": 0.9523527525690174, "kl": 1.0390625, "learning_rate": 4.024809727768648e-06, "loss": 0.0414, "reward": 1.78125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 1092 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 358.671875, "epoch": 0.8744, "grad_norm": 1.3600026041030495, "kl": 0.140625, "learning_rate": 4.023149468540871e-06, "loss": 0.0056, "reward": 1.625, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1093 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.578125, "epoch": 0.8752, "grad_norm": 0.76887038816068, "kl": 0.57421875, "learning_rate": 4.021488140308165e-06, "loss": 0.0229, "reward": 1.703125, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 1094 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 448.9375, "epoch": 0.876, "grad_norm": 0.8105513072622308, "kl": 1.0703125, "learning_rate": 4.019825744236514e-06, "loss": 0.0429, "reward": 1.765625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "step": 1095 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.015625, "epoch": 0.8768, "grad_norm": 0.7716018449654013, "kl": 0.91796875, "learning_rate": 4.018162281492651e-06, "loss": 0.0367, "reward": 1.84375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 1096 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 435.078125, "epoch": 0.8776, "grad_norm": 0.6501186507228375, "kl": 1.390625, "learning_rate": 4.016497753244058e-06, "loss": 0.0557, "reward": 1.65625, "reward_std": 0.26658445596694946, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 1097 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 528.0, "epoch": 0.8784, "grad_norm": 0.5794337582028911, "kl": 0.6875, "learning_rate": 4.014832160658966e-06, "loss": 0.0276, "reward": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 1098 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 501.40625, "epoch": 0.8792, "grad_norm": 0.8259051532628278, "kl": 0.58984375, "learning_rate": 4.013165504906352e-06, "loss": 0.0235, "reward": 1.640625, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1099 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 470.9375, "epoch": 0.88, "grad_norm": 1.1016484370480908, "kl": 0.7578125, "learning_rate": 4.011497787155938e-06, "loss": 0.0303, "reward": 1.765625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "step": 1100 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 442.0, "epoch": 0.8808, "grad_norm": 0.5794692144971305, "kl": 0.123046875, "learning_rate": 4.009829008578192e-06, "loss": 0.0049, "reward": 1.84375, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1101 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 467.859375, "epoch": 0.8816, "grad_norm": 0.3343264060635989, "kl": 0.12060546875, "learning_rate": 4.00815917034433e-06, "loss": 0.0048, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1102 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 493.53125, "epoch": 0.8824, "grad_norm": 0.6032848408012575, "kl": 1.4921875, "learning_rate": 4.006488273626307e-06, "loss": 0.0596, "reward": 1.734375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1103 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 447.625, "epoch": 0.8832, "grad_norm": 2.1882872072718604, "kl": 0.91796875, "learning_rate": 4.004816319596822e-06, "loss": 0.0368, "reward": 1.703125, "reward_std": 0.27554580569267273, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 1104 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 440.578125, "epoch": 0.884, "grad_norm": 2.0923907225823064, "kl": 2.28125, "learning_rate": 4.003143309429317e-06, "loss": 0.0909, "reward": 1.515625, "reward_std": 0.48076385259628296, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.890625, "step": 1105 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 458.203125, "epoch": 0.8848, "grad_norm": 1.978736290913115, "kl": 3.390625, "learning_rate": 4.0014692442979756e-06, "loss": 0.1358, "reward": 1.3125, "reward_std": 0.436609148979187, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.90625, "step": 1106 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 461.1875, "epoch": 0.8856, "grad_norm": 6.48888169484247, "kl": 1.234375, "learning_rate": 3.999794125377721e-06, "loss": 0.0494, "reward": 1.640625, "reward_std": 0.3481428921222687, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 1107 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 484.0625, "epoch": 0.8864, "grad_norm": 2.906234421867136, "kl": 4.71875, "learning_rate": 3.998117953844215e-06, "loss": 0.1884, "reward": 1.34375, "reward_std": 0.4506639540195465, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.84375, "step": 1108 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 436.0625, "epoch": 0.8872, "grad_norm": 1.4255548194518288, "kl": 1.65625, "learning_rate": 3.996440730873861e-06, "loss": 0.0664, "reward": 1.65625, "reward_std": 0.3335031270980835, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 1109 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.609375, "epoch": 0.888, "grad_norm": 1.234066141392723, "kl": 0.76953125, "learning_rate": 3.9947624576437975e-06, "loss": 0.0306, "reward": 1.875, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1110 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 432.40625, "epoch": 0.8888, "grad_norm": 1.4277767028325805, "kl": 0.90234375, "learning_rate": 3.9930831353319025e-06, "loss": 0.0361, "reward": 1.59375, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.953125, "step": 1111 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 450.796875, "epoch": 0.8896, "grad_norm": 1.2248055853199278, "kl": 0.369140625, "learning_rate": 3.9914027651167866e-06, "loss": 0.0148, "reward": 1.71875, "reward_std": 0.391263484954834, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1112 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 488.015625, "epoch": 0.8904, "grad_norm": 1.109925095383779, "kl": 1.1875, "learning_rate": 3.989721348177801e-06, "loss": 0.0477, "reward": 1.765625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.953125, "step": 1113 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.234375, "epoch": 0.8912, "grad_norm": 1.0836512915420775, "kl": 0.138671875, "learning_rate": 3.988038885695028e-06, "loss": 0.0055, "reward": 1.765625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1114 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 476.578125, "epoch": 0.892, "grad_norm": 1.9829381449184804, "kl": 1.1015625, "learning_rate": 3.986355378849284e-06, "loss": 0.0439, "reward": 1.671875, "reward_std": 0.27883464097976685, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.953125, "step": 1115 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.734375, "epoch": 0.8928, "grad_norm": 0.9301515106093913, "kl": 0.1201171875, "learning_rate": 3.984670828822118e-06, "loss": 0.0048, "reward": 1.65625, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1116 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 474.0625, "epoch": 0.8936, "grad_norm": 2.258156989865185, "kl": 1.796875, "learning_rate": 3.982985236795815e-06, "loss": 0.0721, "reward": 1.546875, "reward_std": 0.3276434540748596, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.921875, "step": 1117 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 431.703125, "epoch": 0.8944, "grad_norm": 0.6029594087123232, "kl": 0.130859375, "learning_rate": 3.981298603953385e-06, "loss": 0.0052, "reward": 1.703125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1118 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 464.78125, "epoch": 0.8952, "grad_norm": 1.0329750184692497, "kl": 1.6015625, "learning_rate": 3.979610931478574e-06, "loss": 0.0642, "reward": 1.703125, "reward_std": 0.30607038736343384, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.9375, "step": 1119 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 446.453125, "epoch": 0.896, "grad_norm": 0.8947904659942725, "kl": 0.484375, "learning_rate": 3.977922220555855e-06, "loss": 0.0194, "reward": 1.765625, "reward_std": 0.3025038540363312, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 1120 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 356.34375, "epoch": 0.8968, "grad_norm": 0.8427329686967138, "kl": 1.5546875, "learning_rate": 3.976232472370431e-06, "loss": 0.0622, "reward": 1.46875, "reward_std": 0.25292468070983887, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.953125, "step": 1121 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 447.9375, "epoch": 0.8976, "grad_norm": 2.9962251111735, "kl": 1.3046875, "learning_rate": 3.97454168810823e-06, "loss": 0.0521, "reward": 1.609375, "reward_std": 0.36722296476364136, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.9375, "step": 1122 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 447.40625, "epoch": 0.8984, "grad_norm": 1.6896760435640983, "kl": 2.203125, "learning_rate": 3.972849868955913e-06, "loss": 0.0882, "reward": 1.8125, "reward_std": 0.3945523500442505, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.921875, "step": 1123 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 494.34375, "epoch": 0.8992, "grad_norm": 4.102207391039708, "kl": 7.21875, "learning_rate": 3.97115701610086e-06, "loss": 0.2888, "reward": 1.21875, "reward_std": 0.5415065288543701, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.78125, "step": 1124 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 435.03125, "epoch": 0.9, "grad_norm": 0.7286601798437264, "kl": 0.98828125, "learning_rate": 3.969463130731183e-06, "loss": 0.0396, "reward": 1.5625, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 1125 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 512.15625, "epoch": 0.9008, "grad_norm": 1.7493609069325222, "kl": 3.3125, "learning_rate": 3.967768214035716e-06, "loss": 0.1325, "reward": 1.421875, "reward_std": 0.4455862045288086, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.84375, "step": 1126 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 483.546875, "epoch": 0.9016, "grad_norm": 0.9227670533354237, "kl": 2.46875, "learning_rate": 3.966072267204014e-06, "loss": 0.099, "reward": 1.703125, "reward_std": 0.4069768190383911, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.921875, "step": 1127 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 504.484375, "epoch": 0.9024, "grad_norm": 1.7716691686347417, "kl": 4.40625, "learning_rate": 3.964375291426361e-06, "loss": 0.1756, "reward": 1.53125, "reward_std": 0.6183469295501709, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.84375, "step": 1128 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 435.25, "epoch": 0.9032, "grad_norm": 2.445301463312812, "kl": 5.28125, "learning_rate": 3.962677287893758e-06, "loss": 0.2111, "reward": 1.28125, "reward_std": 0.6274019479751587, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.796875, "step": 1129 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 548.484375, "epoch": 0.904, "grad_norm": 5.123589632294507, "kl": 8.4375, "learning_rate": 3.9609782577979305e-06, "loss": 0.3384, "reward": 1.328125, "reward_std": 0.5813445448875427, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.734375, "step": 1130 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 517.125, "epoch": 0.9048, "grad_norm": 0.9773982278643543, "kl": 2.421875, "learning_rate": 3.959278202331323e-06, "loss": 0.0966, "reward": 1.53125, "reward_std": 0.4171832799911499, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.90625, "step": 1131 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 606.265625, "epoch": 0.9056, "grad_norm": 1.8726346922242276, "kl": 2.734375, "learning_rate": 3.9575771226870986e-06, "loss": 0.1091, "reward": 1.484375, "reward_std": 0.37261295318603516, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.890625, "step": 1132 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 461.75, "epoch": 0.9064, "grad_norm": 1.1884237859088511, "kl": 1.203125, "learning_rate": 3.955875020059141e-06, "loss": 0.0481, "reward": 1.734375, "reward_std": 0.22673699259757996, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.953125, "step": 1133 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 536.765625, "epoch": 0.9072, "grad_norm": 5.028142425015897, "kl": 2.0625, "learning_rate": 3.954171895642052e-06, "loss": 0.0822, "reward": 1.546875, "reward_std": 0.5079944133758545, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.875, "step": 1134 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 481.765625, "epoch": 0.908, "grad_norm": 2.6416702453276626, "kl": 1.671875, "learning_rate": 3.9524677506311505e-06, "loss": 0.067, "reward": 1.59375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 1135 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 490.15625, "epoch": 0.9088, "grad_norm": 0.9754178084529734, "kl": 1.3046875, "learning_rate": 3.950762586222469e-06, "loss": 0.0521, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1136 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 478.46875, "epoch": 0.9096, "grad_norm": 2.0716244414818745, "kl": 0.6484375, "learning_rate": 3.949056403612758e-06, "loss": 0.026, "reward": 1.390625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 1137 }, { "all_correct": 0.0, "all_wrong": 0.5, "completion_length": 415.828125, "epoch": 0.9104, "grad_norm": 1.4351468584193703, "kl": 1.078125, "learning_rate": 3.947349203999485e-06, "loss": 0.0431, "reward": 1.21875, "reward_std": 0.2536902129650116, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 1138 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 393.84375, "epoch": 0.9112, "grad_norm": 1.0885993983624644, "kl": 0.25, "learning_rate": 3.945640988580824e-06, "loss": 0.01, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1139 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 497.140625, "epoch": 0.912, "grad_norm": 1.481975895271482, "kl": 0.734375, "learning_rate": 3.943931758555669e-06, "loss": 0.0293, "reward": 1.734375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 1140 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 488.171875, "epoch": 0.9128, "grad_norm": 1.831247403573637, "kl": 1.484375, "learning_rate": 3.942221515123624e-06, "loss": 0.0595, "reward": 1.625, "reward_std": 0.42873120307922363, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.90625, "step": 1141 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 466.890625, "epoch": 0.9136, "grad_norm": 3.348503762208204, "kl": 0.60546875, "learning_rate": 3.940510259485002e-06, "loss": 0.0242, "reward": 1.5, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 1142 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 375.828125, "epoch": 0.9144, "grad_norm": 0.6862518615346115, "kl": 0.82421875, "learning_rate": 3.938797992840828e-06, "loss": 0.0328, "reward": 1.53125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.953125, "step": 1143 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 428.375, "epoch": 0.9152, "grad_norm": 0.8253381131921054, "kl": 0.58984375, "learning_rate": 3.937084716392839e-06, "loss": 0.0236, "reward": 1.5625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 1144 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 416.40625, "epoch": 0.916, "grad_norm": 1.4288078021535953, "kl": 1.5234375, "learning_rate": 3.935370431343475e-06, "loss": 0.0611, "reward": 1.59375, "reward_std": 0.28247910737991333, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.953125, "step": 1145 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 426.421875, "epoch": 0.9168, "grad_norm": 1.3095918335689882, "kl": 1.296875, "learning_rate": 3.933655138895889e-06, "loss": 0.0518, "reward": 1.1875, "reward_std": 0.3014557361602783, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 1146 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 449.078125, "epoch": 0.9176, "grad_norm": 0.5376006623531796, "kl": 0.107421875, "learning_rate": 3.9319388402539395e-06, "loss": 0.0043, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1147 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.859375, "epoch": 0.9184, "grad_norm": 0.9998521561750789, "kl": 0.91796875, "learning_rate": 3.930221536622192e-06, "loss": 0.0366, "reward": 1.625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 1148 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.171875, "epoch": 0.9192, "grad_norm": 1.0514708714206056, "kl": 1.484375, "learning_rate": 3.928503229205913e-06, "loss": 0.0595, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.9375, "step": 1149 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 384.34375, "epoch": 0.92, "grad_norm": 1.0901882090105197, "kl": 0.79296875, "learning_rate": 3.92678391921108e-06, "loss": 0.0318, "reward": 1.390625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 1150 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 388.921875, "epoch": 0.9208, "grad_norm": 0.7060251409446954, "kl": 0.9765625, "learning_rate": 3.92506360784437e-06, "loss": 0.0392, "reward": 1.671875, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.96875, "step": 1151 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 343.203125, "epoch": 0.9216, "grad_norm": 1.3515674063755896, "kl": 1.1484375, "learning_rate": 3.923342296313162e-06, "loss": 0.0459, "reward": 1.515625, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 1152 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.1875, "epoch": 0.9224, "grad_norm": 1.4739617557166975, "kl": 2.0625, "learning_rate": 3.92161998582554e-06, "loss": 0.0827, "reward": 1.6875, "reward_std": 0.2470981925725937, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9375, "step": 1153 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 378.109375, "epoch": 0.9232, "grad_norm": 1.0225942609340664, "kl": 1.671875, "learning_rate": 3.919896677590289e-06, "loss": 0.067, "reward": 1.5625, "reward_std": 0.2925041913986206, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9375, "step": 1154 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 489.203125, "epoch": 0.924, "grad_norm": 1.2150982355587674, "kl": 1.8359375, "learning_rate": 3.918172372816892e-06, "loss": 0.0734, "reward": 1.59375, "reward_std": 0.4230673909187317, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.953125, "step": 1155 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.546875, "epoch": 0.9248, "grad_norm": 0.8533076568013679, "kl": 1.234375, "learning_rate": 3.916447072715531e-06, "loss": 0.0493, "reward": 1.84375, "reward_std": 0.24608497321605682, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.953125, "step": 1156 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 366.921875, "epoch": 0.9256, "grad_norm": 0.6901681181296133, "kl": 1.4453125, "learning_rate": 3.914720778497091e-06, "loss": 0.0578, "reward": 1.625, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.953125, "step": 1157 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 446.59375, "epoch": 0.9264, "grad_norm": 0.9450905375866617, "kl": 1.484375, "learning_rate": 3.91299349137315e-06, "loss": 0.059, "reward": 1.546875, "reward_std": 0.22707363963127136, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 1158 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 421.390625, "epoch": 0.9272, "grad_norm": 0.8093811017020308, "kl": 3.0625, "learning_rate": 3.9112652125559845e-06, "loss": 0.122, "reward": 1.390625, "reward_std": 0.329900860786438, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.890625, "step": 1159 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.15625, "epoch": 0.928, "grad_norm": 1.2667773493866263, "kl": 1.234375, "learning_rate": 3.909535943258567e-06, "loss": 0.0494, "reward": 1.515625, "reward_std": 0.2777610719203949, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 1160 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 374.65625, "epoch": 0.9288, "grad_norm": 2.229480230522335, "kl": 2.25, "learning_rate": 3.907805684694567e-06, "loss": 0.0902, "reward": 1.390625, "reward_std": 0.33374619483947754, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.921875, "step": 1161 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.296875, "epoch": 0.9296, "grad_norm": 0.9551605600836488, "kl": 3.046875, "learning_rate": 3.906074438078343e-06, "loss": 0.1216, "reward": 1.625, "reward_std": 0.3139738440513611, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.921875, "step": 1162 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.234375, "epoch": 0.9304, "grad_norm": 2.1055650505222054, "kl": 1.7578125, "learning_rate": 3.904342204624955e-06, "loss": 0.07, "reward": 1.875, "reward_std": 0.28566452860832214, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.9375, "step": 1163 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 458.703125, "epoch": 0.9312, "grad_norm": 0.5495339289985234, "kl": 0.9453125, "learning_rate": 3.9026089855501475e-06, "loss": 0.0377, "reward": 1.78125, "reward_std": 0.15870985388755798, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.9375, "step": 1164 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 389.625, "epoch": 0.932, "grad_norm": 0.3289399541161126, "kl": 0.12353515625, "learning_rate": 3.900874782070362e-06, "loss": 0.0049, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1165 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 403.015625, "epoch": 0.9328, "grad_norm": 0.9725077327143433, "kl": 0.68359375, "learning_rate": 3.899139595402729e-06, "loss": 0.0273, "reward": 1.453125, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 1166 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 397.109375, "epoch": 0.9336, "grad_norm": 0.5044647084513043, "kl": 0.259765625, "learning_rate": 3.8974034267650695e-06, "loss": 0.0103, "reward": 1.75, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1167 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 390.6875, "epoch": 0.9344, "grad_norm": 0.6203756574718026, "kl": 0.306640625, "learning_rate": 3.895666277375892e-06, "loss": 0.0122, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1168 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.328125, "epoch": 0.9352, "grad_norm": 0.8361423511027497, "kl": 0.61328125, "learning_rate": 3.893928148454398e-06, "loss": 0.0245, "reward": 1.59375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1169 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 354.515625, "epoch": 0.936, "grad_norm": 0.39279756261166693, "kl": 0.16796875, "learning_rate": 3.89218904122047e-06, "loss": 0.0067, "reward": 1.4375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 1170 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 363.59375, "epoch": 0.9368, "grad_norm": 1.205513888683326, "kl": 1.578125, "learning_rate": 3.890448956894682e-06, "loss": 0.063, "reward": 1.453125, "reward_std": 0.22636085748672485, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.921875, "step": 1171 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 342.65625, "epoch": 0.9376, "grad_norm": 1.6243501485485625, "kl": 0.87890625, "learning_rate": 3.888707896698293e-06, "loss": 0.0352, "reward": 1.640625, "reward_std": 0.3359614610671997, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.953125, "step": 1172 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.859375, "epoch": 0.9384, "grad_norm": 0.8566248245185764, "kl": 0.89453125, "learning_rate": 3.886965861853243e-06, "loss": 0.0357, "reward": 1.5625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1173 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 377.796875, "epoch": 0.9392, "grad_norm": 0.7413774914060944, "kl": 0.30859375, "learning_rate": 3.885222853582163e-06, "loss": 0.0123, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1174 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 429.1875, "epoch": 0.94, "grad_norm": 0.8688677070068936, "kl": 1.1796875, "learning_rate": 3.88347887310836e-06, "loss": 0.0472, "reward": 1.734375, "reward_std": 0.2902791500091553, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 1175 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 402.859375, "epoch": 0.9408, "grad_norm": 1.9388823393178904, "kl": 1.1015625, "learning_rate": 3.881733921655829e-06, "loss": 0.0444, "reward": 1.5, "reward_std": 0.2435920089483261, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.953125, "step": 1176 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 433.453125, "epoch": 0.9416, "grad_norm": 1.1395263747176578, "kl": 0.546875, "learning_rate": 3.879988000449243e-06, "loss": 0.0218, "reward": 1.546875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 1177 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 399.03125, "epoch": 0.9424, "grad_norm": 1.772426608201281, "kl": 3.0, "learning_rate": 3.878241110713957e-06, "loss": 0.1196, "reward": 1.53125, "reward_std": 0.34023943543434143, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.90625, "step": 1178 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 426.734375, "epoch": 0.9432, "grad_norm": 0.8069962534616799, "kl": 1.015625, "learning_rate": 3.876493253676004e-06, "loss": 0.0408, "reward": 1.734375, "reward_std": 0.28778618574142456, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 1179 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 394.890625, "epoch": 0.944, "grad_norm": 0.717350065603133, "kl": 0.89453125, "learning_rate": 3.8747444305621e-06, "loss": 0.0358, "reward": 1.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.953125, "step": 1180 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 394.625, "epoch": 0.9448, "grad_norm": 0.6645661864927441, "kl": 0.92578125, "learning_rate": 3.872994642599635e-06, "loss": 0.0369, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.96875, "step": 1181 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 444.84375, "epoch": 0.9456, "grad_norm": 0.7144934656162855, "kl": 1.171875, "learning_rate": 3.871243891016676e-06, "loss": 0.0467, "reward": 1.703125, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 1182 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.265625, "epoch": 0.9464, "grad_norm": 1.5216401243548279, "kl": 2.484375, "learning_rate": 3.869492177041971e-06, "loss": 0.0997, "reward": 1.71875, "reward_std": 0.2924008071422577, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.9375, "step": 1183 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.28125, "epoch": 0.9472, "grad_norm": 0.873052809745924, "kl": 1.0, "learning_rate": 3.867739501904938e-06, "loss": 0.0399, "reward": 1.734375, "reward_std": 0.2902791500091553, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 1184 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 384.640625, "epoch": 0.948, "grad_norm": 0.5752901880466262, "kl": 1.140625, "learning_rate": 3.8659858668356735e-06, "loss": 0.0458, "reward": 1.578125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 1185 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 408.046875, "epoch": 0.9488, "grad_norm": 1.9259470342314413, "kl": 2.5, "learning_rate": 3.864231273064944e-06, "loss": 0.0997, "reward": 1.484375, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.9375, "step": 1186 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.40625, "epoch": 0.9496, "grad_norm": 0.7736666345822933, "kl": 0.1123046875, "learning_rate": 3.862475721824193e-06, "loss": 0.0045, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1187 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 365.296875, "epoch": 0.9504, "grad_norm": 0.4368484960802902, "kl": 0.1650390625, "learning_rate": 3.8607192143455325e-06, "loss": 0.0066, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1188 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 403.5625, "epoch": 0.9512, "grad_norm": 1.2212503251152664, "kl": 0.5859375, "learning_rate": 3.858961751861748e-06, "loss": 0.0236, "reward": 1.53125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 1189 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 431.703125, "epoch": 0.952, "grad_norm": 0.8227885381713197, "kl": 0.52734375, "learning_rate": 3.857203335606294e-06, "loss": 0.0211, "reward": 1.640625, "reward_std": 0.32407689094543457, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 1190 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 448.828125, "epoch": 0.9528, "grad_norm": 0.3710378790969126, "kl": 0.796875, "learning_rate": 3.855443966813295e-06, "loss": 0.0319, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 1191 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.046875, "epoch": 0.9536, "grad_norm": 0.8855073880304769, "kl": 0.412109375, "learning_rate": 3.853683646717543e-06, "loss": 0.0165, "reward": 1.703125, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1192 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 400.296875, "epoch": 0.9544, "grad_norm": 0.7794909961979359, "kl": 1.2890625, "learning_rate": 3.8519223765544985e-06, "loss": 0.0517, "reward": 1.625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 1193 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 395.9375, "epoch": 0.9552, "grad_norm": 0.6807966086632076, "kl": 1.25, "learning_rate": 3.85016015756029e-06, "loss": 0.0499, "reward": 1.4375, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 1194 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 369.875, "epoch": 0.956, "grad_norm": 0.8135046009324578, "kl": 0.62109375, "learning_rate": 3.848396990971709e-06, "loss": 0.0249, "reward": 1.453125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 1195 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 397.5, "epoch": 0.9568, "grad_norm": 4.325337691463792, "kl": 0.8125, "learning_rate": 3.846632878026214e-06, "loss": 0.0326, "reward": 1.453125, "reward_std": 0.1889965683221817, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.96875, "step": 1196 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 404.125, "epoch": 0.9576, "grad_norm": 1.0605527119395879, "kl": 1.8125, "learning_rate": 3.844867819961928e-06, "loss": 0.0726, "reward": 1.671875, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.953125, "step": 1197 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 356.5625, "epoch": 0.9584, "grad_norm": 2.5066496244472956, "kl": 0.11865234375, "learning_rate": 3.843101818017637e-06, "loss": 0.0047, "reward": 1.703125, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1198 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 372.375, "epoch": 0.9592, "grad_norm": 0.3707204148434657, "kl": 0.11376953125, "learning_rate": 3.841334873432789e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1199 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 416.296875, "epoch": 0.96, "grad_norm": 1.3171464037408456, "kl": 1.359375, "learning_rate": 3.839566987447492e-06, "loss": 0.0543, "reward": 1.609375, "reward_std": 0.28778618574142456, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 1200 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 396.109375, "epoch": 0.9608, "grad_norm": 0.7324569740101239, "kl": 0.1220703125, "learning_rate": 3.837798161302518e-06, "loss": 0.0049, "reward": 1.421875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 1201 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 443.875, "epoch": 0.9616, "grad_norm": 0.4050516138647568, "kl": 0.7578125, "learning_rate": 3.836028396239297e-06, "loss": 0.0303, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1202 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 340.453125, "epoch": 0.9624, "grad_norm": 2.007072847142048, "kl": 0.453125, "learning_rate": 3.8342576934999184e-06, "loss": 0.0182, "reward": 1.390625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 1203 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.234375, "epoch": 0.9632, "grad_norm": 0.6148437659824656, "kl": 0.30859375, "learning_rate": 3.832486054327131e-06, "loss": 0.0124, "reward": 1.609375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 1204 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 410.921875, "epoch": 0.964, "grad_norm": 1.613091575087136, "kl": 0.91796875, "learning_rate": 3.830713479964335e-06, "loss": 0.0367, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 1205 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 390.65625, "epoch": 0.9648, "grad_norm": 1.0370096843423675, "kl": 1.890625, "learning_rate": 3.828939971655595e-06, "loss": 0.0757, "reward": 1.609375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.953125, "step": 1206 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 424.71875, "epoch": 0.9656, "grad_norm": 0.7326482459614472, "kl": 1.9375, "learning_rate": 3.827165530645627e-06, "loss": 0.0773, "reward": 1.59375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.953125, "step": 1207 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 397.375, "epoch": 0.9664, "grad_norm": 0.5215922072444026, "kl": 0.384765625, "learning_rate": 3.825390158179802e-06, "loss": 0.0153, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1208 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 391.109375, "epoch": 0.9672, "grad_norm": 0.9092113219446152, "kl": 0.6171875, "learning_rate": 3.823613855504144e-06, "loss": 0.0247, "reward": 1.59375, "reward_std": 0.2041158527135849, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 1209 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.453125, "epoch": 0.968, "grad_norm": 0.10844602723473387, "kl": 0.1103515625, "learning_rate": 3.82183662386533e-06, "loss": 0.0044, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1210 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.171875, "epoch": 0.9688, "grad_norm": 0.5927214106596028, "kl": 0.1943359375, "learning_rate": 3.82005846451069e-06, "loss": 0.0078, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1211 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.21875, "epoch": 0.9696, "grad_norm": 4.644003560394865, "kl": 0.29296875, "learning_rate": 3.8182793786882065e-06, "loss": 0.0117, "reward": 1.65625, "reward_std": 0.25513994693756104, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 1212 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 369.546875, "epoch": 0.9704, "grad_norm": 0.7984385845301895, "kl": 0.11767578125, "learning_rate": 3.816499367646508e-06, "loss": 0.0047, "reward": 1.75, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1213 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 409.078125, "epoch": 0.9712, "grad_norm": 0.5037814885514016, "kl": 0.18359375, "learning_rate": 3.814718432634877e-06, "loss": 0.0074, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1214 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.921875, "epoch": 0.972, "grad_norm": 0.5793986664724105, "kl": 0.10400390625, "learning_rate": 3.8129365749032398e-06, "loss": 0.0042, "reward": 1.5625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1215 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 420.125, "epoch": 0.9728, "grad_norm": 1.0724754037429123, "kl": 1.2421875, "learning_rate": 3.8111537957021736e-06, "loss": 0.0499, "reward": 1.625, "reward_std": 0.37722259759902954, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 1216 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 450.171875, "epoch": 0.9736, "grad_norm": 2.094546656882904, "kl": 1.375, "learning_rate": 3.809370096282903e-06, "loss": 0.0551, "reward": 1.703125, "reward_std": 0.23925508558750153, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 1217 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.296875, "epoch": 0.9744, "grad_norm": 0.8850136039652126, "kl": 0.5546875, "learning_rate": 3.807585477897296e-06, "loss": 0.0222, "reward": 1.640625, "reward_std": 0.22636085748672485, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1218 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 411.0, "epoch": 0.9752, "grad_norm": 1.0741412183704828, "kl": 0.1201171875, "learning_rate": 3.8057999417978654e-06, "loss": 0.0048, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1219 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 322.921875, "epoch": 0.976, "grad_norm": 0.7900524943733731, "kl": 0.1328125, "learning_rate": 3.8040134892377702e-06, "loss": 0.0053, "reward": 1.546875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1220 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 387.71875, "epoch": 0.9768, "grad_norm": 0.082133830199952, "kl": 0.1162109375, "learning_rate": 3.802226121470811e-06, "loss": 0.0046, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1221 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 386.6875, "epoch": 0.9776, "grad_norm": 0.7853953238612952, "kl": 0.11572265625, "learning_rate": 3.800437839751432e-06, "loss": 0.0046, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1222 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.125, "epoch": 0.9784, "grad_norm": 0.7374526921125558, "kl": 0.1201171875, "learning_rate": 3.7986486453347183e-06, "loss": 0.0048, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1223 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 425.6875, "epoch": 0.9792, "grad_norm": 1.971344763280152, "kl": 0.140625, "learning_rate": 3.796858539476394e-06, "loss": 0.0056, "reward": 1.65625, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 1224 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 356.96875, "epoch": 0.98, "grad_norm": 0.43392110846331944, "kl": 0.1240234375, "learning_rate": 3.795067523432826e-06, "loss": 0.005, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1225 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 373.59375, "epoch": 0.9808, "grad_norm": 2.595408737762125, "kl": 0.228515625, "learning_rate": 3.793275598461017e-06, "loss": 0.0091, "reward": 1.40625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 1226 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 354.1875, "epoch": 0.9816, "grad_norm": 0.6044362340878098, "kl": 0.12255859375, "learning_rate": 3.7914827658186104e-06, "loss": 0.0049, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1227 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.46875, "epoch": 0.9824, "grad_norm": 0.9164506072852291, "kl": 0.1123046875, "learning_rate": 3.7896890267638832e-06, "loss": 0.0045, "reward": 1.65625, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1228 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 464.078125, "epoch": 0.9832, "grad_norm": 1.060333204050679, "kl": 0.8046875, "learning_rate": 3.787894382555752e-06, "loss": 0.0322, "reward": 1.671875, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 1229 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 402.6875, "epoch": 0.984, "grad_norm": 0.8099401116907572, "kl": 0.31640625, "learning_rate": 3.7860988344537664e-06, "loss": 0.0126, "reward": 1.625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 1230 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 402.390625, "epoch": 0.9848, "grad_norm": 0.48940393479073535, "kl": 0.1220703125, "learning_rate": 3.7843023837181126e-06, "loss": 0.0049, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1231 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 376.8125, "epoch": 0.9856, "grad_norm": 0.5796471326337517, "kl": 0.68359375, "learning_rate": 3.782505031609607e-06, "loss": 0.0273, "reward": 1.421875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 1232 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.828125, "epoch": 0.9864, "grad_norm": 1.1469455017093382, "kl": 0.466796875, "learning_rate": 3.7807067793897006e-06, "loss": 0.0188, "reward": 1.765625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 1233 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 407.640625, "epoch": 0.9872, "grad_norm": 0.8698725511170176, "kl": 0.33984375, "learning_rate": 3.778907628320477e-06, "loss": 0.0136, "reward": 1.4375, "reward_std": 0.1911182403564453, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 1234 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.078125, "epoch": 0.988, "grad_norm": 0.7152895003564234, "kl": 0.1396484375, "learning_rate": 3.77710757966465e-06, "loss": 0.0056, "reward": 1.796875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1235 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 374.84375, "epoch": 0.9888, "grad_norm": 0.09151468916312608, "kl": 0.12060546875, "learning_rate": 3.775306634685562e-06, "loss": 0.0048, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1236 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.5625, "epoch": 0.9896, "grad_norm": 0.8930270892171244, "kl": 0.2119140625, "learning_rate": 3.773504794647187e-06, "loss": 0.0085, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1237 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 390.0625, "epoch": 0.9904, "grad_norm": 0.4402466302262891, "kl": 0.12890625, "learning_rate": 3.771702060814123e-06, "loss": 0.0052, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1238 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.390625, "epoch": 0.9912, "grad_norm": 0.8015938392825657, "kl": 0.126953125, "learning_rate": 3.7698984344516e-06, "loss": 0.0051, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1239 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 317.71875, "epoch": 0.992, "grad_norm": 2.6897329924487776, "kl": 0.1396484375, "learning_rate": 3.7680939168254733e-06, "loss": 0.0056, "reward": 1.609375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1240 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 381.171875, "epoch": 0.9928, "grad_norm": 0.8108734422973586, "kl": 0.1259765625, "learning_rate": 3.7662885092022206e-06, "loss": 0.005, "reward": 1.765625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 1241 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 423.1875, "epoch": 0.9936, "grad_norm": 0.7538924663679275, "kl": 1.5078125, "learning_rate": 3.7644822128489476e-06, "loss": 0.0603, "reward": 1.5625, "reward_std": 0.2619796097278595, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.953125, "step": 1242 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 380.328125, "epoch": 0.9944, "grad_norm": 0.4142326854937394, "kl": 0.123046875, "learning_rate": 3.7626750290333824e-06, "loss": 0.0049, "reward": 1.40625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 1243 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 431.875, "epoch": 0.9952, "grad_norm": 1.9432831031618978, "kl": 0.44921875, "learning_rate": 3.7608669590238765e-06, "loss": 0.0179, "reward": 1.6875, "reward_std": 0.31078842282295227, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 1244 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 398.875, "epoch": 0.996, "grad_norm": 1.7965152421438884, "kl": 0.78515625, "learning_rate": 3.7590580040894025e-06, "loss": 0.0315, "reward": 1.5625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 1245 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.953125, "epoch": 0.9968, "grad_norm": 0.8301550724750443, "kl": 0.6015625, "learning_rate": 3.7572481654995554e-06, "loss": 0.0241, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 1246 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 394.671875, "epoch": 0.9976, "grad_norm": 0.9261270003987451, "kl": 0.1435546875, "learning_rate": 3.755437444524548e-06, "loss": 0.0057, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1247 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 415.3125, "epoch": 0.9984, "grad_norm": 0.678962993990223, "kl": 0.5, "learning_rate": 3.7536258424352164e-06, "loss": 0.0201, "reward": 1.390625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 1248 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.421875, "epoch": 0.9992, "grad_norm": 1.117264262366207, "kl": 0.8828125, "learning_rate": 3.75181336050301e-06, "loss": 0.0353, "reward": 1.828125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 1249 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 417.53125, "epoch": 1.0, "grad_norm": 0.3650120709042497, "kl": 0.1396484375, "learning_rate": 3.7500000000000005e-06, "loss": 0.0056, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1250 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 410.21875, "epoch": 1.0008, "grad_norm": 0.5572048293663555, "kl": 0.1396484375, "learning_rate": 3.7481857621988734e-06, "loss": 0.0056, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1251 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 383.171875, "epoch": 1.0016, "grad_norm": 0.9908764499175217, "kl": 0.1884765625, "learning_rate": 3.74637064837293e-06, "loss": 0.0075, "reward": 1.4375, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 1252 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 388.265625, "epoch": 1.0024, "grad_norm": 1.239940799707091, "kl": 1.1640625, "learning_rate": 3.7445546597960882e-06, "loss": 0.0466, "reward": 1.78125, "reward_std": 0.3413130044937134, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 1253 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 392.25, "epoch": 1.0032, "grad_norm": 1.2157593225154961, "kl": 1.484375, "learning_rate": 3.742737797742878e-06, "loss": 0.0591, "reward": 1.40625, "reward_std": 0.5019016861915588, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.953125, "step": 1254 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 407.578125, "epoch": 1.004, "grad_norm": 0.9881956568631687, "kl": 1.1953125, "learning_rate": 3.7409200634884425e-06, "loss": 0.0482, "reward": 1.671875, "reward_std": 0.3107786178588867, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.96875, "step": 1255 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 437.140625, "epoch": 1.0048, "grad_norm": 2.8231668758499793, "kl": 4.0, "learning_rate": 3.7391014583085384e-06, "loss": 0.1607, "reward": 1.734375, "reward_std": 0.39445874094963074, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.921875, "step": 1256 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 459.890625, "epoch": 1.0056, "grad_norm": 4.6616832371486545, "kl": 3.921875, "learning_rate": 3.737281983479534e-06, "loss": 0.1573, "reward": 1.65625, "reward_std": 0.35262539982795715, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.921875, "step": 1257 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.953125, "epoch": 1.0064, "grad_norm": 0.2940974042507545, "kl": 0.2392578125, "learning_rate": 3.735461640278404e-06, "loss": 0.0096, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1258 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 462.625, "epoch": 1.0072, "grad_norm": 0.587074373271045, "kl": 0.1572265625, "learning_rate": 3.733640429982738e-06, "loss": 0.0063, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1259 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.328125, "epoch": 1.008, "grad_norm": 0.7864857509468988, "kl": 1.640625, "learning_rate": 3.731818353870729e-06, "loss": 0.0656, "reward": 1.84375, "reward_std": 0.2709311544895172, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.953125, "step": 1260 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 463.234375, "epoch": 1.0088, "grad_norm": 0.9842190799681781, "kl": 0.123046875, "learning_rate": 3.729995413221183e-06, "loss": 0.0049, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1261 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 421.328125, "epoch": 1.0096, "grad_norm": 7.878004543869069, "kl": 0.271484375, "learning_rate": 3.7281716093135068e-06, "loss": 0.0109, "reward": 1.734375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1262 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 416.84375, "epoch": 1.0104, "grad_norm": 4.849516653718158, "kl": 1.046875, "learning_rate": 3.726346943427719e-06, "loss": 0.0418, "reward": 1.703125, "reward_std": 0.26621314883232117, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.953125, "step": 1263 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 559.78125, "epoch": 1.0112, "grad_norm": 33.10205014775728, "kl": 27.0, "learning_rate": 3.7245214168444388e-06, "loss": 1.081, "reward": 0.6875, "reward_std": 0.5918154716491699, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.125, "step": 1264 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 499.515625, "epoch": 1.012, "grad_norm": 22.709782412189966, "kl": 17.875, "learning_rate": 3.722695030844891e-06, "loss": 0.7162, "reward": 1.046875, "reward_std": 0.6453423500061035, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.296875, "step": 1265 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 480.21875, "epoch": 1.0128, "grad_norm": 14.901755317161799, "kl": 10.875, "learning_rate": 3.7208677867109042e-06, "loss": 0.4345, "reward": 0.953125, "reward_std": 0.527733564376831, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.453125, "step": 1266 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 460.578125, "epoch": 1.0136, "grad_norm": 6.68621373718699, "kl": 3.0, "learning_rate": 3.7190396857249087e-06, "loss": 0.1202, "reward": 1.4375, "reward_std": 0.5098880529403687, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.484375, "step": 1267 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 430.765625, "epoch": 1.0144, "grad_norm": 1.1857603865634236, "kl": 0.16015625, "learning_rate": 3.7172107291699356e-06, "loss": 0.0064, "reward": 1.28125, "reward_std": 0.4535999596118927, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.5, "step": 1268 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 524.328125, "epoch": 1.0152, "grad_norm": 1.9815516032068903, "kl": 0.134765625, "learning_rate": 3.7153809183296174e-06, "loss": 0.0054, "reward": 1.609375, "reward_std": 0.434487521648407, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.625, "step": 1269 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.78125, "epoch": 1.016, "grad_norm": 1.1517316386147365, "kl": 0.1376953125, "learning_rate": 3.713550254488185e-06, "loss": 0.0055, "reward": 1.59375, "reward_std": 0.328794926404953, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.859375, "step": 1270 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 402.640625, "epoch": 1.0168, "grad_norm": 0.725989870887737, "kl": 0.126953125, "learning_rate": 3.7117187389304703e-06, "loss": 0.0051, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.9375, "step": 1271 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 471.40625, "epoch": 1.0176, "grad_norm": 0.3711125652334141, "kl": 0.1162109375, "learning_rate": 3.7098863729418997e-06, "loss": 0.0046, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1272 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 436.203125, "epoch": 1.0184, "grad_norm": 0.6912563396133344, "kl": 0.1328125, "learning_rate": 3.7080531578085e-06, "loss": 0.0053, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1273 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.625, "epoch": 1.0192, "grad_norm": 0.4707630040511406, "kl": 0.1357421875, "learning_rate": 3.7062190948168906e-06, "loss": 0.0054, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 1274 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.828125, "epoch": 1.02, "grad_norm": 0.630313746945571, "kl": 0.1357421875, "learning_rate": 3.7043841852542884e-06, "loss": 0.0054, "reward": 1.796875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1275 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 433.421875, "epoch": 1.0208, "grad_norm": 0.5010734063741127, "kl": 0.1357421875, "learning_rate": 3.7025484304085035e-06, "loss": 0.0054, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1276 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.359375, "epoch": 1.0216, "grad_norm": 1.054618400071382, "kl": 0.1337890625, "learning_rate": 3.7007118315679384e-06, "loss": 0.0054, "reward": 1.875, "reward_std": 0.1243029236793518, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.96875, "step": 1277 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.234375, "epoch": 1.0224, "grad_norm": 0.573816283179805, "kl": 0.14453125, "learning_rate": 3.6988743900215895e-06, "loss": 0.0058, "reward": 1.625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1278 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.015625, "epoch": 1.0232, "grad_norm": 16.83587370066281, "kl": 203.0, "learning_rate": 3.6970361070590443e-06, "loss": 8.1376, "reward": 1.640625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1279 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 470.953125, "epoch": 1.024, "grad_norm": 0.7086540648129875, "kl": 0.1181640625, "learning_rate": 3.695196983970481e-06, "loss": 0.0047, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1280 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 473.640625, "epoch": 1.0248, "grad_norm": 0.5542911690599307, "kl": 0.1455078125, "learning_rate": 3.6933570220466654e-06, "loss": 0.0058, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1281 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.046875, "epoch": 1.0256, "grad_norm": 0.389045624925544, "kl": 0.13671875, "learning_rate": 3.6915162225789546e-06, "loss": 0.0055, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1282 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 422.015625, "epoch": 1.0264, "grad_norm": 1.3551429041345482, "kl": 0.138671875, "learning_rate": 3.6896745868592924e-06, "loss": 0.0056, "reward": 1.703125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1283 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 435.8125, "epoch": 1.0272, "grad_norm": 0.18925265515795672, "kl": 0.11328125, "learning_rate": 3.6878321161802106e-06, "loss": 0.0045, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1284 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 428.203125, "epoch": 1.028, "grad_norm": 0.9443154827084691, "kl": 0.1259765625, "learning_rate": 3.685988811834823e-06, "loss": 0.0051, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 1285 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 450.109375, "epoch": 1.0288, "grad_norm": 0.6444994814202532, "kl": 0.1328125, "learning_rate": 3.684144675116836e-06, "loss": 0.0053, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1286 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 389.171875, "epoch": 1.0296, "grad_norm": 0.4733455270190008, "kl": 0.1259765625, "learning_rate": 3.682299707320532e-06, "loss": 0.005, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 1287 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 374.421875, "epoch": 1.0304, "grad_norm": 0.9509859105214624, "kl": 0.1455078125, "learning_rate": 3.680453909740782e-06, "loss": 0.0058, "reward": 1.578125, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1288 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.890625, "epoch": 1.0312, "grad_norm": 0.7783995301911126, "kl": 0.1376953125, "learning_rate": 3.6786072836730376e-06, "loss": 0.0055, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1289 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 346.53125, "epoch": 1.032, "grad_norm": 0.6724494774107679, "kl": 0.1494140625, "learning_rate": 3.6767598304133325e-06, "loss": 0.006, "reward": 1.5625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 1290 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 406.1875, "epoch": 1.0328, "grad_norm": 0.11063345904636407, "kl": 0.1259765625, "learning_rate": 3.674911551258279e-06, "loss": 0.005, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1291 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.203125, "epoch": 1.0336, "grad_norm": 0.12315637894265835, "kl": 0.1240234375, "learning_rate": 3.673062447505072e-06, "loss": 0.005, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1292 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.796875, "epoch": 1.0344, "grad_norm": 1.0541213420282047, "kl": 0.12890625, "learning_rate": 3.6712125204514836e-06, "loss": 0.0051, "reward": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1293 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 1.0352, "grad_norm": 1.1660069524803196, "kl": 0.1318359375, "learning_rate": 3.6693617713958633e-06, "loss": 0.0053, "reward": 1.59375, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1294 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 389.65625, "epoch": 1.036, "grad_norm": 0.5864727282579134, "kl": 0.1337890625, "learning_rate": 3.6675102016371387e-06, "loss": 0.0053, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1295 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.40625, "epoch": 1.0368, "grad_norm": 0.43355442362640784, "kl": 0.12109375, "learning_rate": 3.665657812474812e-06, "loss": 0.0049, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1296 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 386.125, "epoch": 1.0376, "grad_norm": 0.0828289450261741, "kl": 0.1201171875, "learning_rate": 3.6638046052089614e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1297 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.5625, "epoch": 1.0384, "grad_norm": 0.08981516615883395, "kl": 0.1298828125, "learning_rate": 3.661950581140239e-06, "loss": 0.0052, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1298 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.078125, "epoch": 1.0392, "grad_norm": 1.1288553696182786, "kl": 0.130859375, "learning_rate": 3.660095741569871e-06, "loss": 0.0053, "reward": 1.703125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1299 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.71875, "epoch": 1.04, "grad_norm": 1.117937867759022, "kl": 0.1474609375, "learning_rate": 3.658240087799655e-06, "loss": 0.0059, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1300 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 382.265625, "epoch": 1.0408, "grad_norm": 0.3479631908733468, "kl": 0.1318359375, "learning_rate": 3.6563836211319593e-06, "loss": 0.0053, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1301 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 381.65625, "epoch": 1.0416, "grad_norm": 0.4921254026277044, "kl": 0.12060546875, "learning_rate": 3.654526342869724e-06, "loss": 0.0048, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1302 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 382.046875, "epoch": 1.0424, "grad_norm": 0.08843345511752333, "kl": 0.12060546875, "learning_rate": 3.65266825431646e-06, "loss": 0.0048, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1303 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 412.859375, "epoch": 1.0432, "grad_norm": 0.41148349370278325, "kl": 0.1220703125, "learning_rate": 3.6508093567762425e-06, "loss": 0.0049, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1304 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 433.359375, "epoch": 1.044, "grad_norm": 0.7053228415897961, "kl": 0.123046875, "learning_rate": 3.6489496515537204e-06, "loss": 0.0049, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1305 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 369.796875, "epoch": 1.0448, "grad_norm": 2.9345924019689282, "kl": 0.12255859375, "learning_rate": 3.647089139954104e-06, "loss": 0.0049, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1306 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 406.328125, "epoch": 1.0456, "grad_norm": 0.46572528117117695, "kl": 0.12451171875, "learning_rate": 3.6452278232831734e-06, "loss": 0.005, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1307 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.1875, "epoch": 1.0464, "grad_norm": 0.5515119208835106, "kl": 0.12255859375, "learning_rate": 3.643365702847272e-06, "loss": 0.0049, "reward": 1.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1308 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 416.828125, "epoch": 1.0472, "grad_norm": 0.11326186521321348, "kl": 0.130859375, "learning_rate": 3.641502779953307e-06, "loss": 0.0053, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1309 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 374.5, "epoch": 1.048, "grad_norm": 0.4224649270374836, "kl": 0.1318359375, "learning_rate": 3.639639055908751e-06, "loss": 0.0053, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1310 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 399.78125, "epoch": 1.0488, "grad_norm": 0.31213689183525883, "kl": 0.12158203125, "learning_rate": 3.6377745320216346e-06, "loss": 0.0049, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1311 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 346.671875, "epoch": 1.0496, "grad_norm": 0.8880636818688568, "kl": 0.1279296875, "learning_rate": 3.635909209600555e-06, "loss": 0.0051, "reward": 1.59375, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1312 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 388.640625, "epoch": 1.0504, "grad_norm": 0.8555766336251143, "kl": 0.1396484375, "learning_rate": 3.6340430899546656e-06, "loss": 0.0056, "reward": 1.859375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1313 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 393.46875, "epoch": 1.0512, "grad_norm": 0.6914240310267927, "kl": 0.11962890625, "learning_rate": 3.632176174393682e-06, "loss": 0.0048, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1314 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 408.9375, "epoch": 1.052, "grad_norm": 0.08793480129649127, "kl": 0.1328125, "learning_rate": 3.630308464227877e-06, "loss": 0.0053, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1315 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 354.28125, "epoch": 1.0528, "grad_norm": 0.11135048815683783, "kl": 0.13671875, "learning_rate": 3.628439960768082e-06, "loss": 0.0055, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1316 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 339.1875, "epoch": 1.0536, "grad_norm": 0.12706180835891803, "kl": 0.1357421875, "learning_rate": 3.6265706653256837e-06, "loss": 0.0054, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1317 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 434.4375, "epoch": 1.0544, "grad_norm": 0.4226628009029332, "kl": 0.12255859375, "learning_rate": 3.624700579212626e-06, "loss": 0.0049, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1318 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.875, "epoch": 1.0552, "grad_norm": 0.6815461611812511, "kl": 0.11328125, "learning_rate": 3.6228297037414077e-06, "loss": 0.0045, "reward": 1.640625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1319 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 446.6875, "epoch": 1.056, "grad_norm": 0.5380691915171303, "kl": 0.11328125, "learning_rate": 3.6209580402250816e-06, "loss": 0.0045, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1320 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 374.359375, "epoch": 1.0568, "grad_norm": 0.1148843685975855, "kl": 0.12255859375, "learning_rate": 3.619085589977251e-06, "loss": 0.0049, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1321 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 377.40625, "epoch": 1.0576, "grad_norm": 0.4208076770387228, "kl": 0.1259765625, "learning_rate": 3.617212354312076e-06, "loss": 0.005, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1322 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 387.296875, "epoch": 1.0584, "grad_norm": 0.07878249078730427, "kl": 0.11962890625, "learning_rate": 3.615338334544265e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1323 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.015625, "epoch": 1.0592, "grad_norm": 0.8057658584326876, "kl": 0.1318359375, "learning_rate": 3.6134635319890763e-06, "loss": 0.0053, "reward": 1.71875, "reward_std": 0.17570313811302185, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1324 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 395.6875, "epoch": 1.06, "grad_norm": 0.4294003205075124, "kl": 0.1162109375, "learning_rate": 3.611587947962319e-06, "loss": 0.0047, "reward": 1.390625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 1325 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.578125, "epoch": 1.0608, "grad_norm": 0.7194137607156134, "kl": 0.1259765625, "learning_rate": 3.6097115837803504e-06, "loss": 0.005, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1326 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.15625, "epoch": 1.0616, "grad_norm": 0.08999590606098172, "kl": 0.11962890625, "learning_rate": 3.6078344407600744e-06, "loss": 0.0048, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1327 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.28125, "epoch": 1.0624, "grad_norm": 0.08026476914716063, "kl": 0.1162109375, "learning_rate": 3.6059565202189433e-06, "loss": 0.0047, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1328 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 416.28125, "epoch": 1.0632, "grad_norm": 0.44695944586322334, "kl": 0.119140625, "learning_rate": 3.604077823474954e-06, "loss": 0.0048, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1329 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 393.71875, "epoch": 1.064, "grad_norm": 1.7134577087326561, "kl": 0.111328125, "learning_rate": 3.6021983518466468e-06, "loss": 0.0044, "reward": 1.53125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1330 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.140625, "epoch": 1.0648, "grad_norm": 0.7882508740354264, "kl": 0.125, "learning_rate": 3.600318106653108e-06, "loss": 0.005, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1331 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 399.390625, "epoch": 1.0656, "grad_norm": 0.4124598459493163, "kl": 0.11474609375, "learning_rate": 3.5984370892139663e-06, "loss": 0.0046, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1332 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 430.125, "epoch": 1.0664, "grad_norm": 0.8139864545899247, "kl": 0.12255859375, "learning_rate": 3.5965553008493924e-06, "loss": 0.0049, "reward": 1.859375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1333 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 411.359375, "epoch": 1.0672, "grad_norm": 0.5002200010531586, "kl": 0.1162109375, "learning_rate": 3.594672742880097e-06, "loss": 0.0046, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1334 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 440.90625, "epoch": 1.068, "grad_norm": 0.07014226607248494, "kl": 0.1015625, "learning_rate": 3.5927894166273324e-06, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1335 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 425.640625, "epoch": 1.0688, "grad_norm": 0.07627824876575508, "kl": 0.109375, "learning_rate": 3.5909053234128893e-06, "loss": 0.0044, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1336 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 416.28125, "epoch": 1.0695999999999999, "grad_norm": 0.4377301448094491, "kl": 0.11865234375, "learning_rate": 3.5890204645590964e-06, "loss": 0.0047, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1337 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 381.34375, "epoch": 1.0704, "grad_norm": 0.07781407006259818, "kl": 0.1181640625, "learning_rate": 3.5871348413888207e-06, "loss": 0.0047, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1338 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 1.0712, "grad_norm": 0.47270631237798827, "kl": 0.11865234375, "learning_rate": 3.585248455225466e-06, "loss": 0.0047, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1339 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 369.28125, "epoch": 1.072, "grad_norm": 0.0864109007904575, "kl": 0.1279296875, "learning_rate": 3.5833613073929684e-06, "loss": 0.0051, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1340 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.125, "epoch": 1.0728, "grad_norm": 0.9026686274543017, "kl": 0.11767578125, "learning_rate": 3.5814733992158025e-06, "loss": 0.0047, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1341 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 427.515625, "epoch": 1.0735999999999999, "grad_norm": 0.0923061944502762, "kl": 0.109375, "learning_rate": 3.579584732018975e-06, "loss": 0.0044, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1342 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.5, "epoch": 1.0744, "grad_norm": 0.08281407405240734, "kl": 0.11279296875, "learning_rate": 3.577695307128024e-06, "loss": 0.0045, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1343 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 405.953125, "epoch": 1.0752, "grad_norm": 0.4274854944873935, "kl": 0.119140625, "learning_rate": 3.5758051258690223e-06, "loss": 0.0048, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1344 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 405.296875, "epoch": 1.076, "grad_norm": 0.43568786694628714, "kl": 0.12109375, "learning_rate": 3.5739141895685708e-06, "loss": 0.0048, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1345 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 427.625, "epoch": 1.0768, "grad_norm": 0.5244828704127512, "kl": 0.12060546875, "learning_rate": 3.5720224995538023e-06, "loss": 0.0048, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1346 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 393.859375, "epoch": 1.0776, "grad_norm": 1.0476679067026615, "kl": 0.1162109375, "learning_rate": 3.5701300571523757e-06, "loss": 0.0047, "reward": 1.46875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 1347 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 428.71875, "epoch": 1.0784, "grad_norm": 0.39801606252683097, "kl": 0.1171875, "learning_rate": 3.5682368636924825e-06, "loss": 0.0047, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1348 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 405.28125, "epoch": 1.0792, "grad_norm": 0.9423569674835038, "kl": 0.12109375, "learning_rate": 3.566342920502837e-06, "loss": 0.0048, "reward": 1.640625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1349 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 410.421875, "epoch": 1.08, "grad_norm": 0.8476625272807433, "kl": 0.1064453125, "learning_rate": 3.564448228912682e-06, "loss": 0.0043, "reward": 1.640625, "reward_std": 0.2198973149061203, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1350 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 407.5, "epoch": 1.0808, "grad_norm": 1.0587495022677087, "kl": 0.126953125, "learning_rate": 3.562552790251785e-06, "loss": 0.0051, "reward": 1.875, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1351 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.328125, "epoch": 1.0816, "grad_norm": 0.9746242455558621, "kl": 0.11865234375, "learning_rate": 3.5606566058504377e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1352 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.40625, "epoch": 1.0824, "grad_norm": 0.8637433949095339, "kl": 0.12255859375, "learning_rate": 3.558759677039455e-06, "loss": 0.0049, "reward": 1.90625, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.984375, "step": 1353 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 460.40625, "epoch": 1.0832, "grad_norm": 0.06551415921125266, "kl": 0.10595703125, "learning_rate": 3.5568620051501755e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1354 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.796875, "epoch": 1.084, "grad_norm": 0.7756525458796553, "kl": 0.1201171875, "learning_rate": 3.5549635915144578e-06, "loss": 0.0048, "reward": 1.890625, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1355 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 458.6875, "epoch": 1.0848, "grad_norm": 0.08179437760795527, "kl": 0.11279296875, "learning_rate": 3.553064437464682e-06, "loss": 0.0045, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1356 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 434.265625, "epoch": 1.0856, "grad_norm": 2.280415542766174, "kl": 0.12158203125, "learning_rate": 3.551164544333745e-06, "loss": 0.0049, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1357 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 417.25, "epoch": 1.0864, "grad_norm": 0.4581094884937273, "kl": 0.11083984375, "learning_rate": 3.549263913455069e-06, "loss": 0.0044, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1358 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 468.765625, "epoch": 1.0872, "grad_norm": 0.08229665237670332, "kl": 0.11279296875, "learning_rate": 3.5473625461625884e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1359 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 452.375, "epoch": 1.088, "grad_norm": 0.9424553271395688, "kl": 0.1171875, "learning_rate": 3.5454604437907535e-06, "loss": 0.0047, "reward": 1.609375, "reward_std": 0.2198973149061203, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1360 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 441.703125, "epoch": 1.0888, "grad_norm": 0.7114250380195982, "kl": 0.125, "learning_rate": 3.543557607674537e-06, "loss": 0.005, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1361 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.84375, "epoch": 1.0896, "grad_norm": 0.7450571641255354, "kl": 0.103515625, "learning_rate": 3.54165403914942e-06, "loss": 0.0041, "reward": 1.828125, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 1362 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 450.84375, "epoch": 1.0904, "grad_norm": 0.4169709765198773, "kl": 0.1201171875, "learning_rate": 3.539749739551401e-06, "loss": 0.0048, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1363 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 1.0912, "grad_norm": 1.069548025484488, "kl": 0.12255859375, "learning_rate": 3.53784471021699e-06, "loss": 0.0049, "reward": 1.84375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1364 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 435.34375, "epoch": 1.092, "grad_norm": 0.10591100316080641, "kl": 0.10546875, "learning_rate": 3.535938952483211e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1365 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 389.515625, "epoch": 1.0928, "grad_norm": 0.6580784303551637, "kl": 0.12255859375, "learning_rate": 3.534032467687597e-06, "loss": 0.0049, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1366 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 469.171875, "epoch": 1.0936, "grad_norm": 0.45277713493051125, "kl": 0.107421875, "learning_rate": 3.532125257168193e-06, "loss": 0.0043, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1367 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 377.40625, "epoch": 1.0944, "grad_norm": 0.26519651006330924, "kl": 0.1298828125, "learning_rate": 3.5302173222635526e-06, "loss": 0.0052, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1368 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 411.484375, "epoch": 1.0952, "grad_norm": 0.09805385952145897, "kl": 0.1123046875, "learning_rate": 3.5283086643127396e-06, "loss": 0.0045, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1369 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 439.40625, "epoch": 1.096, "grad_norm": 0.5890017494184737, "kl": 0.103515625, "learning_rate": 3.5263992846553203e-06, "loss": 0.0041, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1370 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 447.328125, "epoch": 1.0968, "grad_norm": 0.07065941566935242, "kl": 0.10791015625, "learning_rate": 3.5244891846313733e-06, "loss": 0.0043, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1371 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 425.46875, "epoch": 1.0976, "grad_norm": 0.9668118573558747, "kl": 0.1318359375, "learning_rate": 3.5225783655814798e-06, "loss": 0.0053, "reward": 1.71875, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1372 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 388.96875, "epoch": 1.0984, "grad_norm": 0.07215970710955445, "kl": 0.11474609375, "learning_rate": 3.520666828846726e-06, "loss": 0.0046, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1373 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 495.265625, "epoch": 1.0992, "grad_norm": 0.3594606902019938, "kl": 0.1015625, "learning_rate": 3.518754575768702e-06, "loss": 0.0041, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1374 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 440.828125, "epoch": 1.1, "grad_norm": 0.8495536664673914, "kl": 0.11572265625, "learning_rate": 3.516841607689501e-06, "loss": 0.0046, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1375 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 422.453125, "epoch": 1.1008, "grad_norm": 0.4668886660518382, "kl": 0.1142578125, "learning_rate": 3.5149279259517165e-06, "loss": 0.0046, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1376 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 418.109375, "epoch": 1.1016, "grad_norm": 0.43961505825250097, "kl": 0.11767578125, "learning_rate": 3.5130135318984454e-06, "loss": 0.0047, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1377 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 481.90625, "epoch": 1.1024, "grad_norm": 0.43290932955061817, "kl": 0.1025390625, "learning_rate": 3.5110984268732827e-06, "loss": 0.0041, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1378 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 422.890625, "epoch": 1.1032, "grad_norm": 0.07043072502096129, "kl": 0.1123046875, "learning_rate": 3.509182612220322e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1379 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 466.40625, "epoch": 1.104, "grad_norm": 0.3747882985557012, "kl": 0.10498046875, "learning_rate": 3.507266089284157e-06, "loss": 0.0042, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1380 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 1.1048, "grad_norm": 1.8793652736629933, "kl": 0.10888671875, "learning_rate": 3.5053488594098763e-06, "loss": 0.0044, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1381 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 391.84375, "epoch": 1.1056, "grad_norm": 0.4966571203948814, "kl": 0.11279296875, "learning_rate": 3.5034309239430664e-06, "loss": 0.0045, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1382 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 396.25, "epoch": 1.1064, "grad_norm": 0.12919829421090323, "kl": 0.1220703125, "learning_rate": 3.501512284229807e-06, "loss": 0.0049, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1383 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 364.4375, "epoch": 1.1072, "grad_norm": 0.1075597809162098, "kl": 0.1171875, "learning_rate": 3.4995929416166756e-06, "loss": 0.0047, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1384 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 328.953125, "epoch": 1.108, "grad_norm": 1.0073464095529048, "kl": 0.1259765625, "learning_rate": 3.4976728974507387e-06, "loss": 0.005, "reward": 1.640625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1385 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 431.09375, "epoch": 1.1088, "grad_norm": 0.4899062847323215, "kl": 0.1103515625, "learning_rate": 3.4957521530795576e-06, "loss": 0.0044, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1386 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.1875, "epoch": 1.1096, "grad_norm": 0.8881361855371956, "kl": 0.10400390625, "learning_rate": 3.493830709851185e-06, "loss": 0.0042, "reward": 1.71875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1387 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 376.96875, "epoch": 1.1104, "grad_norm": 0.5312134517477242, "kl": 0.119140625, "learning_rate": 3.4919085691141636e-06, "loss": 0.0048, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 1388 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 418.03125, "epoch": 1.1112, "grad_norm": 1.2335427817309588, "kl": 0.1337890625, "learning_rate": 3.4899857322175252e-06, "loss": 0.0053, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 1389 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 447.53125, "epoch": 1.112, "grad_norm": 0.12412149321676529, "kl": 0.099609375, "learning_rate": 3.4880622005107916e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1390 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 456.34375, "epoch": 1.1128, "grad_norm": 0.11572666283984473, "kl": 0.10498046875, "learning_rate": 3.486137975343971e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1391 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 409.578125, "epoch": 1.1136, "grad_norm": 0.10714842129617189, "kl": 0.09619140625, "learning_rate": 3.484213058067559e-06, "loss": 0.0038, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1392 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 1.1144, "grad_norm": 0.5554838829199905, "kl": 0.10595703125, "learning_rate": 3.482287450032536e-06, "loss": 0.0042, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1393 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 426.3125, "epoch": 1.1152, "grad_norm": 0.0853444982378338, "kl": 0.1044921875, "learning_rate": 3.4803611525903687e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1394 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 425.78125, "epoch": 1.116, "grad_norm": 0.5599055800840694, "kl": 0.0966796875, "learning_rate": 3.4784341670930067e-06, "loss": 0.0039, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1395 }, { "all_correct": 0.375, "all_wrong": 0.625, "completion_length": 378.671875, "epoch": 1.1168, "grad_norm": 0.07257007906770074, "kl": 0.10400390625, "learning_rate": 3.4765064948928813e-06, "loss": 0.0042, "reward": 1.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 1396 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 413.953125, "epoch": 1.1176, "grad_norm": 0.48566788345707423, "kl": 0.1201171875, "learning_rate": 3.474578137342909e-06, "loss": 0.0048, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1397 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 453.265625, "epoch": 1.1184, "grad_norm": 1.0924516188881157, "kl": 0.1025390625, "learning_rate": 3.4726490957964836e-06, "loss": 0.0041, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1398 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 448.96875, "epoch": 1.1192, "grad_norm": 0.777686275790424, "kl": 0.09423828125, "learning_rate": 3.4707193716074816e-06, "loss": 0.0038, "reward": 1.734375, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1399 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.0625, "epoch": 1.12, "grad_norm": 0.10795816109103065, "kl": 0.1025390625, "learning_rate": 3.4687889661302577e-06, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1400 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 450.703125, "epoch": 1.1208, "grad_norm": 0.5445734373198614, "kl": 0.10888671875, "learning_rate": 3.466857880719645e-06, "loss": 0.0044, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1401 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 411.140625, "epoch": 1.1216, "grad_norm": 0.6848000310402026, "kl": 0.1044921875, "learning_rate": 3.464926116730953e-06, "loss": 0.0042, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 1402 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 408.875, "epoch": 1.1224, "grad_norm": 0.4923579855658618, "kl": 0.11328125, "learning_rate": 3.462993675519968e-06, "loss": 0.0045, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1403 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 450.0625, "epoch": 1.1232, "grad_norm": 0.6043566172446442, "kl": 0.10986328125, "learning_rate": 3.4610605584429526e-06, "loss": 0.0044, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1404 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 464.875, "epoch": 1.124, "grad_norm": 0.06675601359060121, "kl": 0.09619140625, "learning_rate": 3.4591267668566412e-06, "loss": 0.0038, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1405 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 402.359375, "epoch": 1.1248, "grad_norm": 1.0616775591146432, "kl": 0.12255859375, "learning_rate": 3.457192302118244e-06, "loss": 0.0049, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1406 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 453.921875, "epoch": 1.1256, "grad_norm": 1.1647138985440137, "kl": 0.08935546875, "learning_rate": 3.455257165585444e-06, "loss": 0.0036, "reward": 1.859375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1407 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 449.5625, "epoch": 1.1264, "grad_norm": 0.7541875292982689, "kl": 0.1015625, "learning_rate": 3.453321358616393e-06, "loss": 0.0041, "reward": 1.75, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1408 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 470.8125, "epoch": 1.1272, "grad_norm": 0.08380687851384189, "kl": 0.10205078125, "learning_rate": 3.4513848825697145e-06, "loss": 0.0041, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1409 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 474.671875, "epoch": 1.1280000000000001, "grad_norm": 0.5145916486333977, "kl": 0.10107421875, "learning_rate": 3.4494477388045035e-06, "loss": 0.0041, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1410 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 406.109375, "epoch": 1.1288, "grad_norm": 0.543150234735062, "kl": 0.1162109375, "learning_rate": 3.4475099286803204e-06, "loss": 0.0047, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1411 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.96875, "epoch": 1.1296, "grad_norm": 0.7635360148609007, "kl": 0.11669921875, "learning_rate": 3.445571453557196e-06, "loss": 0.0047, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1412 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 433.828125, "epoch": 1.1304, "grad_norm": 0.07728577054889625, "kl": 0.1044921875, "learning_rate": 3.443632314795627e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1413 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 473.34375, "epoch": 1.1312, "grad_norm": 0.06768809458452375, "kl": 0.10400390625, "learning_rate": 3.4416925137565756e-06, "loss": 0.0042, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1414 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 439.59375, "epoch": 1.1320000000000001, "grad_norm": 0.5746438413962025, "kl": 0.0986328125, "learning_rate": 3.439752051801467e-06, "loss": 0.004, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1415 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 468.4375, "epoch": 1.1328, "grad_norm": 0.433038922604871, "kl": 0.10400390625, "learning_rate": 3.4378109302921946e-06, "loss": 0.0042, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1416 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 461.546875, "epoch": 1.1336, "grad_norm": 0.6659817414420092, "kl": 0.11181640625, "learning_rate": 3.4358691505911105e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1417 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 472.59375, "epoch": 1.1344, "grad_norm": 0.6911444735566226, "kl": 0.09814453125, "learning_rate": 3.4339267140610317e-06, "loss": 0.0039, "reward": 1.671875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1418 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 476.625, "epoch": 1.1352, "grad_norm": 0.09875310122596062, "kl": 0.11474609375, "learning_rate": 3.4319836220652334e-06, "loss": 0.0046, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1419 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.859375, "epoch": 1.1360000000000001, "grad_norm": 0.8264825473284627, "kl": 0.12109375, "learning_rate": 3.430039875967454e-06, "loss": 0.0048, "reward": 1.75, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1420 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 415.609375, "epoch": 1.1368, "grad_norm": 0.6031482260506018, "kl": 0.12158203125, "learning_rate": 3.428095477131888e-06, "loss": 0.0049, "reward": 1.515625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1421 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 1.1376, "grad_norm": 0.4291622496601647, "kl": 0.09912109375, "learning_rate": 3.4261504269231904e-06, "loss": 0.004, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1422 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 416.359375, "epoch": 1.1384, "grad_norm": 0.6574559335779393, "kl": 0.125, "learning_rate": 3.4242047267064714e-06, "loss": 0.005, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1423 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 459.078125, "epoch": 1.1392, "grad_norm": 0.7045531699568626, "kl": 0.1142578125, "learning_rate": 3.4222583778472997e-06, "loss": 0.0046, "reward": 1.8125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 1424 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 468.078125, "epoch": 1.1400000000000001, "grad_norm": 0.10368832579911602, "kl": 0.111328125, "learning_rate": 3.4203113817116955e-06, "loss": 0.0044, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1425 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 463.609375, "epoch": 1.1408, "grad_norm": 0.7714420431453896, "kl": 0.11669921875, "learning_rate": 3.4183637396661372e-06, "loss": 0.0047, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1426 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 459.265625, "epoch": 1.1416, "grad_norm": 21.463436447860914, "kl": 304.0, "learning_rate": 3.4164154530775552e-06, "loss": 12.1845, "reward": 1.4375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 1427 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 430.328125, "epoch": 1.1424, "grad_norm": 0.463088686013149, "kl": 0.107421875, "learning_rate": 3.4144665233133318e-06, "loss": 0.0043, "reward": 1.4375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 1428 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 426.59375, "epoch": 1.1432, "grad_norm": 0.5761050088039148, "kl": 0.109375, "learning_rate": 3.4125169517413005e-06, "loss": 0.0044, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 1429 }, { "all_correct": 0.375, "all_wrong": 0.625, "completion_length": 386.96875, "epoch": 1.144, "grad_norm": 0.08700823346346177, "kl": 0.119140625, "learning_rate": 3.410566739729746e-06, "loss": 0.0048, "reward": 1.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 1430 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 402.375, "epoch": 1.1448, "grad_norm": 0.644533417538252, "kl": 0.1103515625, "learning_rate": 3.408615888647402e-06, "loss": 0.0044, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1431 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 444.53125, "epoch": 1.1456, "grad_norm": 0.5729802767468924, "kl": 0.10693359375, "learning_rate": 3.4066643998634506e-06, "loss": 0.0043, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1432 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 464.359375, "epoch": 1.1464, "grad_norm": 0.06954289731233868, "kl": 0.1083984375, "learning_rate": 3.4047122747475227e-06, "loss": 0.0043, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1433 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 450.796875, "epoch": 1.1472, "grad_norm": 0.10797523245038422, "kl": 0.10888671875, "learning_rate": 3.402759514669694e-06, "loss": 0.0043, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1434 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 381.15625, "epoch": 1.148, "grad_norm": 0.4994777175919268, "kl": 0.10595703125, "learning_rate": 3.4008061210004872e-06, "loss": 0.0042, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1435 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.53125, "epoch": 1.1488, "grad_norm": 0.15145116729361308, "kl": 0.1162109375, "learning_rate": 3.3988520951108683e-06, "loss": 0.0047, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1436 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 450.1875, "epoch": 1.1496, "grad_norm": 0.453908518354673, "kl": 0.0966796875, "learning_rate": 3.3968974383722497e-06, "loss": 0.0039, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1437 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.296875, "epoch": 1.1504, "grad_norm": 0.08309227989880606, "kl": 0.10791015625, "learning_rate": 3.3949421521564825e-06, "loss": 0.0043, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1438 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 465.265625, "epoch": 1.1512, "grad_norm": 1.0436547874273405, "kl": 0.109375, "learning_rate": 3.392986237835863e-06, "loss": 0.0044, "reward": 1.703125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1439 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.875, "epoch": 1.152, "grad_norm": 0.6227114845892602, "kl": 0.0986328125, "learning_rate": 3.391029696783127e-06, "loss": 0.0039, "reward": 1.71875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1440 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 417.78125, "epoch": 1.1528, "grad_norm": 0.06476405521823399, "kl": 0.09375, "learning_rate": 3.389072530371451e-06, "loss": 0.0037, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1441 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 441.453125, "epoch": 1.1536, "grad_norm": 0.9585479878506278, "kl": 0.1044921875, "learning_rate": 3.3871147399744482e-06, "loss": 0.0042, "reward": 1.703125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1442 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 424.28125, "epoch": 1.1544, "grad_norm": 0.06163943899833009, "kl": 0.1044921875, "learning_rate": 3.385156326966173e-06, "loss": 0.0042, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1443 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 433.78125, "epoch": 1.1552, "grad_norm": 1.1113670329603564, "kl": 0.11181640625, "learning_rate": 3.383197292721114e-06, "loss": 0.0045, "reward": 1.703125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1444 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 435.09375, "epoch": 1.156, "grad_norm": 0.10695649373521311, "kl": 0.10498046875, "learning_rate": 3.3812376386141966e-06, "loss": 0.0042, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1445 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.921875, "epoch": 1.1568, "grad_norm": 0.06801156277794704, "kl": 0.10009765625, "learning_rate": 3.379277366020782e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1446 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 418.734375, "epoch": 1.1576, "grad_norm": 0.3580823813076655, "kl": 0.10498046875, "learning_rate": 3.3773164763166653e-06, "loss": 0.0042, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1447 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 464.828125, "epoch": 1.1584, "grad_norm": 0.663091035201307, "kl": 0.09716796875, "learning_rate": 3.3753549708780736e-06, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1448 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 373.609375, "epoch": 1.1592, "grad_norm": 0.6667749238816655, "kl": 0.1142578125, "learning_rate": 3.3733928510816677e-06, "loss": 0.0046, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1449 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.5, "epoch": 1.16, "grad_norm": 0.6968065727793826, "kl": 0.1015625, "learning_rate": 3.3714301183045382e-06, "loss": 0.0041, "reward": 1.578125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1450 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 451.0625, "epoch": 1.1608, "grad_norm": 0.44453937218899353, "kl": 0.10107421875, "learning_rate": 3.369466773924207e-06, "loss": 0.004, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1451 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.578125, "epoch": 1.1616, "grad_norm": 0.5322992184165442, "kl": 0.10546875, "learning_rate": 3.3675028193186243e-06, "loss": 0.0042, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1452 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 413.265625, "epoch": 1.1623999999999999, "grad_norm": 0.4982598509555196, "kl": 0.1083984375, "learning_rate": 3.365538255866169e-06, "loss": 0.0043, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1453 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.90625, "epoch": 1.1632, "grad_norm": 0.66166595330765, "kl": 0.109375, "learning_rate": 3.3635730849456484e-06, "loss": 0.0044, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1454 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.59375, "epoch": 1.164, "grad_norm": 0.8889262086198133, "kl": 0.107421875, "learning_rate": 3.3616073079362925e-06, "loss": 0.0043, "reward": 1.765625, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1455 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 440.0, "epoch": 1.1648, "grad_norm": 2.5141442050434963, "kl": 0.099609375, "learning_rate": 3.3596409262177633e-06, "loss": 0.004, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1456 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 463.078125, "epoch": 1.1656, "grad_norm": 0.6616652595910705, "kl": 0.103515625, "learning_rate": 3.357673941170139e-06, "loss": 0.0041, "reward": 1.78125, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1457 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 406.59375, "epoch": 1.1663999999999999, "grad_norm": 0.6832318765926045, "kl": 0.11328125, "learning_rate": 3.3557063541739283e-06, "loss": 0.0045, "reward": 1.609375, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1458 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 476.078125, "epoch": 1.1672, "grad_norm": 0.6721377170179303, "kl": 0.10498046875, "learning_rate": 3.353738166610058e-06, "loss": 0.0042, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1459 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 380.140625, "epoch": 1.168, "grad_norm": 0.6020334048192396, "kl": 0.1142578125, "learning_rate": 3.35176937985988e-06, "loss": 0.0046, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1460 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 384.109375, "epoch": 1.1688, "grad_norm": 0.48014412954771984, "kl": 0.109375, "learning_rate": 3.349799995305162e-06, "loss": 0.0044, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1461 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 414.734375, "epoch": 1.1696, "grad_norm": 0.341451409321828, "kl": 0.09130859375, "learning_rate": 3.3478300143280946e-06, "loss": 0.0036, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1462 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 453.0, "epoch": 1.1703999999999999, "grad_norm": 0.0639825437004834, "kl": 0.095703125, "learning_rate": 3.3458594383112868e-06, "loss": 0.0038, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1463 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 444.078125, "epoch": 1.1712, "grad_norm": 0.5032444666172071, "kl": 0.09326171875, "learning_rate": 3.343888268637765e-06, "loss": 0.0037, "reward": 1.65625, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1464 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 418.28125, "epoch": 1.172, "grad_norm": 0.44208712896677865, "kl": 0.10400390625, "learning_rate": 3.341916506690971e-06, "loss": 0.0042, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1465 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 441.703125, "epoch": 1.1728, "grad_norm": 0.36290573465269566, "kl": 0.0986328125, "learning_rate": 3.3399441538547638e-06, "loss": 0.0039, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1466 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 429.25, "epoch": 1.1736, "grad_norm": 1.100507848866221, "kl": 0.10302734375, "learning_rate": 3.337971211513417e-06, "loss": 0.0041, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1467 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.359375, "epoch": 1.1743999999999999, "grad_norm": 0.06864927055867691, "kl": 0.099609375, "learning_rate": 3.3359976810516164e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1468 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 432.28125, "epoch": 1.1752, "grad_norm": 0.7475973430320392, "kl": 0.1181640625, "learning_rate": 3.3340235638544633e-06, "loss": 0.0047, "reward": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1469 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 428.21875, "epoch": 1.176, "grad_norm": 1.3264637736722542, "kl": 0.09716796875, "learning_rate": 3.332048861307467e-06, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1470 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 383.96875, "epoch": 1.1768, "grad_norm": 0.6959989823906907, "kl": 0.1064453125, "learning_rate": 3.330073574796551e-06, "loss": 0.0043, "reward": 1.5, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1471 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 420.8125, "epoch": 1.1776, "grad_norm": 0.87072487103619, "kl": 0.107421875, "learning_rate": 3.328097705708047e-06, "loss": 0.0043, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1472 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 420.40625, "epoch": 1.1784, "grad_norm": 0.5768313122779916, "kl": 0.11181640625, "learning_rate": 3.3261212554286977e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1473 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 439.1875, "epoch": 1.1792, "grad_norm": 0.07694570407810193, "kl": 0.11328125, "learning_rate": 3.324144225345649e-06, "loss": 0.0045, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1474 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 440.515625, "epoch": 1.18, "grad_norm": 0.09631195615765224, "kl": 0.109375, "learning_rate": 3.3221666168464584e-06, "loss": 0.0044, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1475 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 384.5, "epoch": 1.1808, "grad_norm": 0.4622244083058771, "kl": 0.1220703125, "learning_rate": 3.320188431319088e-06, "loss": 0.0049, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1476 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 438.4375, "epoch": 1.1816, "grad_norm": 0.6939857811376071, "kl": 0.09912109375, "learning_rate": 3.318209670151904e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1477 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 377.78125, "epoch": 1.1824, "grad_norm": 0.5453108696690133, "kl": 0.1025390625, "learning_rate": 3.3162303347336765e-06, "loss": 0.0041, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1478 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 405.703125, "epoch": 1.1832, "grad_norm": 0.7482532247766777, "kl": 0.10888671875, "learning_rate": 3.3142504264535808e-06, "loss": 0.0044, "reward": 1.609375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1479 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 441.859375, "epoch": 1.184, "grad_norm": 0.7719143253082762, "kl": 0.1015625, "learning_rate": 3.3122699467011913e-06, "loss": 0.0041, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1480 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.578125, "epoch": 1.1848, "grad_norm": 0.07207211889396034, "kl": 0.1044921875, "learning_rate": 3.3102888968664857e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1481 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 432.671875, "epoch": 1.1856, "grad_norm": 0.4440499987515191, "kl": 0.10009765625, "learning_rate": 3.308307278339842e-06, "loss": 0.004, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1482 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 458.9375, "epoch": 1.1864, "grad_norm": 0.44469879596118, "kl": 0.0966796875, "learning_rate": 3.306325092512034e-06, "loss": 0.0039, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1483 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 431.578125, "epoch": 1.1872, "grad_norm": 0.5874152839811555, "kl": 0.10205078125, "learning_rate": 3.3043423407742374e-06, "loss": 0.0041, "reward": 1.46875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 1484 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 435.671875, "epoch": 1.188, "grad_norm": 0.4305572452110268, "kl": 0.0966796875, "learning_rate": 3.3023590245180237e-06, "loss": 0.0039, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1485 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 413.5625, "epoch": 1.1888, "grad_norm": 0.6724993029168898, "kl": 0.1123046875, "learning_rate": 3.300375145135361e-06, "loss": 0.0045, "reward": 1.703125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 1486 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.5, "epoch": 1.1896, "grad_norm": 0.7374735676474072, "kl": 0.09375, "learning_rate": 3.2983907040186112e-06, "loss": 0.0037, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1487 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 466.265625, "epoch": 1.1904, "grad_norm": 0.39588011924560756, "kl": 0.0947265625, "learning_rate": 3.296405702560532e-06, "loss": 0.0038, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1488 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 361.375, "epoch": 1.1912, "grad_norm": 0.4409189442063453, "kl": 0.11474609375, "learning_rate": 3.294420142154274e-06, "loss": 0.0046, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1489 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.71875, "epoch": 1.192, "grad_norm": 0.3752695753607167, "kl": 0.09326171875, "learning_rate": 3.29243402419338e-06, "loss": 0.0037, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1490 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 387.78125, "epoch": 1.1928, "grad_norm": 0.59116556013629, "kl": 0.0927734375, "learning_rate": 3.2904473500717826e-06, "loss": 0.0037, "reward": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1491 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.640625, "epoch": 1.1936, "grad_norm": 0.8800352480705476, "kl": 0.11767578125, "learning_rate": 3.2884601211838087e-06, "loss": 0.0047, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1492 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 426.53125, "epoch": 1.1944, "grad_norm": 0.06276444903383693, "kl": 0.0986328125, "learning_rate": 3.2864723389241697e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1493 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.96875, "epoch": 1.1952, "grad_norm": 0.637470041365548, "kl": 0.12109375, "learning_rate": 3.284484004687969e-06, "loss": 0.0049, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1494 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 372.65625, "epoch": 1.196, "grad_norm": 0.08157808922861641, "kl": 0.11279296875, "learning_rate": 3.2824951198706958e-06, "loss": 0.0045, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1495 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 399.1875, "epoch": 1.1968, "grad_norm": 0.5575219576538066, "kl": 0.095703125, "learning_rate": 3.280505685868226e-06, "loss": 0.0038, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1496 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 455.21875, "epoch": 1.1976, "grad_norm": 0.35317690682621256, "kl": 0.0947265625, "learning_rate": 3.278515704076821e-06, "loss": 0.0038, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1497 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 451.03125, "epoch": 1.1984, "grad_norm": 0.7645963540928052, "kl": 0.0947265625, "learning_rate": 3.276525175893126e-06, "loss": 0.0038, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1498 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 389.84375, "epoch": 1.1992, "grad_norm": 0.1171126630216482, "kl": 0.1044921875, "learning_rate": 3.274534102714172e-06, "loss": 0.0042, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1499 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 459.328125, "epoch": 1.2, "grad_norm": 0.5239497582573815, "kl": 0.10302734375, "learning_rate": 3.272542485937369e-06, "loss": 0.0041, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1500 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.65625, "epoch": 1.2008, "grad_norm": 0.7470058520218157, "kl": 0.103515625, "learning_rate": 3.270550326960511e-06, "loss": 0.0041, "reward": 1.71875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1501 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.671875, "epoch": 1.2016, "grad_norm": 1.1478656805215206, "kl": 0.107421875, "learning_rate": 3.268557627181772e-06, "loss": 0.0043, "reward": 1.78125, "reward_std": 0.23827511072158813, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1502 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 479.53125, "epoch": 1.2024, "grad_norm": 1.2596014404408202, "kl": 0.111328125, "learning_rate": 3.2665643879997054e-06, "loss": 0.0044, "reward": 1.6875, "reward_std": 0.19506090879440308, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1503 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 403.859375, "epoch": 1.2032, "grad_norm": 0.5155330501641545, "kl": 0.10498046875, "learning_rate": 3.2645706108132426e-06, "loss": 0.0042, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1504 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.8125, "epoch": 1.204, "grad_norm": 0.08151666120758788, "kl": 0.09326171875, "learning_rate": 3.2625762970216944e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1505 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 395.65625, "epoch": 1.2048, "grad_norm": 0.07530023716969772, "kl": 0.111328125, "learning_rate": 3.2605814480247454e-06, "loss": 0.0044, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1506 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 431.234375, "epoch": 1.2056, "grad_norm": 0.5458788856505151, "kl": 0.09765625, "learning_rate": 3.258586065222459e-06, "loss": 0.0039, "reward": 1.5, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1507 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 409.5625, "epoch": 1.2064, "grad_norm": 0.6878005837117955, "kl": 0.10546875, "learning_rate": 3.2565901500152702e-06, "loss": 0.0042, "reward": 1.453125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 1508 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 449.34375, "epoch": 1.2072, "grad_norm": 1.9930446515014937, "kl": 0.09375, "learning_rate": 3.2545937038039904e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1509 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.359375, "epoch": 1.208, "grad_norm": 0.6289779042784038, "kl": 0.103515625, "learning_rate": 3.2525967279898017e-06, "loss": 0.0041, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1510 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 488.96875, "epoch": 1.2088, "grad_norm": 1.0644143284005663, "kl": 0.10400390625, "learning_rate": 3.2505992239742582e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1511 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 1.2096, "grad_norm": 1.0202268224441047, "kl": 0.10498046875, "learning_rate": 3.2486011931592863e-06, "loss": 0.0042, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1512 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 488.796875, "epoch": 1.2104, "grad_norm": 0.8755874946771128, "kl": 0.09765625, "learning_rate": 3.2466026369471804e-06, "loss": 0.0039, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1513 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 408.140625, "epoch": 1.2112, "grad_norm": 0.49639688259820164, "kl": 0.1044921875, "learning_rate": 3.2446035567406033e-06, "loss": 0.0042, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1514 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 411.78125, "epoch": 1.212, "grad_norm": 0.491303707987397, "kl": 0.10009765625, "learning_rate": 3.2426039539425875e-06, "loss": 0.004, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1515 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 442.8125, "epoch": 1.2128, "grad_norm": 0.47695448585921424, "kl": 0.10009765625, "learning_rate": 3.240603829956531e-06, "loss": 0.004, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1516 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 404.375, "epoch": 1.2136, "grad_norm": 0.6604175255061485, "kl": 0.11083984375, "learning_rate": 3.238603186186198e-06, "loss": 0.0044, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1517 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 462.0, "epoch": 1.2144, "grad_norm": 0.7893523216783938, "kl": 0.099609375, "learning_rate": 3.2366020240357166e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1518 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 384.5, "epoch": 1.2152, "grad_norm": 0.10841424005624302, "kl": 0.1005859375, "learning_rate": 3.2346003449095803e-06, "loss": 0.004, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1519 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.65625, "epoch": 1.216, "grad_norm": 0.09099275239237044, "kl": 0.087890625, "learning_rate": 3.2325981502126434e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1520 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.765625, "epoch": 1.2168, "grad_norm": 0.6016652936245342, "kl": 0.09814453125, "learning_rate": 3.2305954413501252e-06, "loss": 0.0039, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1521 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.90625, "epoch": 1.2176, "grad_norm": 0.746887381046538, "kl": 0.08984375, "learning_rate": 3.228592219727602e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1522 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 467.671875, "epoch": 1.2184, "grad_norm": 0.4274887556891549, "kl": 0.0947265625, "learning_rate": 3.226588486751012e-06, "loss": 0.0038, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1523 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 375.046875, "epoch": 1.2192, "grad_norm": 0.4559184408879622, "kl": 0.0966796875, "learning_rate": 3.2245842438266526e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1524 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 452.140625, "epoch": 1.22, "grad_norm": 0.060753100467566594, "kl": 0.10400390625, "learning_rate": 3.222579492361179e-06, "loss": 0.0042, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1525 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 409.40625, "epoch": 1.2208, "grad_norm": 1.7216627459251168, "kl": 0.10986328125, "learning_rate": 3.220574233761603e-06, "loss": 0.0044, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1526 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 454.296875, "epoch": 1.2216, "grad_norm": 0.4211796013305398, "kl": 0.10107421875, "learning_rate": 3.2185684694352913e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1527 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 413.875, "epoch": 1.2224, "grad_norm": 0.4899344234500141, "kl": 0.087890625, "learning_rate": 3.216562200789968e-06, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1528 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 452.265625, "epoch": 1.2232, "grad_norm": 0.6674345375776687, "kl": 0.09912109375, "learning_rate": 3.214555429233707e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1529 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 475.46875, "epoch": 1.224, "grad_norm": 0.43503399027021084, "kl": 0.0927734375, "learning_rate": 3.2125481561749406e-06, "loss": 0.0037, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1530 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 438.203125, "epoch": 1.2248, "grad_norm": 0.5721824849540433, "kl": 0.0966796875, "learning_rate": 3.210540383022449e-06, "loss": 0.0039, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1531 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 446.8125, "epoch": 1.2256, "grad_norm": 0.6566101912725456, "kl": 0.095703125, "learning_rate": 3.208532111185365e-06, "loss": 0.0038, "reward": 1.734375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1532 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 422.125, "epoch": 1.2264, "grad_norm": 0.6024691885630784, "kl": 0.09423828125, "learning_rate": 3.2065233420731717e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1533 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 404.328125, "epoch": 1.2272, "grad_norm": 0.37536801298633604, "kl": 0.09765625, "learning_rate": 3.2045140770956987e-06, "loss": 0.0039, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1534 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 423.96875, "epoch": 1.228, "grad_norm": 0.06282124859248339, "kl": 0.10107421875, "learning_rate": 3.2025043176631283e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1535 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 374.921875, "epoch": 1.2288000000000001, "grad_norm": 0.07562155529442534, "kl": 0.10400390625, "learning_rate": 3.2004940651859844e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1536 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 427.5625, "epoch": 1.2296, "grad_norm": 0.9258511371248414, "kl": 0.0869140625, "learning_rate": 3.198483321075141e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1537 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 422.625, "epoch": 1.2304, "grad_norm": 0.07953989457996766, "kl": 0.09375, "learning_rate": 3.196472086741815e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1538 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 434.453125, "epoch": 1.2312, "grad_norm": 0.3698225889729225, "kl": 0.08935546875, "learning_rate": 3.194460363597569e-06, "loss": 0.0036, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1539 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.359375, "epoch": 1.232, "grad_norm": 0.7287457368623393, "kl": 0.11181640625, "learning_rate": 3.192448153054306e-06, "loss": 0.0045, "reward": 1.71875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1540 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 460.703125, "epoch": 1.2328000000000001, "grad_norm": 1.4674304521597479, "kl": 0.08447265625, "learning_rate": 3.190435456524275e-06, "loss": 0.0034, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1541 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 447.65625, "epoch": 1.2336, "grad_norm": 0.6147338626869835, "kl": 0.09814453125, "learning_rate": 3.188422275420063e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1542 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.265625, "epoch": 1.2344, "grad_norm": 0.6045595821869633, "kl": 0.08837890625, "learning_rate": 3.186408611154597e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1543 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 405.015625, "epoch": 1.2352, "grad_norm": 0.519945223557907, "kl": 0.095703125, "learning_rate": 3.184394465141146e-06, "loss": 0.0038, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1544 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 409.515625, "epoch": 1.236, "grad_norm": 0.9373246726421136, "kl": 0.09912109375, "learning_rate": 3.1823798387933134e-06, "loss": 0.004, "reward": 1.484375, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1545 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 443.21875, "epoch": 1.2368000000000001, "grad_norm": 0.08531400250257483, "kl": 0.08349609375, "learning_rate": 3.180364733525043e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1546 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 374.984375, "epoch": 1.2376, "grad_norm": 0.06331314257077808, "kl": 0.099609375, "learning_rate": 3.178349150750612e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1547 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.9375, "epoch": 1.2384, "grad_norm": 0.6285410098420786, "kl": 0.1025390625, "learning_rate": 3.1763330918846347e-06, "loss": 0.0041, "reward": 1.78125, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1548 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 477.09375, "epoch": 1.2392, "grad_norm": 0.4345200262968506, "kl": 0.09521484375, "learning_rate": 3.1743165583420586e-06, "loss": 0.0038, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1549 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 439.703125, "epoch": 1.24, "grad_norm": 0.06940406756956047, "kl": 0.091796875, "learning_rate": 3.1722995515381644e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1550 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.421875, "epoch": 1.2408, "grad_norm": 0.7915566012453998, "kl": 0.11376953125, "learning_rate": 3.1702820728885657e-06, "loss": 0.0045, "reward": 1.5625, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1551 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.46875, "epoch": 1.2416, "grad_norm": 0.6619017754194848, "kl": 0.0927734375, "learning_rate": 3.1682641238092064e-06, "loss": 0.0037, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1552 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.375, "epoch": 1.2424, "grad_norm": 0.4864696985125599, "kl": 0.0859375, "learning_rate": 3.1662457057163603e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1553 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 417.90625, "epoch": 1.2432, "grad_norm": 1.0619547205515607, "kl": 0.09716796875, "learning_rate": 3.164226820026632e-06, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1554 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 413.0, "epoch": 1.244, "grad_norm": 0.9950719878007777, "kl": 0.09765625, "learning_rate": 3.162207468156952e-06, "loss": 0.0039, "reward": 1.765625, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1555 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.859375, "epoch": 1.2448, "grad_norm": 1.025372167284398, "kl": 0.10546875, "learning_rate": 3.16018765152458e-06, "loss": 0.0042, "reward": 1.640625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 1556 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 400.75, "epoch": 1.2456, "grad_norm": 0.48661050828794217, "kl": 0.10009765625, "learning_rate": 3.1581673715471007e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1557 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 433.546875, "epoch": 1.2464, "grad_norm": 0.5652765630430348, "kl": 0.08642578125, "learning_rate": 3.1561466296424247e-06, "loss": 0.0035, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1558 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 428.28125, "epoch": 1.2472, "grad_norm": 0.9513859294472352, "kl": 0.0947265625, "learning_rate": 3.154125427228786e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1559 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 360.296875, "epoch": 1.248, "grad_norm": 0.9009921272927971, "kl": 0.099609375, "learning_rate": 3.152103765724743e-06, "loss": 0.004, "reward": 1.671875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1560 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.09375, "epoch": 1.2488, "grad_norm": 0.8178067423041044, "kl": 0.10302734375, "learning_rate": 3.150081646549174e-06, "loss": 0.0041, "reward": 1.734375, "reward_std": 0.1893727034330368, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1561 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 427.34375, "epoch": 1.2496, "grad_norm": 0.38434557649415374, "kl": 0.09814453125, "learning_rate": 3.1480590711212823e-06, "loss": 0.0039, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1562 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 435.78125, "epoch": 1.2504, "grad_norm": 0.7944107285153403, "kl": 0.09130859375, "learning_rate": 3.1460360408605866e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1563 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 408.609375, "epoch": 1.2511999999999999, "grad_norm": 0.06691646785805336, "kl": 0.10546875, "learning_rate": 3.144012557186931e-06, "loss": 0.0042, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1564 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 424.09375, "epoch": 1.252, "grad_norm": 0.7682463850209427, "kl": 0.1064453125, "learning_rate": 3.14198862152047e-06, "loss": 0.0043, "reward": 1.734375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1565 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 426.046875, "epoch": 1.2528000000000001, "grad_norm": 0.06225457269389659, "kl": 0.08935546875, "learning_rate": 3.1399642352816825e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1566 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.9375, "epoch": 1.2536, "grad_norm": 0.9374596011892233, "kl": 0.103515625, "learning_rate": 3.1379393998913594e-06, "loss": 0.0041, "reward": 1.765625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 1567 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.390625, "epoch": 1.2544, "grad_norm": 7.514759589636227, "kl": 0.09228515625, "learning_rate": 3.135914116770609e-06, "loss": 0.0037, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1568 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.34375, "epoch": 1.2551999999999999, "grad_norm": 0.6425695406518036, "kl": 0.10205078125, "learning_rate": 3.1338883873408517e-06, "loss": 0.0041, "reward": 1.890625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1569 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 440.03125, "epoch": 1.256, "grad_norm": 0.4627676969714413, "kl": 0.087890625, "learning_rate": 3.1318622130238237e-06, "loss": 0.0035, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1570 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 433.25, "epoch": 1.2568, "grad_norm": 0.22107253422721868, "kl": 0.0966796875, "learning_rate": 3.1298355952415714e-06, "loss": 0.0039, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1571 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 428.0, "epoch": 1.2576, "grad_norm": 0.7907025613664066, "kl": 0.09716796875, "learning_rate": 3.127808535416454e-06, "loss": 0.0039, "reward": 1.890625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.984375, "step": 1572 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 449.359375, "epoch": 1.2584, "grad_norm": 0.06448354850068903, "kl": 0.0849609375, "learning_rate": 3.1257810349711388e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1573 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 477.40625, "epoch": 1.2591999999999999, "grad_norm": 0.7031076936859635, "kl": 0.0927734375, "learning_rate": 3.1237530953286046e-06, "loss": 0.0037, "reward": 1.859375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1574 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 406.734375, "epoch": 1.26, "grad_norm": 0.5359139719280055, "kl": 0.0966796875, "learning_rate": 3.121724717912138e-06, "loss": 0.0039, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1575 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 478.421875, "epoch": 1.2608, "grad_norm": 1.1600258143376623, "kl": 0.09375, "learning_rate": 3.11969590414533e-06, "loss": 0.0037, "reward": 1.734375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1576 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 442.21875, "epoch": 1.2616, "grad_norm": 0.5506945261688823, "kl": 0.09033203125, "learning_rate": 3.1176666554520827e-06, "loss": 0.0036, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1577 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 435.484375, "epoch": 1.2624, "grad_norm": 1.0118338172531742, "kl": 0.10205078125, "learning_rate": 3.1156369732566006e-06, "loss": 0.0041, "reward": 1.75, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1578 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.390625, "epoch": 1.2631999999999999, "grad_norm": 0.73187066201207, "kl": 0.1025390625, "learning_rate": 3.113606858983391e-06, "loss": 0.0041, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1579 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 465.265625, "epoch": 1.264, "grad_norm": 0.49609510574463794, "kl": 0.09765625, "learning_rate": 3.1115763140572686e-06, "loss": 0.0039, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1580 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 466.875, "epoch": 1.2648, "grad_norm": 0.4063841044076745, "kl": 0.08642578125, "learning_rate": 3.109545339903347e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1581 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 510.0, "epoch": 1.2656, "grad_norm": 0.06132187821039446, "kl": 0.09912109375, "learning_rate": 3.107513937947041e-06, "loss": 0.004, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1582 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.328125, "epoch": 1.2664, "grad_norm": 0.3537366580312689, "kl": 0.0927734375, "learning_rate": 3.1054821096140675e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1583 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 401.171875, "epoch": 1.2671999999999999, "grad_norm": 0.4688301710358441, "kl": 0.10107421875, "learning_rate": 3.1034498563304435e-06, "loss": 0.004, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1584 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 407.703125, "epoch": 1.268, "grad_norm": 0.41789923092783626, "kl": 0.111328125, "learning_rate": 3.1014171795224794e-06, "loss": 0.0045, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1585 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 436.90625, "epoch": 1.2688, "grad_norm": 0.49923174987107194, "kl": 0.1025390625, "learning_rate": 3.0993840806167884e-06, "loss": 0.0041, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1586 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 453.90625, "epoch": 1.2696, "grad_norm": 0.5634745847349014, "kl": 0.08935546875, "learning_rate": 3.0973505610402767e-06, "loss": 0.0036, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1587 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.0625, "epoch": 1.2704, "grad_norm": 11.033931290745768, "kl": 0.1025390625, "learning_rate": 3.0953166222201474e-06, "loss": 0.0041, "reward": 1.921875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1588 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 417.484375, "epoch": 1.2711999999999999, "grad_norm": 0.6320445656018617, "kl": 0.1005859375, "learning_rate": 3.093282265583895e-06, "loss": 0.004, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1589 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 423.78125, "epoch": 1.272, "grad_norm": 0.5090770424043585, "kl": 0.0966796875, "learning_rate": 3.0912474925593124e-06, "loss": 0.0039, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1590 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 440.109375, "epoch": 1.2728, "grad_norm": 0.34673848255588985, "kl": 0.10107421875, "learning_rate": 3.0892123045744787e-06, "loss": 0.004, "reward": 1.8125, "reward_std": 0.09449111670255661, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 1591 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 515.8125, "epoch": 1.2736, "grad_norm": 0.5295230759151055, "kl": 0.09033203125, "learning_rate": 3.0871767030577686e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1592 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 432.375, "epoch": 1.2744, "grad_norm": 0.6899275836361773, "kl": 0.0966796875, "learning_rate": 3.085140689437846e-06, "loss": 0.0039, "reward": 1.625, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1593 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 428.171875, "epoch": 1.2752, "grad_norm": 0.37769899158558207, "kl": 0.09716796875, "learning_rate": 3.0831042651436634e-06, "loss": 0.0039, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1594 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 381.75, "epoch": 1.276, "grad_norm": 1.3234644407004388, "kl": 0.10302734375, "learning_rate": 3.0810674316044602e-06, "loss": 0.0041, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1595 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.609375, "epoch": 1.2768, "grad_norm": 1.2325874581687077, "kl": 0.11474609375, "learning_rate": 3.0790301902497664e-06, "loss": 0.0046, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1596 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 409.734375, "epoch": 1.2776, "grad_norm": 0.49425500489254154, "kl": 0.10400390625, "learning_rate": 3.076992542509396e-06, "loss": 0.0042, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1597 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 401.515625, "epoch": 1.2784, "grad_norm": 0.44881265921084124, "kl": 0.0927734375, "learning_rate": 3.0749544898134487e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1598 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 427.03125, "epoch": 1.2792, "grad_norm": 0.611287629019588, "kl": 0.0966796875, "learning_rate": 3.072916033592307e-06, "loss": 0.0039, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1599 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.703125, "epoch": 1.28, "grad_norm": 0.5946965434991116, "kl": 0.09716796875, "learning_rate": 3.0708771752766397e-06, "loss": 0.0039, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1600 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.34375, "epoch": 1.2808, "grad_norm": 0.4426993076609644, "kl": 0.095703125, "learning_rate": 3.068837916297396e-06, "loss": 0.0038, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1601 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 422.859375, "epoch": 1.2816, "grad_norm": 1.1339435328216212, "kl": 0.099609375, "learning_rate": 3.0667982580858047e-06, "loss": 0.004, "reward": 1.53125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 1602 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.890625, "epoch": 1.2824, "grad_norm": 0.9071102702262377, "kl": 0.095703125, "learning_rate": 3.0647582020733773e-06, "loss": 0.0038, "reward": 1.828125, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1603 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 419.9375, "epoch": 1.2832, "grad_norm": 0.0676820144422039, "kl": 0.09375, "learning_rate": 3.062717749691904e-06, "loss": 0.0038, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1604 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 459.234375, "epoch": 1.284, "grad_norm": 0.41953849970425167, "kl": 0.095703125, "learning_rate": 3.0606769023734535e-06, "loss": 0.0038, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1605 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.953125, "epoch": 1.2848, "grad_norm": 0.0648339639969181, "kl": 0.0849609375, "learning_rate": 3.0586356615503693e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1606 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 399.515625, "epoch": 1.2856, "grad_norm": 0.48602338854803356, "kl": 0.10009765625, "learning_rate": 3.056594028655274e-06, "loss": 0.004, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1607 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.34375, "epoch": 1.2864, "grad_norm": 0.8451809274469703, "kl": 0.09033203125, "learning_rate": 3.0545520051210637e-06, "loss": 0.0036, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1608 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 441.515625, "epoch": 1.2872, "grad_norm": 0.0589542855636287, "kl": 0.09326171875, "learning_rate": 3.052509592380909e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1609 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 443.734375, "epoch": 1.288, "grad_norm": 0.4313472556924139, "kl": 0.09619140625, "learning_rate": 3.050466791868254e-06, "loss": 0.0038, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1610 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.234375, "epoch": 1.2888, "grad_norm": 0.6668857371532481, "kl": 0.08642578125, "learning_rate": 3.048423605016815e-06, "loss": 0.0034, "reward": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1611 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 451.671875, "epoch": 1.2896, "grad_norm": 1.3406958819436345, "kl": 0.08935546875, "learning_rate": 3.0463800332605787e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1612 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 393.078125, "epoch": 1.2904, "grad_norm": 0.08148749383324354, "kl": 0.09814453125, "learning_rate": 3.0443360780338034e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1613 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.015625, "epoch": 1.2912, "grad_norm": 0.4167582259726254, "kl": 0.0966796875, "learning_rate": 3.042291740771014e-06, "loss": 0.0039, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1614 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 440.234375, "epoch": 1.292, "grad_norm": 0.7375712223049959, "kl": 0.0927734375, "learning_rate": 3.0402470229070057e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1615 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 417.421875, "epoch": 1.2928, "grad_norm": 0.40930958366571135, "kl": 0.1015625, "learning_rate": 3.03820192587684e-06, "loss": 0.0041, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1616 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 451.46875, "epoch": 1.2936, "grad_norm": 0.06704584310068394, "kl": 0.09716796875, "learning_rate": 3.036156451115846e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1617 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 461.078125, "epoch": 1.2944, "grad_norm": 0.8551137716897909, "kl": 0.09521484375, "learning_rate": 3.034110600059616e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1618 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 360.828125, "epoch": 1.2952, "grad_norm": 0.4107784594557192, "kl": 0.1044921875, "learning_rate": 3.0320643741440052e-06, "loss": 0.0042, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1619 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 424.703125, "epoch": 1.296, "grad_norm": 0.7297262530078372, "kl": 0.09765625, "learning_rate": 3.0300177748051375e-06, "loss": 0.0039, "reward": 1.453125, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 1620 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 448.6875, "epoch": 1.2968, "grad_norm": 0.5609991290061184, "kl": 0.09228515625, "learning_rate": 3.0279708034793907e-06, "loss": 0.0037, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1621 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.09375, "epoch": 1.2976, "grad_norm": 2.127767079084424, "kl": 0.099609375, "learning_rate": 3.025923461603412e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1622 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 415.671875, "epoch": 1.2984, "grad_norm": 0.421056995953176, "kl": 0.0849609375, "learning_rate": 3.0238757506141013e-06, "loss": 0.0034, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1623 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 405.9375, "epoch": 1.2992, "grad_norm": 0.4784112731263907, "kl": 0.0859375, "learning_rate": 3.0218276719486245e-06, "loss": 0.0034, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1624 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.9375, "epoch": 1.3, "grad_norm": 0.0846574884923956, "kl": 0.09912109375, "learning_rate": 3.019779227044398e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1625 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 466.484375, "epoch": 1.3008, "grad_norm": 0.41397898699895536, "kl": 0.09423828125, "learning_rate": 3.0177304173391038e-06, "loss": 0.0038, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1626 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 456.078125, "epoch": 1.3016, "grad_norm": 0.091162832448217, "kl": 0.09228515625, "learning_rate": 3.015681244270672e-06, "loss": 0.0037, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1627 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 457.5625, "epoch": 1.3024, "grad_norm": 0.6367701683829089, "kl": 0.0908203125, "learning_rate": 3.0136317092772923e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1628 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 511.109375, "epoch": 1.3032, "grad_norm": 0.3142596953350838, "kl": 0.0927734375, "learning_rate": 3.0115818137974066e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1629 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 473.4375, "epoch": 1.304, "grad_norm": 0.2755177205130857, "kl": 0.08984375, "learning_rate": 3.0095315592697126e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1630 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.0, "epoch": 1.3048, "grad_norm": 0.621827152213513, "kl": 0.10302734375, "learning_rate": 3.007480947133155e-06, "loss": 0.0041, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1631 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 504.125, "epoch": 1.3056, "grad_norm": 4.9949937715132915, "kl": 0.0869140625, "learning_rate": 3.0054299788269343e-06, "loss": 0.0035, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 1632 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 457.875, "epoch": 1.3064, "grad_norm": 0.4221012204468084, "kl": 0.0859375, "learning_rate": 3.0033786557904982e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1633 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 459.609375, "epoch": 1.3072, "grad_norm": 0.3895949270319322, "kl": 0.09228515625, "learning_rate": 3.001326979463545e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1634 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 455.984375, "epoch": 1.308, "grad_norm": 0.6182718078206811, "kl": 0.0908203125, "learning_rate": 2.9992749512860177e-06, "loss": 0.0036, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1635 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 459.984375, "epoch": 1.3088, "grad_norm": 0.7769829844057634, "kl": 0.0888671875, "learning_rate": 2.9972225726981114e-06, "loss": 0.0036, "reward": 1.640625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1636 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 458.59375, "epoch": 1.3096, "grad_norm": 1.2178216736288152, "kl": 0.1044921875, "learning_rate": 2.995169845140264e-06, "loss": 0.0042, "reward": 1.5625, "reward_std": 0.213067427277565, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1637 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 435.921875, "epoch": 1.3104, "grad_norm": 0.7459185976110713, "kl": 0.099609375, "learning_rate": 2.9931167700531575e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1638 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 431.890625, "epoch": 1.3112, "grad_norm": 0.5701855150813324, "kl": 0.1005859375, "learning_rate": 2.9910633488777198e-06, "loss": 0.004, "reward": 1.625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1639 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.578125, "epoch": 1.312, "grad_norm": 0.5978223972000913, "kl": 0.0869140625, "learning_rate": 2.989009583055121e-06, "loss": 0.0035, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1640 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.921875, "epoch": 1.3128, "grad_norm": 0.6614721976106119, "kl": 0.103515625, "learning_rate": 2.9869554740267726e-06, "loss": 0.0041, "reward": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1641 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.390625, "epoch": 1.3136, "grad_norm": 0.5065939763157713, "kl": 0.09375, "learning_rate": 2.9849010232343274e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1642 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.921875, "epoch": 1.3144, "grad_norm": 0.6517591893012402, "kl": 0.0927734375, "learning_rate": 2.982846232119679e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1643 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 477.328125, "epoch": 1.3152, "grad_norm": 0.7316421419175054, "kl": 0.08984375, "learning_rate": 2.9807911021249573e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1644 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 454.625, "epoch": 1.316, "grad_norm": 0.3839403671631433, "kl": 0.0947265625, "learning_rate": 2.9787356346925327e-06, "loss": 0.0038, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1645 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 418.859375, "epoch": 1.3168, "grad_norm": 0.7656226808781703, "kl": 0.0947265625, "learning_rate": 2.9766798312650112e-06, "loss": 0.0038, "reward": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1646 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 409.84375, "epoch": 1.3176, "grad_norm": 0.4979322214211116, "kl": 0.09912109375, "learning_rate": 2.9746236932852355e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1647 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.890625, "epoch": 1.3184, "grad_norm": 1.6603169356215501, "kl": 0.1015625, "learning_rate": 2.9725672221962804e-06, "loss": 0.0041, "reward": 1.71875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1648 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 433.265625, "epoch": 1.3192, "grad_norm": 0.6779815964348609, "kl": 0.0947265625, "learning_rate": 2.9705104194414587e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1649 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 447.3125, "epoch": 1.32, "grad_norm": 0.09704947200588641, "kl": 0.099609375, "learning_rate": 2.9684532864643123e-06, "loss": 0.004, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1650 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.890625, "epoch": 1.3208, "grad_norm": 0.91615554598393, "kl": 0.09912109375, "learning_rate": 2.9663958247086165e-06, "loss": 0.004, "reward": 1.84375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1651 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.109375, "epoch": 1.3216, "grad_norm": 0.43300156125730144, "kl": 0.09375, "learning_rate": 2.964338035618378e-06, "loss": 0.0038, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1652 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 476.09375, "epoch": 1.3224, "grad_norm": 0.6125742503413522, "kl": 0.0888671875, "learning_rate": 2.9622799206378306e-06, "loss": 0.0036, "reward": 1.515625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1653 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.140625, "epoch": 1.3232, "grad_norm": 0.6155846543805755, "kl": 0.0888671875, "learning_rate": 2.9602214812114414e-06, "loss": 0.0036, "reward": 1.890625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1654 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 436.09375, "epoch": 1.324, "grad_norm": 0.9448571772973201, "kl": 0.08984375, "learning_rate": 2.9581627187838997e-06, "loss": 0.0036, "reward": 1.6875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1655 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.0625, "epoch": 1.3248, "grad_norm": 0.8290792265291552, "kl": 0.09765625, "learning_rate": 2.956103634800126e-06, "loss": 0.0039, "reward": 1.8125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1656 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 487.984375, "epoch": 1.3256000000000001, "grad_norm": 0.7710961065985185, "kl": 0.08935546875, "learning_rate": 2.9540442307052643e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1657 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 468.515625, "epoch": 1.3264, "grad_norm": 0.4367040453358929, "kl": 0.09228515625, "learning_rate": 2.9519845079446824e-06, "loss": 0.0037, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1658 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 441.0, "epoch": 1.3272, "grad_norm": 1.7849653041160358, "kl": 0.09716796875, "learning_rate": 2.949924467963975e-06, "loss": 0.0039, "reward": 1.53125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1659 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 456.859375, "epoch": 1.328, "grad_norm": 0.06616971197048722, "kl": 0.087890625, "learning_rate": 2.9478641122089563e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1660 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 479.890625, "epoch": 1.3288, "grad_norm": 0.8634993072852989, "kl": 0.0888671875, "learning_rate": 2.945803442125663e-06, "loss": 0.0036, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1661 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 477.28125, "epoch": 1.3296000000000001, "grad_norm": 0.5858383428314752, "kl": 0.08740234375, "learning_rate": 2.943742459160354e-06, "loss": 0.0035, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1662 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.59375, "epoch": 1.3304, "grad_norm": 0.6528977422099489, "kl": 0.0927734375, "learning_rate": 2.9416811647595052e-06, "loss": 0.0037, "reward": 1.828125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1663 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 435.96875, "epoch": 1.3312, "grad_norm": 0.07583206488855732, "kl": 0.10400390625, "learning_rate": 2.939619560369813e-06, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1664 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 453.296875, "epoch": 1.332, "grad_norm": 0.3460165537316144, "kl": 0.08447265625, "learning_rate": 2.9375576474381907e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1665 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 422.140625, "epoch": 1.3328, "grad_norm": 0.38978421791431306, "kl": 0.10107421875, "learning_rate": 2.9354954274117683e-06, "loss": 0.004, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1666 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.96875, "epoch": 1.3336000000000001, "grad_norm": 1.4009124949937999, "kl": 0.1015625, "learning_rate": 2.9334329017378898e-06, "loss": 0.0041, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1667 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 424.328125, "epoch": 1.3344, "grad_norm": 0.6815678642497196, "kl": 0.1005859375, "learning_rate": 2.9313700718641167e-06, "loss": 0.004, "reward": 1.78125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1668 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.234375, "epoch": 1.3352, "grad_norm": 0.7279258818436678, "kl": 0.09716796875, "learning_rate": 2.9293069392382224e-06, "loss": 0.0039, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1669 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 436.203125, "epoch": 1.336, "grad_norm": 0.5515830996228361, "kl": 0.0947265625, "learning_rate": 2.927243505308192e-06, "loss": 0.0038, "reward": 1.53125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1670 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.09375, "epoch": 1.3368, "grad_norm": 1.3628656070496277, "kl": 0.11474609375, "learning_rate": 2.925179771522223e-06, "loss": 0.0046, "reward": 1.75, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1671 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 431.4375, "epoch": 1.3376000000000001, "grad_norm": 0.09915645836765682, "kl": 0.0859375, "learning_rate": 2.9231157393287234e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1672 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.8125, "epoch": 1.3384, "grad_norm": 0.06367935957263718, "kl": 0.10009765625, "learning_rate": 2.9210514101763116e-06, "loss": 0.004, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1673 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 482.546875, "epoch": 1.3392, "grad_norm": 0.2979102609761941, "kl": 0.083984375, "learning_rate": 2.9189867855138103e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1674 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 377.171875, "epoch": 1.34, "grad_norm": 0.7054617593421745, "kl": 0.11474609375, "learning_rate": 2.9169218667902562e-06, "loss": 0.0046, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1675 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 394.4375, "epoch": 1.3408, "grad_norm": 0.3841998484727046, "kl": 0.09375, "learning_rate": 2.9148566554548857e-06, "loss": 0.0037, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1676 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 389.53125, "epoch": 1.3416000000000001, "grad_norm": 0.4768179751975896, "kl": 0.08837890625, "learning_rate": 2.912791152957145e-06, "loss": 0.0035, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1677 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 396.78125, "epoch": 1.3424, "grad_norm": 0.6416350235556716, "kl": 0.1005859375, "learning_rate": 2.9107253607466833e-06, "loss": 0.004, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1678 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 413.71875, "epoch": 1.3432, "grad_norm": 0.07621470210180957, "kl": 0.08740234375, "learning_rate": 2.908659280273354e-06, "loss": 0.0035, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1679 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 408.625, "epoch": 1.3439999999999999, "grad_norm": 0.07061972741807182, "kl": 0.0927734375, "learning_rate": 2.9065929129872097e-06, "loss": 0.0037, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1680 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.984375, "epoch": 1.3448, "grad_norm": 0.5972451450752345, "kl": 0.09912109375, "learning_rate": 2.9045262603385073e-06, "loss": 0.004, "reward": 1.78125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1681 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.640625, "epoch": 1.3456000000000001, "grad_norm": 1.232740151493746, "kl": 0.09326171875, "learning_rate": 2.902459323777704e-06, "loss": 0.0037, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1682 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 1.3464, "grad_norm": 0.6605666547913168, "kl": 0.091796875, "learning_rate": 2.900392104755455e-06, "loss": 0.0037, "reward": 1.78125, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1683 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 383.5625, "epoch": 1.3472, "grad_norm": 0.5299227472944182, "kl": 0.09716796875, "learning_rate": 2.8983246047226137e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1684 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.203125, "epoch": 1.3479999999999999, "grad_norm": 0.06712588071041166, "kl": 0.087890625, "learning_rate": 2.8962568251302327e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1685 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.078125, "epoch": 1.3488, "grad_norm": 0.6702177333942141, "kl": 0.09423828125, "learning_rate": 2.8941887674295573e-06, "loss": 0.0038, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1686 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 459.375, "epoch": 1.3496000000000001, "grad_norm": 1.1857337277286952, "kl": 0.09375, "learning_rate": 2.892120433072031e-06, "loss": 0.0037, "reward": 1.484375, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1687 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 350.953125, "epoch": 1.3504, "grad_norm": 1.139359898078901, "kl": 0.11279296875, "learning_rate": 2.8900518235092908e-06, "loss": 0.0045, "reward": 1.765625, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1688 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 443.28125, "epoch": 1.3512, "grad_norm": 0.6587354222238803, "kl": 0.08642578125, "learning_rate": 2.887982940193165e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1689 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 376.0, "epoch": 1.3519999999999999, "grad_norm": 1.0438938529358286, "kl": 0.09814453125, "learning_rate": 2.8859137845756785e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.30038219690322876, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1690 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 384.6875, "epoch": 1.3528, "grad_norm": 0.8948286229963006, "kl": 0.091796875, "learning_rate": 2.8838443581090415e-06, "loss": 0.0037, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1691 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 410.25, "epoch": 1.3536000000000001, "grad_norm": 0.567807663221168, "kl": 0.10009765625, "learning_rate": 2.8817746622456585e-06, "loss": 0.004, "reward": 1.453125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 1692 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 372.875, "epoch": 1.3544, "grad_norm": 0.519819832802404, "kl": 0.1044921875, "learning_rate": 2.879704698438121e-06, "loss": 0.0042, "reward": 1.484375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1693 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.53125, "epoch": 1.3552, "grad_norm": 0.06979011480417248, "kl": 0.08349609375, "learning_rate": 2.8776344681392106e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1694 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 417.828125, "epoch": 1.3559999999999999, "grad_norm": 0.8902488995754579, "kl": 0.09765625, "learning_rate": 2.875563972801893e-06, "loss": 0.0039, "reward": 1.578125, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1695 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 517.671875, "epoch": 1.3568, "grad_norm": 0.43442008238301505, "kl": 0.08349609375, "learning_rate": 2.8734932138793226e-06, "loss": 0.0033, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1696 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 383.5625, "epoch": 1.3576, "grad_norm": 0.7640673678868046, "kl": 0.09814453125, "learning_rate": 2.871422192824837e-06, "loss": 0.0039, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1697 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.125, "epoch": 1.3584, "grad_norm": 0.08779466583102942, "kl": 0.09619140625, "learning_rate": 2.8693509110919597e-06, "loss": 0.0038, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1698 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 453.390625, "epoch": 1.3592, "grad_norm": 0.06334432150922421, "kl": 0.0830078125, "learning_rate": 2.867279370134395e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1699 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.421875, "epoch": 1.3599999999999999, "grad_norm": 0.4094643398060491, "kl": 0.09765625, "learning_rate": 2.8652075714060296e-06, "loss": 0.0039, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1700 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 410.171875, "epoch": 1.3608, "grad_norm": 1.1037768105451344, "kl": 0.09716796875, "learning_rate": 2.863135516360932e-06, "loss": 0.0039, "reward": 1.53125, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1701 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 436.546875, "epoch": 1.3616, "grad_norm": 0.05567304665583726, "kl": 0.0869140625, "learning_rate": 2.8610632064533517e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1702 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 505.3125, "epoch": 1.3624, "grad_norm": 0.3870815660672654, "kl": 0.08251953125, "learning_rate": 2.8589906431377133e-06, "loss": 0.0033, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1703 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 464.5, "epoch": 1.3632, "grad_norm": 0.05381483134898848, "kl": 0.0830078125, "learning_rate": 2.8569178278686222e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1704 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.765625, "epoch": 1.3639999999999999, "grad_norm": 0.8768340466062517, "kl": 0.091796875, "learning_rate": 2.8548447621008614e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1705 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.109375, "epoch": 1.3648, "grad_norm": 0.616807366638633, "kl": 0.09716796875, "learning_rate": 2.8527714472893866e-06, "loss": 0.0039, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1706 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 438.875, "epoch": 1.3656, "grad_norm": 0.6597623797833504, "kl": 0.0869140625, "learning_rate": 2.85069788488933e-06, "loss": 0.0035, "reward": 1.640625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1707 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 444.953125, "epoch": 1.3664, "grad_norm": 0.07323190489046655, "kl": 0.08642578125, "learning_rate": 2.8486240763559984e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1708 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 440.109375, "epoch": 1.3672, "grad_norm": 0.42004256302505294, "kl": 0.0830078125, "learning_rate": 2.8465500231448707e-06, "loss": 0.0033, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1709 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 433.03125, "epoch": 1.3679999999999999, "grad_norm": 1.6511001500878217, "kl": 0.0927734375, "learning_rate": 2.844475726711595e-06, "loss": 0.0037, "reward": 1.640625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1710 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.53125, "epoch": 1.3688, "grad_norm": 0.6525848959359796, "kl": 0.09716796875, "learning_rate": 2.8424011885119956e-06, "loss": 0.0039, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1711 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 424.34375, "epoch": 1.3696, "grad_norm": 0.55412135648888, "kl": 0.08935546875, "learning_rate": 2.8403264100020613e-06, "loss": 0.0036, "reward": 1.5625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1712 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 428.84375, "epoch": 1.3704, "grad_norm": 0.6574047284766994, "kl": 0.09228515625, "learning_rate": 2.8382513926379508e-06, "loss": 0.0037, "reward": 1.546875, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1713 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 387.578125, "epoch": 1.3712, "grad_norm": 0.4794407935749394, "kl": 0.09033203125, "learning_rate": 2.836176137875993e-06, "loss": 0.0036, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1714 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.703125, "epoch": 1.3719999999999999, "grad_norm": 0.6043851197564949, "kl": 0.08984375, "learning_rate": 2.8341006471726817e-06, "loss": 0.0036, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1715 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 423.875, "epoch": 1.3728, "grad_norm": 0.05399729496075482, "kl": 0.087890625, "learning_rate": 2.832024921984674e-06, "loss": 0.0035, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1716 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 413.046875, "epoch": 1.3736, "grad_norm": 0.05820011528800716, "kl": 0.0830078125, "learning_rate": 2.8299489637687955e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1717 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 395.140625, "epoch": 1.3744, "grad_norm": 1.930911031144314, "kl": 0.095703125, "learning_rate": 2.8278727739820334e-06, "loss": 0.0038, "reward": 1.5, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1718 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 436.328125, "epoch": 1.3752, "grad_norm": 0.17158954044269398, "kl": 0.09326171875, "learning_rate": 2.825796354081537e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1719 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 401.578125, "epoch": 1.376, "grad_norm": 0.6193317607808814, "kl": 0.09033203125, "learning_rate": 2.8237197055246175e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1720 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 429.578125, "epoch": 1.3768, "grad_norm": 0.7098608229060337, "kl": 0.091796875, "learning_rate": 2.821642829768748e-06, "loss": 0.0037, "reward": 1.484375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1721 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.546875, "epoch": 1.3776, "grad_norm": 0.6271528558475724, "kl": 0.087890625, "learning_rate": 2.8195657282715595e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1722 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 419.96875, "epoch": 1.3784, "grad_norm": 0.07813253837379211, "kl": 0.09375, "learning_rate": 2.817488402490841e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1723 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.90625, "epoch": 1.3792, "grad_norm": 0.7704286028303243, "kl": 0.09521484375, "learning_rate": 2.8154108538845405e-06, "loss": 0.0038, "reward": 1.8125, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1724 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.140625, "epoch": 1.38, "grad_norm": 0.49829524199966546, "kl": 0.0888671875, "learning_rate": 2.813333083910761e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1725 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 416.328125, "epoch": 1.3808, "grad_norm": 0.6052960995753209, "kl": 0.0966796875, "learning_rate": 2.8112550940277615e-06, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1726 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 375.4375, "epoch": 1.3816, "grad_norm": 0.0658129561427548, "kl": 0.08984375, "learning_rate": 2.809176885693956e-06, "loss": 0.0036, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1727 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.65625, "epoch": 1.3824, "grad_norm": 1.0729215465529967, "kl": 0.10498046875, "learning_rate": 2.807098460367911e-06, "loss": 0.0042, "reward": 1.484375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1728 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 378.78125, "epoch": 1.3832, "grad_norm": 0.961026547114402, "kl": 0.09228515625, "learning_rate": 2.8050198195083445e-06, "loss": 0.0037, "reward": 1.65625, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1729 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 446.390625, "epoch": 1.384, "grad_norm": 0.579067243181472, "kl": 0.08740234375, "learning_rate": 2.802940964574127e-06, "loss": 0.0035, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1730 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.609375, "epoch": 1.3848, "grad_norm": 0.35402072367858844, "kl": 0.0908203125, "learning_rate": 2.800861897024279e-06, "loss": 0.0036, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1731 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 455.03125, "epoch": 1.3856, "grad_norm": 0.8443935379882248, "kl": 0.0947265625, "learning_rate": 2.798782618317971e-06, "loss": 0.0038, "reward": 1.71875, "reward_std": 0.1872510462999344, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1732 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 429.390625, "epoch": 1.3864, "grad_norm": 0.1488616126210085, "kl": 0.08203125, "learning_rate": 2.796703129914519e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1733 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 475.734375, "epoch": 1.3872, "grad_norm": 0.5764467406833679, "kl": 0.087890625, "learning_rate": 2.79462343327339e-06, "loss": 0.0035, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1734 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 436.53125, "epoch": 1.388, "grad_norm": 0.9090572974719794, "kl": 0.091796875, "learning_rate": 2.7925435298541944e-06, "loss": 0.0037, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1735 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.34375, "epoch": 1.3888, "grad_norm": 0.7280829399817845, "kl": 0.09912109375, "learning_rate": 2.7904634211166877e-06, "loss": 0.004, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1736 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 434.859375, "epoch": 1.3896, "grad_norm": 0.43457076699696445, "kl": 0.0908203125, "learning_rate": 2.7883831085207707e-06, "loss": 0.0036, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1737 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 497.5625, "epoch": 1.3904, "grad_norm": 0.8210253431600771, "kl": 0.0859375, "learning_rate": 2.7863025935264876e-06, "loss": 0.0034, "reward": 1.796875, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1738 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.515625, "epoch": 1.3912, "grad_norm": 0.6231118757133141, "kl": 0.0849609375, "learning_rate": 2.784221877594024e-06, "loss": 0.0034, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1739 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 428.71875, "epoch": 1.392, "grad_norm": 0.3842278559460937, "kl": 0.08349609375, "learning_rate": 2.7821409621837042e-06, "loss": 0.0033, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1740 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 444.609375, "epoch": 1.3928, "grad_norm": 0.42824205799656606, "kl": 0.083984375, "learning_rate": 2.7800598487559976e-06, "loss": 0.0034, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1741 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 391.953125, "epoch": 1.3936, "grad_norm": 0.7379060895730872, "kl": 0.10302734375, "learning_rate": 2.777978538771508e-06, "loss": 0.0041, "reward": 1.40625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 1742 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 483.5, "epoch": 1.3944, "grad_norm": 0.2762651121228131, "kl": 0.0859375, "learning_rate": 2.7758970336909795e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1743 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 461.84375, "epoch": 1.3952, "grad_norm": 0.060911127843547894, "kl": 0.08251953125, "learning_rate": 2.7738153349752923e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1744 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 453.09375, "epoch": 1.396, "grad_norm": 0.8246021984882184, "kl": 0.0986328125, "learning_rate": 2.7717334440854634e-06, "loss": 0.0039, "reward": 1.625, "reward_std": 0.1825428307056427, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1745 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 403.59375, "epoch": 1.3968, "grad_norm": 0.5415659251566713, "kl": 0.0927734375, "learning_rate": 2.7696513624826422e-06, "loss": 0.0037, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1746 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 457.546875, "epoch": 1.3976, "grad_norm": 1.0157142443046738, "kl": 0.09326171875, "learning_rate": 2.7675690916281158e-06, "loss": 0.0037, "reward": 1.421875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 1747 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.890625, "epoch": 1.3984, "grad_norm": 0.7862791590866622, "kl": 0.0908203125, "learning_rate": 2.7654866329833e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1748 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.703125, "epoch": 1.3992, "grad_norm": 0.07807654781300286, "kl": 0.09765625, "learning_rate": 2.763403988009746e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1749 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 456.25, "epoch": 1.4, "grad_norm": 0.054222595804571644, "kl": 0.08447265625, "learning_rate": 2.761321158169134e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1750 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 497.96875, "epoch": 1.4008, "grad_norm": 1.0155958812389072, "kl": 0.08447265625, "learning_rate": 2.759238144923274e-06, "loss": 0.0034, "reward": 1.6875, "reward_std": 0.24608495831489563, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1751 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.3125, "epoch": 1.4016, "grad_norm": 0.6716594212516473, "kl": 0.08544921875, "learning_rate": 2.7571549497341044e-06, "loss": 0.0034, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1752 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.953125, "epoch": 1.4024, "grad_norm": 0.7699765748252924, "kl": 0.09423828125, "learning_rate": 2.755071574063692e-06, "loss": 0.0038, "reward": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1753 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 457.703125, "epoch": 1.4032, "grad_norm": 0.05632543107067612, "kl": 0.087890625, "learning_rate": 2.7529880193742297e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1754 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 436.046875, "epoch": 1.404, "grad_norm": 0.09855268858275923, "kl": 0.0830078125, "learning_rate": 2.7509042871280373e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1755 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 439.703125, "epoch": 1.4048, "grad_norm": 0.6798272640898966, "kl": 0.0947265625, "learning_rate": 2.748820378787558e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1756 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 453.609375, "epoch": 1.4056, "grad_norm": 0.48265125822759436, "kl": 0.07763671875, "learning_rate": 2.7467362958153585e-06, "loss": 0.0031, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1757 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 441.78125, "epoch": 1.4064, "grad_norm": 0.6355267246841728, "kl": 0.0986328125, "learning_rate": 2.7446520396741293e-06, "loss": 0.004, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1758 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 446.359375, "epoch": 1.4072, "grad_norm": 0.0589431499290098, "kl": 0.0927734375, "learning_rate": 2.742567611826681e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1759 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 455.921875, "epoch": 1.408, "grad_norm": 0.5294323876504748, "kl": 0.08935546875, "learning_rate": 2.7404830137359445e-06, "loss": 0.0036, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1760 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 474.171875, "epoch": 1.4088, "grad_norm": 0.5531081080315496, "kl": 0.095703125, "learning_rate": 2.7383982468649715e-06, "loss": 0.0038, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1761 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 444.5625, "epoch": 1.4096, "grad_norm": 0.43544886548557143, "kl": 0.08935546875, "learning_rate": 2.7363133126769326e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1762 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 456.203125, "epoch": 1.4104, "grad_norm": 0.8314586552583844, "kl": 0.0927734375, "learning_rate": 2.7342282126351145e-06, "loss": 0.0037, "reward": 1.453125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 1763 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 419.53125, "epoch": 1.4112, "grad_norm": 0.39421918824872754, "kl": 0.1025390625, "learning_rate": 2.73214294820292e-06, "loss": 0.0041, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1764 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 461.359375, "epoch": 1.412, "grad_norm": 0.9457511440850183, "kl": 0.0810546875, "learning_rate": 2.7300575208438684e-06, "loss": 0.0032, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1765 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 447.125, "epoch": 1.4128, "grad_norm": 0.06588191069668192, "kl": 0.0888671875, "learning_rate": 2.7279719320215924e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1766 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 481.234375, "epoch": 1.4136, "grad_norm": 0.6801417772319411, "kl": 0.09228515625, "learning_rate": 2.725886183199839e-06, "loss": 0.0037, "reward": 1.734375, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1767 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 430.703125, "epoch": 1.4144, "grad_norm": 2.235008009368302, "kl": 0.09228515625, "learning_rate": 2.723800275842468e-06, "loss": 0.0037, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 1768 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 459.078125, "epoch": 1.4152, "grad_norm": 0.05632043931444275, "kl": 0.08447265625, "learning_rate": 2.7217142114134466e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1769 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.6875, "epoch": 1.416, "grad_norm": 1.328281291993653, "kl": 0.10205078125, "learning_rate": 2.7196279913768587e-06, "loss": 0.0041, "reward": 1.765625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1770 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.9375, "epoch": 1.4168, "grad_norm": 0.4187136145829767, "kl": 0.0869140625, "learning_rate": 2.717541617196891e-06, "loss": 0.0035, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1771 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 421.125, "epoch": 1.4176, "grad_norm": 0.3430143246769699, "kl": 0.08544921875, "learning_rate": 2.7154550903378425e-06, "loss": 0.0034, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1772 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 442.640625, "epoch": 1.4184, "grad_norm": 0.75589745660042, "kl": 0.0947265625, "learning_rate": 2.713368412264118e-06, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1773 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 481.84375, "epoch": 1.4192, "grad_norm": 0.5936135246264803, "kl": 0.07666015625, "learning_rate": 2.711281584440228e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1774 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 401.3125, "epoch": 1.42, "grad_norm": 0.7427802776635967, "kl": 0.09423828125, "learning_rate": 2.70919460833079e-06, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1775 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 447.25, "epoch": 1.4208, "grad_norm": 1.0457149215633064, "kl": 0.08642578125, "learning_rate": 2.7071074854005206e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1776 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 429.53125, "epoch": 1.4216, "grad_norm": 0.11534240201715483, "kl": 0.08544921875, "learning_rate": 2.705020217114248e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1777 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 404.046875, "epoch": 1.4224, "grad_norm": 2.0687053977515215, "kl": 0.087890625, "learning_rate": 2.7029328049368942e-06, "loss": 0.0035, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1778 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 404.859375, "epoch": 1.4232, "grad_norm": 0.6773605944996723, "kl": 0.0908203125, "learning_rate": 2.700845250333486e-06, "loss": 0.0036, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1779 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 454.21875, "epoch": 1.424, "grad_norm": 0.37142243554998144, "kl": 0.0859375, "learning_rate": 2.69875755476915e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1780 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 421.9375, "epoch": 1.4248, "grad_norm": 0.842197329868124, "kl": 0.0849609375, "learning_rate": 2.696669719709111e-06, "loss": 0.0034, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1781 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 428.28125, "epoch": 1.4256, "grad_norm": 0.37781768013698297, "kl": 0.08056640625, "learning_rate": 2.694581746618691e-06, "loss": 0.0032, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1782 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.703125, "epoch": 1.4264000000000001, "grad_norm": 0.9742505049336619, "kl": 0.08740234375, "learning_rate": 2.6924936369633126e-06, "loss": 0.0035, "reward": 1.796875, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1783 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 409.765625, "epoch": 1.4272, "grad_norm": 0.36508109585285203, "kl": 0.0908203125, "learning_rate": 2.6904053922084893e-06, "loss": 0.0036, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1784 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 399.53125, "epoch": 1.428, "grad_norm": 0.4482960129779903, "kl": 0.0908203125, "learning_rate": 2.688317013819832e-06, "loss": 0.0036, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1785 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 436.46875, "epoch": 1.4288, "grad_norm": 0.8436666351865185, "kl": 0.08251953125, "learning_rate": 2.686228503263045e-06, "loss": 0.0033, "reward": 1.78125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1786 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.828125, "epoch": 1.4296, "grad_norm": 0.6532852990214628, "kl": 0.0888671875, "learning_rate": 2.684139862003927e-06, "loss": 0.0035, "reward": 1.578125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1787 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 1.4304000000000001, "grad_norm": 0.7029649307478942, "kl": 0.09765625, "learning_rate": 2.682051091508365e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.1893727034330368, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1788 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 477.46875, "epoch": 1.4312, "grad_norm": 0.5554355074355926, "kl": 0.08837890625, "learning_rate": 2.679962193242338e-06, "loss": 0.0035, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1789 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 476.203125, "epoch": 1.432, "grad_norm": 9.292448459279282, "kl": 0.08251953125, "learning_rate": 2.6778731686719177e-06, "loss": 0.0033, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1790 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.59375, "epoch": 1.4328, "grad_norm": 1.1170165060861008, "kl": 0.08837890625, "learning_rate": 2.67578401926326e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 1791 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 387.5625, "epoch": 1.4336, "grad_norm": 0.6628566904467228, "kl": 0.08740234375, "learning_rate": 2.6736947464826107e-06, "loss": 0.0035, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1792 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 416.09375, "epoch": 1.4344000000000001, "grad_norm": 0.5003754997616967, "kl": 0.08447265625, "learning_rate": 2.671605351796302e-06, "loss": 0.0034, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1793 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.109375, "epoch": 1.4352, "grad_norm": 0.6028123645605987, "kl": 0.0966796875, "learning_rate": 2.6695158366707526e-06, "loss": 0.0039, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1794 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 445.53125, "epoch": 1.436, "grad_norm": 3.057199036053953, "kl": 0.1005859375, "learning_rate": 2.667426202572463e-06, "loss": 0.004, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1795 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 408.40625, "epoch": 1.4368, "grad_norm": 0.5067201512233437, "kl": 0.07763671875, "learning_rate": 2.665336450968019e-06, "loss": 0.0031, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1796 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 429.234375, "epoch": 1.4376, "grad_norm": 0.06801069394458993, "kl": 0.08544921875, "learning_rate": 2.6632465833240895e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1797 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 460.75, "epoch": 1.4384000000000001, "grad_norm": 0.054330115276332414, "kl": 0.08447265625, "learning_rate": 2.661156601107424e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1798 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.328125, "epoch": 1.4392, "grad_norm": 0.7862043988917632, "kl": 0.08203125, "learning_rate": 2.659066505784852e-06, "loss": 0.0033, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1799 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.171875, "epoch": 1.44, "grad_norm": 0.5702632707864189, "kl": 0.09619140625, "learning_rate": 2.6569762988232838e-06, "loss": 0.0039, "reward": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1800 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.203125, "epoch": 1.4408, "grad_norm": 0.8445473876693124, "kl": 0.0810546875, "learning_rate": 2.654885981689706e-06, "loss": 0.0032, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1801 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.828125, "epoch": 1.4416, "grad_norm": 0.7352932222659931, "kl": 0.08935546875, "learning_rate": 2.652795555851184e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1802 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.375, "epoch": 1.4424000000000001, "grad_norm": 0.6185246891708823, "kl": 0.095703125, "learning_rate": 2.6507050227748595e-06, "loss": 0.0038, "reward": 1.90625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.984375, "step": 1803 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 415.0625, "epoch": 1.4432, "grad_norm": 0.5966741557649632, "kl": 0.083984375, "learning_rate": 2.648614383927949e-06, "loss": 0.0034, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1804 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 435.609375, "epoch": 1.444, "grad_norm": 0.6485992869672412, "kl": 0.09912109375, "learning_rate": 2.646523640777741e-06, "loss": 0.004, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1805 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 456.40625, "epoch": 1.4447999999999999, "grad_norm": 0.055847950381667615, "kl": 0.08251953125, "learning_rate": 2.6444327947916037e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1806 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.15625, "epoch": 1.4456, "grad_norm": 1.909258839064335, "kl": 0.08740234375, "learning_rate": 2.6423418474369707e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1807 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 477.390625, "epoch": 1.4464000000000001, "grad_norm": 0.06290973328158704, "kl": 0.07421875, "learning_rate": 2.64025080018135e-06, "loss": 0.003, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1808 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 390.625, "epoch": 1.4472, "grad_norm": 0.4668977788776293, "kl": 0.10107421875, "learning_rate": 2.6381596544923184e-06, "loss": 0.004, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 1809 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 513.4375, "epoch": 1.448, "grad_norm": 0.4406609866412533, "kl": 0.080078125, "learning_rate": 2.636068411837523e-06, "loss": 0.0032, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1810 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 494.234375, "epoch": 1.4487999999999999, "grad_norm": 0.5725715470521852, "kl": 0.0859375, "learning_rate": 2.6339770736846794e-06, "loss": 0.0034, "reward": 1.671875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1811 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 448.875, "epoch": 1.4496, "grad_norm": 0.8695663469311959, "kl": 0.08447265625, "learning_rate": 2.6318856415015664e-06, "loss": 0.0034, "reward": 1.671875, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1812 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 403.90625, "epoch": 1.4504000000000001, "grad_norm": 0.0594596234641751, "kl": 0.0986328125, "learning_rate": 2.629794116756035e-06, "loss": 0.004, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1813 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 493.359375, "epoch": 1.4512, "grad_norm": 0.8626395875074092, "kl": 0.07861328125, "learning_rate": 2.627702500915995e-06, "loss": 0.0031, "reward": 1.71875, "reward_std": 0.2709311544895172, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 1814 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 430.0, "epoch": 1.452, "grad_norm": 1.2437546406975093, "kl": 0.091796875, "learning_rate": 2.625610795449424e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1815 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 442.984375, "epoch": 1.4527999999999999, "grad_norm": 0.4148455903939452, "kl": 0.08447265625, "learning_rate": 2.6235190018243623e-06, "loss": 0.0034, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1816 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 489.46875, "epoch": 1.4536, "grad_norm": 0.4913454489457642, "kl": 0.09033203125, "learning_rate": 2.6214271215089106e-06, "loss": 0.0036, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1817 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.984375, "epoch": 1.4544000000000001, "grad_norm": 0.3579503990202543, "kl": 0.0908203125, "learning_rate": 2.6193351559712294e-06, "loss": 0.0036, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1818 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 424.015625, "epoch": 1.4552, "grad_norm": 0.9040421571837298, "kl": 0.080078125, "learning_rate": 2.6172431066795428e-06, "loss": 0.0032, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1819 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.390625, "epoch": 1.456, "grad_norm": 0.5527848660887122, "kl": 0.08935546875, "learning_rate": 2.6151509751021307e-06, "loss": 0.0036, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1820 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 452.609375, "epoch": 1.4567999999999999, "grad_norm": 0.7159570949496135, "kl": 0.08447265625, "learning_rate": 2.6130587627073315e-06, "loss": 0.0034, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1821 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 503.59375, "epoch": 1.4576, "grad_norm": 0.7045527173346348, "kl": 0.0869140625, "learning_rate": 2.6109664709635413e-06, "loss": 0.0035, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1822 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 458.953125, "epoch": 1.4584, "grad_norm": 1.3746716929196108, "kl": 0.07373046875, "learning_rate": 2.60887410133921e-06, "loss": 0.003, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1823 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 401.984375, "epoch": 1.4592, "grad_norm": 0.7099269175226316, "kl": 0.0966796875, "learning_rate": 2.606781655302843e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.16887325048446655, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1824 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 446.734375, "epoch": 1.46, "grad_norm": 0.29515016967963137, "kl": 0.09228515625, "learning_rate": 2.604689134322999e-06, "loss": 0.0037, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1825 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 439.578125, "epoch": 1.4607999999999999, "grad_norm": 0.4957503499839096, "kl": 0.0849609375, "learning_rate": 2.602596539868292e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1826 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 471.640625, "epoch": 1.4616, "grad_norm": 0.05056978228204779, "kl": 0.08642578125, "learning_rate": 2.6005038734073833e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1827 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 469.625, "epoch": 1.4624, "grad_norm": 0.05118351605817079, "kl": 0.080078125, "learning_rate": 2.5984111364089875e-06, "loss": 0.0032, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1828 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.546875, "epoch": 1.4632, "grad_norm": 0.5244917655001043, "kl": 0.08740234375, "learning_rate": 2.5963183303418682e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1829 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 378.703125, "epoch": 1.464, "grad_norm": 0.06660306641892735, "kl": 0.09619140625, "learning_rate": 2.594225456674837e-06, "loss": 0.0038, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1830 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 377.71875, "epoch": 1.4647999999999999, "grad_norm": 0.4164352473607868, "kl": 0.10595703125, "learning_rate": 2.592132516876753e-06, "loss": 0.0042, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1831 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 447.671875, "epoch": 1.4656, "grad_norm": 0.5268936714361685, "kl": 0.0859375, "learning_rate": 2.5900395124165216e-06, "loss": 0.0034, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1832 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.890625, "epoch": 1.4664, "grad_norm": 0.39118333712594194, "kl": 0.0869140625, "learning_rate": 2.5879464447630947e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1833 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 460.828125, "epoch": 1.4672, "grad_norm": 0.055892104785312396, "kl": 0.087890625, "learning_rate": 2.5858533153854676e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1834 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 468.828125, "epoch": 1.468, "grad_norm": 0.30366899064146025, "kl": 0.0849609375, "learning_rate": 2.583760125752679e-06, "loss": 0.0034, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1835 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 490.140625, "epoch": 1.4687999999999999, "grad_norm": 0.3984764927244527, "kl": 0.0869140625, "learning_rate": 2.58166687733381e-06, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1836 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 434.34375, "epoch": 1.4696, "grad_norm": 0.09308225263921162, "kl": 0.080078125, "learning_rate": 2.5795735715979826e-06, "loss": 0.0032, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1837 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.3125, "epoch": 1.4704, "grad_norm": 0.5889483500307229, "kl": 0.0927734375, "learning_rate": 2.577480210014359e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1838 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.015625, "epoch": 1.4712, "grad_norm": 0.9711711579355499, "kl": 0.103515625, "learning_rate": 2.575386794052142e-06, "loss": 0.0041, "reward": 1.609375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 1839 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 438.125, "epoch": 1.472, "grad_norm": 0.0572974281718638, "kl": 0.080078125, "learning_rate": 2.5732933251805716e-06, "loss": 0.0032, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1840 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 424.859375, "epoch": 1.4727999999999999, "grad_norm": 0.48171769956286503, "kl": 0.083984375, "learning_rate": 2.571199804868923e-06, "loss": 0.0034, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1841 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 408.515625, "epoch": 1.4736, "grad_norm": 0.39939450673132837, "kl": 0.08349609375, "learning_rate": 2.569106234586511e-06, "loss": 0.0033, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.984375, "step": 1842 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 423.640625, "epoch": 1.4744, "grad_norm": 0.3497836011832466, "kl": 0.0947265625, "learning_rate": 2.5670126158026843e-06, "loss": 0.0038, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1843 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 471.9375, "epoch": 1.4752, "grad_norm": 0.05418370592255706, "kl": 0.08837890625, "learning_rate": 2.5649189499868233e-06, "loss": 0.0035, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1844 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 436.859375, "epoch": 1.476, "grad_norm": 0.053063064671305116, "kl": 0.0888671875, "learning_rate": 2.5628252386083443e-06, "loss": 0.0036, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1845 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 462.796875, "epoch": 1.4768, "grad_norm": 0.30970356384425807, "kl": 0.08447265625, "learning_rate": 2.560731483136694e-06, "loss": 0.0034, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1846 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 413.28125, "epoch": 1.4776, "grad_norm": 0.9125615897854776, "kl": 0.0830078125, "learning_rate": 2.558637685041352e-06, "loss": 0.0033, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1847 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 454.984375, "epoch": 1.4784, "grad_norm": 0.059595331973601735, "kl": 0.076171875, "learning_rate": 2.5565438457918247e-06, "loss": 0.003, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1848 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 391.359375, "epoch": 1.4792, "grad_norm": 0.06795832349931433, "kl": 0.0888671875, "learning_rate": 2.5544499668576508e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1849 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.890625, "epoch": 1.48, "grad_norm": 0.5189598870665199, "kl": 0.0810546875, "learning_rate": 2.5523560497083927e-06, "loss": 0.0032, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1850 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 430.5, "epoch": 1.4808, "grad_norm": 0.4665591820452326, "kl": 0.06689453125, "learning_rate": 2.5502620958136444e-06, "loss": 0.0027, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1851 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 465.4375, "epoch": 1.4816, "grad_norm": 0.6207213717246918, "kl": 0.0810546875, "learning_rate": 2.548168106643022e-06, "loss": 0.0032, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1852 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 452.5625, "epoch": 1.4824, "grad_norm": 0.5084056440420548, "kl": 0.0859375, "learning_rate": 2.546074083666169e-06, "loss": 0.0034, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1853 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 401.078125, "epoch": 1.4832, "grad_norm": 0.060662205606532446, "kl": 0.08837890625, "learning_rate": 2.5439800283527495e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1854 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 463.203125, "epoch": 1.484, "grad_norm": 0.07666590310346533, "kl": 0.07373046875, "learning_rate": 2.541885942172454e-06, "loss": 0.0029, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1855 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 400.890625, "epoch": 1.4848, "grad_norm": 0.39402392081461957, "kl": 0.099609375, "learning_rate": 2.539791826594991e-06, "loss": 0.004, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1856 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 455.21875, "epoch": 1.4856, "grad_norm": 0.3416794358100851, "kl": 0.09375, "learning_rate": 2.537697683090093e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1857 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.34375, "epoch": 1.4864, "grad_norm": 2.1323883472952603, "kl": 0.0927734375, "learning_rate": 2.5356035131275096e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1858 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 447.953125, "epoch": 1.4872, "grad_norm": 0.505078824930963, "kl": 0.0869140625, "learning_rate": 2.5335093181770105e-06, "loss": 0.0035, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1859 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 473.828125, "epoch": 1.488, "grad_norm": 0.44477228397087665, "kl": 0.08203125, "learning_rate": 2.531415099708382e-06, "loss": 0.0033, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1860 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 471.59375, "epoch": 1.4888, "grad_norm": 0.31768271281471744, "kl": 0.0869140625, "learning_rate": 2.5293208591914265e-06, "loss": 0.0035, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1861 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 396.0625, "epoch": 1.4896, "grad_norm": 0.37940950625472025, "kl": 0.08154296875, "learning_rate": 2.5272265980959644e-06, "loss": 0.0033, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1862 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.71875, "epoch": 1.4904, "grad_norm": 0.6872315143129755, "kl": 0.10009765625, "learning_rate": 2.525132317891827e-06, "loss": 0.004, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1863 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 451.375, "epoch": 1.4912, "grad_norm": 0.05494493440457726, "kl": 0.08154296875, "learning_rate": 2.523038020048861e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1864 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 405.609375, "epoch": 1.492, "grad_norm": 2.3759659888477676, "kl": 0.08740234375, "learning_rate": 2.5209437060369266e-06, "loss": 0.0035, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1865 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 407.1875, "epoch": 1.4928, "grad_norm": 0.5496704832725965, "kl": 0.09130859375, "learning_rate": 2.518849377325893e-06, "loss": 0.0037, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1866 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 460.796875, "epoch": 1.4936, "grad_norm": 0.04423375224248584, "kl": 0.07373046875, "learning_rate": 2.51675503538564e-06, "loss": 0.0029, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1867 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 422.875, "epoch": 1.4944, "grad_norm": 0.4092693934966897, "kl": 0.09326171875, "learning_rate": 2.5146606816860597e-06, "loss": 0.0037, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1868 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.484375, "epoch": 1.4952, "grad_norm": 0.6088729253934295, "kl": 0.08251953125, "learning_rate": 2.5125663176970475e-06, "loss": 0.0033, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1869 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 413.09375, "epoch": 1.496, "grad_norm": 0.05835888209531134, "kl": 0.0869140625, "learning_rate": 2.5104719448885103e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1870 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.03125, "epoch": 1.4968, "grad_norm": 0.6104164350909421, "kl": 0.09033203125, "learning_rate": 2.5083775647303583e-06, "loss": 0.0036, "reward": 1.640625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1871 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 430.265625, "epoch": 1.4976, "grad_norm": 0.7829505465205233, "kl": 0.08740234375, "learning_rate": 2.5062831786925102e-06, "loss": 0.0035, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 1872 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.140625, "epoch": 1.4984, "grad_norm": 0.7670852263386284, "kl": 0.0771484375, "learning_rate": 2.5041887882448845e-06, "loss": 0.0031, "reward": 1.71875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1873 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 407.1875, "epoch": 1.4992, "grad_norm": 0.7791601483538768, "kl": 0.1005859375, "learning_rate": 2.5020943948574056e-06, "loss": 0.004, "reward": 1.90625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1874 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.4375, "epoch": 1.5, "grad_norm": 0.07225995331618189, "kl": 0.08544921875, "learning_rate": 2.5e-06, "loss": 0.0034, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1875 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 461.9375, "epoch": 1.5008, "grad_norm": 1.03905440902492, "kl": 0.08935546875, "learning_rate": 2.497905605142595e-06, "loss": 0.0036, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1876 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 432.75, "epoch": 1.5016, "grad_norm": 0.7955818423033687, "kl": 0.09228515625, "learning_rate": 2.4958112117551163e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1877 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 403.46875, "epoch": 1.5024, "grad_norm": 0.4000701195771805, "kl": 0.08154296875, "learning_rate": 2.4937168213074906e-06, "loss": 0.0033, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.984375, "step": 1878 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 434.890625, "epoch": 1.5032, "grad_norm": 0.05769996956502378, "kl": 0.07861328125, "learning_rate": 2.491622435269642e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1879 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 419.9375, "epoch": 1.504, "grad_norm": 0.658628747605266, "kl": 0.0859375, "learning_rate": 2.489528055111491e-06, "loss": 0.0034, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1880 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 427.671875, "epoch": 1.5048, "grad_norm": 0.6500906281135059, "kl": 0.0888671875, "learning_rate": 2.487433682302953e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1881 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 458.0625, "epoch": 1.5056, "grad_norm": 0.047598480794597976, "kl": 0.068359375, "learning_rate": 2.485339318313941e-06, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1882 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 460.5625, "epoch": 1.5064, "grad_norm": 0.0754762844906546, "kl": 0.08349609375, "learning_rate": 2.4832449646143605e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1883 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.203125, "epoch": 1.5072, "grad_norm": 0.4099922193713641, "kl": 0.08203125, "learning_rate": 2.4811506226741077e-06, "loss": 0.0033, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1884 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 1.508, "grad_norm": 0.8625269812715508, "kl": 0.0927734375, "learning_rate": 2.4790562939630738e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1885 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 472.6875, "epoch": 1.5088, "grad_norm": 0.05813491582761802, "kl": 0.0732421875, "learning_rate": 2.4769619799511392e-06, "loss": 0.0029, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1886 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 474.53125, "epoch": 1.5096, "grad_norm": 0.6129709930819037, "kl": 0.09521484375, "learning_rate": 2.474867682108174e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1887 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 426.90625, "epoch": 1.5104, "grad_norm": 0.7629134340766548, "kl": 0.087890625, "learning_rate": 2.472773401904037e-06, "loss": 0.0035, "reward": 1.65625, "reward_std": 0.1735912710428238, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1888 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 463.953125, "epoch": 1.5112, "grad_norm": 0.4520750873435136, "kl": 0.07861328125, "learning_rate": 2.470679140808574e-06, "loss": 0.0031, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1889 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 458.625, "epoch": 1.512, "grad_norm": 0.05248061071666733, "kl": 0.08056640625, "learning_rate": 2.4685849002916184e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1890 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 439.375, "epoch": 1.5128, "grad_norm": 0.058220772450739776, "kl": 0.07763671875, "learning_rate": 2.4664906818229903e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1891 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 466.09375, "epoch": 1.5135999999999998, "grad_norm": 0.05223533553545021, "kl": 0.08642578125, "learning_rate": 2.4643964868724916e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1892 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 452.53125, "epoch": 1.5144, "grad_norm": 1.512917485006429, "kl": 0.08203125, "learning_rate": 2.4623023169099074e-06, "loss": 0.0033, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 1893 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.609375, "epoch": 1.5152, "grad_norm": 0.5777589521384713, "kl": 0.07275390625, "learning_rate": 2.4602081734050093e-06, "loss": 0.0029, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1894 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 454.9375, "epoch": 1.516, "grad_norm": 0.04983621844004456, "kl": 0.0869140625, "learning_rate": 2.4581140578275473e-06, "loss": 0.0035, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1895 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 418.609375, "epoch": 1.5168, "grad_norm": 0.05418391240913127, "kl": 0.0859375, "learning_rate": 2.456019971647251e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1896 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 419.578125, "epoch": 1.5175999999999998, "grad_norm": 0.4708469803329581, "kl": 0.09912109375, "learning_rate": 2.4539259163338317e-06, "loss": 0.004, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1897 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 404.96875, "epoch": 1.5184, "grad_norm": 0.20812983894555992, "kl": 0.0908203125, "learning_rate": 2.4518318933569786e-06, "loss": 0.0036, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1898 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 448.40625, "epoch": 1.5192, "grad_norm": 0.4885372425816075, "kl": 0.08544921875, "learning_rate": 2.449737904186357e-06, "loss": 0.0034, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1899 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 487.625, "epoch": 1.52, "grad_norm": 0.6225599802503589, "kl": 0.0751953125, "learning_rate": 2.447643950291608e-06, "loss": 0.003, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1900 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 450.5625, "epoch": 1.5208, "grad_norm": 0.44505940091203605, "kl": 0.08056640625, "learning_rate": 2.4455500331423505e-06, "loss": 0.0032, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1901 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 426.828125, "epoch": 1.5215999999999998, "grad_norm": 1.5724566153791564, "kl": 0.08642578125, "learning_rate": 2.4434561542081765e-06, "loss": 0.0035, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1902 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.15625, "epoch": 1.5224, "grad_norm": 0.0688764837836802, "kl": 0.083984375, "learning_rate": 2.441362314958649e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1903 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 406.8125, "epoch": 1.5232, "grad_norm": 0.4062787098036763, "kl": 0.0849609375, "learning_rate": 2.439268516863306e-06, "loss": 0.0034, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1904 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.1875, "epoch": 1.524, "grad_norm": 0.056109183998569095, "kl": 0.07958984375, "learning_rate": 2.4371747613916566e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1905 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.421875, "epoch": 1.5248, "grad_norm": 0.5402923435551574, "kl": 0.09814453125, "learning_rate": 2.4350810500131776e-06, "loss": 0.0039, "reward": 1.515625, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 1906 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 451.921875, "epoch": 1.5255999999999998, "grad_norm": 0.06078793551415715, "kl": 0.0830078125, "learning_rate": 2.4329873841973174e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1907 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 414.578125, "epoch": 1.5264, "grad_norm": 0.43927563665387126, "kl": 0.08349609375, "learning_rate": 2.4308937654134893e-06, "loss": 0.0033, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1908 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 373.65625, "epoch": 1.5272000000000001, "grad_norm": 0.977754562420843, "kl": 0.099609375, "learning_rate": 2.428800195131078e-06, "loss": 0.004, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1909 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 446.90625, "epoch": 1.528, "grad_norm": 0.43253855303777056, "kl": 0.08642578125, "learning_rate": 2.4267066748194297e-06, "loss": 0.0035, "reward": 1.78125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1910 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 419.453125, "epoch": 1.5288, "grad_norm": 0.05995847869070541, "kl": 0.08203125, "learning_rate": 2.4246132059478582e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1911 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 481.9375, "epoch": 1.5295999999999998, "grad_norm": 0.05057924473336973, "kl": 0.087890625, "learning_rate": 2.4225197899856416e-06, "loss": 0.0035, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1912 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 431.359375, "epoch": 1.5304, "grad_norm": 0.7218737826161329, "kl": 0.08349609375, "learning_rate": 2.4204264284020182e-06, "loss": 0.0033, "reward": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1913 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 424.796875, "epoch": 1.5312000000000001, "grad_norm": 1.083573297307257, "kl": 0.10546875, "learning_rate": 2.4183331226661913e-06, "loss": 0.0042, "reward": 1.578125, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 1914 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.015625, "epoch": 1.532, "grad_norm": 0.903437762615361, "kl": 0.08251953125, "learning_rate": 2.4162398742473216e-06, "loss": 0.0033, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1915 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 447.6875, "epoch": 1.5328, "grad_norm": 0.05691963307194323, "kl": 0.09326171875, "learning_rate": 2.4141466846145332e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1916 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 437.796875, "epoch": 1.5335999999999999, "grad_norm": 0.6243776782610874, "kl": 0.08935546875, "learning_rate": 2.4120535552369057e-06, "loss": 0.0036, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 1917 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 455.8125, "epoch": 1.5344, "grad_norm": 0.05135058165921652, "kl": 0.0751953125, "learning_rate": 2.4099604875834796e-06, "loss": 0.003, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1918 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.390625, "epoch": 1.5352000000000001, "grad_norm": 0.5251489637543353, "kl": 0.09033203125, "learning_rate": 2.407867483123248e-06, "loss": 0.0036, "reward": 1.90625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1919 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 470.125, "epoch": 1.536, "grad_norm": 0.4654445293962623, "kl": 0.08056640625, "learning_rate": 2.4057745433251637e-06, "loss": 0.0032, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1920 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 451.96875, "epoch": 1.5368, "grad_norm": 0.3780116267775583, "kl": 0.07666015625, "learning_rate": 2.4036816696581326e-06, "loss": 0.0031, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1921 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 452.015625, "epoch": 1.5375999999999999, "grad_norm": 0.04968332132414957, "kl": 0.078125, "learning_rate": 2.401588863591013e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1922 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 422.390625, "epoch": 1.5384, "grad_norm": 0.5597047714476183, "kl": 0.0751953125, "learning_rate": 2.3994961265926166e-06, "loss": 0.003, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 1923 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 401.703125, "epoch": 1.5392000000000001, "grad_norm": 0.42614631105928746, "kl": 0.080078125, "learning_rate": 2.3974034601317085e-06, "loss": 0.0032, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1924 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 473.9375, "epoch": 1.54, "grad_norm": 0.7449681026166073, "kl": 0.08447265625, "learning_rate": 2.3953108656770018e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1925 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 505.890625, "epoch": 1.5408, "grad_norm": 0.3399618763451221, "kl": 0.0771484375, "learning_rate": 2.3932183446971584e-06, "loss": 0.0031, "reward": 1.421875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 1926 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.40625, "epoch": 1.5415999999999999, "grad_norm": 0.048034442827705685, "kl": 0.076171875, "learning_rate": 2.3911258986607907e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1927 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 412.78125, "epoch": 1.5424, "grad_norm": 0.5970261805180694, "kl": 0.091796875, "learning_rate": 2.3890335290364596e-06, "loss": 0.0037, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1928 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 396.421875, "epoch": 1.5432000000000001, "grad_norm": 1.2450282207798113, "kl": 0.08740234375, "learning_rate": 2.386941237292669e-06, "loss": 0.0035, "reward": 1.546875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1929 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 377.140625, "epoch": 1.544, "grad_norm": 0.4069990345139908, "kl": 0.09228515625, "learning_rate": 2.3848490248978693e-06, "loss": 0.0037, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1930 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 376.75, "epoch": 1.5448, "grad_norm": 0.41353856720159765, "kl": 0.08642578125, "learning_rate": 2.3827568933204576e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1931 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 440.6875, "epoch": 1.5455999999999999, "grad_norm": 0.3722024727369536, "kl": 0.09423828125, "learning_rate": 2.3806648440287715e-06, "loss": 0.0038, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1932 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 444.390625, "epoch": 1.5464, "grad_norm": 0.29316398916960057, "kl": 0.07568359375, "learning_rate": 2.378572878491091e-06, "loss": 0.003, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1933 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 411.875, "epoch": 1.5472000000000001, "grad_norm": 0.05588848785297944, "kl": 0.0947265625, "learning_rate": 2.376480998175638e-06, "loss": 0.0038, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1934 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 408.796875, "epoch": 1.548, "grad_norm": 0.3869799472767756, "kl": 0.0732421875, "learning_rate": 2.3743892045505764e-06, "loss": 0.0029, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1935 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.203125, "epoch": 1.5488, "grad_norm": 0.05040098743195126, "kl": 0.08154296875, "learning_rate": 2.372297499084006e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1936 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 1.5495999999999999, "grad_norm": 0.09941455691374751, "kl": 0.080078125, "learning_rate": 2.3702058832439667e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1937 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.390625, "epoch": 1.5504, "grad_norm": 0.049595646976077816, "kl": 0.0810546875, "learning_rate": 2.368114358498434e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1938 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 489.515625, "epoch": 1.5512000000000001, "grad_norm": 0.058668874610572117, "kl": 0.07080078125, "learning_rate": 2.366022926315322e-06, "loss": 0.0028, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1939 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.4375, "epoch": 1.552, "grad_norm": 0.4576779353248496, "kl": 0.09521484375, "learning_rate": 2.3639315881624776e-06, "loss": 0.0038, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.984375, "step": 1940 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 385.453125, "epoch": 1.5528, "grad_norm": 0.41280531013291405, "kl": 0.1005859375, "learning_rate": 2.361840345507683e-06, "loss": 0.004, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1941 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 456.75, "epoch": 1.5535999999999999, "grad_norm": 0.051528662637933556, "kl": 0.083984375, "learning_rate": 2.359749199818651e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1942 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 402.359375, "epoch": 1.5544, "grad_norm": 0.44677184269484693, "kl": 0.08544921875, "learning_rate": 2.3576581525630297e-06, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1943 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 428.171875, "epoch": 1.5552000000000001, "grad_norm": 0.4934843447792334, "kl": 0.08154296875, "learning_rate": 2.355567205208397e-06, "loss": 0.0033, "reward": 1.6875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1944 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 445.40625, "epoch": 1.556, "grad_norm": 1.2941746037157797, "kl": 0.0849609375, "learning_rate": 2.353476359222259e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1945 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.671875, "epoch": 1.5568, "grad_norm": 0.7087891414321927, "kl": 0.09521484375, "learning_rate": 2.351385616072052e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1946 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 441.796875, "epoch": 1.5575999999999999, "grad_norm": 0.7033943265399606, "kl": 0.08935546875, "learning_rate": 2.3492949772251418e-06, "loss": 0.0036, "reward": 1.671875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 1947 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 435.203125, "epoch": 1.5584, "grad_norm": 0.517283215443088, "kl": 0.08349609375, "learning_rate": 2.3472044441488175e-06, "loss": 0.0033, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1948 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 422.203125, "epoch": 1.5592000000000001, "grad_norm": 0.32980988793749694, "kl": 0.0947265625, "learning_rate": 2.345114018310295e-06, "loss": 0.0038, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1949 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 520.0625, "epoch": 1.56, "grad_norm": 0.3253568610855632, "kl": 0.0771484375, "learning_rate": 2.3430237011767166e-06, "loss": 0.0031, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1950 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.390625, "epoch": 1.5608, "grad_norm": 0.9188973196173563, "kl": 0.09716796875, "learning_rate": 2.3409334942151485e-06, "loss": 0.0039, "reward": 1.734375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 1951 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 398.109375, "epoch": 1.5615999999999999, "grad_norm": 0.05237912104856825, "kl": 0.08251953125, "learning_rate": 2.3388433988925767e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1952 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.0625, "epoch": 1.5624, "grad_norm": 0.8733536196484487, "kl": 0.09765625, "learning_rate": 2.3367534166759105e-06, "loss": 0.0039, "reward": 1.9375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1953 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 481.765625, "epoch": 1.5632000000000001, "grad_norm": 0.3414651117889524, "kl": 0.087890625, "learning_rate": 2.3346635490319815e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1954 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.015625, "epoch": 1.564, "grad_norm": 0.8019378872289062, "kl": 0.09033203125, "learning_rate": 2.3325737974275382e-06, "loss": 0.0036, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1955 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 450.71875, "epoch": 1.5648, "grad_norm": 0.37614842106850166, "kl": 0.09130859375, "learning_rate": 2.3304841633292487e-06, "loss": 0.0037, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1956 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.90625, "epoch": 1.5655999999999999, "grad_norm": 0.920505096249008, "kl": 0.08740234375, "learning_rate": 2.328394648203698e-06, "loss": 0.0035, "reward": 1.828125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1957 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 469.453125, "epoch": 1.5664, "grad_norm": 0.06176893859386287, "kl": 0.09375, "learning_rate": 2.32630525351739e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1958 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 461.671875, "epoch": 1.5672000000000001, "grad_norm": 0.6721447259674301, "kl": 0.09765625, "learning_rate": 2.324215980736741e-06, "loss": 0.0039, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1959 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 438.40625, "epoch": 1.568, "grad_norm": 0.6466685139029245, "kl": 0.0869140625, "learning_rate": 2.3221268313280836e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1960 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 444.53125, "epoch": 1.5688, "grad_norm": 0.6677164989481668, "kl": 0.0830078125, "learning_rate": 2.320037806757662e-06, "loss": 0.0033, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1961 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 410.640625, "epoch": 1.5695999999999999, "grad_norm": 0.05529813702949351, "kl": 0.08251953125, "learning_rate": 2.317948908491636e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1962 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 415.234375, "epoch": 1.5704, "grad_norm": 0.3663790734074994, "kl": 0.09375, "learning_rate": 2.315860137996074e-06, "loss": 0.0038, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1963 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 518.40625, "epoch": 1.5712000000000002, "grad_norm": 0.3820730654813393, "kl": 0.08349609375, "learning_rate": 2.3137714967369544e-06, "loss": 0.0033, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1964 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 400.359375, "epoch": 1.572, "grad_norm": 0.5072571106126055, "kl": 0.0888671875, "learning_rate": 2.3116829861801687e-06, "loss": 0.0036, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1965 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 453.640625, "epoch": 1.5728, "grad_norm": 0.7357031421006998, "kl": 0.099609375, "learning_rate": 2.3095946077915115e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1966 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.828125, "epoch": 1.5735999999999999, "grad_norm": 0.5710762790424448, "kl": 0.10009765625, "learning_rate": 2.307506363036688e-06, "loss": 0.004, "reward": 1.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1967 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.640625, "epoch": 1.5744, "grad_norm": 0.6056496600123321, "kl": 0.10546875, "learning_rate": 2.305418253381309e-06, "loss": 0.0042, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1968 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 423.578125, "epoch": 1.5752000000000002, "grad_norm": 0.7841380166434793, "kl": 0.1171875, "learning_rate": 2.3033302802908895e-06, "loss": 0.0047, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1969 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.953125, "epoch": 1.576, "grad_norm": 0.7099695625206764, "kl": 0.0908203125, "learning_rate": 2.301242445230851e-06, "loss": 0.0036, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1970 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.375, "epoch": 1.5768, "grad_norm": 0.48466629449866044, "kl": 0.0869140625, "learning_rate": 2.299154749666515e-06, "loss": 0.0035, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1971 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 442.78125, "epoch": 1.5776, "grad_norm": 0.8487171031141673, "kl": 0.09619140625, "learning_rate": 2.2970671950631066e-06, "loss": 0.0038, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1972 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.15625, "epoch": 1.5784, "grad_norm": 0.6884572205620748, "kl": 0.09423828125, "learning_rate": 2.2949797828857527e-06, "loss": 0.0038, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1973 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 403.109375, "epoch": 1.5792000000000002, "grad_norm": 0.4376278554728628, "kl": 0.08935546875, "learning_rate": 2.2928925145994798e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1974 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 473.484375, "epoch": 1.58, "grad_norm": 0.612301840169861, "kl": 0.08740234375, "learning_rate": 2.290805391669212e-06, "loss": 0.0035, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1975 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 461.765625, "epoch": 1.5808, "grad_norm": 1.1403517593852999, "kl": 0.1015625, "learning_rate": 2.2887184155597725e-06, "loss": 0.0041, "reward": 1.875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1976 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 432.234375, "epoch": 1.5816, "grad_norm": 0.6715587617536881, "kl": 0.0908203125, "learning_rate": 2.286631587735883e-06, "loss": 0.0036, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 1977 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 394.484375, "epoch": 1.5824, "grad_norm": 0.5746937057612103, "kl": 0.095703125, "learning_rate": 2.2845449096621583e-06, "loss": 0.0038, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1978 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.25, "epoch": 1.5832000000000002, "grad_norm": 0.6515198071616948, "kl": 0.10009765625, "learning_rate": 2.282458382803109e-06, "loss": 0.004, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1979 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.390625, "epoch": 1.584, "grad_norm": 0.06183753410933011, "kl": 0.08837890625, "learning_rate": 2.280372008623142e-06, "loss": 0.0035, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1980 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.859375, "epoch": 1.5848, "grad_norm": 0.7247649129352906, "kl": 0.10009765625, "learning_rate": 2.2782857885865538e-06, "loss": 0.004, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1981 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.546875, "epoch": 1.5856, "grad_norm": 0.058504336371987986, "kl": 0.07861328125, "learning_rate": 2.2761997241575335e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1982 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 464.09375, "epoch": 1.5864, "grad_norm": 0.059325817514382304, "kl": 0.0927734375, "learning_rate": 2.274113816800161e-06, "loss": 0.0037, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1983 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 427.15625, "epoch": 1.5872000000000002, "grad_norm": 0.44834264209595615, "kl": 0.08154296875, "learning_rate": 2.272028067978408e-06, "loss": 0.0033, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1984 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 380.671875, "epoch": 1.588, "grad_norm": 0.07685436523479029, "kl": 0.09326171875, "learning_rate": 2.2699424791561324e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1985 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 447.171875, "epoch": 1.5888, "grad_norm": 0.060732157570829066, "kl": 0.0966796875, "learning_rate": 2.267857051797081e-06, "loss": 0.0039, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1986 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 419.40625, "epoch": 1.5896, "grad_norm": 0.680513554192407, "kl": 0.0947265625, "learning_rate": 2.265771787364886e-06, "loss": 0.0038, "reward": 1.671875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1987 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.5625, "epoch": 1.5904, "grad_norm": 0.6776229162099154, "kl": 0.10595703125, "learning_rate": 2.263686687323068e-06, "loss": 0.0042, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1988 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 428.96875, "epoch": 1.5912, "grad_norm": 0.05980076735866545, "kl": 0.0830078125, "learning_rate": 2.261601753135029e-06, "loss": 0.0033, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1989 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 427.640625, "epoch": 1.592, "grad_norm": 0.562923329652774, "kl": 0.10400390625, "learning_rate": 2.259516986264057e-06, "loss": 0.0042, "reward": 1.703125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1990 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 402.796875, "epoch": 1.5928, "grad_norm": 0.6307637783596173, "kl": 0.09033203125, "learning_rate": 2.2574323881733202e-06, "loss": 0.0036, "reward": 1.78125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1991 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.421875, "epoch": 1.5936, "grad_norm": 0.6263483609701712, "kl": 0.07861328125, "learning_rate": 2.255347960325871e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1992 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 438.640625, "epoch": 1.5944, "grad_norm": 0.5561684101608247, "kl": 0.0703125, "learning_rate": 2.2532637041846423e-06, "loss": 0.0028, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1993 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 434.765625, "epoch": 1.5952, "grad_norm": 2.007077470371618, "kl": 0.1044921875, "learning_rate": 2.2511796212124424e-06, "loss": 0.0042, "reward": 1.53125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1994 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.265625, "epoch": 1.596, "grad_norm": 0.052762762834036386, "kl": 0.07763671875, "learning_rate": 2.2490957128719627e-06, "loss": 0.0031, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1995 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 431.09375, "epoch": 1.5968, "grad_norm": 0.540183431697575, "kl": 0.0927734375, "learning_rate": 2.247011980625771e-06, "loss": 0.0037, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 1996 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 399.546875, "epoch": 1.5976, "grad_norm": 0.709139819373853, "kl": 0.1083984375, "learning_rate": 2.2449284259363093e-06, "loss": 0.0043, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1997 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.703125, "epoch": 1.5984, "grad_norm": 0.6191389046837881, "kl": 0.0908203125, "learning_rate": 2.2428450502658964e-06, "loss": 0.0036, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1998 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.515625, "epoch": 1.5992, "grad_norm": 0.6661522188284288, "kl": 0.09375, "learning_rate": 2.240761855076727e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1999 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 472.109375, "epoch": 1.6, "grad_norm": 0.6794216918715973, "kl": 0.09228515625, "learning_rate": 2.238678841830867e-06, "loss": 0.0037, "reward": 1.390625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 2000 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 502.25, "epoch": 1.6008, "grad_norm": 0.4924265749469055, "kl": 0.0859375, "learning_rate": 2.2365960119902543e-06, "loss": 0.0034, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2001 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 441.125, "epoch": 1.6016, "grad_norm": 0.6632221972914333, "kl": 0.08984375, "learning_rate": 2.2345133670167e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2002 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 403.984375, "epoch": 1.6024, "grad_norm": 0.5731803399766031, "kl": 0.09521484375, "learning_rate": 2.232430908371885e-06, "loss": 0.0038, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2003 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 429.59375, "epoch": 1.6032, "grad_norm": 0.05592152651951858, "kl": 0.08984375, "learning_rate": 2.2303486375173586e-06, "loss": 0.0036, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2004 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 393.484375, "epoch": 1.604, "grad_norm": 0.5149817346204347, "kl": 0.0888671875, "learning_rate": 2.228266555914538e-06, "loss": 0.0035, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 2005 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 420.921875, "epoch": 1.6048, "grad_norm": 0.5618040729575181, "kl": 0.1083984375, "learning_rate": 2.2261846650247077e-06, "loss": 0.0043, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2006 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.25, "epoch": 1.6056, "grad_norm": 0.5471707557096966, "kl": 0.08154296875, "learning_rate": 2.224102966309021e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2007 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 364.625, "epoch": 1.6064, "grad_norm": 0.5750125022504474, "kl": 0.083984375, "learning_rate": 2.2220214612284925e-06, "loss": 0.0034, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2008 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 408.53125, "epoch": 1.6072, "grad_norm": 0.5028381419417359, "kl": 0.0966796875, "learning_rate": 2.2199401512440037e-06, "loss": 0.0039, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2009 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.796875, "epoch": 1.608, "grad_norm": 1.0662632177333176, "kl": 0.08984375, "learning_rate": 2.2178590378162957e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2010 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 442.4375, "epoch": 1.6088, "grad_norm": 0.809973613432914, "kl": 0.109375, "learning_rate": 2.215778122405977e-06, "loss": 0.0044, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2011 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 402.453125, "epoch": 1.6096, "grad_norm": 0.40635075586272323, "kl": 0.10107421875, "learning_rate": 2.2136974064735132e-06, "loss": 0.0041, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2012 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 454.96875, "epoch": 1.6104, "grad_norm": 0.3458725202502868, "kl": 0.08544921875, "learning_rate": 2.2116168914792293e-06, "loss": 0.0034, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2013 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.15625, "epoch": 1.6112, "grad_norm": 0.7825353499057853, "kl": 0.09716796875, "learning_rate": 2.209536578883313e-06, "loss": 0.0039, "reward": 1.90625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 2014 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 450.75, "epoch": 1.612, "grad_norm": 0.40853927313163246, "kl": 0.099609375, "learning_rate": 2.207456470145807e-06, "loss": 0.004, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2015 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 398.46875, "epoch": 1.6128, "grad_norm": 0.39259991374650216, "kl": 0.09375, "learning_rate": 2.205376566726611e-06, "loss": 0.0037, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2016 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 403.765625, "epoch": 1.6136, "grad_norm": 0.38809053469550847, "kl": 0.0703125, "learning_rate": 2.2032968700854813e-06, "loss": 0.0028, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2017 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 430.359375, "epoch": 1.6143999999999998, "grad_norm": 1.6390571938325302, "kl": 0.11865234375, "learning_rate": 2.2012173816820297e-06, "loss": 0.0047, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2018 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 437.4375, "epoch": 1.6152, "grad_norm": 0.76433564696172, "kl": 0.11376953125, "learning_rate": 2.1991381029757216e-06, "loss": 0.0046, "reward": 1.515625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 2019 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 466.34375, "epoch": 1.616, "grad_norm": 0.05320376064530623, "kl": 0.07763671875, "learning_rate": 2.1970590354258745e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2020 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 389.640625, "epoch": 1.6168, "grad_norm": 0.39452858608194213, "kl": 0.099609375, "learning_rate": 2.1949801804916563e-06, "loss": 0.004, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2021 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 409.953125, "epoch": 1.6176, "grad_norm": 0.6108606028152092, "kl": 0.099609375, "learning_rate": 2.19290153963209e-06, "loss": 0.004, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2022 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 387.34375, "epoch": 1.6183999999999998, "grad_norm": 0.07378419376928622, "kl": 0.08349609375, "learning_rate": 2.190823114306045e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2023 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 434.09375, "epoch": 1.6192, "grad_norm": 0.06119824242919308, "kl": 0.0810546875, "learning_rate": 2.188744905972239e-06, "loss": 0.0032, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2024 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 479.453125, "epoch": 1.62, "grad_norm": 0.07068022480178383, "kl": 0.09375, "learning_rate": 2.186666916089239e-06, "loss": 0.0037, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2025 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.671875, "epoch": 1.6208, "grad_norm": 0.6221384797782726, "kl": 0.1123046875, "learning_rate": 2.1845891461154604e-06, "loss": 0.0045, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2026 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.515625, "epoch": 1.6216, "grad_norm": 0.3793672283807978, "kl": 0.11962890625, "learning_rate": 2.1825115975091594e-06, "loss": 0.0048, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2027 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 439.125, "epoch": 1.6223999999999998, "grad_norm": 0.6990150534191653, "kl": 0.09765625, "learning_rate": 2.1804342717284414e-06, "loss": 0.0039, "reward": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2028 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 425.828125, "epoch": 1.6232, "grad_norm": 0.5126155154548316, "kl": 0.12255859375, "learning_rate": 2.1783571702312523e-06, "loss": 0.0049, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2029 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 430.96875, "epoch": 1.624, "grad_norm": 1.1625177114219285, "kl": 0.11181640625, "learning_rate": 2.176280294475383e-06, "loss": 0.0045, "reward": 1.703125, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2030 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 451.265625, "epoch": 1.6248, "grad_norm": 0.0790836077361706, "kl": 0.09228515625, "learning_rate": 2.174203645918464e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2031 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.578125, "epoch": 1.6256, "grad_norm": 0.5180100414711042, "kl": 0.1376953125, "learning_rate": 2.172127226017967e-06, "loss": 0.0055, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.984375, "step": 2032 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 491.734375, "epoch": 1.6263999999999998, "grad_norm": 0.5959103272952211, "kl": 0.10302734375, "learning_rate": 2.1700510362312053e-06, "loss": 0.0041, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 2033 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.890625, "epoch": 1.6272, "grad_norm": 1.0119783425654472, "kl": 0.1015625, "learning_rate": 2.1679750780153265e-06, "loss": 0.0041, "reward": 1.828125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2034 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 459.078125, "epoch": 1.6280000000000001, "grad_norm": 0.6102145825112454, "kl": 0.1337890625, "learning_rate": 2.1658993528273196e-06, "loss": 0.0053, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2035 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 458.703125, "epoch": 1.6288, "grad_norm": 0.670395486538238, "kl": 0.11376953125, "learning_rate": 2.163823862124007e-06, "loss": 0.0045, "reward": 1.671875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2036 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 426.609375, "epoch": 1.6296, "grad_norm": 0.09793812265645296, "kl": 0.1123046875, "learning_rate": 2.1617486073620496e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2037 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 485.6875, "epoch": 1.6303999999999998, "grad_norm": 0.846967509312758, "kl": 0.10302734375, "learning_rate": 2.15967358999794e-06, "loss": 0.0041, "reward": 1.890625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2038 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 415.6875, "epoch": 1.6312, "grad_norm": 1.185056589277107, "kl": 0.1630859375, "learning_rate": 2.1575988114880057e-06, "loss": 0.0065, "reward": 1.5625, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 2039 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 405.671875, "epoch": 1.6320000000000001, "grad_norm": 0.7997307398404204, "kl": 0.1572265625, "learning_rate": 2.155524273288405e-06, "loss": 0.0063, "reward": 1.515625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 2040 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 490.234375, "epoch": 1.6328, "grad_norm": 0.758577387618897, "kl": 0.09375, "learning_rate": 2.15344997685513e-06, "loss": 0.0037, "reward": 1.625, "reward_std": 0.20337893068790436, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 2041 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 408.890625, "epoch": 1.6336, "grad_norm": 0.7824015696554165, "kl": 0.1591796875, "learning_rate": 2.1513759236440024e-06, "loss": 0.0064, "reward": 1.421875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 2042 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 438.03125, "epoch": 1.6343999999999999, "grad_norm": 1.1683127251260226, "kl": 0.1474609375, "learning_rate": 2.1493021151106704e-06, "loss": 0.0059, "reward": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 2043 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 412.96875, "epoch": 1.6352, "grad_norm": 0.14790733519340812, "kl": 0.1484375, "learning_rate": 2.147228552710614e-06, "loss": 0.0059, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2044 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 414.109375, "epoch": 1.6360000000000001, "grad_norm": 0.8931486283271186, "kl": 0.1474609375, "learning_rate": 2.145155237899139e-06, "loss": 0.0059, "reward": 1.640625, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 2045 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 424.15625, "epoch": 1.6368, "grad_norm": 1.3417958553699005, "kl": 0.1201171875, "learning_rate": 2.143082172131378e-06, "loss": 0.0048, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2046 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 435.046875, "epoch": 1.6376, "grad_norm": 1.0701853002218584, "kl": 0.1748046875, "learning_rate": 2.141009356862288e-06, "loss": 0.007, "reward": 1.9375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2047 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.375, "epoch": 1.6383999999999999, "grad_norm": 0.8067529629088794, "kl": 0.2001953125, "learning_rate": 2.138936793546649e-06, "loss": 0.008, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2048 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.09375, "epoch": 1.6392, "grad_norm": 0.9583270906668814, "kl": 0.1552734375, "learning_rate": 2.1368644836390684e-06, "loss": 0.0062, "reward": 1.859375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2049 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 448.421875, "epoch": 1.6400000000000001, "grad_norm": 0.6525017486732866, "kl": 0.1923828125, "learning_rate": 2.134792428593971e-06, "loss": 0.0077, "reward": 1.5625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 2050 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.828125, "epoch": 1.6408, "grad_norm": 1.138673694517526, "kl": 0.2177734375, "learning_rate": 2.1327206298656055e-06, "loss": 0.0087, "reward": 1.6875, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 2051 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.40625, "epoch": 1.6416, "grad_norm": 0.8979409996731252, "kl": 0.2421875, "learning_rate": 2.130649088908041e-06, "loss": 0.0097, "reward": 1.71875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2052 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 386.75, "epoch": 1.6423999999999999, "grad_norm": 1.351684688785436, "kl": 0.357421875, "learning_rate": 2.1285778071751638e-06, "loss": 0.0143, "reward": 1.734375, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.96875, "step": 2053 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 382.53125, "epoch": 1.6432, "grad_norm": 0.9290746829045567, "kl": 0.353515625, "learning_rate": 2.126506786120678e-06, "loss": 0.0141, "reward": 1.703125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2054 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 377.34375, "epoch": 1.6440000000000001, "grad_norm": 0.8947963003590713, "kl": 0.275390625, "learning_rate": 2.1244360271981073e-06, "loss": 0.0111, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2055 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 387.140625, "epoch": 1.6448, "grad_norm": 0.8756873414122475, "kl": 0.400390625, "learning_rate": 2.1223655318607907e-06, "loss": 0.016, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 2056 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 446.96875, "epoch": 1.6456, "grad_norm": 1.9104153923874023, "kl": 0.40625, "learning_rate": 2.1202953015618794e-06, "loss": 0.0162, "reward": 1.71875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2057 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 1.6463999999999999, "grad_norm": 2.7428791774243075, "kl": 1.28125, "learning_rate": 2.1182253377543428e-06, "loss": 0.0514, "reward": 1.734375, "reward_std": 0.4639715254306793, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.875, "step": 2058 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 423.796875, "epoch": 1.6472, "grad_norm": 2.8132777340641075, "kl": 1.6640625, "learning_rate": 2.116155641890959e-06, "loss": 0.0665, "reward": 1.453125, "reward_std": 0.5810762643814087, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.84375, "step": 2059 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 440.75, "epoch": 1.6480000000000001, "grad_norm": 2.618284937298759, "kl": 0.609375, "learning_rate": 2.1140862154243223e-06, "loss": 0.0244, "reward": 1.734375, "reward_std": 0.2877861559391022, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.953125, "step": 2060 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 491.140625, "epoch": 1.6488, "grad_norm": 2.929192733231867, "kl": 2.5, "learning_rate": 2.1120170598068353e-06, "loss": 0.0998, "reward": 1.640625, "reward_std": 0.4592459797859192, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.859375, "step": 2061 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 453.375, "epoch": 1.6496, "grad_norm": 0.8376506704489841, "kl": 0.83984375, "learning_rate": 2.109948176490711e-06, "loss": 0.0335, "reward": 1.859375, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.96875, "step": 2062 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 471.546875, "epoch": 1.6503999999999999, "grad_norm": 3.354611778255606, "kl": 46.5, "learning_rate": 2.10787956692797e-06, "loss": 1.8653, "reward": 1.546875, "reward_std": 0.5070846080780029, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.828125, "step": 2063 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.234375, "epoch": 1.6512, "grad_norm": 2.632392548689691, "kl": 24.125, "learning_rate": 2.1058112325704436e-06, "loss": 0.9703, "reward": 1.578125, "reward_std": 0.5543705821037292, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.875, "step": 2064 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 495.40625, "epoch": 1.6520000000000001, "grad_norm": 6.459755037363299, "kl": 8.375, "learning_rate": 2.103743174869769e-06, "loss": 0.3359, "reward": 1.421875, "reward_std": 0.5879159569740295, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.78125, "step": 2065 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 389.734375, "epoch": 1.6528, "grad_norm": 0.37028672863703244, "kl": 0.0888671875, "learning_rate": 2.1016753952773867e-06, "loss": 0.0036, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2066 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 367.15625, "epoch": 1.6536, "grad_norm": 1.4978545902795564, "kl": 1.0078125, "learning_rate": 2.0996078952445453e-06, "loss": 0.0403, "reward": 1.609375, "reward_std": 0.22340349853038788, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.953125, "step": 2067 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 351.84375, "epoch": 1.6543999999999999, "grad_norm": 2.3919512724033165, "kl": 1.2421875, "learning_rate": 2.0975406762222966e-06, "loss": 0.0496, "reward": 1.671875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.953125, "step": 2068 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 377.90625, "epoch": 1.6552, "grad_norm": 0.13528200674066437, "kl": 0.0908203125, "learning_rate": 2.095473739661494e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2069 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 322.953125, "epoch": 1.6560000000000001, "grad_norm": 1.3661852926913154, "kl": 0.423828125, "learning_rate": 2.093407087012791e-06, "loss": 0.017, "reward": 1.921875, "reward_std": 0.09300297498703003, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2070 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 355.8125, "epoch": 1.6568, "grad_norm": 1.2447334924326254, "kl": 0.2001953125, "learning_rate": 2.091340719726647e-06, "loss": 0.008, "reward": 1.671875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2071 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 367.859375, "epoch": 1.6576, "grad_norm": 2.8788088395701164, "kl": 0.431640625, "learning_rate": 2.089274639253317e-06, "loss": 0.0172, "reward": 1.75, "reward_std": 0.3230288028717041, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.90625, "step": 2072 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 344.859375, "epoch": 1.6583999999999999, "grad_norm": 0.4312088131038686, "kl": 0.103515625, "learning_rate": 2.0872088470428553e-06, "loss": 0.0041, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2073 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 333.4375, "epoch": 1.6592, "grad_norm": 0.712412782267239, "kl": 0.09423828125, "learning_rate": 2.0851433445451142e-06, "loss": 0.0038, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2074 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 342.3125, "epoch": 1.6600000000000001, "grad_norm": 0.47028550784080814, "kl": 0.103515625, "learning_rate": 2.0830781332097446e-06, "loss": 0.0042, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 2075 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 364.90625, "epoch": 1.6608, "grad_norm": 0.11942239325951794, "kl": 0.0927734375, "learning_rate": 2.08101321448619e-06, "loss": 0.0037, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2076 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 380.5, "epoch": 1.6616, "grad_norm": 0.08161399529058654, "kl": 0.07373046875, "learning_rate": 2.0789485898236897e-06, "loss": 0.0029, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2077 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 343.03125, "epoch": 1.6623999999999999, "grad_norm": 0.49450583451663443, "kl": 0.1669921875, "learning_rate": 2.076884260671276e-06, "loss": 0.0067, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2078 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 322.0, "epoch": 1.6632, "grad_norm": 1.4657197200618237, "kl": 0.111328125, "learning_rate": 2.0748202284777775e-06, "loss": 0.0044, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2079 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 361.5, "epoch": 1.6640000000000001, "grad_norm": 1.0973359588279956, "kl": 0.10009765625, "learning_rate": 2.072756494691809e-06, "loss": 0.004, "reward": 1.71875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2080 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.71875, "epoch": 1.6648, "grad_norm": 1.3076777684445047, "kl": 0.1689453125, "learning_rate": 2.070693060761779e-06, "loss": 0.0068, "reward": 1.640625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 2081 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 383.640625, "epoch": 1.6656, "grad_norm": 2.6447020055058146, "kl": 0.53515625, "learning_rate": 2.0686299281358837e-06, "loss": 0.0214, "reward": 1.71875, "reward_std": 0.3503679633140564, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 2082 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 407.1875, "epoch": 1.6663999999999999, "grad_norm": 0.8757899604388453, "kl": 0.390625, "learning_rate": 2.0665670982621107e-06, "loss": 0.0156, "reward": 1.625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 2083 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 403.03125, "epoch": 1.6672, "grad_norm": 1.1131451419047147, "kl": 0.453125, "learning_rate": 2.0645045725882334e-06, "loss": 0.0181, "reward": 1.90625, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.984375, "step": 2084 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 423.828125, "epoch": 1.6680000000000001, "grad_norm": 3.3255435641845077, "kl": 1.9921875, "learning_rate": 2.0624423525618097e-06, "loss": 0.0799, "reward": 1.65625, "reward_std": 0.4106212854385376, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.890625, "step": 2085 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.234375, "epoch": 1.6688, "grad_norm": 2.875306385663048, "kl": 1.8828125, "learning_rate": 2.0603804396301875e-06, "loss": 0.0752, "reward": 1.84375, "reward_std": 0.3198433816432953, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.90625, "step": 2086 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 488.234375, "epoch": 1.6696, "grad_norm": 5.811269178043532, "kl": 7.5, "learning_rate": 2.058318835240495e-06, "loss": 0.2997, "reward": 1.40625, "reward_std": 0.5636438131332397, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.71875, "step": 2087 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 578.171875, "epoch": 1.6703999999999999, "grad_norm": 3.063940567925349, "kl": 6.625, "learning_rate": 2.0562575408396475e-06, "loss": 0.266, "reward": 1.359375, "reward_std": 0.5273522138595581, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.765625, "step": 2088 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 488.8125, "epoch": 1.6712, "grad_norm": 2.1562511039521093, "kl": 4.6875, "learning_rate": 2.0541965578743373e-06, "loss": 0.1873, "reward": 1.421875, "reward_std": 0.5159524083137512, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.84375, "step": 2089 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 506.21875, "epoch": 1.6720000000000002, "grad_norm": 5.100514446358345, "kl": 4.28125, "learning_rate": 2.0521358877910446e-06, "loss": 0.1713, "reward": 1.359375, "reward_std": 0.7028868794441223, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.796875, "step": 2090 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 466.578125, "epoch": 1.6728, "grad_norm": 2.844359638606817, "kl": 3.6875, "learning_rate": 2.0500755320360263e-06, "loss": 0.1472, "reward": 1.59375, "reward_std": 0.506583571434021, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.859375, "step": 2091 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 430.75, "epoch": 1.6736, "grad_norm": 4.278366066617067, "kl": 3.265625, "learning_rate": 2.048015492055319e-06, "loss": 0.1303, "reward": 1.546875, "reward_std": 0.5261501669883728, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.859375, "step": 2092 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 402.59375, "epoch": 1.6743999999999999, "grad_norm": 3.062156897960549, "kl": 3.53125, "learning_rate": 2.045955769294737e-06, "loss": 0.1412, "reward": 1.765625, "reward_std": 0.3825550079345703, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.84375, "step": 2093 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 477.671875, "epoch": 1.6752, "grad_norm": 519.8739486273251, "kl": 2.421875, "learning_rate": 2.0438963651998747e-06, "loss": 0.0967, "reward": 1.578125, "reward_std": 0.4517311453819275, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.84375, "step": 2094 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 459.109375, "epoch": 1.6760000000000002, "grad_norm": 34.63765130965278, "kl": 0.69140625, "learning_rate": 2.0418372812161015e-06, "loss": 0.0277, "reward": 1.875, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 2095 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 491.09375, "epoch": 1.6768, "grad_norm": 1.6566859147044086, "kl": 1.4140625, "learning_rate": 2.03977851878856e-06, "loss": 0.0569, "reward": 1.90625, "reward_std": 0.22558549046516418, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.953125, "step": 2096 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 460.578125, "epoch": 1.6776, "grad_norm": 1.091428748065709, "kl": 0.28515625, "learning_rate": 2.0377200793621694e-06, "loss": 0.0114, "reward": 1.4375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 2097 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 394.78125, "epoch": 1.6784, "grad_norm": 0.7501097936511554, "kl": 0.60546875, "learning_rate": 2.0356619643816234e-06, "loss": 0.0242, "reward": 1.59375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 2098 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 432.40625, "epoch": 1.6792, "grad_norm": 0.9020856731720874, "kl": 0.337890625, "learning_rate": 2.0336041752913843e-06, "loss": 0.0135, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 2099 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 483.65625, "epoch": 1.6800000000000002, "grad_norm": 1.533215888822757, "kl": 0.37109375, "learning_rate": 2.031546713535688e-06, "loss": 0.0149, "reward": 1.6875, "reward_std": 0.23319074511528015, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 2100 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 397.328125, "epoch": 1.6808, "grad_norm": 0.7005043360325897, "kl": 0.130859375, "learning_rate": 2.029489580558542e-06, "loss": 0.0052, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2101 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 458.390625, "epoch": 1.6816, "grad_norm": 1.4970864399606052, "kl": 0.2890625, "learning_rate": 2.0274327778037204e-06, "loss": 0.0115, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 2102 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 443.75, "epoch": 1.6824, "grad_norm": 0.3186074496487898, "kl": 0.111328125, "learning_rate": 2.0253763067147657e-06, "loss": 0.0045, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2103 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 415.15625, "epoch": 1.6832, "grad_norm": 1.6010840415017298, "kl": 1.140625, "learning_rate": 2.0233201687349888e-06, "loss": 0.0458, "reward": 1.84375, "reward_std": 0.21556037664413452, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.9375, "step": 2104 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 420.390625, "epoch": 1.6840000000000002, "grad_norm": 0.35779095114683007, "kl": 0.0830078125, "learning_rate": 2.0212643653074677e-06, "loss": 0.0033, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2105 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.953125, "epoch": 1.6848, "grad_norm": 2.534925458175817, "kl": 0.8125, "learning_rate": 2.019208897875043e-06, "loss": 0.0325, "reward": 1.9375, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.953125, "step": 2106 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 391.9375, "epoch": 1.6856, "grad_norm": 3.2418327402054525, "kl": 1.5, "learning_rate": 2.0171537678803222e-06, "loss": 0.0597, "reward": 1.609375, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.9375, "step": 2107 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 341.703125, "epoch": 1.6864, "grad_norm": 2.2100911582973635, "kl": 0.51171875, "learning_rate": 2.015098976765673e-06, "loss": 0.0205, "reward": 1.390625, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.96875, "step": 2108 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 384.046875, "epoch": 1.6872, "grad_norm": 1.8927276557942287, "kl": 1.0625, "learning_rate": 2.0130445259732282e-06, "loss": 0.0425, "reward": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 2109 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 433.734375, "epoch": 1.688, "grad_norm": 1.5667912067089171, "kl": 0.9140625, "learning_rate": 2.01099041694488e-06, "loss": 0.0365, "reward": 1.8125, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.953125, "step": 2110 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.578125, "epoch": 1.6888, "grad_norm": 1.7986987795251153, "kl": 2.359375, "learning_rate": 2.0089366511222815e-06, "loss": 0.0943, "reward": 1.78125, "reward_std": 0.3652152717113495, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.921875, "step": 2111 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 419.828125, "epoch": 1.6896, "grad_norm": 1.217112278948557, "kl": 0.58203125, "learning_rate": 2.006883229946843e-06, "loss": 0.0233, "reward": 1.703125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 2112 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 344.703125, "epoch": 1.6904, "grad_norm": 1.0434906078163486, "kl": 0.462890625, "learning_rate": 2.0048301548597365e-06, "loss": 0.0185, "reward": 1.515625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 2113 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 427.375, "epoch": 1.6912, "grad_norm": 1.7048318956597763, "kl": 2.5625, "learning_rate": 2.0027774273018894e-06, "loss": 0.1026, "reward": 1.53125, "reward_std": 0.24969476461410522, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.921875, "step": 2114 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.5, "epoch": 1.692, "grad_norm": 1.120614382118344, "kl": 1.421875, "learning_rate": 2.0007250487139827e-06, "loss": 0.0566, "reward": 1.890625, "reward_std": 0.27883461117744446, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.96875, "step": 2115 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 459.375, "epoch": 1.6928, "grad_norm": 1.2304593184727952, "kl": 3.140625, "learning_rate": 1.998673020536456e-06, "loss": 0.1261, "reward": 1.71875, "reward_std": 0.4387563467025757, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.921875, "step": 2116 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.484375, "epoch": 1.6936, "grad_norm": 1.0479482288555169, "kl": 2.203125, "learning_rate": 1.996621344209503e-06, "loss": 0.088, "reward": 1.734375, "reward_std": 0.2880638837814331, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.9375, "step": 2117 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 425.171875, "epoch": 1.6944, "grad_norm": 0.7930626251555342, "kl": 1.3984375, "learning_rate": 1.994570021173067e-06, "loss": 0.056, "reward": 1.640625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 2118 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 395.546875, "epoch": 1.6952, "grad_norm": 0.41743046947985185, "kl": 0.56640625, "learning_rate": 1.9925190528668455e-06, "loss": 0.0227, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 2119 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 383.140625, "epoch": 1.696, "grad_norm": 0.3176723997061967, "kl": 0.77734375, "learning_rate": 1.990468440730288e-06, "loss": 0.0311, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2120 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 339.5, "epoch": 1.6968, "grad_norm": 0.9748433744860779, "kl": 0.4296875, "learning_rate": 1.9884181862025938e-06, "loss": 0.0172, "reward": 1.578125, "reward_std": 0.23531240224838257, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 2121 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.84375, "epoch": 1.6976, "grad_norm": 1.5175450035007318, "kl": 2.15625, "learning_rate": 1.986368290722709e-06, "loss": 0.0863, "reward": 1.890625, "reward_std": 0.23925507068634033, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.96875, "step": 2122 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 380.90625, "epoch": 1.6984, "grad_norm": 0.391819515717612, "kl": 0.609375, "learning_rate": 1.9843187557293286e-06, "loss": 0.0244, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 2123 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 397.234375, "epoch": 1.6992, "grad_norm": 1.6924632622452174, "kl": 2.609375, "learning_rate": 1.9822695826608975e-06, "loss": 0.1046, "reward": 1.75, "reward_std": 0.20751865208148956, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9375, "step": 2124 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 374.078125, "epoch": 1.7, "grad_norm": 1.052184524765829, "kl": 1.109375, "learning_rate": 1.9802207729556023e-06, "loss": 0.0444, "reward": 1.78125, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 2125 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 391.65625, "epoch": 1.7008, "grad_norm": 0.34578416675042317, "kl": 0.55859375, "learning_rate": 1.978172328051377e-06, "loss": 0.0222, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2126 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.484375, "epoch": 1.7016, "grad_norm": 0.49065077345498653, "kl": 0.7109375, "learning_rate": 1.9761242493858987e-06, "loss": 0.0284, "reward": 1.921875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2127 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.90625, "epoch": 1.7024, "grad_norm": 0.6389077473900239, "kl": 0.8125, "learning_rate": 1.9740765383965894e-06, "loss": 0.0326, "reward": 1.6875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 2128 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 355.1875, "epoch": 1.7032, "grad_norm": 0.5299376628103857, "kl": 0.1005859375, "learning_rate": 1.9720291965206097e-06, "loss": 0.004, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2129 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.875, "epoch": 1.704, "grad_norm": 0.832740783607803, "kl": 0.53125, "learning_rate": 1.969982225194864e-06, "loss": 0.0213, "reward": 1.546875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 2130 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 394.90625, "epoch": 1.7048, "grad_norm": 0.4899814699346314, "kl": 0.0849609375, "learning_rate": 1.9679356258559943e-06, "loss": 0.0034, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2131 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 419.4375, "epoch": 1.7056, "grad_norm": 0.7187222946588002, "kl": 0.0849609375, "learning_rate": 1.9658893999403847e-06, "loss": 0.0034, "reward": 1.65625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2132 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.109375, "epoch": 1.7064, "grad_norm": 0.7439981408427864, "kl": 0.61328125, "learning_rate": 1.9638435488841543e-06, "loss": 0.0246, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2133 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 402.40625, "epoch": 1.7072, "grad_norm": 0.09514080244625805, "kl": 0.091796875, "learning_rate": 1.96179807412316e-06, "loss": 0.0037, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2134 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.40625, "epoch": 1.708, "grad_norm": 1.3045182798093014, "kl": 0.09375, "learning_rate": 1.959752977092995e-06, "loss": 0.0037, "reward": 1.828125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2135 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.5, "epoch": 1.7088, "grad_norm": 5.048103835913532, "kl": 0.111328125, "learning_rate": 1.957708259228987e-06, "loss": 0.0045, "reward": 1.75, "reward_std": 0.19506090879440308, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2136 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 386.265625, "epoch": 1.7096, "grad_norm": 0.8767383880884204, "kl": 0.08935546875, "learning_rate": 1.9556639219661983e-06, "loss": 0.0036, "reward": 1.671875, "reward_std": 0.189372718334198, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2137 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 350.6875, "epoch": 1.7104, "grad_norm": 0.6028009139225727, "kl": 0.10888671875, "learning_rate": 1.9536199667394217e-06, "loss": 0.0044, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2138 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 398.984375, "epoch": 1.7112, "grad_norm": 0.41365095907955113, "kl": 0.0869140625, "learning_rate": 1.9515763949831852e-06, "loss": 0.0035, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2139 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 1.712, "grad_norm": 1.0507864788730774, "kl": 0.08837890625, "learning_rate": 1.9495332081317466e-06, "loss": 0.0035, "reward": 1.84375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2140 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 396.28125, "epoch": 1.7128, "grad_norm": 0.5168401126078777, "kl": 0.08056640625, "learning_rate": 1.947490407619092e-06, "loss": 0.0032, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2141 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 426.796875, "epoch": 1.7136, "grad_norm": 1.1847996150566735, "kl": 0.09033203125, "learning_rate": 1.945447994878937e-06, "loss": 0.0036, "reward": 1.890625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2142 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 391.34375, "epoch": 1.7144, "grad_norm": 0.5054062802849875, "kl": 0.314453125, "learning_rate": 1.9434059713447264e-06, "loss": 0.0126, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 2143 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 345.25, "epoch": 1.7151999999999998, "grad_norm": 0.42991081054431823, "kl": 0.0947265625, "learning_rate": 1.9413643384496315e-06, "loss": 0.0038, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2144 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.453125, "epoch": 1.716, "grad_norm": 2.6317180049845845, "kl": 0.83984375, "learning_rate": 1.9393230976265478e-06, "loss": 0.0336, "reward": 1.75, "reward_std": 0.342454731464386, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.96875, "step": 2145 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 451.390625, "epoch": 1.7168, "grad_norm": 0.6238490237819564, "kl": 0.09130859375, "learning_rate": 1.937282250308096e-06, "loss": 0.0036, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2146 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 395.265625, "epoch": 1.7176, "grad_norm": 0.6754300967166547, "kl": 0.0810546875, "learning_rate": 1.935241797926623e-06, "loss": 0.0032, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2147 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 407.265625, "epoch": 1.7184, "grad_norm": 0.37420052763490447, "kl": 0.07763671875, "learning_rate": 1.933201741914196e-06, "loss": 0.0031, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2148 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 410.3125, "epoch": 1.7191999999999998, "grad_norm": 0.5465000812012062, "kl": 0.08154296875, "learning_rate": 1.931162083702606e-06, "loss": 0.0033, "reward": 1.59375, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 2149 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 420.71875, "epoch": 1.72, "grad_norm": 2.069261642593193, "kl": 0.875, "learning_rate": 1.9291228247233607e-06, "loss": 0.035, "reward": 1.75, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2150 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.15625, "epoch": 1.7208, "grad_norm": 0.7448041846892124, "kl": 0.462890625, "learning_rate": 1.9270839664076937e-06, "loss": 0.0186, "reward": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 2151 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.046875, "epoch": 1.7216, "grad_norm": 0.6583434051201349, "kl": 0.09765625, "learning_rate": 1.9250455101865526e-06, "loss": 0.0039, "reward": 1.765625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2152 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 376.9375, "epoch": 1.7224, "grad_norm": 0.4778357308685756, "kl": 0.62890625, "learning_rate": 1.9230074574906043e-06, "loss": 0.025, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2153 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 388.734375, "epoch": 1.7231999999999998, "grad_norm": 0.6613729323890468, "kl": 0.1640625, "learning_rate": 1.920969809750234e-06, "loss": 0.0066, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 2154 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 359.734375, "epoch": 1.724, "grad_norm": 0.8442695124394836, "kl": 0.8203125, "learning_rate": 1.91893256839554e-06, "loss": 0.033, "reward": 1.609375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 2155 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 361.890625, "epoch": 1.7248, "grad_norm": 0.5860484477873993, "kl": 1.5234375, "learning_rate": 1.916895734856338e-06, "loss": 0.0606, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 2156 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 426.65625, "epoch": 1.7256, "grad_norm": 0.527963775207647, "kl": 0.08056640625, "learning_rate": 1.9148593105621542e-06, "loss": 0.0032, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2157 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 375.28125, "epoch": 1.7264, "grad_norm": 0.6531439609473019, "kl": 1.0078125, "learning_rate": 1.9128232969422318e-06, "loss": 0.0402, "reward": 1.890625, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.984375, "step": 2158 }, { "all_correct": 0.375, "all_wrong": 0.625, "completion_length": 338.625, "epoch": 1.7271999999999998, "grad_norm": 0.07875094697090004, "kl": 0.0947265625, "learning_rate": 1.9107876954255217e-06, "loss": 0.0038, "reward": 1.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 2159 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.421875, "epoch": 1.728, "grad_norm": 0.9866165387525001, "kl": 0.68359375, "learning_rate": 1.908752507440689e-06, "loss": 0.0273, "reward": 1.890625, "reward_std": 0.15981829166412354, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.984375, "step": 2160 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 338.765625, "epoch": 1.7288000000000001, "grad_norm": 0.5917012212023924, "kl": 0.08984375, "learning_rate": 1.906717734416105e-06, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2161 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 423.0625, "epoch": 1.7296, "grad_norm": 0.37594943811425857, "kl": 0.08154296875, "learning_rate": 1.9046833777798534e-06, "loss": 0.0033, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2162 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.453125, "epoch": 1.7304, "grad_norm": 0.3544332730968081, "kl": 0.0849609375, "learning_rate": 1.9026494389597239e-06, "loss": 0.0034, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2163 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.234375, "epoch": 1.7311999999999999, "grad_norm": 1.0241005608809086, "kl": 1.3515625, "learning_rate": 1.9006159193832124e-06, "loss": 0.0538, "reward": 1.8125, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 2164 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 390.109375, "epoch": 1.732, "grad_norm": 0.10141318590330797, "kl": 0.0869140625, "learning_rate": 1.8985828204775206e-06, "loss": 0.0035, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2165 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 400.109375, "epoch": 1.7328000000000001, "grad_norm": 0.5447422323142942, "kl": 0.07958984375, "learning_rate": 1.8965501436695578e-06, "loss": 0.0032, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2166 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 1.7336, "grad_norm": 1.3390343253268027, "kl": 1.6640625, "learning_rate": 1.894517890385933e-06, "loss": 0.0668, "reward": 1.71875, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2167 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.140625, "epoch": 1.7344, "grad_norm": 0.6592344074028343, "kl": 0.07763671875, "learning_rate": 1.8924860620529594e-06, "loss": 0.0031, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2168 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.09375, "epoch": 1.7351999999999999, "grad_norm": 0.6831807214742347, "kl": 0.349609375, "learning_rate": 1.8904546600966539e-06, "loss": 0.0139, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2169 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 363.9375, "epoch": 1.736, "grad_norm": 0.8033956249930624, "kl": 0.076171875, "learning_rate": 1.888423685942732e-06, "loss": 0.003, "reward": 1.828125, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2170 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 378.5, "epoch": 1.7368000000000001, "grad_norm": 7.109417305470336, "kl": 66.5, "learning_rate": 1.886393141016609e-06, "loss": 2.6712, "reward": 1.53125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 2171 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.953125, "epoch": 1.7376, "grad_norm": 0.6615096403758822, "kl": 0.0849609375, "learning_rate": 1.8843630267434e-06, "loss": 0.0034, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2172 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 395.203125, "epoch": 1.7384, "grad_norm": 0.6867397743025585, "kl": 0.08984375, "learning_rate": 1.8823333445479175e-06, "loss": 0.0036, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 2173 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 378.640625, "epoch": 1.7391999999999999, "grad_norm": 0.3295383515321191, "kl": 0.703125, "learning_rate": 1.8803040958546708e-06, "loss": 0.0282, "reward": 1.828125, "reward_std": 0.09300297498703003, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 2174 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.25, "epoch": 1.74, "grad_norm": 0.5054531410755875, "kl": 0.0810546875, "learning_rate": 1.8782752820878636e-06, "loss": 0.0032, "reward": 1.921875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2175 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 370.4375, "epoch": 1.7408000000000001, "grad_norm": 0.07730153858880742, "kl": 0.080078125, "learning_rate": 1.8762469046713954e-06, "loss": 0.0032, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2176 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 358.6875, "epoch": 1.7416, "grad_norm": 0.06700469393968977, "kl": 0.0830078125, "learning_rate": 1.8742189650288617e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2177 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 364.015625, "epoch": 1.7424, "grad_norm": 0.7741486266428659, "kl": 0.61328125, "learning_rate": 1.872191464583547e-06, "loss": 0.0248, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2178 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 425.65625, "epoch": 1.7431999999999999, "grad_norm": 0.5533172483910037, "kl": 0.91796875, "learning_rate": 1.8701644047584294e-06, "loss": 0.0368, "reward": 1.640625, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 2179 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 384.484375, "epoch": 1.744, "grad_norm": 0.7222343989628767, "kl": 0.11376953125, "learning_rate": 1.868137786976177e-06, "loss": 0.0046, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2180 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 379.484375, "epoch": 1.7448000000000001, "grad_norm": 0.7613424119483693, "kl": 0.087890625, "learning_rate": 1.8661116126591492e-06, "loss": 0.0035, "reward": 1.828125, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2181 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 360.078125, "epoch": 1.7456, "grad_norm": 0.364184606976644, "kl": 0.0771484375, "learning_rate": 1.8640858832293924e-06, "loss": 0.0031, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2182 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 423.640625, "epoch": 1.7464, "grad_norm": 0.06684451176827239, "kl": 0.07666015625, "learning_rate": 1.8620606001086423e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2183 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 415.921875, "epoch": 1.7471999999999999, "grad_norm": 1.9474665806526612, "kl": 0.078125, "learning_rate": 1.8600357647183188e-06, "loss": 0.0031, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2184 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 363.21875, "epoch": 1.748, "grad_norm": 0.8366932360932623, "kl": 0.078125, "learning_rate": 1.8580113784795306e-06, "loss": 0.0031, "reward": 1.671875, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2185 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 353.453125, "epoch": 1.7488000000000001, "grad_norm": 0.07387785973137999, "kl": 0.0810546875, "learning_rate": 1.8559874428130708e-06, "loss": 0.0032, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2186 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 389.6875, "epoch": 1.7496, "grad_norm": 0.3920833617687332, "kl": 0.0771484375, "learning_rate": 1.8539639591394131e-06, "loss": 0.0031, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2187 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 369.765625, "epoch": 1.7504, "grad_norm": 0.6329375099735762, "kl": 0.08740234375, "learning_rate": 1.8519409288787182e-06, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2188 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 319.765625, "epoch": 1.7511999999999999, "grad_norm": 0.7345984322249883, "kl": 0.08203125, "learning_rate": 1.8499183534508263e-06, "loss": 0.0033, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2189 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 357.28125, "epoch": 1.752, "grad_norm": 0.33114526841709796, "kl": 0.09423828125, "learning_rate": 1.8478962342752584e-06, "loss": 0.0038, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 2190 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 442.625, "epoch": 1.7528000000000001, "grad_norm": 0.35950129781701856, "kl": 0.197265625, "learning_rate": 1.8458745727712142e-06, "loss": 0.0079, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2191 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 385.09375, "epoch": 1.7536, "grad_norm": 0.638379994349957, "kl": 0.1337890625, "learning_rate": 1.8438533703575757e-06, "loss": 0.0053, "reward": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2192 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 381.984375, "epoch": 1.7544, "grad_norm": 0.656625662268667, "kl": 0.1787109375, "learning_rate": 1.8418326284528997e-06, "loss": 0.0071, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2193 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.640625, "epoch": 1.7551999999999999, "grad_norm": 0.77059664015117, "kl": 0.0859375, "learning_rate": 1.8398123484754204e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2194 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 356.890625, "epoch": 1.756, "grad_norm": 0.6530003369178063, "kl": 0.087890625, "learning_rate": 1.8377925318430478e-06, "loss": 0.0035, "reward": 1.71875, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2195 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.796875, "epoch": 1.7568000000000001, "grad_norm": 0.7639542174461229, "kl": 0.08349609375, "learning_rate": 1.8357731799733686e-06, "loss": 0.0033, "reward": 1.84375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2196 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 346.46875, "epoch": 1.7576, "grad_norm": 0.8352966618780917, "kl": 0.341796875, "learning_rate": 1.8337542942836406e-06, "loss": 0.0137, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 2197 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.734375, "epoch": 1.7584, "grad_norm": 0.4202004345708123, "kl": 0.08837890625, "learning_rate": 1.8317358761907945e-06, "loss": 0.0035, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2198 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 352.59375, "epoch": 1.7591999999999999, "grad_norm": 1.5169317862327436, "kl": 0.11474609375, "learning_rate": 1.8297179271114345e-06, "loss": 0.0046, "reward": 1.515625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 2199 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.078125, "epoch": 1.76, "grad_norm": 0.5583614560311482, "kl": 0.0810546875, "learning_rate": 1.827700448461836e-06, "loss": 0.0032, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2200 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 338.21875, "epoch": 1.7608000000000001, "grad_norm": 0.09188466516776025, "kl": 0.0888671875, "learning_rate": 1.8256834416579423e-06, "loss": 0.0036, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2201 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 415.96875, "epoch": 1.7616, "grad_norm": 1.3590539546953042, "kl": 0.0830078125, "learning_rate": 1.8236669081153657e-06, "loss": 0.0033, "reward": 1.640625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 2202 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 450.640625, "epoch": 1.7624, "grad_norm": 0.25075375774385944, "kl": 0.07666015625, "learning_rate": 1.8216508492493887e-06, "loss": 0.0031, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2203 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 417.84375, "epoch": 1.7631999999999999, "grad_norm": 0.5709628925941539, "kl": 0.07177734375, "learning_rate": 1.8196352664749578e-06, "loss": 0.0029, "reward": 1.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2204 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 412.390625, "epoch": 1.764, "grad_norm": 0.6348322494182044, "kl": 0.0849609375, "learning_rate": 1.8176201612066874e-06, "loss": 0.0034, "reward": 1.59375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 2205 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 376.890625, "epoch": 1.7648000000000001, "grad_norm": 0.4552307632423137, "kl": 0.1455078125, "learning_rate": 1.8156055348588548e-06, "loss": 0.0058, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2206 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.484375, "epoch": 1.7656, "grad_norm": 0.49988439873976975, "kl": 0.08447265625, "learning_rate": 1.8135913888454034e-06, "loss": 0.0034, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2207 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 425.640625, "epoch": 1.7664, "grad_norm": 0.726632658712714, "kl": 0.0771484375, "learning_rate": 1.8115777245799383e-06, "loss": 0.0031, "reward": 1.796875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2208 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.53125, "epoch": 1.7671999999999999, "grad_norm": 0.8967043761757607, "kl": 0.2353515625, "learning_rate": 1.8095645434757261e-06, "loss": 0.0094, "reward": 1.78125, "reward_std": 0.15769661962985992, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 2209 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 431.984375, "epoch": 1.768, "grad_norm": 0.3909682790220696, "kl": 0.078125, "learning_rate": 1.8075518469456944e-06, "loss": 0.0031, "reward": 1.9375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2210 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 427.5, "epoch": 1.7688000000000001, "grad_norm": 0.3092307066350754, "kl": 0.08837890625, "learning_rate": 1.8055396364024318e-06, "loss": 0.0035, "reward": 1.265625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 1.0, "step": 2211 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 403.359375, "epoch": 1.7696, "grad_norm": 0.06132860723139154, "kl": 0.0810546875, "learning_rate": 1.803527913258186e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2212 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 413.421875, "epoch": 1.7704, "grad_norm": 1.3164007009289929, "kl": 0.09375, "learning_rate": 1.8015166789248606e-06, "loss": 0.0037, "reward": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2213 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.28125, "epoch": 1.7711999999999999, "grad_norm": 0.6314313796956886, "kl": 0.083984375, "learning_rate": 1.7995059348140165e-06, "loss": 0.0034, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2214 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 427.28125, "epoch": 1.772, "grad_norm": 0.7757636034561768, "kl": 0.2314453125, "learning_rate": 1.7974956823368728e-06, "loss": 0.0093, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2215 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.75, "epoch": 1.7728000000000002, "grad_norm": 0.07203304999848316, "kl": 0.07421875, "learning_rate": 1.7954859229043017e-06, "loss": 0.003, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2216 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.40625, "epoch": 1.7736, "grad_norm": 0.6206520422132313, "kl": 0.0908203125, "learning_rate": 1.7934766579268292e-06, "loss": 0.0036, "reward": 1.609375, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2217 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 386.59375, "epoch": 1.7744, "grad_norm": 0.5964038193468935, "kl": 0.09375, "learning_rate": 1.7914678888146347e-06, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2218 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 348.359375, "epoch": 1.7752, "grad_norm": 0.8000847788397882, "kl": 0.08837890625, "learning_rate": 1.7894596169775514e-06, "loss": 0.0035, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2219 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 442.9375, "epoch": 1.776, "grad_norm": 0.05735166309966034, "kl": 0.0830078125, "learning_rate": 1.7874518438250598e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2220 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 396.3125, "epoch": 1.7768000000000002, "grad_norm": 0.377393691190953, "kl": 0.09619140625, "learning_rate": 1.785444570766293e-06, "loss": 0.0038, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2221 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 472.71875, "epoch": 1.7776, "grad_norm": 0.5801925247004116, "kl": 0.08251953125, "learning_rate": 1.7834377992100332e-06, "loss": 0.0033, "reward": 1.328125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 1.0, "step": 2222 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 399.140625, "epoch": 1.7784, "grad_norm": 0.9329010475938972, "kl": 0.33203125, "learning_rate": 1.7814315305647095e-06, "loss": 0.0133, "reward": 1.71875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 2223 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 385.734375, "epoch": 1.7792, "grad_norm": 1.8708027844615398, "kl": 0.185546875, "learning_rate": 1.779425766238398e-06, "loss": 0.0074, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2224 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 394.125, "epoch": 1.78, "grad_norm": 0.498139251920171, "kl": 0.28515625, "learning_rate": 1.7774205076388207e-06, "loss": 0.0114, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2225 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.25, "epoch": 1.7808000000000002, "grad_norm": 0.5619444912439748, "kl": 0.08349609375, "learning_rate": 1.7754157561733476e-06, "loss": 0.0033, "reward": 1.84375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2226 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 404.3125, "epoch": 1.7816, "grad_norm": 0.4210320302566394, "kl": 0.083984375, "learning_rate": 1.7734115132489887e-06, "loss": 0.0034, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2227 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.375, "epoch": 1.7824, "grad_norm": 0.7367942735127898, "kl": 0.07421875, "learning_rate": 1.7714077802723994e-06, "loss": 0.003, "reward": 1.671875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2228 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.296875, "epoch": 1.7832, "grad_norm": 0.9890232405149658, "kl": 0.0888671875, "learning_rate": 1.7694045586498754e-06, "loss": 0.0035, "reward": 1.65625, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2229 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.46875, "epoch": 1.784, "grad_norm": 0.6324103271657705, "kl": 0.1689453125, "learning_rate": 1.7674018497873568e-06, "loss": 0.0067, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2230 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 405.5625, "epoch": 1.7848000000000002, "grad_norm": 3.941341814515546, "kl": 0.08984375, "learning_rate": 1.7653996550904208e-06, "loss": 0.0036, "reward": 1.59375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 2231 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.21875, "epoch": 1.7856, "grad_norm": 1.0721577397180355, "kl": 0.08544921875, "learning_rate": 1.7633979759642844e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2232 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 450.75, "epoch": 1.7864, "grad_norm": 0.4838969957345881, "kl": 0.0888671875, "learning_rate": 1.7613968138138027e-06, "loss": 0.0036, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 2233 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 414.46875, "epoch": 1.7872, "grad_norm": 0.35336711014719263, "kl": 0.0751953125, "learning_rate": 1.7593961700434692e-06, "loss": 0.003, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2234 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.8125, "epoch": 1.788, "grad_norm": 0.48168024944088667, "kl": 0.08349609375, "learning_rate": 1.7573960460574133e-06, "loss": 0.0033, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2235 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.453125, "epoch": 1.7888, "grad_norm": 0.611225545331788, "kl": 0.56640625, "learning_rate": 1.7553964432593976e-06, "loss": 0.0227, "reward": 1.640625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 2236 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 414.359375, "epoch": 1.7896, "grad_norm": 0.9386424953791263, "kl": 0.302734375, "learning_rate": 1.75339736305282e-06, "loss": 0.0121, "reward": 1.625, "reward_std": 0.1243029236793518, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 2237 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 460.40625, "epoch": 1.7904, "grad_norm": 1.1941673294724382, "kl": 0.349609375, "learning_rate": 1.7513988068407145e-06, "loss": 0.0141, "reward": 1.6875, "reward_std": 0.23827511072158813, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 2238 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.109375, "epoch": 1.7912, "grad_norm": 0.8289088427251986, "kl": 0.458984375, "learning_rate": 1.7494007760257428e-06, "loss": 0.0183, "reward": 1.625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2239 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.828125, "epoch": 1.792, "grad_norm": 0.18174947692607274, "kl": 0.08251953125, "learning_rate": 1.7474032720101991e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2240 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 410.03125, "epoch": 1.7928, "grad_norm": 0.057623689715676836, "kl": 0.0830078125, "learning_rate": 1.7454062961960102e-06, "loss": 0.0033, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2241 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.84375, "epoch": 1.7936, "grad_norm": 0.47957047822833476, "kl": 0.08837890625, "learning_rate": 1.7434098499847308e-06, "loss": 0.0035, "reward": 1.703125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2242 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.78125, "epoch": 1.7944, "grad_norm": 1.4510650561256406, "kl": 1.3359375, "learning_rate": 1.7414139347775423e-06, "loss": 0.0534, "reward": 1.71875, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 2243 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.53125, "epoch": 1.7952, "grad_norm": 0.7926264380225096, "kl": 0.1015625, "learning_rate": 1.7394185519752546e-06, "loss": 0.0041, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2244 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 432.640625, "epoch": 1.796, "grad_norm": 0.45536677576866846, "kl": 0.19921875, "learning_rate": 1.7374237029783064e-06, "loss": 0.008, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 2245 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 441.421875, "epoch": 1.7968, "grad_norm": 0.6059867585897916, "kl": 0.5859375, "learning_rate": 1.7354293891867582e-06, "loss": 0.0235, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2246 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 389.59375, "epoch": 1.7976, "grad_norm": 0.41467690318077355, "kl": 0.0869140625, "learning_rate": 1.7334356120002956e-06, "loss": 0.0035, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2247 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.015625, "epoch": 1.7984, "grad_norm": 0.767967960568488, "kl": 0.07275390625, "learning_rate": 1.7314423728182283e-06, "loss": 0.0029, "reward": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2248 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 452.609375, "epoch": 1.7992, "grad_norm": 0.37091586756919365, "kl": 0.68359375, "learning_rate": 1.7294496730394897e-06, "loss": 0.0274, "reward": 1.75, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2249 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 411.921875, "epoch": 1.8, "grad_norm": 1.0544059277895323, "kl": 0.08251953125, "learning_rate": 1.7274575140626318e-06, "loss": 0.0033, "reward": 1.46875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 2250 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.109375, "epoch": 1.8008, "grad_norm": 0.05339237676383853, "kl": 0.06982421875, "learning_rate": 1.7254658972858293e-06, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2251 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.4375, "epoch": 1.8016, "grad_norm": 0.4810915562817627, "kl": 0.71875, "learning_rate": 1.7234748241068742e-06, "loss": 0.0289, "reward": 1.921875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2252 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 398.375, "epoch": 1.8024, "grad_norm": 0.08754555373606211, "kl": 0.0771484375, "learning_rate": 1.7214842959231796e-06, "loss": 0.0031, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2253 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.921875, "epoch": 1.8032, "grad_norm": 1.353657433311371, "kl": 1.0390625, "learning_rate": 1.719494314131775e-06, "loss": 0.0416, "reward": 1.796875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 2254 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 1.804, "grad_norm": 1.7745382663360545, "kl": 0.09228515625, "learning_rate": 1.7175048801293042e-06, "loss": 0.0037, "reward": 1.84375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2255 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.8125, "epoch": 1.8048, "grad_norm": 0.8812652450707908, "kl": 0.09814453125, "learning_rate": 1.7155159953120315e-06, "loss": 0.0039, "reward": 1.65625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 2256 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 380.875, "epoch": 1.8056, "grad_norm": 0.8448951799668749, "kl": 0.0869140625, "learning_rate": 1.7135276610758309e-06, "loss": 0.0035, "reward": 1.546875, "reward_std": 0.2472364753484726, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 2257 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.640625, "epoch": 1.8064, "grad_norm": 0.7179314122332302, "kl": 0.078125, "learning_rate": 1.7115398788161923e-06, "loss": 0.0031, "reward": 1.640625, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 2258 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.296875, "epoch": 1.8072, "grad_norm": 0.7423929414923691, "kl": 0.2001953125, "learning_rate": 1.7095526499282172e-06, "loss": 0.008, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2259 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 1.808, "grad_norm": 0.07914383247635483, "kl": 0.07958984375, "learning_rate": 1.7075659758066207e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2260 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 381.15625, "epoch": 1.8088, "grad_norm": 2.0865411261662095, "kl": 0.08740234375, "learning_rate": 1.7055798578457267e-06, "loss": 0.0035, "reward": 1.53125, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 2261 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 430.578125, "epoch": 1.8096, "grad_norm": 0.5517740052570347, "kl": 0.06982421875, "learning_rate": 1.703594297439469e-06, "loss": 0.0028, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2262 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 423.578125, "epoch": 1.8104, "grad_norm": 1.258041362856456, "kl": 0.6796875, "learning_rate": 1.7016092959813892e-06, "loss": 0.0273, "reward": 1.875, "reward_std": 0.2177756279706955, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 2263 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 396.53125, "epoch": 1.8112, "grad_norm": 0.5529226969659027, "kl": 0.09521484375, "learning_rate": 1.6996248548646393e-06, "loss": 0.0038, "reward": 1.796875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2264 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 397.453125, "epoch": 1.812, "grad_norm": 0.5097453340811813, "kl": 0.0830078125, "learning_rate": 1.6976409754819767e-06, "loss": 0.0033, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2265 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 416.9375, "epoch": 1.8128, "grad_norm": 0.06305387379928601, "kl": 0.0771484375, "learning_rate": 1.6956576592257635e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2266 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 397.3125, "epoch": 1.8136, "grad_norm": 0.06838940090814126, "kl": 0.08251953125, "learning_rate": 1.6936749074879663e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2267 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 419.75, "epoch": 1.8144, "grad_norm": 0.37466414719961527, "kl": 0.076171875, "learning_rate": 1.6916927216601593e-06, "loss": 0.0031, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 2268 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.796875, "epoch": 1.8152, "grad_norm": 0.6063340028821798, "kl": 0.07861328125, "learning_rate": 1.6897111031335145e-06, "loss": 0.0031, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2269 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 428.421875, "epoch": 1.8159999999999998, "grad_norm": 0.6073598853765154, "kl": 0.515625, "learning_rate": 1.6877300532988095e-06, "loss": 0.0206, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 2270 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 388.390625, "epoch": 1.8168, "grad_norm": 2.323754122284559, "kl": 0.0888671875, "learning_rate": 1.6857495735464196e-06, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2271 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 404.109375, "epoch": 1.8176, "grad_norm": 0.6804114586495742, "kl": 0.1220703125, "learning_rate": 1.6837696652663244e-06, "loss": 0.0049, "reward": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 2272 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 426.6875, "epoch": 1.8184, "grad_norm": 0.7756025874676311, "kl": 0.08984375, "learning_rate": 1.681790329848097e-06, "loss": 0.0036, "reward": 1.828125, "reward_std": 0.16887323558330536, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2273 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 427.734375, "epoch": 1.8192, "grad_norm": 0.4015389990989993, "kl": 0.29296875, "learning_rate": 1.6798115686809125e-06, "loss": 0.0117, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2274 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 410.203125, "epoch": 1.8199999999999998, "grad_norm": 0.06056705674607063, "kl": 0.07861328125, "learning_rate": 1.677833383153542e-06, "loss": 0.0031, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2275 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.5, "epoch": 1.8208, "grad_norm": 2.205816337885792, "kl": 0.07861328125, "learning_rate": 1.6758557746543518e-06, "loss": 0.0031, "reward": 1.859375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2276 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.59375, "epoch": 1.8216, "grad_norm": 1.095409944294399, "kl": 0.263671875, "learning_rate": 1.673878744571304e-06, "loss": 0.0106, "reward": 1.90625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.96875, "step": 2277 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 395.78125, "epoch": 1.8224, "grad_norm": 0.07041981281314577, "kl": 0.076171875, "learning_rate": 1.6719022942919527e-06, "loss": 0.003, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2278 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 406.609375, "epoch": 1.8232, "grad_norm": 0.4342114951966203, "kl": 0.07958984375, "learning_rate": 1.6699264252034498e-06, "loss": 0.0032, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2279 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 417.25, "epoch": 1.8239999999999998, "grad_norm": 0.5092638014697883, "kl": 0.244140625, "learning_rate": 1.6679511386925337e-06, "loss": 0.0098, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2280 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 432.078125, "epoch": 1.8248, "grad_norm": 0.06874156878127212, "kl": 0.07275390625, "learning_rate": 1.6659764361455383e-06, "loss": 0.0029, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2281 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.140625, "epoch": 1.8256000000000001, "grad_norm": 2.3332094846872873, "kl": 0.07470703125, "learning_rate": 1.6640023189483836e-06, "loss": 0.003, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2282 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 419.3125, "epoch": 1.8264, "grad_norm": 0.3194931297089578, "kl": 0.09326171875, "learning_rate": 1.6620287884865831e-06, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2283 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 434.96875, "epoch": 1.8272, "grad_norm": 0.058001088032383526, "kl": 0.0830078125, "learning_rate": 1.6600558461452368e-06, "loss": 0.0033, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2284 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 400.21875, "epoch": 1.8279999999999998, "grad_norm": 1.1346504301460851, "kl": 0.66015625, "learning_rate": 1.65808349330903e-06, "loss": 0.0263, "reward": 1.546875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 2285 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 416.0, "epoch": 1.8288, "grad_norm": 0.7323502042973661, "kl": 0.09765625, "learning_rate": 1.656111731362236e-06, "loss": 0.0039, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2286 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.046875, "epoch": 1.8296000000000001, "grad_norm": 0.6425092179459982, "kl": 0.08203125, "learning_rate": 1.6541405616887138e-06, "loss": 0.0033, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2287 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.921875, "epoch": 1.8304, "grad_norm": 0.5851376635857483, "kl": 0.08984375, "learning_rate": 1.6521699856719065e-06, "loss": 0.0036, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2288 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 387.78125, "epoch": 1.8312, "grad_norm": 0.39461520559472857, "kl": 0.07763671875, "learning_rate": 1.650200004694839e-06, "loss": 0.0031, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2289 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.046875, "epoch": 1.8319999999999999, "grad_norm": 0.5458663585800854, "kl": 0.0703125, "learning_rate": 1.6482306201401211e-06, "loss": 0.0028, "reward": 1.921875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2290 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 457.375, "epoch": 1.8328, "grad_norm": 0.35289779031109975, "kl": 0.08056640625, "learning_rate": 1.6462618333899422e-06, "loss": 0.0032, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 1.8336000000000001, "grad_norm": 0.8117770038248345, "kl": 0.25390625, "learning_rate": 1.6442936458260723e-06, "loss": 0.0101, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.984375, "step": 2292 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 478.5625, "epoch": 1.8344, "grad_norm": 0.6311079921720235, "kl": 0.07958984375, "learning_rate": 1.6423260588298608e-06, "loss": 0.0032, "reward": 1.671875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2293 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 406.40625, "epoch": 1.8352, "grad_norm": 0.514535236545834, "kl": 0.080078125, "learning_rate": 1.6403590737822378e-06, "loss": 0.0032, "reward": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2294 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 454.40625, "epoch": 1.8359999999999999, "grad_norm": 0.22239642077532396, "kl": 0.1943359375, "learning_rate": 1.6383926920637077e-06, "loss": 0.0078, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2295 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.1875, "epoch": 1.8368, "grad_norm": 0.6811230340729435, "kl": 0.28125, "learning_rate": 1.6364269150543533e-06, "loss": 0.0113, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "step": 2296 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 392.1875, "epoch": 1.8376000000000001, "grad_norm": 1.2234684755317045, "kl": 0.37890625, "learning_rate": 1.6344617441338311e-06, "loss": 0.0152, "reward": 1.546875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 2297 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 436.875, "epoch": 1.8384, "grad_norm": 0.40356023234711014, "kl": 0.07861328125, "learning_rate": 1.6324971806813766e-06, "loss": 0.0031, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2298 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 393.1875, "epoch": 1.8392, "grad_norm": 1.0542946842458971, "kl": 0.083984375, "learning_rate": 1.6305332260757937e-06, "loss": 0.0034, "reward": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2299 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 429.765625, "epoch": 1.8399999999999999, "grad_norm": 0.3947925761415726, "kl": 0.08203125, "learning_rate": 1.6285698816954626e-06, "loss": 0.0033, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2300 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 446.265625, "epoch": 1.8408, "grad_norm": 0.7470857421143552, "kl": 0.10302734375, "learning_rate": 1.6266071489183327e-06, "loss": 0.0041, "reward": 1.640625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 2301 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.390625, "epoch": 1.8416000000000001, "grad_norm": 0.6735528871930032, "kl": 0.083984375, "learning_rate": 1.6246450291219268e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2302 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 444.84375, "epoch": 1.8424, "grad_norm": 0.5872276880370481, "kl": 0.09228515625, "learning_rate": 1.6226835236833356e-06, "loss": 0.0037, "reward": 1.90625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 2303 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.6875, "epoch": 1.8432, "grad_norm": 1.2067725226290258, "kl": 0.10107421875, "learning_rate": 1.620722633979219e-06, "loss": 0.004, "reward": 1.921875, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2304 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 460.359375, "epoch": 1.8439999999999999, "grad_norm": 0.7093879480128268, "kl": 0.08544921875, "learning_rate": 1.6187623613858038e-06, "loss": 0.0034, "reward": 1.859375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2305 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 435.578125, "epoch": 1.8448, "grad_norm": 0.7916096166065976, "kl": 0.400390625, "learning_rate": 1.6168027072788868e-06, "loss": 0.016, "reward": 1.8125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2306 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 492.09375, "epoch": 1.8456000000000001, "grad_norm": 0.6229346311295197, "kl": 0.07958984375, "learning_rate": 1.6148436730338279e-06, "loss": 0.0032, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2307 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 440.515625, "epoch": 1.8464, "grad_norm": 1.2172980612288264, "kl": 0.0830078125, "learning_rate": 1.6128852600255518e-06, "loss": 0.0033, "reward": 1.8125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "step": 2308 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.796875, "epoch": 1.8472, "grad_norm": 0.8909217088840472, "kl": 1.3359375, "learning_rate": 1.6109274696285496e-06, "loss": 0.0535, "reward": 1.9375, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.96875, "step": 2309 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 463.203125, "epoch": 1.8479999999999999, "grad_norm": 0.5301575360099662, "kl": 0.09423828125, "learning_rate": 1.6089703032168736e-06, "loss": 0.0038, "reward": 1.875, "reward_std": 0.1243029236793518, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 2310 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 479.765625, "epoch": 1.8488, "grad_norm": 0.44364619351434914, "kl": 0.81640625, "learning_rate": 1.6070137621641382e-06, "loss": 0.0327, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 2311 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 457.28125, "epoch": 1.8496000000000001, "grad_norm": 0.47166131642306697, "kl": 0.08837890625, "learning_rate": 1.6050578478435184e-06, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2312 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 474.921875, "epoch": 1.8504, "grad_norm": 0.6161836323243681, "kl": 0.2138671875, "learning_rate": 1.6031025616277512e-06, "loss": 0.0086, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2313 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 490.984375, "epoch": 1.8512, "grad_norm": 0.6143641531177049, "kl": 0.087890625, "learning_rate": 1.6011479048891323e-06, "loss": 0.0035, "reward": 1.765625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 2314 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 442.25, "epoch": 1.8519999999999999, "grad_norm": 1.3468600760031937, "kl": 0.56640625, "learning_rate": 1.5991938789995138e-06, "loss": 0.0227, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2315 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 461.890625, "epoch": 1.8528, "grad_norm": 0.04483782424172691, "kl": 0.078125, "learning_rate": 1.5972404853303061e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2316 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.359375, "epoch": 1.8536000000000001, "grad_norm": 0.8712801056418593, "kl": 0.1123046875, "learning_rate": 1.595287725252478e-06, "loss": 0.0045, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.984375, "step": 2317 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 420.953125, "epoch": 1.8544, "grad_norm": 1.4722712412340406, "kl": 1.5390625, "learning_rate": 1.5933356001365502e-06, "loss": 0.0616, "reward": 1.671875, "reward_std": 0.35816800594329834, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 2318 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 503.453125, "epoch": 1.8552, "grad_norm": 1.788304634704619, "kl": 0.0869140625, "learning_rate": 1.591384111352599e-06, "loss": 0.0035, "reward": 1.578125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 2319 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.640625, "epoch": 1.8559999999999999, "grad_norm": 0.9205119308247197, "kl": 0.1474609375, "learning_rate": 1.5894332602702545e-06, "loss": 0.0059, "reward": 1.71875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2320 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 442.828125, "epoch": 1.8568, "grad_norm": 0.4724895151392997, "kl": 0.12255859375, "learning_rate": 1.5874830482587003e-06, "loss": 0.0049, "reward": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2321 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 460.4375, "epoch": 1.8576000000000001, "grad_norm": 0.7016824771040632, "kl": 0.88671875, "learning_rate": 1.585533476686669e-06, "loss": 0.0355, "reward": 1.734375, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2322 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 474.8125, "epoch": 1.8584, "grad_norm": 0.7669713951662993, "kl": 0.111328125, "learning_rate": 1.5835845469224447e-06, "loss": 0.0045, "reward": 1.875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "step": 2323 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 455.90625, "epoch": 1.8592, "grad_norm": 0.5967170462706515, "kl": 0.0810546875, "learning_rate": 1.5816362603338632e-06, "loss": 0.0032, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2324 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 502.0, "epoch": 1.8599999999999999, "grad_norm": 0.4899720910756078, "kl": 0.458984375, "learning_rate": 1.5796886182883053e-06, "loss": 0.0183, "reward": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 2325 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 501.140625, "epoch": 1.8608, "grad_norm": 0.8680875979230859, "kl": 1.796875, "learning_rate": 1.577741622152702e-06, "loss": 0.0718, "reward": 1.8125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.953125, "step": 2326 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 473.359375, "epoch": 1.8616000000000001, "grad_norm": 0.6348149800883833, "kl": 1.703125, "learning_rate": 1.5757952732935288e-06, "loss": 0.0684, "reward": 1.6875, "reward_std": 0.2619796097278595, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.953125, "step": 2327 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 473.8125, "epoch": 1.8624, "grad_norm": 0.5647115361224099, "kl": 0.2578125, "learning_rate": 1.5738495730768104e-06, "loss": 0.0103, "reward": 1.484375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 2328 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 437.671875, "epoch": 1.8632, "grad_norm": 1.6814611945657367, "kl": 0.181640625, "learning_rate": 1.5719045228681127e-06, "loss": 0.0072, "reward": 1.671875, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 2329 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 405.640625, "epoch": 1.8639999999999999, "grad_norm": 0.623347105111847, "kl": 0.6953125, "learning_rate": 1.5699601240325474e-06, "loss": 0.0279, "reward": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 2330 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 442.453125, "epoch": 1.8648, "grad_norm": 1.3513951234674806, "kl": 1.6328125, "learning_rate": 1.5680163779347668e-06, "loss": 0.0655, "reward": 1.59375, "reward_std": 0.2744373679161072, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.90625, "step": 2331 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 410.015625, "epoch": 1.8656000000000001, "grad_norm": 0.6530723952699748, "kl": 0.21875, "learning_rate": 1.5660732859389687e-06, "loss": 0.0087, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2332 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 523.140625, "epoch": 1.8664, "grad_norm": 1.0787077908420544, "kl": 0.2333984375, "learning_rate": 1.5641308494088903e-06, "loss": 0.0093, "reward": 1.828125, "reward_std": 0.23925507068634033, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2333 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.53125, "epoch": 1.8672, "grad_norm": 1.0625795033201053, "kl": 0.9140625, "learning_rate": 1.5621890697078069e-06, "loss": 0.0367, "reward": 1.796875, "reward_std": 0.2773849368095398, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 2334 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 518.46875, "epoch": 1.8679999999999999, "grad_norm": 2.898753556284234, "kl": 0.341796875, "learning_rate": 1.5602479481985333e-06, "loss": 0.0137, "reward": 1.9375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.953125, "step": 2335 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 375.609375, "epoch": 1.8688, "grad_norm": 1.1551349822481982, "kl": 1.1328125, "learning_rate": 1.5583074862434254e-06, "loss": 0.0455, "reward": 1.46875, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 2336 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 452.53125, "epoch": 1.8696000000000002, "grad_norm": 1.524298777949189, "kl": 1.5859375, "learning_rate": 1.5563676852043738e-06, "loss": 0.0635, "reward": 1.5625, "reward_std": 0.3122667670249939, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.90625, "step": 2337 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.265625, "epoch": 1.8704, "grad_norm": 1.9314722579043813, "kl": 2.0, "learning_rate": 1.5544285464428044e-06, "loss": 0.0802, "reward": 1.703125, "reward_std": 0.45481547713279724, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.9375, "step": 2338 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 392.40625, "epoch": 1.8712, "grad_norm": 0.7790863024565302, "kl": 0.4375, "learning_rate": 1.55249007131968e-06, "loss": 0.0175, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2339 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 459.578125, "epoch": 1.8719999999999999, "grad_norm": 0.302611999932875, "kl": 0.10498046875, "learning_rate": 1.5505522611954977e-06, "loss": 0.0042, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2340 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 472.75, "epoch": 1.8728, "grad_norm": 1.0436752059242749, "kl": 0.341796875, "learning_rate": 1.548615117430286e-06, "loss": 0.0136, "reward": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2341 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 484.453125, "epoch": 1.8736000000000002, "grad_norm": 0.2955088923753632, "kl": 0.6796875, "learning_rate": 1.5466786413836077e-06, "loss": 0.0272, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.984375, "step": 2342 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 438.140625, "epoch": 1.8744, "grad_norm": 0.6671132830263261, "kl": 0.0751953125, "learning_rate": 1.5447428344145565e-06, "loss": 0.003, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2343 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 418.65625, "epoch": 1.8752, "grad_norm": 0.12573231959996634, "kl": 0.103515625, "learning_rate": 1.5428076978817564e-06, "loss": 0.0041, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2344 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 403.0625, "epoch": 1.876, "grad_norm": 1.0971006880127447, "kl": 0.42578125, "learning_rate": 1.5408732331433596e-06, "loss": 0.0171, "reward": 1.5625, "reward_std": 0.2948937714099884, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 2345 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.84375, "epoch": 1.8768, "grad_norm": 1.2217093029811685, "kl": 0.314453125, "learning_rate": 1.538939441557048e-06, "loss": 0.0126, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.96875, "step": 2346 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 403.015625, "epoch": 1.8776000000000002, "grad_norm": 0.6836710614113861, "kl": 1.0, "learning_rate": 1.5370063244800326e-06, "loss": 0.0401, "reward": 1.859375, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2347 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 466.46875, "epoch": 1.8784, "grad_norm": 0.2853069038669468, "kl": 0.1396484375, "learning_rate": 1.5350738832690479e-06, "loss": 0.0056, "reward": 1.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2348 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 464.375, "epoch": 1.8792, "grad_norm": 1.0723235378029217, "kl": 0.9140625, "learning_rate": 1.5331421192803565e-06, "loss": 0.0367, "reward": 1.765625, "reward_std": 0.19044627249240875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 2349 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.28125, "epoch": 1.88, "grad_norm": 1.3252120392631617, "kl": 0.7578125, "learning_rate": 1.5312110338697427e-06, "loss": 0.0304, "reward": 1.78125, "reward_std": 0.24608497321605682, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 2350 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 441.625, "epoch": 1.8808, "grad_norm": 1.1791143111050741, "kl": 1.59375, "learning_rate": 1.5292806283925192e-06, "loss": 0.0637, "reward": 1.671875, "reward_std": 0.26076778769493103, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 2351 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 438.8125, "epoch": 1.8816000000000002, "grad_norm": 1.3254748259748228, "kl": 1.8671875, "learning_rate": 1.5273509042035172e-06, "loss": 0.0746, "reward": 1.90625, "reward_std": 0.1828794628381729, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.96875, "step": 2352 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 444.234375, "epoch": 1.8824, "grad_norm": 0.5973355080036852, "kl": 0.60546875, "learning_rate": 1.5254218626570927e-06, "loss": 0.0243, "reward": 1.59375, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 2353 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 392.953125, "epoch": 1.8832, "grad_norm": 1.1410039809378982, "kl": 0.83203125, "learning_rate": 1.5234935051071193e-06, "loss": 0.0333, "reward": 1.75, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2354 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 385.0, "epoch": 1.884, "grad_norm": 1.1471713788274727, "kl": 0.14453125, "learning_rate": 1.521565832906994e-06, "loss": 0.0058, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2355 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 410.765625, "epoch": 1.8848, "grad_norm": 0.5609605962424036, "kl": 0.0771484375, "learning_rate": 1.519638847409632e-06, "loss": 0.0031, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2356 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 407.34375, "epoch": 1.8856000000000002, "grad_norm": 0.980539815965368, "kl": 0.06982421875, "learning_rate": 1.5177125499674639e-06, "loss": 0.0028, "reward": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2357 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 395.375, "epoch": 1.8864, "grad_norm": 1.154891923772219, "kl": 0.3828125, "learning_rate": 1.515786941932441e-06, "loss": 0.0153, "reward": 1.78125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "step": 2358 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 409.03125, "epoch": 1.8872, "grad_norm": 0.9968607771230172, "kl": 0.52734375, "learning_rate": 1.5138620246560295e-06, "loss": 0.0211, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2359 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 401.796875, "epoch": 1.888, "grad_norm": 0.5690985149571485, "kl": 0.0859375, "learning_rate": 1.5119377994892095e-06, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2360 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 457.046875, "epoch": 1.8888, "grad_norm": 0.8228440234192015, "kl": 0.06884765625, "learning_rate": 1.5100142677824752e-06, "loss": 0.0027, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2361 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 369.3125, "epoch": 1.8896, "grad_norm": 0.07799185674390813, "kl": 0.0771484375, "learning_rate": 1.5080914308858375e-06, "loss": 0.0031, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2362 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.203125, "epoch": 1.8904, "grad_norm": 1.940053735681463, "kl": 0.47265625, "learning_rate": 1.5061692901488161e-06, "loss": 0.019, "reward": 1.796875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 2363 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.296875, "epoch": 1.8912, "grad_norm": 0.6430789106343778, "kl": 0.208984375, "learning_rate": 1.5042478469204437e-06, "loss": 0.0084, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2364 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 492.9375, "epoch": 1.892, "grad_norm": 1.8130245744964804, "kl": 0.69921875, "learning_rate": 1.502327102549262e-06, "loss": 0.0281, "reward": 1.609375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 2365 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 382.96875, "epoch": 1.8928, "grad_norm": 0.17595814693903425, "kl": 0.1328125, "learning_rate": 1.5004070583833252e-06, "loss": 0.0053, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2366 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 445.21875, "epoch": 1.8936, "grad_norm": 0.7845570900182349, "kl": 1.265625, "learning_rate": 1.4984877157701932e-06, "loss": 0.0506, "reward": 1.578125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.953125, "step": 2367 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 477.453125, "epoch": 1.8944, "grad_norm": 0.4826090070028168, "kl": 0.072265625, "learning_rate": 1.4965690760569346e-06, "loss": 0.0029, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2368 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.5625, "epoch": 1.8952, "grad_norm": 1.8023395400804547, "kl": 0.58984375, "learning_rate": 1.4946511405901237e-06, "loss": 0.0236, "reward": 1.9375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.984375, "step": 2369 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 430.734375, "epoch": 1.896, "grad_norm": 1.1268241412178124, "kl": 1.1875, "learning_rate": 1.4927339107158437e-06, "loss": 0.0471, "reward": 1.734375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2370 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 444.96875, "epoch": 1.8968, "grad_norm": 1.651657729387728, "kl": 3.359375, "learning_rate": 1.4908173877796784e-06, "loss": 0.1348, "reward": 1.625, "reward_std": 0.31973996758461, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.921875, "step": 2371 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 434.875, "epoch": 1.8976, "grad_norm": 2.6397654725926225, "kl": 4.28125, "learning_rate": 1.4889015731267186e-06, "loss": 0.1708, "reward": 1.84375, "reward_std": 0.36278265714645386, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.890625, "step": 2372 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 389.8125, "epoch": 1.8984, "grad_norm": 0.5491844825137755, "kl": 0.5859375, "learning_rate": 1.486986468101555e-06, "loss": 0.0236, "reward": 1.765625, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "step": 2373 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 451.4375, "epoch": 1.8992, "grad_norm": 1.7994233823728274, "kl": 1.625, "learning_rate": 1.4850720740482842e-06, "loss": 0.0652, "reward": 1.703125, "reward_std": 0.18139132857322693, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.953125, "step": 2374 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 486.59375, "epoch": 1.9, "grad_norm": 1.6773978859741498, "kl": 3.859375, "learning_rate": 1.4831583923105e-06, "loss": 0.1539, "reward": 1.671875, "reward_std": 0.47389551997184753, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.90625, "step": 2375 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.1875, "epoch": 1.9008, "grad_norm": 1.8736716490980116, "kl": 1.03125, "learning_rate": 1.481245424231298e-06, "loss": 0.0413, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2376 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 451.34375, "epoch": 1.9016, "grad_norm": 1.013615619433797, "kl": 0.578125, "learning_rate": 1.4793331711532743e-06, "loss": 0.0231, "reward": 1.90625, "reward_std": 0.1828794628381729, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.953125, "step": 2377 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 418.34375, "epoch": 1.9024, "grad_norm": 1.3228661327007896, "kl": 0.5625, "learning_rate": 1.4774216344185204e-06, "loss": 0.0225, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2378 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 446.140625, "epoch": 1.9032, "grad_norm": 1.9025870153471482, "kl": 1.984375, "learning_rate": 1.4755108153686275e-06, "loss": 0.0792, "reward": 1.78125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.9375, "step": 2379 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.5, "epoch": 1.904, "grad_norm": 1.7443363795882494, "kl": 0.96484375, "learning_rate": 1.4736007153446803e-06, "loss": 0.0384, "reward": 1.921875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.984375, "step": 2380 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.4375, "epoch": 1.9048, "grad_norm": 1.185011585277608, "kl": 2.25, "learning_rate": 1.4716913356872614e-06, "loss": 0.0901, "reward": 1.828125, "reward_std": 0.2475731074810028, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.9375, "step": 2381 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 388.140625, "epoch": 1.9056, "grad_norm": 0.7768321191968288, "kl": 0.46484375, "learning_rate": 1.4697826777364478e-06, "loss": 0.0186, "reward": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 2382 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 461.53125, "epoch": 1.9064, "grad_norm": 0.697500500552628, "kl": 1.453125, "learning_rate": 1.467874742831808e-06, "loss": 0.0582, "reward": 1.546875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.953125, "step": 2383 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 369.421875, "epoch": 1.9072, "grad_norm": 0.16652836720156833, "kl": 0.076171875, "learning_rate": 1.4659675323124037e-06, "loss": 0.003, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2384 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.78125, "epoch": 1.908, "grad_norm": 3.7073476646067087, "kl": 0.6953125, "learning_rate": 1.46406104751679e-06, "loss": 0.0278, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2385 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 389.5, "epoch": 1.9088, "grad_norm": 0.10786571220709351, "kl": 0.08642578125, "learning_rate": 1.462155289783011e-06, "loss": 0.0035, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2386 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 422.171875, "epoch": 1.9096, "grad_norm": 4.944269786732493, "kl": 1.03125, "learning_rate": 1.4602502604486e-06, "loss": 0.0415, "reward": 1.78125, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.953125, "step": 2387 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 473.328125, "epoch": 1.9104, "grad_norm": 1.7538214127820733, "kl": 2.953125, "learning_rate": 1.45834596085058e-06, "loss": 0.1182, "reward": 1.71875, "reward_std": 0.40139204263687134, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.9375, "step": 2388 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 418.25, "epoch": 1.9112, "grad_norm": 0.8850296233423672, "kl": 1.9140625, "learning_rate": 1.456442392325463e-06, "loss": 0.0766, "reward": 1.46875, "reward_std": 0.19506090879440308, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.953125, "step": 2389 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 402.75, "epoch": 1.912, "grad_norm": 1.7349139855129643, "kl": 2.015625, "learning_rate": 1.4545395562092467e-06, "loss": 0.0806, "reward": 1.5625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9375, "step": 2390 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 452.703125, "epoch": 1.9127999999999998, "grad_norm": 0.8028662692336452, "kl": 3.5625, "learning_rate": 1.4526374538374133e-06, "loss": 0.1424, "reward": 1.5, "reward_std": 0.28566449880599976, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.90625, "step": 2391 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 481.40625, "epoch": 1.9136, "grad_norm": 3.132424951458745, "kl": 4.40625, "learning_rate": 1.4507360865449318e-06, "loss": 0.1768, "reward": 1.4375, "reward_std": 0.4297013580799103, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.875, "step": 2392 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 563.9375, "epoch": 1.9144, "grad_norm": 8.129251157032787, "kl": 13.0, "learning_rate": 1.4488354556662553e-06, "loss": 0.5199, "reward": 1.484375, "reward_std": 0.6906865239143372, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.6875, "step": 2393 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 455.484375, "epoch": 1.9152, "grad_norm": 2.3740502257386185, "kl": 5.4375, "learning_rate": 1.4469355625353199e-06, "loss": 0.2183, "reward": 1.46875, "reward_std": 0.42286166548728943, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.859375, "step": 2394 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 440.90625, "epoch": 1.916, "grad_norm": 3.8721595956167616, "kl": 2.8125, "learning_rate": 1.4450364084855433e-06, "loss": 0.1127, "reward": 1.53125, "reward_std": 0.2895520329475403, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.890625, "step": 2395 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 414.296875, "epoch": 1.9167999999999998, "grad_norm": 2.1228233089204775, "kl": 0.7265625, "learning_rate": 1.4431379948498254e-06, "loss": 0.029, "reward": 1.71875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.953125, "step": 2396 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 385.703125, "epoch": 1.9176, "grad_norm": 1.6526787411586574, "kl": 1.46875, "learning_rate": 1.4412403229605453e-06, "loss": 0.0587, "reward": 1.65625, "reward_std": 0.22558550536632538, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.9375, "step": 2397 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 473.8125, "epoch": 1.9184, "grad_norm": 2.800740644453696, "kl": 1.03125, "learning_rate": 1.4393433941495638e-06, "loss": 0.0411, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2398 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 396.84375, "epoch": 1.9192, "grad_norm": 0.7896679883569652, "kl": 0.333984375, "learning_rate": 1.4374472097482156e-06, "loss": 0.0134, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 2399 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 442.875, "epoch": 1.92, "grad_norm": 1.0336282386280489, "kl": 0.9609375, "learning_rate": 1.4355517710873184e-06, "loss": 0.0385, "reward": 1.625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.96875, "step": 2400 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 418.640625, "epoch": 1.9207999999999998, "grad_norm": 0.23397704646346038, "kl": 0.09375, "learning_rate": 1.4336570794971643e-06, "loss": 0.0038, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2401 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 424.09375, "epoch": 1.9216, "grad_norm": 1.1937637305859858, "kl": 0.6640625, "learning_rate": 1.4317631363075186e-06, "loss": 0.0267, "reward": 1.703125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 2402 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 429.71875, "epoch": 1.9224, "grad_norm": 0.5965034849870672, "kl": 0.458984375, "learning_rate": 1.4298699428476236e-06, "loss": 0.0184, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 2403 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 474.359375, "epoch": 1.9232, "grad_norm": 1.405933984259602, "kl": 0.78125, "learning_rate": 1.427977500446199e-06, "loss": 0.0313, "reward": 1.53125, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.96875, "step": 2404 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.390625, "epoch": 1.924, "grad_norm": 2.1879113892963846, "kl": 0.72265625, "learning_rate": 1.4260858104314299e-06, "loss": 0.0289, "reward": 1.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.953125, "step": 2405 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 376.21875, "epoch": 1.9247999999999998, "grad_norm": 0.13799559141652026, "kl": 0.0966796875, "learning_rate": 1.4241948741309783e-06, "loss": 0.0039, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 451.03125, "epoch": 1.9256, "grad_norm": 1.7073983939222042, "kl": 0.56640625, "learning_rate": 1.4223046928719764e-06, "loss": 0.0227, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.96875, "step": 2407 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 382.21875, "epoch": 1.9264000000000001, "grad_norm": 1.0241736559368684, "kl": 0.09814453125, "learning_rate": 1.420415267981026e-06, "loss": 0.0039, "reward": 1.5625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 2408 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 374.171875, "epoch": 1.9272, "grad_norm": 0.3417625778417692, "kl": 0.095703125, "learning_rate": 1.418526600784198e-06, "loss": 0.0038, "reward": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2409 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 345.171875, "epoch": 1.928, "grad_norm": 0.7692397213525569, "kl": 0.1572265625, "learning_rate": 1.4166386926070322e-06, "loss": 0.0063, "reward": 1.953125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 2410 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 405.09375, "epoch": 1.9287999999999998, "grad_norm": 0.3322491135522514, "kl": 0.1298828125, "learning_rate": 1.414751544774535e-06, "loss": 0.0052, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2411 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 469.828125, "epoch": 1.9296, "grad_norm": 0.5821248791023559, "kl": 0.2255859375, "learning_rate": 1.412865158611179e-06, "loss": 0.009, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2412 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 405.546875, "epoch": 1.9304000000000001, "grad_norm": 0.7299583132737032, "kl": 1.6171875, "learning_rate": 1.4109795354409045e-06, "loss": 0.0648, "reward": 1.75, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.953125, "step": 2413 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.46875, "epoch": 1.9312, "grad_norm": 0.7082718237042975, "kl": 0.6015625, "learning_rate": 1.4090946765871105e-06, "loss": 0.0241, "reward": 1.921875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 0.96875, "step": 2414 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.203125, "epoch": 1.932, "grad_norm": 1.367174375883604, "kl": 0.287109375, "learning_rate": 1.4072105833726685e-06, "loss": 0.0115, "reward": 1.671875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 2415 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 381.0625, "epoch": 1.9327999999999999, "grad_norm": 0.6774045362409198, "kl": 0.09619140625, "learning_rate": 1.4053272571199037e-06, "loss": 0.0038, "reward": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2416 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 400.890625, "epoch": 1.9336, "grad_norm": 0.6653094918917516, "kl": 1.2109375, "learning_rate": 1.4034446991506084e-06, "loss": 0.0482, "reward": 1.78125, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.953125, "step": 2417 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.6875, "epoch": 1.9344000000000001, "grad_norm": 0.13353366925534746, "kl": 0.08935546875, "learning_rate": 1.401562910786034e-06, "loss": 0.0036, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2418 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.890625, "epoch": 1.9352, "grad_norm": 0.7638499083157223, "kl": 1.140625, "learning_rate": 1.3996818933468926e-06, "loss": 0.0457, "reward": 1.796875, "reward_std": 0.18139132857322693, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.953125, "step": 2419 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 433.140625, "epoch": 1.936, "grad_norm": 0.33290967962973383, "kl": 0.353515625, "learning_rate": 1.397801648153354e-06, "loss": 0.0141, "reward": 1.859375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2420 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.21875, "epoch": 1.9367999999999999, "grad_norm": 0.8091019407578133, "kl": 0.94921875, "learning_rate": 1.395922176525047e-06, "loss": 0.0379, "reward": 1.890625, "reward_std": 0.15981829166412354, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.96875, "step": 2421 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 434.828125, "epoch": 1.9376, "grad_norm": 1.0066023202487273, "kl": 0.6484375, "learning_rate": 1.3940434797810567e-06, "loss": 0.026, "reward": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 2422 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 424.203125, "epoch": 1.9384000000000001, "grad_norm": 1.0731406534843149, "kl": 0.99609375, "learning_rate": 1.3921655592399256e-06, "loss": 0.0401, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 2423 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.359375, "epoch": 1.9392, "grad_norm": 0.9808281341958132, "kl": 1.8828125, "learning_rate": 1.3902884162196509e-06, "loss": 0.0753, "reward": 1.859375, "reward_std": 0.31546199321746826, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.9375, "step": 2424 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 390.25, "epoch": 1.94, "grad_norm": 0.5700271885815287, "kl": 0.58984375, "learning_rate": 1.388412052037682e-06, "loss": 0.0236, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 2425 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.171875, "epoch": 1.9407999999999999, "grad_norm": 1.7155177622592277, "kl": 1.234375, "learning_rate": 1.3865364680109239e-06, "loss": 0.0498, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.96875, "step": 2426 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 403.84375, "epoch": 1.9416, "grad_norm": 0.662619706734705, "kl": 1.1171875, "learning_rate": 1.384661665455736e-06, "loss": 0.0446, "reward": 1.796875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "step": 2427 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 402.1875, "epoch": 1.9424000000000001, "grad_norm": 2.8082600803937443, "kl": 1.1328125, "learning_rate": 1.3827876456879247e-06, "loss": 0.0455, "reward": 1.4375, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.953125, "step": 2428 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 421.53125, "epoch": 1.9432, "grad_norm": 0.98101898384185, "kl": 0.6484375, "learning_rate": 1.3809144100227483e-06, "loss": 0.0259, "reward": 1.71875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 2429 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 421.71875, "epoch": 1.944, "grad_norm": 1.1167812076105414, "kl": 1.3046875, "learning_rate": 1.3790419597749198e-06, "loss": 0.0521, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2430 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.671875, "epoch": 1.9447999999999999, "grad_norm": 0.7169936463537103, "kl": 0.77734375, "learning_rate": 1.3771702962585928e-06, "loss": 0.0311, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.96875, "step": 2431 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 527.09375, "epoch": 1.9456, "grad_norm": 0.7200255970830155, "kl": 2.265625, "learning_rate": 1.3752994207873743e-06, "loss": 0.0902, "reward": 1.78125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.9375, "step": 2432 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 434.90625, "epoch": 1.9464000000000001, "grad_norm": 2.528998755799731, "kl": 1.640625, "learning_rate": 1.373429334674317e-06, "loss": 0.0656, "reward": 1.75, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9375, "step": 2433 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 425.203125, "epoch": 1.9472, "grad_norm": 0.9374989047821527, "kl": 1.28125, "learning_rate": 1.3715600392319186e-06, "loss": 0.0513, "reward": 1.78125, "reward_std": 0.24684599041938782, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 2434 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 434.71875, "epoch": 1.948, "grad_norm": 0.3005621124258611, "kl": 0.66015625, "learning_rate": 1.369691535772123e-06, "loss": 0.0265, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "step": 2435 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 455.453125, "epoch": 1.9487999999999999, "grad_norm": 1.2796207746661543, "kl": 1.1796875, "learning_rate": 1.3678238256063193e-06, "loss": 0.0472, "reward": 1.6875, "reward_std": 0.213067427277565, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 2436 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 440.140625, "epoch": 1.9496, "grad_norm": 0.9651874328660741, "kl": 2.21875, "learning_rate": 1.3659569100453346e-06, "loss": 0.0889, "reward": 1.734375, "reward_std": 0.28778618574142456, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.9375, "step": 2437 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 435.359375, "epoch": 1.9504000000000001, "grad_norm": 2.6329782048013586, "kl": 2.125, "learning_rate": 1.3640907903994455e-06, "loss": 0.085, "reward": 1.765625, "reward_std": 0.15499483048915863, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.9375, "step": 2438 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.515625, "epoch": 1.9512, "grad_norm": 1.4783404096789414, "kl": 0.130859375, "learning_rate": 1.3622254679783665e-06, "loss": 0.0052, "reward": 1.734375, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2439 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 465.5625, "epoch": 1.952, "grad_norm": 1.6253159465554536, "kl": 3.90625, "learning_rate": 1.3603609440912508e-06, "loss": 0.1559, "reward": 1.453125, "reward_std": 0.5136494040489197, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.890625, "step": 2440 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 469.328125, "epoch": 1.9527999999999999, "grad_norm": 1.8519862602162838, "kl": 2.09375, "learning_rate": 1.3584972200466936e-06, "loss": 0.0835, "reward": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.9375, "step": 2441 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 448.578125, "epoch": 1.9536, "grad_norm": 1.386289677708159, "kl": 3.390625, "learning_rate": 1.356634297152729e-06, "loss": 0.1358, "reward": 1.390625, "reward_std": 0.3298586905002594, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.90625, "step": 2442 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 433.734375, "epoch": 1.9544000000000001, "grad_norm": 1.4488160932843597, "kl": 3.21875, "learning_rate": 1.3547721767168273e-06, "loss": 0.1293, "reward": 1.734375, "reward_std": 0.5867297649383545, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.890625, "step": 2443 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 540.9375, "epoch": 1.9552, "grad_norm": 0.5355961107458718, "kl": 1.984375, "learning_rate": 1.3529108600458967e-06, "loss": 0.0796, "reward": 1.796875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.953125, "step": 2444 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 457.359375, "epoch": 1.956, "grad_norm": 1.4905781930338178, "kl": 1.4609375, "learning_rate": 1.3510503484462807e-06, "loss": 0.0587, "reward": 1.75, "reward_std": 0.2040124535560608, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.953125, "step": 2445 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 486.953125, "epoch": 1.9567999999999999, "grad_norm": 2.6096505744523775, "kl": 2.4375, "learning_rate": 1.349190643223758e-06, "loss": 0.0973, "reward": 1.796875, "reward_std": 0.42498332262039185, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.921875, "step": 2446 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 427.109375, "epoch": 1.9576, "grad_norm": 8.682296049745519, "kl": 1.8671875, "learning_rate": 1.347331745683542e-06, "loss": 0.0747, "reward": 1.828125, "reward_std": 0.23925508558750153, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.953125, "step": 2447 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 420.25, "epoch": 1.9584000000000001, "grad_norm": 1.0152230889390625, "kl": 1.2734375, "learning_rate": 1.3454736571302761e-06, "loss": 0.051, "reward": 1.46875, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.953125, "step": 2448 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 478.875, "epoch": 1.9592, "grad_norm": 0.9667083422075894, "kl": 1.4375, "learning_rate": 1.3436163788680411e-06, "loss": 0.0576, "reward": 1.546875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 2449 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.421875, "epoch": 1.96, "grad_norm": 1.3667488608028606, "kl": 1.8671875, "learning_rate": 1.3417599122003464e-06, "loss": 0.0745, "reward": 1.78125, "reward_std": 0.2924008071422577, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.921875, "step": 2450 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 393.796875, "epoch": 1.9607999999999999, "grad_norm": 0.71504983170553, "kl": 1.1796875, "learning_rate": 1.3399042584301298e-06, "loss": 0.0473, "reward": 1.703125, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "step": 2451 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 426.921875, "epoch": 1.9616, "grad_norm": 0.961385853315966, "kl": 1.0546875, "learning_rate": 1.3380494188597603e-06, "loss": 0.042, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 2452 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 413.203125, "epoch": 1.9624000000000001, "grad_norm": 3.8970906370746277, "kl": 1.34375, "learning_rate": 1.3361953947910394e-06, "loss": 0.0538, "reward": 1.828125, "reward_std": 0.34564992785453796, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.953125, "step": 2453 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 1.9632, "grad_norm": 0.33225482099165854, "kl": 0.8828125, "learning_rate": 1.334342187525189e-06, "loss": 0.0352, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.96875, "step": 2454 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 458.078125, "epoch": 1.964, "grad_norm": 9.28529152616632, "kl": 1.1171875, "learning_rate": 1.3324897983628621e-06, "loss": 0.0446, "reward": 1.703125, "reward_std": 0.21730589866638184, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.953125, "step": 2455 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 482.25, "epoch": 1.9647999999999999, "grad_norm": 1.1380537604259222, "kl": 0.5546875, "learning_rate": 1.330638228604137e-06, "loss": 0.0221, "reward": 1.734375, "reward_std": 0.20189079642295837, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2456 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 440.40625, "epoch": 1.9656, "grad_norm": 0.7523422406555214, "kl": 0.98046875, "learning_rate": 1.3287874795485168e-06, "loss": 0.0394, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2457 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 412.40625, "epoch": 1.9664000000000001, "grad_norm": 0.6149431433268288, "kl": 0.9453125, "learning_rate": 1.3269375524949286e-06, "loss": 0.0378, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2458 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 430.15625, "epoch": 1.9672, "grad_norm": 0.4774950978450209, "kl": 0.328125, "learning_rate": 1.3250884487417227e-06, "loss": 0.0131, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2459 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 453.515625, "epoch": 1.968, "grad_norm": 0.8734251798841773, "kl": 2.015625, "learning_rate": 1.3232401695866686e-06, "loss": 0.0805, "reward": 1.65625, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.9375, "step": 2460 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 457.71875, "epoch": 1.9687999999999999, "grad_norm": 1.3631889780253275, "kl": 1.25, "learning_rate": 1.321392716326963e-06, "loss": 0.05, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.96875, "step": 2461 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 434.234375, "epoch": 1.9696, "grad_norm": 0.6394979876062535, "kl": 0.828125, "learning_rate": 1.3195460902592193e-06, "loss": 0.0331, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 2462 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 444.84375, "epoch": 1.9704000000000002, "grad_norm": 2.514048156633523, "kl": 3.875, "learning_rate": 1.3177002926794685e-06, "loss": 0.1551, "reward": 1.359375, "reward_std": 0.5200409889221191, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.859375, "step": 2463 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 474.328125, "epoch": 1.9712, "grad_norm": 1.1340715503269814, "kl": 2.609375, "learning_rate": 1.3158553248831658e-06, "loss": 0.1041, "reward": 1.59375, "reward_std": 0.3718376159667969, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9375, "step": 2464 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 438.875, "epoch": 1.972, "grad_norm": 0.6956774003491915, "kl": 1.0390625, "learning_rate": 1.3140111881651773e-06, "loss": 0.0416, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.96875, "step": 2465 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 411.171875, "epoch": 1.9727999999999999, "grad_norm": 0.5235973050143332, "kl": 0.83203125, "learning_rate": 1.312167883819791e-06, "loss": 0.0332, "reward": 1.734375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2466 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 456.796875, "epoch": 1.9736, "grad_norm": 1.473294108100521, "kl": 1.1640625, "learning_rate": 1.3103254131407082e-06, "loss": 0.0464, "reward": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 2467 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.90625, "epoch": 1.9744000000000002, "grad_norm": 4.62995735255069, "kl": 3.125, "learning_rate": 1.308483777421046e-06, "loss": 0.1251, "reward": 1.59375, "reward_std": 0.48066505789756775, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.90625, "step": 2468 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 452.9375, "epoch": 1.9752, "grad_norm": 2.3200179982419966, "kl": 1.84375, "learning_rate": 1.3066429779533352e-06, "loss": 0.0735, "reward": 1.671875, "reward_std": 0.32764342427253723, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 2469 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 490.140625, "epoch": 1.976, "grad_norm": 1.138649250709883, "kl": 2.28125, "learning_rate": 1.3048030160295196e-06, "loss": 0.0912, "reward": 1.671875, "reward_std": 0.27883461117744446, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.9375, "step": 2470 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 473.984375, "epoch": 1.9768, "grad_norm": 3.0332609973450797, "kl": 1.2578125, "learning_rate": 1.3029638929409555e-06, "loss": 0.0504, "reward": 1.6875, "reward_std": 0.13719715178012848, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.953125, "step": 2471 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 442.9375, "epoch": 1.9776, "grad_norm": 1.7715433838125119, "kl": 3.546875, "learning_rate": 1.3011256099784103e-06, "loss": 0.1423, "reward": 1.578125, "reward_std": 0.47417181730270386, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.890625, "step": 2472 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 442.265625, "epoch": 1.9784000000000002, "grad_norm": 2.235711684889043, "kl": 2.390625, "learning_rate": 1.2992881684320627e-06, "loss": 0.0958, "reward": 1.671875, "reward_std": 0.43047669529914856, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.921875, "step": 2473 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 478.890625, "epoch": 1.9792, "grad_norm": 1.767685645904705, "kl": 4.09375, "learning_rate": 1.297451569591498e-06, "loss": 0.1643, "reward": 1.796875, "reward_std": 0.4343159794807434, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.875, "step": 2474 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 503.921875, "epoch": 1.98, "grad_norm": 1.558552636436934, "kl": 5.6875, "learning_rate": 1.2956158147457116e-06, "loss": 0.227, "reward": 1.5625, "reward_std": 0.4267922341823578, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.8125, "step": 2475 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 580.984375, "epoch": 1.9808, "grad_norm": 7.419471445231683, "kl": 7.5, "learning_rate": 1.2937809051831102e-06, "loss": 0.3002, "reward": 1.421875, "reward_std": 0.4921797215938568, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.765625, "step": 2476 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 588.640625, "epoch": 1.9816, "grad_norm": 7.921893460533328, "kl": 11.875, "learning_rate": 1.2919468421915008e-06, "loss": 0.4759, "reward": 1.375, "reward_std": 0.7253988981246948, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.671875, "step": 2477 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 444.390625, "epoch": 1.9824000000000002, "grad_norm": 3.1882832646838053, "kl": 4.71875, "learning_rate": 1.2901136270580994e-06, "loss": 0.1892, "reward": 1.5625, "reward_std": 0.29476165771484375, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.84375, "step": 2478 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 437.578125, "epoch": 1.9832, "grad_norm": 1.2594971316236099, "kl": 3.328125, "learning_rate": 1.2882812610695305e-06, "loss": 0.1334, "reward": 1.21875, "reward_std": 0.3333997130393982, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.875, "step": 2479 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 466.5, "epoch": 1.984, "grad_norm": 1.2952932685895526, "kl": 3.0625, "learning_rate": 1.2864497455118152e-06, "loss": 0.1226, "reward": 1.515625, "reward_std": 0.4027767777442932, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.875, "step": 2480 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 392.984375, "epoch": 1.9848, "grad_norm": 1.7427594979289083, "kl": 0.453125, "learning_rate": 1.2846190816703836e-06, "loss": 0.0182, "reward": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 2481 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.3125, "epoch": 1.9856, "grad_norm": 1.829837754717998, "kl": 0.59765625, "learning_rate": 1.2827892708300648e-06, "loss": 0.0239, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.953125, "step": 2482 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 422.328125, "epoch": 1.9864000000000002, "grad_norm": 1.6968579942041917, "kl": 1.140625, "learning_rate": 1.280960314275092e-06, "loss": 0.0457, "reward": 1.75, "reward_std": 0.2834492623806, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9375, "step": 2483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.828125, "epoch": 1.9872, "grad_norm": 1.3706405759005194, "kl": 0.67578125, "learning_rate": 1.279132213289096e-06, "loss": 0.0271, "reward": 1.8125, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "step": 2484 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.765625, "epoch": 1.988, "grad_norm": 1.2479577260390846, "kl": 0.7265625, "learning_rate": 1.2773049691551103e-06, "loss": 0.0291, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2485 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 476.171875, "epoch": 1.9888, "grad_norm": 0.4718047526902604, "kl": 0.296875, "learning_rate": 1.2754785831555617e-06, "loss": 0.0119, "reward": 1.703125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2486 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.515625, "epoch": 1.9896, "grad_norm": 3.873672543262227, "kl": 0.81640625, "learning_rate": 1.273653056572282e-06, "loss": 0.0326, "reward": 1.84375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.953125, "step": 2487 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.640625, "epoch": 1.9904, "grad_norm": 3.808522502892453, "kl": 0.80859375, "learning_rate": 1.2718283906864939e-06, "loss": 0.0324, "reward": 1.640625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.96875, "step": 2488 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 355.453125, "epoch": 1.9912, "grad_norm": 0.8173967776247805, "kl": 0.4765625, "learning_rate": 1.2700045867788184e-06, "loss": 0.0191, "reward": 1.59375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 2489 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.59375, "epoch": 1.992, "grad_norm": 0.6222982221544567, "kl": 0.890625, "learning_rate": 1.2681816461292715e-06, "loss": 0.0356, "reward": 1.609375, "reward_std": 0.15086673200130463, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 2490 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.859375, "epoch": 1.9928, "grad_norm": 0.7749320110635745, "kl": 0.373046875, "learning_rate": 1.2663595700172631e-06, "loss": 0.0149, "reward": 1.953125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.96875, "step": 2491 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 390.984375, "epoch": 1.9936, "grad_norm": 0.6220782396058596, "kl": 0.4765625, "learning_rate": 1.2645383597215965e-06, "loss": 0.019, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 2492 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 390.734375, "epoch": 1.9944, "grad_norm": 1.206073841190381, "kl": 1.1171875, "learning_rate": 1.2627180165204671e-06, "loss": 0.0449, "reward": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "step": 2493 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 417.6875, "epoch": 1.9952, "grad_norm": 1.295960181274372, "kl": 0.86328125, "learning_rate": 1.2608985416914616e-06, "loss": 0.0344, "reward": 1.734375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 2494 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 465.171875, "epoch": 1.996, "grad_norm": 0.5028284488457815, "kl": 0.87109375, "learning_rate": 1.259079936511558e-06, "loss": 0.0347, "reward": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 2495 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 390.828125, "epoch": 1.9968, "grad_norm": 0.9870454487172972, "kl": 0.384765625, "learning_rate": 1.257262202257124e-06, "loss": 0.0154, "reward": 1.65625, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 2496 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 451.09375, "epoch": 1.9976, "grad_norm": 0.7070811807566644, "kl": 0.271484375, "learning_rate": 1.2554453402039124e-06, "loss": 0.0108, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 2497 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 377.453125, "epoch": 1.9984, "grad_norm": 0.1712508918566282, "kl": 0.08544921875, "learning_rate": 1.2536293516270704e-06, "loss": 0.0034, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2498 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 393.109375, "epoch": 1.9992, "grad_norm": 0.08793411536601524, "kl": 0.09326171875, "learning_rate": 1.251814237801128e-06, "loss": 0.0037, "reward": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2499 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 351.6875, "epoch": 2.0, "grad_norm": 1.4112992865682874, "kl": 1.15625, "learning_rate": 1.2500000000000007e-06, "loss": 0.0462, "reward": 1.78125, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "step": 2500 } ], "logging_steps": 1.0, "max_steps": 3750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }