| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5714285714285714, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2523.270866394043, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.06654668599367142, |
| "kl": 0.0, |
| "lambda_div_used": 0.6164634302258492, |
| "learning_rate": 0.0, |
| "loss": -0.0258, |
| "reward": -0.14551701629534364, |
| "reward_after_mean": -0.14551701629534364, |
| "reward_after_std": 0.6225011153146625, |
| "reward_before_mean": 0.17862090840935707, |
| "reward_before_std": 0.5394803490489721, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3241379093378782, |
| "reward_change_min": -0.5330121107399464, |
| "reward_change_std": 0.19750467501580715, |
| "reward_std": 0.622501116245985, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.0713790925219655, |
| "step": 1 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2684.583366394043, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.07638121396303177, |
| "kl": 0.0, |
| "lambda_div_used": 0.5910675376653671, |
| "learning_rate": 5e-08, |
| "loss": 0.001, |
| "reward": -0.07136534340679646, |
| "reward_after_mean": -0.07136534340679646, |
| "reward_after_std": 0.5026816055178642, |
| "reward_before_mean": 0.33918463438749313, |
| "reward_before_std": 0.41114553064107895, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4105500001460314, |
| "reward_change_min": -0.619081187993288, |
| "reward_change_std": 0.23483010660856962, |
| "reward_std": 0.5026816166937351, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.047517990693449974, |
| "step": 2 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2966.9166870117188, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.07826078683137894, |
| "kl": 5.805492401123047e-05, |
| "lambda_div_used": 0.5747007578611374, |
| "learning_rate": 1e-07, |
| "loss": 0.0118, |
| "reward": -0.3506452329456806, |
| "reward_after_mean": -0.3506452329456806, |
| "reward_after_std": 0.40795029513537884, |
| "reward_before_mean": -0.06986081041395664, |
| "reward_before_std": 0.3391748256981373, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.280784422531724, |
| "reward_change_min": -0.45085202157497406, |
| "reward_change_std": 0.16326917707920074, |
| "reward_std": 0.4079503044486046, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.1323608043603599, |
| "step": 3 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1377.000015258789, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.10993114858865738, |
| "kl": 3.951042890548706e-05, |
| "lambda_div_used": 0.5788158848881721, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0214, |
| "reward": -0.24558139964938164, |
| "reward_after_mean": -0.24558139964938164, |
| "reward_after_std": 0.3998244144022465, |
| "reward_before_mean": 0.06915237568318844, |
| "reward_before_std": 0.36195528600364923, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3147337753325701, |
| "reward_change_min": -0.4869973622262478, |
| "reward_change_std": 0.1927295122295618, |
| "reward_std": 0.3998244162648916, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.09751429967582226, |
| "step": 4 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3310.4583587646484, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.06377790123224258, |
| "kl": 5.4255127906799316e-05, |
| "lambda_div_used": 0.61885916441679, |
| "learning_rate": 2e-07, |
| "loss": 0.0428, |
| "reward": -0.25358792673796415, |
| "reward_after_mean": -0.25358792673796415, |
| "reward_after_std": 0.5858059301972389, |
| "reward_before_mean": 0.0003254720941185951, |
| "reward_before_std": 0.5446656532585621, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25391339510679245, |
| "reward_change_min": -0.45586229115724564, |
| "reward_change_std": 0.16313681472092867, |
| "reward_std": 0.5858059376478195, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.12467453628778458, |
| "step": 5 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2887.8541717529297, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.06263996660709381, |
| "kl": 5.511939525604248e-05, |
| "lambda_div_used": 0.5606320649385452, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0618, |
| "reward": -0.4456054698675871, |
| "reward_after_mean": -0.4456054698675871, |
| "reward_after_std": 0.3509902711957693, |
| "reward_before_mean": -0.1859958479180932, |
| "reward_before_std": 0.2702419590204954, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2596096061170101, |
| "reward_change_min": -0.38109203428030014, |
| "reward_change_std": 0.14125030301511288, |
| "reward_std": 0.3509902749210596, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2276625158265233, |
| "step": 6 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2572.3542098999023, |
| "epoch": 0.008, |
| "grad_norm": 0.06695970892906189, |
| "kl": 4.0218234062194824e-05, |
| "lambda_div_used": 0.6162339821457863, |
| "learning_rate": 3e-07, |
| "loss": 0.0453, |
| "reward": -0.10290078446269035, |
| "reward_after_mean": -0.10290078446269035, |
| "reward_after_std": 0.566169198602438, |
| "reward_before_mean": 0.22045390354469419, |
| "reward_before_std": 0.5333282891660929, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32335469499230385, |
| "reward_change_min": -0.5540991388261318, |
| "reward_change_std": 0.20453903079032898, |
| "reward_std": 0.566169211640954, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": 0.012120556086301804, |
| "step": 7 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2173.7500228881836, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.07779484242200851, |
| "kl": 3.185681998729706e-05, |
| "lambda_div_used": 0.63105358928442, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0071, |
| "reward": 0.09723488846793771, |
| "reward_after_mean": 0.09723488846793771, |
| "reward_after_std": 0.6893859151750803, |
| "reward_before_mean": 0.5079077246482484, |
| "reward_before_std": 0.6077435212209821, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.410672839730978, |
| "reward_change_min": -0.638349361717701, |
| "reward_change_std": 0.2494220733642578, |
| "reward_std": 0.6893859468400478, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.1329077403061092, |
| "step": 8 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2828.229179382324, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.09342571347951889, |
| "kl": 4.9620866775512695e-05, |
| "lambda_div_used": 0.6150719821453094, |
| "learning_rate": 4e-07, |
| "loss": -0.0132, |
| "reward": -0.18431122601032257, |
| "reward_after_mean": -0.18431122601032257, |
| "reward_after_std": 0.5804070886224508, |
| "reward_before_mean": 0.09753156686201692, |
| "reward_before_std": 0.5302032623440027, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2818427961319685, |
| "reward_change_min": -0.46669338643550873, |
| "reward_change_std": 0.18001185078173876, |
| "reward_std": 0.5804071053862572, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.0691350968554616, |
| "step": 9 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2469.7083587646484, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.09024360775947571, |
| "kl": 3.8331374526023865e-05, |
| "lambda_div_used": 0.63355503231287, |
| "learning_rate": 4.5e-07, |
| "loss": -0.001, |
| "reward": -0.04430920258164406, |
| "reward_after_mean": -0.04430920258164406, |
| "reward_after_std": 0.6170141901820898, |
| "reward_before_mean": 0.2625937759876251, |
| "reward_before_std": 0.6259458847343922, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3069029748439789, |
| "reward_change_min": -0.5734602250158787, |
| "reward_change_std": 0.22167105227708817, |
| "reward_std": 0.6170141994953156, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": 0.012593759223818779, |
| "step": 10 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3367.500015258789, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.05533275753259659, |
| "kl": 5.245208740234375e-05, |
| "lambda_div_used": 0.5749180987477303, |
| "learning_rate": 5e-07, |
| "loss": -0.0467, |
| "reward": -0.35878527723252773, |
| "reward_after_mean": -0.35878527723252773, |
| "reward_after_std": 0.4030665699392557, |
| "reward_before_mean": -0.09556051343679428, |
| "reward_before_std": 0.3429161449894309, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2632247470319271, |
| "reward_change_min": -0.4096112735569477, |
| "reward_change_std": 0.1540710162371397, |
| "reward_std": 0.40306657925248146, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.19972719065845013, |
| "step": 11 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2016.958351135254, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.08992662280797958, |
| "kl": 3.9637088775634766e-05, |
| "lambda_div_used": 0.6247389540076256, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0429, |
| "reward": -0.10162727534770966, |
| "reward_after_mean": -0.10162727534770966, |
| "reward_after_std": 0.594907833263278, |
| "reward_before_mean": 0.19102710485458374, |
| "reward_before_std": 0.5805315412580967, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.292654387652874, |
| "reward_change_min": -0.5117304362356663, |
| "reward_change_std": 0.20085694547742605, |
| "reward_std": 0.5949078388512135, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.03813957702368498, |
| "step": 12 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2905.0833587646484, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.0641385093331337, |
| "kl": 4.194676876068115e-05, |
| "lambda_div_used": 0.5766140297055244, |
| "learning_rate": 6e-07, |
| "loss": 0.0437, |
| "reward": -0.3046809285879135, |
| "reward_after_mean": -0.3046809285879135, |
| "reward_after_std": 0.399059085175395, |
| "reward_before_mean": -0.018187658861279488, |
| "reward_before_std": 0.35123275220394135, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28649328649044037, |
| "reward_change_min": -0.4807186797261238, |
| "reward_change_std": 0.17643271200358868, |
| "reward_std": 0.399059085175395, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.1431876514106989, |
| "step": 13 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2520.312530517578, |
| "epoch": 0.016, |
| "grad_norm": 0.07475250959396362, |
| "kl": 3.432855010032654e-05, |
| "lambda_div_used": 0.5991310104727745, |
| "learning_rate": 6.5e-07, |
| "loss": -0.0332, |
| "reward": -0.22936711832880974, |
| "reward_after_mean": -0.22936711832880974, |
| "reward_after_std": 0.5019008349627256, |
| "reward_before_mean": 0.04907496925443411, |
| "reward_before_std": 0.4585955021902919, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27844210527837276, |
| "reward_change_min": -0.4685846194624901, |
| "reward_change_std": 0.17394172679632902, |
| "reward_std": 0.5019008629024029, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.09675835724920034, |
| "step": 14 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2717.0416984558105, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.08986662328243256, |
| "kl": 4.3585896492004395e-05, |
| "lambda_div_used": 0.5765259638428688, |
| "learning_rate": 7e-07, |
| "loss": 0.0355, |
| "reward": -0.16787780448794365, |
| "reward_after_mean": -0.16787780448794365, |
| "reward_after_std": 0.4601633083075285, |
| "reward_before_mean": 0.22437161579728127, |
| "reward_before_std": 0.3496173685416579, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3922494389116764, |
| "reward_change_min": -0.5602193549275398, |
| "reward_change_std": 0.2197399353608489, |
| "reward_std": 0.46016331762075424, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": 0.016038289293646812, |
| "step": 15 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3395.1875, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.05402011796832085, |
| "kl": 4.659593105316162e-05, |
| "lambda_div_used": 0.5638558194041252, |
| "learning_rate": 7.5e-07, |
| "loss": 0.032, |
| "reward": -0.432980858720839, |
| "reward_after_mean": -0.432980858720839, |
| "reward_after_std": 0.3415633924305439, |
| "reward_before_mean": -0.16444075386971235, |
| "reward_before_std": 0.2898183651268482, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26854010485112667, |
| "reward_change_min": -0.45487307757139206, |
| "reward_change_std": 0.16323526203632355, |
| "reward_std": 0.34156340546905994, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.20610742270946503, |
| "step": 16 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2164.020854949951, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.1185457855463028, |
| "kl": 4.2147934436798096e-05, |
| "lambda_div_used": 0.5763917565345764, |
| "learning_rate": 8e-07, |
| "loss": 0.0302, |
| "reward": -0.2335935328155756, |
| "reward_after_mean": -0.2335935328155756, |
| "reward_after_std": 0.4716028142720461, |
| "reward_before_mean": 0.13111361488699913, |
| "reward_before_std": 0.3423871146515012, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36470715142786503, |
| "reward_change_min": -0.5119686089456081, |
| "reward_change_std": 0.19370271731168032, |
| "reward_std": 0.471602825447917, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.05638639978133142, |
| "step": 17 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3079.4583740234375, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.05137661099433899, |
| "kl": 3.637373447418213e-05, |
| "lambda_div_used": 0.6073044687509537, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0478, |
| "reward": -0.057250723242759705, |
| "reward_after_mean": -0.057250723242759705, |
| "reward_after_std": 0.5715843215584755, |
| "reward_before_mean": 0.3262214660644531, |
| "reward_before_std": 0.5018989769741893, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38347217813134193, |
| "reward_change_min": -0.6260374560952187, |
| "reward_change_std": 0.24400013033300638, |
| "reward_std": 0.571584340184927, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.034554785932414234, |
| "step": 18 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2889.8542289733887, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.07495337724685669, |
| "kl": 4.096329212188721e-05, |
| "lambda_div_used": 0.6219106838107109, |
| "learning_rate": 9e-07, |
| "loss": 0.036, |
| "reward": 0.1660416293889284, |
| "reward_after_mean": 0.1660416293889284, |
| "reward_after_std": 0.6783627849072218, |
| "reward_before_mean": 0.6544073540717363, |
| "reward_before_std": 0.5640754774212837, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.488365713506937, |
| "reward_change_min": -0.758969146758318, |
| "reward_change_std": 0.2900321548804641, |
| "reward_std": 0.6783628333359957, |
| "rewards/accuracy_reward": 0.45833334140479565, |
| "rewards/cosine_scaled_reward": 0.19607400865061209, |
| "step": 19 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2114.8333587646484, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.0912039652466774, |
| "kl": 2.7514994144439697e-05, |
| "lambda_div_used": 0.6171177923679352, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": -0.0338, |
| "reward": -0.12753646727651358, |
| "reward_after_mean": -0.12753646727651358, |
| "reward_after_std": 0.6479649767279625, |
| "reward_before_mean": 0.2073977841064334, |
| "reward_before_std": 0.5446468549780548, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33493425138294697, |
| "reward_change_min": -0.5443090125918388, |
| "reward_change_std": 0.20240973494946957, |
| "reward_std": 0.6479649972170591, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.0426022089086473, |
| "step": 20 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2578.3125, |
| "epoch": 0.024, |
| "grad_norm": 0.1021808609366417, |
| "kl": 4.0903687477111816e-05, |
| "lambda_div_used": 0.5696183741092682, |
| "learning_rate": 1e-06, |
| "loss": 0.0243, |
| "reward": -0.16086186096072197, |
| "reward_after_mean": -0.16086186096072197, |
| "reward_after_std": 0.4245347697287798, |
| "reward_before_mean": 0.23232585442019626, |
| "reward_before_std": 0.3138282438740134, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3931876849383116, |
| "reward_change_min": -0.5524330623447895, |
| "reward_change_std": 0.2125744568184018, |
| "reward_std": 0.424534784629941, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": -0.01767415925860405, |
| "step": 21 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1644.437572479248, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.11418648809194565, |
| "kl": 2.1685846149921417e-05, |
| "lambda_div_used": 0.5976943448185921, |
| "learning_rate": 9.99931462820376e-07, |
| "loss": -0.0304, |
| "reward": -0.10828415304422379, |
| "reward_after_mean": -0.10828415304422379, |
| "reward_after_std": 0.5476059578359127, |
| "reward_before_mean": 0.2575272931717336, |
| "reward_before_std": 0.4531834872905165, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36581144109368324, |
| "reward_change_min": -0.5498910807073116, |
| "reward_change_std": 0.21405170671641827, |
| "reward_std": 0.5476059839129448, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.03413938358426094, |
| "step": 22 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2188.687530517578, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.0898994579911232, |
| "kl": 3.7088990211486816e-05, |
| "lambda_div_used": 0.6402537003159523, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": -0.0037, |
| "reward": -0.13877022732049227, |
| "reward_after_mean": -0.13877022732049227, |
| "reward_after_std": 0.6969181355088949, |
| "reward_before_mean": 0.11387635703431442, |
| "reward_before_std": 0.6502415342256427, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2526465691626072, |
| "reward_change_min": -0.45175985619425774, |
| "reward_change_std": 0.1627071350812912, |
| "reward_std": 0.6969181522727013, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.05279031861573458, |
| "step": 23 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2394.3958740234375, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.09361977875232697, |
| "kl": 2.0720064640045166e-05, |
| "lambda_div_used": 0.6309552267193794, |
| "learning_rate": 9.993832906395582e-07, |
| "loss": 0.0666, |
| "reward": 0.0798279655573424, |
| "reward_after_mean": 0.0798279655573424, |
| "reward_after_std": 0.6638341955840588, |
| "reward_before_mean": 0.4744788520038128, |
| "reward_before_std": 0.6099728401750326, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3946509025990963, |
| "reward_change_min": -0.6427228525280952, |
| "reward_change_std": 0.25018193013966084, |
| "reward_std": 0.6638342067599297, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/cosine_scaled_reward": 0.12031219294294715, |
| "step": 24 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2358.583366394043, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.07725252956151962, |
| "kl": 4.092603921890259e-05, |
| "lambda_div_used": 0.6378427669405937, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0557, |
| "reward": -0.15363326482474804, |
| "reward_after_mean": -0.15363326482474804, |
| "reward_after_std": 0.6477887704968452, |
| "reward_before_mean": 0.10259643197059631, |
| "reward_before_std": 0.6456613540649414, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25622970052063465, |
| "reward_change_min": -0.5558900497853756, |
| "reward_change_std": 0.19845533184707165, |
| "reward_std": 0.6477888077497482, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.08490357082337141, |
| "step": 25 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2850.937530517578, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.06806578487157822, |
| "kl": 3.5643577575683594e-05, |
| "lambda_div_used": 0.5781113430857658, |
| "learning_rate": 9.982876141412855e-07, |
| "loss": 0.005, |
| "reward": -0.25517464708536863, |
| "reward_after_mean": -0.25517464708536863, |
| "reward_after_std": 0.47686139307916164, |
| "reward_before_mean": 0.09097394905984402, |
| "reward_before_std": 0.3564971052110195, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3461485952138901, |
| "reward_change_min": -0.4805891402065754, |
| "reward_change_std": 0.18107541371136904, |
| "reward_std": 0.476861409842968, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.07569271977990866, |
| "step": 26 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2795.3750534057617, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.0670829713344574, |
| "kl": 5.0067901611328125e-05, |
| "lambda_div_used": 0.5909813195466995, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.038, |
| "reward": -0.30352935567498207, |
| "reward_after_mean": -0.30352935567498207, |
| "reward_after_std": 0.4782011900097132, |
| "reward_before_mean": -0.04106062464416027, |
| "reward_before_std": 0.42168071679770947, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2624687273055315, |
| "reward_change_min": -0.441840048879385, |
| "reward_change_std": 0.16586182732135057, |
| "reward_std": 0.47820119373500347, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.16606062231585383, |
| "step": 27 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2662.562530517578, |
| "epoch": 0.032, |
| "grad_norm": 0.07541470974683762, |
| "kl": 4.6879053115844727e-05, |
| "lambda_div_used": 0.6455363035202026, |
| "learning_rate": 9.96645768238595e-07, |
| "loss": -0.0548, |
| "reward": -0.01765006221830845, |
| "reward_after_mean": -0.01765006221830845, |
| "reward_after_std": 0.7017391249537468, |
| "reward_before_mean": 0.26933106034994125, |
| "reward_before_std": 0.6768321208655834, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2869811188429594, |
| "reward_change_min": -0.5125514604151249, |
| "reward_change_std": 0.19719962775707245, |
| "reward_std": 0.7017391547560692, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.022335614077746868, |
| "step": 28 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2988.1041717529297, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.07968341559171677, |
| "kl": 4.7832727432250977e-05, |
| "lambda_div_used": 0.5851128473877907, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.1054, |
| "reward": -0.38509707152843475, |
| "reward_after_mean": -0.38509707152843475, |
| "reward_after_std": 0.4501145612448454, |
| "reward_before_mean": -0.14269733056426048, |
| "reward_before_std": 0.3910446595400572, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24239975214004517, |
| "reward_change_min": -0.4103321433067322, |
| "reward_change_std": 0.1474100910127163, |
| "reward_std": 0.4501145798712969, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.20519732707180083, |
| "step": 29 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2777.1042404174805, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.0785035490989685, |
| "kl": 4.0784478187561035e-05, |
| "lambda_div_used": 0.616574227809906, |
| "learning_rate": 9.944597532678119e-07, |
| "loss": 0.0306, |
| "reward": 0.014547215774655342, |
| "reward_after_mean": 0.014547215774655342, |
| "reward_after_std": 0.6317574586719275, |
| "reward_before_mean": 0.40835407795384526, |
| "reward_before_std": 0.5376694360747933, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3938068598508835, |
| "reward_change_min": -0.5825920477509499, |
| "reward_change_std": 0.22909015510231256, |
| "reward_std": 0.631757466122508, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.09585406119003892, |
| "step": 30 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3100.9166870117188, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.05876823514699936, |
| "kl": 5.622208118438721e-05, |
| "lambda_div_used": 0.5746422484517097, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0301, |
| "reward": -0.373639321886003, |
| "reward_after_mean": -0.373639321886003, |
| "reward_after_std": 0.40832165256142616, |
| "reward_before_mean": -0.11681870371103287, |
| "reward_before_std": 0.3402246618643403, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2568206209689379, |
| "reward_change_min": -0.4000260457396507, |
| "reward_change_std": 0.149553001858294, |
| "reward_std": 0.4083216693252325, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.20015203207731247, |
| "step": 31 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2879.625030517578, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.060988157987594604, |
| "kl": 4.863739013671875e-05, |
| "lambda_div_used": 0.583471342921257, |
| "learning_rate": 9.917322325514487e-07, |
| "loss": 0.0143, |
| "reward": -0.09911171346902847, |
| "reward_after_mean": -0.09911171346902847, |
| "reward_after_std": 0.46612042374908924, |
| "reward_before_mean": 0.2912287414073944, |
| "reward_before_std": 0.3850706424564123, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39034045673906803, |
| "reward_change_min": -0.5949138030409813, |
| "reward_change_std": 0.23070038296282291, |
| "reward_std": 0.46612043119966984, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": -0.0004379265010356903, |
| "step": 32 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3136.750030517578, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.07551854848861694, |
| "kl": 4.787743091583252e-05, |
| "lambda_div_used": 0.6242911070585251, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": -0.0446, |
| "reward": -0.06627794913947582, |
| "reward_after_mean": -0.06627794913947582, |
| "reward_after_std": 0.5915219262242317, |
| "reward_before_mean": 0.24627447500824928, |
| "reward_before_std": 0.5734489392489195, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3125524502247572, |
| "reward_change_min": -0.5326853170990944, |
| "reward_change_std": 0.2051102453842759, |
| "reward_std": 0.5915219560265541, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": -0.0037255147472023964, |
| "step": 33 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2379.541702270508, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.07498405873775482, |
| "kl": 4.60892915725708e-05, |
| "lambda_div_used": 0.6475758850574493, |
| "learning_rate": 9.88466529153356e-07, |
| "loss": 0.0814, |
| "reward": 0.054736172780394554, |
| "reward_after_mean": 0.054736172780394554, |
| "reward_after_std": 0.7529329154640436, |
| "reward_before_mean": 0.4133173357695341, |
| "reward_before_std": 0.6894109938293695, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3585811499506235, |
| "reward_change_min": -0.6234904117882252, |
| "reward_change_std": 0.23517589084804058, |
| "reward_std": 0.7529329396784306, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.07998399529606104, |
| "step": 34 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3113.1875610351562, |
| "epoch": 0.04, |
| "grad_norm": 0.1044503003358841, |
| "kl": 5.8710575103759766e-05, |
| "lambda_div_used": 0.5754920393228531, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.0058, |
| "reward": -0.36803475581109524, |
| "reward_after_mean": -0.36803475581109524, |
| "reward_after_std": 0.4120363052934408, |
| "reward_before_mean": -0.1010703444480896, |
| "reward_before_std": 0.3434353759512305, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2669644057750702, |
| "reward_change_min": -0.42853060737252235, |
| "reward_change_std": 0.1540099997073412, |
| "reward_std": 0.41203631460666656, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.16357035003602505, |
| "step": 35 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3350.1666870117188, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.07969332486391068, |
| "kl": 6.110966205596924e-05, |
| "lambda_div_used": 0.5744208693504333, |
| "learning_rate": 9.846666218300807e-07, |
| "loss": -0.0514, |
| "reward": -0.4380789175629616, |
| "reward_after_mean": -0.4380789175629616, |
| "reward_after_std": 0.41310197673738003, |
| "reward_before_mean": -0.20339234871789813, |
| "reward_before_std": 0.337322598323226, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23468656465411186, |
| "reward_change_min": -0.3448420464992523, |
| "reward_change_std": 0.12617942783981562, |
| "reward_std": 0.4131019860506058, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.24505900964140892, |
| "step": 36 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3394.3958435058594, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.05401439592242241, |
| "kl": 4.9054622650146484e-05, |
| "lambda_div_used": 0.5599935948848724, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0086, |
| "reward": -0.3875775597989559, |
| "reward_after_mean": -0.3875775597989559, |
| "reward_after_std": 0.32731432281434536, |
| "reward_before_mean": -0.10551332868635654, |
| "reward_before_std": 0.2694389373064041, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28206423483788967, |
| "reward_change_min": -0.4370834305882454, |
| "reward_change_std": 0.16284553799778223, |
| "reward_std": 0.3273143321275711, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.1888466626405716, |
| "step": 37 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3311.0833435058594, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.05246276035904884, |
| "kl": 4.945695400238037e-05, |
| "lambda_div_used": 0.5770035535097122, |
| "learning_rate": 9.80337140183366e-07, |
| "loss": -0.0117, |
| "reward": -0.3118477761745453, |
| "reward_after_mean": -0.3118477761745453, |
| "reward_after_std": 0.38849758356809616, |
| "reward_before_mean": -0.021045896457508206, |
| "reward_before_std": 0.3480216721072793, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2908018734306097, |
| "reward_change_min": -0.44894198328256607, |
| "reward_change_std": 0.17141664400696754, |
| "reward_std": 0.3884976040571928, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.14604590460658073, |
| "step": 38 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2919.604179382324, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.08766351640224457, |
| "kl": 3.2648444175720215e-05, |
| "lambda_div_used": 0.5717808604240417, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.032, |
| "reward": -0.14589639008045197, |
| "reward_after_mean": -0.14589639008045197, |
| "reward_after_std": 0.4250806160271168, |
| "reward_before_mean": 0.26390516571700573, |
| "reward_before_std": 0.3228347860276699, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40980157628655434, |
| "reward_change_min": -0.5692479386925697, |
| "reward_change_std": 0.22337355464696884, |
| "reward_std": 0.42508063092827797, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.013905153144150972, |
| "step": 39 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2468.7916870117188, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.07357986271381378, |
| "kl": 4.18052077293396e-05, |
| "lambda_div_used": 0.5706272348761559, |
| "learning_rate": 9.754833590196926e-07, |
| "loss": 0.0148, |
| "reward": -0.2578982161357999, |
| "reward_after_mean": -0.2578982161357999, |
| "reward_after_std": 0.4464099854230881, |
| "reward_before_mean": 0.1052470114082098, |
| "reward_before_std": 0.31700289947912097, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3631452303379774, |
| "reward_change_min": -0.5201860442757607, |
| "reward_change_std": 0.1936064399778843, |
| "reward_std": 0.4464099947363138, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.06141966814175248, |
| "step": 40 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3203.312515258789, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.05290238931775093, |
| "kl": 5.0440430641174316e-05, |
| "lambda_div_used": 0.6480180770158768, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0199, |
| "reward": -0.04493038635700941, |
| "reward_after_mean": -0.04493038635700941, |
| "reward_after_std": 0.7092454191297293, |
| "reward_before_mean": 0.23497815802693367, |
| "reward_before_std": 0.693053056485951, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2799085471779108, |
| "reward_change_min": -0.5064845345914364, |
| "reward_change_std": 0.19558908697217703, |
| "reward_std": 0.7092454340308905, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": 0.005811510724015534, |
| "step": 41 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2890.9791927337646, |
| "epoch": 0.048, |
| "grad_norm": 0.09494666010141373, |
| "kl": 5.85019588470459e-05, |
| "lambda_div_used": 0.5329261645674706, |
| "learning_rate": 9.701111919237408e-07, |
| "loss": 0.0335, |
| "reward": -0.5650591850280762, |
| "reward_after_mean": -0.5650591850280762, |
| "reward_after_std": 0.225538931787014, |
| "reward_before_mean": -0.32472414150834084, |
| "reward_before_std": 0.14451794046908617, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24033503979444504, |
| "reward_change_min": -0.358134388923645, |
| "reward_change_std": 0.12634030357003212, |
| "reward_std": 0.2255389392375946, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.32472414895892143, |
| "step": 42 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3027.9583587646484, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.05899634212255478, |
| "kl": 4.252791404724121e-05, |
| "lambda_div_used": 0.595756284892559, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": -0.017, |
| "reward": -0.31989429891109467, |
| "reward_after_mean": -0.31989429891109467, |
| "reward_after_std": 0.5033334400504827, |
| "reward_before_mean": -0.07388713955879211, |
| "reward_before_std": 0.43908379040658474, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24600715190172195, |
| "reward_change_min": -0.4015818126499653, |
| "reward_change_std": 0.143897395581007, |
| "reward_std": 0.5033334512263536, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.1780538223683834, |
| "step": 43 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2372.5833587646484, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.08952542394399643, |
| "kl": 4.1332095861434937e-05, |
| "lambda_div_used": 0.5768283307552338, |
| "learning_rate": 9.64227184053598e-07, |
| "loss": 0.0238, |
| "reward": -0.09475147165358067, |
| "reward_after_mean": -0.09475147165358067, |
| "reward_after_std": 0.4600023180246353, |
| "reward_before_mean": 0.33489263616502285, |
| "reward_before_std": 0.35124383692163974, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4296441040933132, |
| "reward_change_min": -0.6358921378850937, |
| "reward_change_std": 0.24556122440844774, |
| "reward_std": 0.4600023254752159, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.06405930174514651, |
| "step": 44 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3178.125, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.05793697386980057, |
| "kl": 4.953145980834961e-05, |
| "lambda_div_used": 0.6092750132083893, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0375, |
| "reward": -0.24964951165020466, |
| "reward_after_mean": -0.24964951165020466, |
| "reward_after_std": 0.575963044539094, |
| "reward_before_mean": 0.010031561367213726, |
| "reward_before_std": 0.5040898034349084, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25968106649816036, |
| "reward_change_min": -0.41585399955511093, |
| "reward_change_std": 0.15605208091437817, |
| "reward_std": 0.5759630724787712, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.114968441426754, |
| "step": 45 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3160.895866394043, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.06750793755054474, |
| "kl": 4.884600639343262e-05, |
| "lambda_div_used": 0.5543409436941147, |
| "learning_rate": 9.578385041664925e-07, |
| "loss": 0.0442, |
| "reward": -0.47163831628859043, |
| "reward_after_mean": -0.47163831628859043, |
| "reward_after_std": 0.32501144520938396, |
| "reward_before_mean": -0.21356616588309407, |
| "reward_before_std": 0.24316862598061562, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25807216577231884, |
| "reward_change_min": -0.3883417770266533, |
| "reward_change_std": 0.13798328768461943, |
| "reward_std": 0.3250114619731903, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.23439949192106724, |
| "step": 46 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2624.270881652832, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.10712098330259323, |
| "kl": 5.3569674491882324e-05, |
| "lambda_div_used": 0.6219898834824562, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0102, |
| "reward": -0.10781855694949627, |
| "reward_after_mean": -0.10781855694949627, |
| "reward_after_std": 0.5886904541403055, |
| "reward_before_mean": 0.19626505579799414, |
| "reward_before_std": 0.5641138087958097, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3040836229920387, |
| "reward_change_min": -0.5314439944922924, |
| "reward_change_std": 0.2001748401671648, |
| "reward_std": 0.5886904690414667, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.03290162514895201, |
| "step": 47 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2589.458396911621, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.07933427393436432, |
| "kl": 4.4988468289375305e-05, |
| "lambda_div_used": 0.6262567341327667, |
| "learning_rate": 9.509529358847654e-07, |
| "loss": -0.0358, |
| "reward": -0.13860276085324585, |
| "reward_after_mean": -0.13860276085324585, |
| "reward_after_std": 0.5960155855864286, |
| "reward_before_mean": 0.13786590658128262, |
| "reward_before_std": 0.5872014760971069, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2764686793088913, |
| "reward_change_min": -0.5099791921675205, |
| "reward_change_std": 0.19576375279575586, |
| "reward_std": 0.5960155855864286, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.07046743389219046, |
| "step": 48 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1985.2291946411133, |
| "epoch": 0.056, |
| "grad_norm": 0.12161526083946228, |
| "kl": 4.811584949493408e-05, |
| "lambda_div_used": 0.5859075263142586, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0065, |
| "reward": -0.21219712868332863, |
| "reward_after_mean": -0.21219712868332863, |
| "reward_after_std": 0.439599821344018, |
| "reward_before_mean": 0.10536342021077871, |
| "reward_before_std": 0.39430073963012546, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3175605833530426, |
| "reward_change_min": -0.4657192714512348, |
| "reward_change_std": 0.19229275174438953, |
| "reward_std": 0.4395998399704695, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.10296990955248475, |
| "step": 49 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2880.250011444092, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.07723147422075272, |
| "kl": 3.2588839530944824e-05, |
| "lambda_div_used": 0.5741998106241226, |
| "learning_rate": 9.43578868212728e-07, |
| "loss": 0.0182, |
| "reward": -0.0913238637149334, |
| "reward_after_mean": -0.0913238637149334, |
| "reward_after_std": 0.42474013939499855, |
| "reward_before_mean": 0.3248649761080742, |
| "reward_before_std": 0.3397149038501084, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41618884168565273, |
| "reward_change_min": -0.6034708395600319, |
| "reward_change_std": 0.23950859624892473, |
| "reward_std": 0.4247401561588049, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.012364983558654785, |
| "step": 50 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2433.020854949951, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.09581972658634186, |
| "kl": 5.4717063903808594e-05, |
| "lambda_div_used": 0.5894461125135422, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": -0.0308, |
| "reward": -0.28266510320827365, |
| "reward_after_mean": -0.28266510320827365, |
| "reward_after_std": 0.44258156418800354, |
| "reward_before_mean": 0.012647990137338638, |
| "reward_before_std": 0.40818152111023664, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29531308077275753, |
| "reward_change_min": -0.4761137217283249, |
| "reward_change_std": 0.1856073010712862, |
| "reward_std": 0.4425815735012293, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.15401867777109146, |
| "step": 51 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2707.250030517578, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.14702580869197845, |
| "kl": 4.030205309391022e-05, |
| "lambda_div_used": 0.6197419166564941, |
| "learning_rate": 9.357252853159505e-07, |
| "loss": 0.0298, |
| "reward": 0.17821598052978516, |
| "reward_after_mean": 0.17821598052978516, |
| "reward_after_std": 0.6063419748097658, |
| "reward_before_mean": 0.629100788384676, |
| "reward_before_std": 0.5578461596742272, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4508847985416651, |
| "reward_change_min": -0.6935725994408131, |
| "reward_change_std": 0.2830996550619602, |
| "reward_std": 0.6063419822603464, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.17076744884252548, |
| "step": 52 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2799.8125610351562, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.07566332817077637, |
| "kl": 5.0321221351623535e-05, |
| "lambda_div_used": 0.6389462202787399, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0088, |
| "reward": -0.0043678306974470615, |
| "reward_after_mean": -0.0043678306974470615, |
| "reward_after_std": 0.6600480955094099, |
| "reward_before_mean": 0.31353550031781197, |
| "reward_before_std": 0.6461421558633447, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3179033286869526, |
| "reward_change_min": -0.5801984183490276, |
| "reward_change_std": 0.22101250104606152, |
| "reward_std": 0.6600481104105711, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/cosine_scaled_reward": 0.0218688128516078, |
| "step": 53 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2297.9583778381348, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.09625467658042908, |
| "kl": 4.2378902435302734e-05, |
| "lambda_div_used": 0.6194662973284721, |
| "learning_rate": 9.274017555754407e-07, |
| "loss": 0.0224, |
| "reward": 0.0802488662302494, |
| "reward_after_mean": 0.0802488662302494, |
| "reward_after_std": 0.6323203574866056, |
| "reward_before_mean": 0.4949483387172222, |
| "reward_before_std": 0.5504263024777174, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41469944082200527, |
| "reward_change_min": -0.6165112145245075, |
| "reward_change_std": 0.2416615542024374, |
| "reward_std": 0.632320374250412, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.1407816605642438, |
| "step": 54 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2896.416702270508, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.05907125398516655, |
| "kl": 3.5256147384643555e-05, |
| "lambda_div_used": 0.602072462439537, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0467, |
| "reward": -0.13429339788854122, |
| "reward_after_mean": -0.13429339788854122, |
| "reward_after_std": 0.57946902140975, |
| "reward_before_mean": 0.22700263070873916, |
| "reward_before_std": 0.4767341245897114, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36129603534936905, |
| "reward_change_min": -0.5782174952328205, |
| "reward_change_std": 0.21661545429378748, |
| "reward_std": 0.5794690307229757, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": -0.02299738209694624, |
| "step": 55 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3075.187545776367, |
| "epoch": 0.064, |
| "grad_norm": 0.06310205906629562, |
| "kl": 4.1544437408447266e-05, |
| "lambda_div_used": 0.5549901723861694, |
| "learning_rate": 9.186184199300463e-07, |
| "loss": -0.0108, |
| "reward": -0.4740895018912852, |
| "reward_after_mean": -0.4740895018912852, |
| "reward_after_std": 0.32212498411536217, |
| "reward_before_mean": -0.2245362438261509, |
| "reward_before_std": 0.24502124171704054, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24955323711037636, |
| "reward_change_min": -0.3650141842663288, |
| "reward_change_std": 0.13405024446547031, |
| "reward_std": 0.3221249897032976, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.24536957405507565, |
| "step": 56 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3013.8541870117188, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.052018001675605774, |
| "kl": 3.096461296081543e-05, |
| "lambda_div_used": 0.5674270242452621, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0192, |
| "reward": -0.45193217881023884, |
| "reward_after_mean": -0.45193217881023884, |
| "reward_after_std": 0.37017360515892506, |
| "reward_before_mean": -0.21255622059106827, |
| "reward_before_std": 0.3090948835015297, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23937593773007393, |
| "reward_change_min": -0.42212119325995445, |
| "reward_change_std": 0.14615233521908522, |
| "reward_std": 0.3701736144721508, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2542228833772242, |
| "step": 57 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2257.8750534057617, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.09038142114877701, |
| "kl": 3.3486634492874146e-05, |
| "lambda_div_used": 0.6153116375207901, |
| "learning_rate": 9.093859795212817e-07, |
| "loss": 0.049, |
| "reward": -0.07369695231318474, |
| "reward_after_mean": -0.07369695231318474, |
| "reward_after_std": 0.5549236796796322, |
| "reward_before_mean": 0.2532845102250576, |
| "reward_before_std": 0.5286985114216805, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3269814867526293, |
| "reward_change_min": -0.537376407533884, |
| "reward_change_std": 0.20782853197306395, |
| "reward_std": 0.5549236983060837, |
| "rewards/accuracy_reward": 0.27083334513008595, |
| "rewards/cosine_scaled_reward": -0.017548808827996254, |
| "step": 58 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2807.6250381469727, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.06863339245319366, |
| "kl": 3.577768802642822e-05, |
| "lambda_div_used": 0.6041718497872353, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": -0.005, |
| "reward": -0.25026911310851574, |
| "reward_after_mean": -0.25026911310851574, |
| "reward_after_std": 0.5205795764923096, |
| "reward_before_mean": 0.026796480640769005, |
| "reward_before_std": 0.4837551396340132, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27706561237573624, |
| "reward_change_min": -0.5239802338182926, |
| "reward_change_std": 0.18853904772549868, |
| "reward_std": 0.5205795876681805, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.11903684702701867, |
| "step": 59 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2797.5833740234375, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.07598927617073059, |
| "kl": 3.738701343536377e-05, |
| "lambda_div_used": 0.6269034072756767, |
| "learning_rate": 8.997156826556369e-07, |
| "loss": 0.0313, |
| "reward": -0.2369612492620945, |
| "reward_after_mean": -0.2369612492620945, |
| "reward_after_std": 0.6222166065126657, |
| "reward_before_mean": 0.005125788040459156, |
| "reward_before_std": 0.5912308068946004, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24208704754710197, |
| "reward_change_min": -0.48471353203058243, |
| "reward_change_std": 0.17278119549155235, |
| "reward_std": 0.6222166288644075, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.14070754405111074, |
| "step": 60 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2917.812515258789, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.06223488971590996, |
| "kl": 3.0603259801864624e-05, |
| "lambda_div_used": 0.6177623867988586, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0259, |
| "reward": -0.1357121616601944, |
| "reward_after_mean": -0.1357121616601944, |
| "reward_after_std": 0.5694319568574429, |
| "reward_before_mean": 0.16569078899919987, |
| "reward_before_std": 0.5441586868837476, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3014029450714588, |
| "reward_change_min": -0.5209132842719555, |
| "reward_change_std": 0.20053375512361526, |
| "reward_std": 0.5694319736212492, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.042642548214644194, |
| "step": 61 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2507.8958854675293, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.0747612789273262, |
| "kl": 2.6777386665344238e-05, |
| "lambda_div_used": 0.61732517182827, |
| "learning_rate": 8.896193111002475e-07, |
| "loss": 0.0366, |
| "reward": 0.010478481650352478, |
| "reward_after_mean": 0.010478481650352478, |
| "reward_after_std": 0.6334415413439274, |
| "reward_before_mean": 0.4018659461289644, |
| "reward_before_std": 0.5490978918969631, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39138743840157986, |
| "reward_change_min": -0.6315983533859253, |
| "reward_change_std": 0.24404066987335682, |
| "reward_std": 0.6334415581077337, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.06853258889168501, |
| "step": 62 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1902.5000228881836, |
| "epoch": 0.072, |
| "grad_norm": 0.07997458428144455, |
| "kl": 2.5197863578796387e-05, |
| "lambda_div_used": 0.6046535074710846, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.059, |
| "reward": 0.0217137411236763, |
| "reward_after_mean": 0.0217137411236763, |
| "reward_after_std": 0.5547807831317186, |
| "reward_before_mean": 0.42593196779489517, |
| "reward_before_std": 0.49076249497011304, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4042182229459286, |
| "reward_change_min": -0.6314646527171135, |
| "reward_change_std": 0.25159123074263334, |
| "reward_std": 0.5547807849943638, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.07176528126001358, |
| "step": 63 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2790.979217529297, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.07697053253650665, |
| "kl": 4.419684410095215e-05, |
| "lambda_div_used": 0.6203151121735573, |
| "learning_rate": 8.791091657286267e-07, |
| "loss": 0.061, |
| "reward": -0.1100537832826376, |
| "reward_after_mean": -0.1100537832826376, |
| "reward_after_std": 0.5783387050032616, |
| "reward_before_mean": 0.19534806534647942, |
| "reward_before_std": 0.5548522733151913, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3054018337279558, |
| "reward_change_min": -0.5362299680709839, |
| "reward_change_std": 0.2037976048886776, |
| "reward_std": 0.5783387236297131, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.012985273322556168, |
| "step": 64 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2605.2708587646484, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.0772913321852684, |
| "kl": 3.089010715484619e-05, |
| "lambda_div_used": 0.5563548430800438, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": -0.0861, |
| "reward": -0.33125742711126804, |
| "reward_after_mean": -0.33125742711126804, |
| "reward_after_std": 0.3626741226762533, |
| "reward_before_mean": 0.005530592054128647, |
| "reward_before_std": 0.2566492212936282, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33678802102804184, |
| "reward_change_min": -0.5241431556642056, |
| "reward_change_std": 0.18918895348906517, |
| "reward_std": 0.36267412453889847, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.161136067006737, |
| "step": 65 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2088.562511444092, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.10951534658670425, |
| "kl": 2.690870314836502e-05, |
| "lambda_div_used": 0.5969364494085312, |
| "learning_rate": 8.681980515339463e-07, |
| "loss": 0.0287, |
| "reward": 0.012385480105876923, |
| "reward_after_mean": 0.012385480105876923, |
| "reward_after_std": 0.5128760654479265, |
| "reward_before_mean": 0.44090043008327484, |
| "reward_before_std": 0.4437296399846673, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4285149369388819, |
| "reward_change_min": -0.660201895982027, |
| "reward_change_std": 0.25701938942074776, |
| "reward_std": 0.512876084074378, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.0659004095941782, |
| "step": 66 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3402.8958740234375, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.047973256558179855, |
| "kl": 3.771483898162842e-05, |
| "lambda_div_used": 0.5712955147027969, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": -0.0027, |
| "reward": -0.4502652711234987, |
| "reward_after_mean": -0.4502652711234987, |
| "reward_after_std": 0.39950996078550816, |
| "reward_before_mean": -0.22067994717508554, |
| "reward_before_std": 0.3211147477850318, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.229585325345397, |
| "reward_change_min": -0.3470626436173916, |
| "reward_change_std": 0.12443333957344294, |
| "reward_std": 0.39950997941195965, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.26234661415219307, |
| "step": 67 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1642.9166984558105, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.11412619799375534, |
| "kl": 2.551823854446411e-05, |
| "lambda_div_used": 0.6189040914177895, |
| "learning_rate": 8.568992620281243e-07, |
| "loss": -0.0912, |
| "reward": -0.22925877509987913, |
| "reward_after_mean": -0.22925877509987913, |
| "reward_after_std": 0.6055262424051762, |
| "reward_before_mean": 0.03262739907950163, |
| "reward_before_std": 0.5506744375452399, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2618861813098192, |
| "reward_change_min": -0.4676270857453346, |
| "reward_change_std": 0.16872322466224432, |
| "reward_std": 0.6055262610316277, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.09237260185182095, |
| "step": 68 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2018.6666946411133, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.11094633489847183, |
| "kl": 4.1466206312179565e-05, |
| "lambda_div_used": 0.5734386518597603, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0618, |
| "reward": -0.4316765144467354, |
| "reward_after_mean": -0.4316765144467354, |
| "reward_after_std": 0.4006939698010683, |
| "reward_before_mean": -0.1882182292174548, |
| "reward_before_std": 0.331131674349308, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24345828033983707, |
| "reward_change_min": -0.3763142116367817, |
| "reward_change_std": 0.13501812983304262, |
| "reward_std": 0.40069398283958435, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.22988490015268326, |
| "step": 69 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2860.8958587646484, |
| "epoch": 0.08, |
| "grad_norm": 0.06797178089618683, |
| "kl": 4.0102750062942505e-05, |
| "lambda_div_used": 0.6089852601289749, |
| "learning_rate": 8.452265630457282e-07, |
| "loss": 0.0416, |
| "reward": -0.16451344639062881, |
| "reward_after_mean": -0.16451344639062881, |
| "reward_after_std": 0.5175492316484451, |
| "reward_before_mean": 0.1350960824638605, |
| "reward_before_std": 0.5058744940906763, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2996095381677151, |
| "reward_change_min": -0.5449608005583286, |
| "reward_change_std": 0.2059446070343256, |
| "reward_std": 0.5175492409616709, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.05240389332175255, |
| "step": 70 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2700.2708587646484, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.07971946895122528, |
| "kl": 4.7519803047180176e-05, |
| "lambda_div_used": 0.6114047914743423, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0248, |
| "reward": -0.17357509583234787, |
| "reward_after_mean": -0.17357509583234787, |
| "reward_after_std": 0.5260436423122883, |
| "reward_before_mean": 0.1154884397983551, |
| "reward_before_std": 0.5224206217098981, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2890635374933481, |
| "reward_change_min": -0.5293963067233562, |
| "reward_change_std": 0.20269155222922564, |
| "reward_std": 0.5260436479002237, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.051178233698010445, |
| "step": 71 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2651.1250610351562, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.1173771321773529, |
| "kl": 5.626678466796875e-05, |
| "lambda_div_used": 0.5625430718064308, |
| "learning_rate": 8.331941759724268e-07, |
| "loss": -0.0294, |
| "reward": -0.4364648088812828, |
| "reward_after_mean": -0.4364648088812828, |
| "reward_after_std": 0.3423657324165106, |
| "reward_before_mean": -0.17360325902700424, |
| "reward_before_std": 0.28492841869592667, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2628615368157625, |
| "reward_change_min": -0.44537778943777084, |
| "reward_change_std": 0.15708798076957464, |
| "reward_std": 0.3423657491803169, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2152699390426278, |
| "step": 72 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3339.9583740234375, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.04790791496634483, |
| "kl": 4.526972770690918e-05, |
| "lambda_div_used": 0.6265770718455315, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0626, |
| "reward": -0.12956268154084682, |
| "reward_after_mean": -0.12956268154084682, |
| "reward_after_std": 0.6143265012651682, |
| "reward_before_mean": 0.15906396182253957, |
| "reward_before_std": 0.5886987801641226, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.288626654073596, |
| "reward_change_min": -0.5101466961205006, |
| "reward_change_std": 0.1920691430568695, |
| "reward_std": 0.6143265273422003, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.049269367940723896, |
| "step": 73 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2385.7500381469727, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.06600002944469452, |
| "kl": 3.542378544807434e-05, |
| "lambda_div_used": 0.6220900639891624, |
| "learning_rate": 8.208167604184217e-07, |
| "loss": -0.0312, |
| "reward": -0.0998319232603535, |
| "reward_after_mean": -0.0998319232603535, |
| "reward_after_std": 0.5906735248863697, |
| "reward_before_mean": 0.20307225361466408, |
| "reward_before_std": 0.566557977348566, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30290416814386845, |
| "reward_change_min": -0.5291841961443424, |
| "reward_change_std": 0.20113296527415514, |
| "reward_std": 0.5906735453754663, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.005261080645141192, |
| "step": 74 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2732.458335876465, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.059586018323898315, |
| "kl": 3.4804921597242355e-05, |
| "lambda_div_used": 0.5497717335820198, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.026, |
| "reward": -0.16143636964261532, |
| "reward_after_mean": -0.16143636964261532, |
| "reward_after_std": 0.41724423691630363, |
| "reward_before_mean": 0.31992355175316334, |
| "reward_before_std": 0.22394374571740627, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4813598971813917, |
| "reward_change_min": -0.6314556114375591, |
| "reward_change_std": 0.24685372970998287, |
| "reward_std": 0.4172442499548197, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": 0.0490901879966259, |
| "step": 75 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2649.9791870117188, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.07793173938989639, |
| "kl": 3.8176774978637695e-05, |
| "lambda_div_used": 0.5533522665500641, |
| "learning_rate": 8.081093963579707e-07, |
| "loss": 0.0329, |
| "reward": -0.4524771338328719, |
| "reward_after_mean": -0.4524771338328719, |
| "reward_after_std": 0.32186589390039444, |
| "reward_before_mean": -0.18054483737796545, |
| "reward_before_std": 0.23863591719418764, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2719323057681322, |
| "reward_change_min": -0.402019876986742, |
| "reward_change_std": 0.14489794615656137, |
| "reward_std": 0.32186589762568474, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.20137817226350307, |
| "step": 76 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2847.8333587646484, |
| "epoch": 0.088, |
| "grad_norm": 0.06544006615877151, |
| "kl": 4.89354133605957e-05, |
| "lambda_div_used": 0.5558431893587112, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0184, |
| "reward": -0.46523131616413593, |
| "reward_after_mean": -0.46523131616413593, |
| "reward_after_std": 0.3304946720600128, |
| "reward_before_mean": -0.20099198445677757, |
| "reward_before_std": 0.2519041560590267, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26423933170735836, |
| "reward_change_min": -0.37857379391789436, |
| "reward_change_std": 0.140173084102571, |
| "reward_std": 0.3304946757853031, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.22182531282305717, |
| "step": 77 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3269.8333740234375, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.05014213174581528, |
| "kl": 3.695487976074219e-05, |
| "lambda_div_used": 0.6150171384215355, |
| "learning_rate": 7.950875657567621e-07, |
| "loss": -0.0153, |
| "reward": -0.23963370453566313, |
| "reward_after_mean": -0.23963370453566313, |
| "reward_after_std": 0.5818531475961208, |
| "reward_before_mean": 0.01525909942574799, |
| "reward_before_std": 0.5286716222763062, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2548927925527096, |
| "reward_change_min": -0.411530327051878, |
| "reward_change_std": 0.1550313262268901, |
| "reward_std": 0.5818531513214111, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.10974090336821973, |
| "step": 78 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2262.062511444092, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.06913956999778748, |
| "kl": 3.7103891372680664e-05, |
| "lambda_div_used": 0.5935603529214859, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0312, |
| "reward": -0.23656679573468864, |
| "reward_after_mean": -0.23656679573468864, |
| "reward_after_std": 0.5612265523523092, |
| "reward_before_mean": 0.08522074297070503, |
| "reward_before_std": 0.4243400124832988, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3217875510454178, |
| "reward_change_min": -0.4353032112121582, |
| "reward_change_std": 0.16354462038725615, |
| "reward_std": 0.5612265765666962, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.10227925609797239, |
| "step": 79 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3292.3333587646484, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.0538838729262352, |
| "kl": 4.382804036140442e-05, |
| "lambda_div_used": 0.5688970535993576, |
| "learning_rate": 7.817671337095244e-07, |
| "loss": 0.0288, |
| "reward": -0.3089722655713558, |
| "reward_after_mean": -0.3089722655713558, |
| "reward_after_std": 0.3733787778764963, |
| "reward_before_mean": 0.0013715587556362152, |
| "reward_before_std": 0.30914535373449326, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31034383550286293, |
| "reward_change_min": -0.46327832341194153, |
| "reward_change_std": 0.17456937301903963, |
| "reward_std": 0.3733787890523672, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.12362844927702099, |
| "step": 80 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3043.270866394043, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.08238786458969116, |
| "kl": 5.1021575927734375e-05, |
| "lambda_div_used": 0.5923707559704781, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0152, |
| "reward": -0.26676796935498714, |
| "reward_after_mean": -0.26676796935498714, |
| "reward_after_std": 0.4838532619178295, |
| "reward_before_mean": 0.01490369625389576, |
| "reward_before_std": 0.4254928780719638, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28167167864739895, |
| "reward_change_min": -0.4777062237262726, |
| "reward_change_std": 0.17371418420225382, |
| "reward_std": 0.4838532619178295, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.11009630188345909, |
| "step": 81 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2630.625030517578, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.06656210124492645, |
| "kl": 4.544854164123535e-05, |
| "lambda_div_used": 0.6220841184258461, |
| "learning_rate": 7.681643291108517e-07, |
| "loss": 0.0658, |
| "reward": -0.2009956305846572, |
| "reward_after_mean": -0.2009956305846572, |
| "reward_after_std": 0.6119217481464148, |
| "reward_before_mean": 0.05983926076442003, |
| "reward_before_std": 0.5658847019076347, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2608348857611418, |
| "reward_change_min": -0.45365019887685776, |
| "reward_change_std": 0.16699018515646458, |
| "reward_std": 0.6119217481464148, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.08599409856833518, |
| "step": 82 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2535.208381652832, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.09338229149580002, |
| "kl": 5.167722702026367e-05, |
| "lambda_div_used": 0.5914888307452202, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0644, |
| "reward": -0.14509151689708233, |
| "reward_after_mean": -0.14509151689708233, |
| "reward_after_std": 0.47357647120952606, |
| "reward_before_mean": 0.1998078590258956, |
| "reward_before_std": 0.4206119291484356, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3448993805795908, |
| "reward_change_min": -0.5211166813969612, |
| "reward_change_std": 0.20530468598008156, |
| "reward_std": 0.4735764730721712, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": 0.012307855300605297, |
| "step": 83 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2915.8750076293945, |
| "epoch": 0.096, |
| "grad_norm": 0.07025005668401718, |
| "kl": 4.2825937271118164e-05, |
| "lambda_div_used": 0.6353590413928032, |
| "learning_rate": 7.54295724882796e-07, |
| "loss": -0.0466, |
| "reward": -0.1503741154447198, |
| "reward_after_mean": -0.1503741154447198, |
| "reward_after_std": 0.6686306204646826, |
| "reward_before_mean": 0.10233869170770049, |
| "reward_before_std": 0.6240420090034604, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25271278992295265, |
| "reward_change_min": -0.43359045311808586, |
| "reward_change_std": 0.1594225950539112, |
| "reward_std": 0.6686306446790695, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.10599466459825635, |
| "step": 84 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2968.9375610351562, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.05030859634280205, |
| "kl": 2.9653310775756836e-05, |
| "lambda_div_used": 0.6120947226881981, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.02, |
| "reward": -0.31006562570109963, |
| "reward_after_mean": -0.31006562570109963, |
| "reward_after_std": 0.5445140562951565, |
| "reward_before_mean": -0.0690727960318327, |
| "reward_before_std": 0.5210688021034002, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2409928422421217, |
| "reward_change_min": -0.46906592324376106, |
| "reward_change_std": 0.1683354014530778, |
| "reward_std": 0.5445140581578016, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.17323945881798863, |
| "step": 85 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2726.9167098999023, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.08248579502105713, |
| "kl": 5.6862831115722656e-05, |
| "lambda_div_used": 0.5853299722075462, |
| "learning_rate": 7.401782177833147e-07, |
| "loss": -0.015, |
| "reward": -0.32714659720659256, |
| "reward_after_mean": -0.32714659720659256, |
| "reward_after_std": 0.4505470525473356, |
| "reward_before_mean": -0.05685961013659835, |
| "reward_before_std": 0.39618763769976795, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2702869772911072, |
| "reward_change_min": -0.49061961844563484, |
| "reward_change_std": 0.17353465128690004, |
| "reward_std": 0.4505470544099808, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.14019294315949082, |
| "step": 86 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2504.604232788086, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.07946456968784332, |
| "kl": 5.137920379638672e-05, |
| "lambda_div_used": 0.5860441103577614, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": -0.0658, |
| "reward": -0.25398197025060654, |
| "reward_after_mean": -0.25398197025060654, |
| "reward_after_std": 0.4263457953929901, |
| "reward_before_mean": 0.039198137819767, |
| "reward_before_std": 0.4029952948912978, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2931801360100508, |
| "reward_change_min": -0.46910280734300613, |
| "reward_change_std": 0.18813357036560774, |
| "reward_std": 0.4263457991182804, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1066351905465126, |
| "step": 87 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2021.3958702087402, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.08914105594158173, |
| "kl": 3.445148468017578e-05, |
| "lambda_div_used": 0.6275195479393005, |
| "learning_rate": 7.258290078201731e-07, |
| "loss": 0.0661, |
| "reward": -0.03901347843930125, |
| "reward_after_mean": -0.03901347843930125, |
| "reward_after_std": 0.6304188724607229, |
| "reward_before_mean": 0.28059265296906233, |
| "reward_before_std": 0.5838784109801054, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3196061383932829, |
| "reward_change_min": -0.4850176088511944, |
| "reward_change_std": 0.19270309899002314, |
| "reward_std": 0.6304188761860132, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": 0.030592639930546284, |
| "step": 88 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2855.666702270508, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.052225928753614426, |
| "kl": 3.810226917266846e-05, |
| "lambda_div_used": 0.603381521999836, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0748, |
| "reward": -0.2931769546121359, |
| "reward_after_mean": -0.2931769546121359, |
| "reward_after_std": 0.5186727736145258, |
| "reward_before_mean": -0.04963091528043151, |
| "reward_before_std": 0.4795092437416315, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2435460388660431, |
| "reward_change_min": -0.4475689232349396, |
| "reward_change_std": 0.1609211042523384, |
| "reward_std": 0.5186727829277515, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.17463092133402824, |
| "step": 89 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2377.9375228881836, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.1115579754114151, |
| "kl": 6.511807441711426e-05, |
| "lambda_div_used": 0.5806571692228317, |
| "learning_rate": 7.11265577295385e-07, |
| "loss": 0.1049, |
| "reward": -0.39063363266177475, |
| "reward_after_mean": -0.39063363266177475, |
| "reward_after_std": 0.42313366010785103, |
| "reward_before_mean": -0.14117828011512756, |
| "reward_before_std": 0.3631622865796089, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24945535697042942, |
| "reward_change_min": -0.38458868488669395, |
| "reward_change_std": 0.14130910206586123, |
| "reward_std": 0.4231336638331413, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.1828449461609125, |
| "step": 90 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2758.250015258789, |
| "epoch": 0.104, |
| "grad_norm": 0.0690622553229332, |
| "kl": 4.595518112182617e-05, |
| "lambda_div_used": 0.5834402665495872, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0142, |
| "reward": -0.1535566644743085, |
| "reward_after_mean": -0.1535566644743085, |
| "reward_after_std": 0.4708296097815037, |
| "reward_before_mean": 0.22540673054754734, |
| "reward_before_std": 0.3833985608071089, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3789634220302105, |
| "reward_change_min": -0.6174517869949341, |
| "reward_change_std": 0.2261828240007162, |
| "reward_std": 0.4708296228200197, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": -0.003759911749511957, |
| "step": 91 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2467.2083892822266, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.07518215477466583, |
| "kl": 3.6522746086120605e-05, |
| "lambda_div_used": 0.5778974890708923, |
| "learning_rate": 6.965056695057204e-07, |
| "loss": 0.0134, |
| "reward": -0.324931837618351, |
| "reward_after_mean": -0.324931837618351, |
| "reward_after_std": 0.42975695990025997, |
| "reward_before_mean": -0.03748153988271952, |
| "reward_before_std": 0.35468481201678514, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2874503042548895, |
| "reward_change_min": -0.4382200054824352, |
| "reward_change_std": 0.1673103515058756, |
| "reward_std": 0.42975698225200176, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.14164821058511734, |
| "step": 92 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3584.0, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.05293423309922218, |
| "kl": 5.128979682922363e-05, |
| "lambda_div_used": 0.5581478402018547, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": -0.0, |
| "reward": -0.4109080731868744, |
| "reward_after_mean": -0.4109080731868744, |
| "reward_after_std": 0.33292335644364357, |
| "reward_before_mean": -0.12180843483656645, |
| "reward_before_std": 0.26414576172828674, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28909964114427567, |
| "reward_change_min": -0.45860791578888893, |
| "reward_change_std": 0.16918409056961536, |
| "reward_std": 0.332923362031579, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.16347510018385947, |
| "step": 93 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2337.0625076293945, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.08120116591453552, |
| "kl": 4.356354475021362e-05, |
| "lambda_div_used": 0.5623572915792465, |
| "learning_rate": 6.815672671252315e-07, |
| "loss": 0.0672, |
| "reward": -0.2148991823196411, |
| "reward_after_mean": -0.2148991823196411, |
| "reward_after_std": 0.37764647975564003, |
| "reward_before_mean": 0.17423074319958687, |
| "reward_before_std": 0.2815079055726528, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3891299031674862, |
| "reward_change_min": -0.5794133953750134, |
| "reward_change_std": 0.21886237617582083, |
| "reward_std": 0.3776464983820915, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.034102603793144226, |
| "step": 94 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3268.7708587646484, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.048412173986434937, |
| "kl": 3.3482909202575684e-05, |
| "lambda_div_used": 0.599761851131916, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0138, |
| "reward": -0.28226344753056765, |
| "reward_after_mean": -0.28226344753056765, |
| "reward_after_std": 0.5015552807599306, |
| "reward_before_mean": -0.01817359635606408, |
| "reward_before_std": 0.4594444427639246, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2640898581594229, |
| "reward_change_min": -0.45775213465094566, |
| "reward_change_std": 0.170160255394876, |
| "reward_std": 0.5015552863478661, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.14317359682172537, |
| "step": 95 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2311.229202270508, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.08286502212285995, |
| "kl": 3.9830803871154785e-05, |
| "lambda_div_used": 0.6152824014425278, |
| "learning_rate": 6.664685702961344e-07, |
| "loss": 0.0394, |
| "reward": -0.1798182651400566, |
| "reward_after_mean": -0.1798182651400566, |
| "reward_after_std": 0.5684184953570366, |
| "reward_before_mean": 0.09630595671478659, |
| "reward_before_std": 0.5319525888189673, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2761242166161537, |
| "reward_change_min": -0.4500977620482445, |
| "reward_change_std": 0.17272682767361403, |
| "reward_std": 0.5684185232967138, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.11202737595885992, |
| "step": 96 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3088.104202270508, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.05773423984646797, |
| "kl": 4.525482654571533e-05, |
| "lambda_div_used": 0.5830484703183174, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.012, |
| "reward": -0.24631217122077942, |
| "reward_after_mean": -0.24631217122077942, |
| "reward_after_std": 0.4177880808711052, |
| "reward_before_mean": 0.052551381289958954, |
| "reward_before_std": 0.3777043428272009, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2988635450601578, |
| "reward_change_min": -0.4822593107819557, |
| "reward_change_std": 0.18344944156706333, |
| "reward_std": 0.4177880957722664, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.11411529779434204, |
| "step": 97 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3099.6666870117188, |
| "epoch": 0.112, |
| "grad_norm": 0.05776653811335564, |
| "kl": 4.242360591888428e-05, |
| "lambda_div_used": 0.5548205152153969, |
| "learning_rate": 6.512279744547392e-07, |
| "loss": 0.0853, |
| "reward": -0.30739316577091813, |
| "reward_after_mean": -0.30739316577091813, |
| "reward_after_std": 0.3838734310120344, |
| "reward_before_mean": 0.044590696692466736, |
| "reward_before_std": 0.2453754236921668, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3519838694483042, |
| "reward_change_min": -0.49566248431801796, |
| "reward_change_std": 0.18073826655745506, |
| "reward_std": 0.38387343287467957, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.10124263912439346, |
| "step": 98 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2814.0416679382324, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.10099756717681885, |
| "kl": 3.9711594581604004e-05, |
| "lambda_div_used": 0.589689776301384, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": -0.0116, |
| "reward": -0.24916072934865952, |
| "reward_after_mean": -0.24916072934865952, |
| "reward_after_std": 0.5363867282867432, |
| "reward_before_mean": 0.07555947656510398, |
| "reward_before_std": 0.4080575914122164, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32472023367881775, |
| "reward_change_min": -0.46621018648147583, |
| "reward_change_std": 0.16967851482331753, |
| "reward_std": 0.5363867320120335, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.11194050963968039, |
| "step": 99 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2585.208366394043, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.08013861626386642, |
| "kl": 3.557652235031128e-05, |
| "lambda_div_used": 0.6440299674868584, |
| "learning_rate": 6.358640479194451e-07, |
| "loss": 0.0233, |
| "reward": -0.017059004865586758, |
| "reward_after_mean": -0.017059004865586758, |
| "reward_after_std": 0.6788044832646847, |
| "reward_before_mean": 0.27609538938850164, |
| "reward_before_std": 0.672226045280695, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29315441474318504, |
| "reward_change_min": -0.560445386916399, |
| "reward_change_std": 0.2121435971930623, |
| "reward_std": 0.6788045018911362, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.02609538659453392, |
| "step": 100 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2698.687515258789, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.08196399360895157, |
| "kl": 4.9740076065063477e-05, |
| "lambda_div_used": 0.5603137612342834, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0552, |
| "reward": -0.2113272361457348, |
| "reward_after_mean": -0.2113272361457348, |
| "reward_after_std": 0.3802106771618128, |
| "reward_before_mean": 0.176470085978508, |
| "reward_before_std": 0.27200379874557257, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3877973351627588, |
| "reward_change_min": -0.5599614717066288, |
| "reward_change_std": 0.21361587569117546, |
| "reward_std": 0.38021068647503853, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.03186323493719101, |
| "step": 101 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2008.1875457763672, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.09427309036254883, |
| "kl": 3.542006015777588e-05, |
| "lambda_div_used": 0.5826256647706032, |
| "learning_rate": 6.203955092681039e-07, |
| "loss": -0.0045, |
| "reward": -0.2004177300259471, |
| "reward_after_mean": -0.2004177300259471, |
| "reward_after_std": 0.498240664601326, |
| "reward_before_mean": 0.16466995794326067, |
| "reward_before_std": 0.37351171765476465, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36508770659565926, |
| "reward_change_min": -0.5195483043789864, |
| "reward_change_std": 0.19531819131225348, |
| "reward_std": 0.4982406720519066, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.001996707171201706, |
| "step": 102 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2895.250020980835, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.06802447140216827, |
| "kl": 4.501640796661377e-05, |
| "lambda_div_used": 0.5506640374660492, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.0391, |
| "reward": -0.3214366286993027, |
| "reward_after_mean": -0.3214366286993027, |
| "reward_after_std": 0.3532369527965784, |
| "reward_before_mean": 0.050119780004024506, |
| "reward_before_std": 0.22584082814864814, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3715564049780369, |
| "reward_change_min": -0.520975548774004, |
| "reward_change_std": 0.19540261384099722, |
| "reward_std": 0.35323695838451385, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.09571357164531946, |
| "step": 103 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2521.3333435058594, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.096355140209198, |
| "kl": 4.947185516357422e-05, |
| "lambda_div_used": 0.5742901861667633, |
| "learning_rate": 6.048412045323164e-07, |
| "loss": 0.0001, |
| "reward": -0.32155255414545536, |
| "reward_after_mean": -0.32155255414545536, |
| "reward_after_std": 0.39902236871421337, |
| "reward_before_mean": -0.03546402044594288, |
| "reward_before_std": 0.3418318200856447, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2860885336995125, |
| "reward_change_min": -0.4318212755024433, |
| "reward_change_std": 0.16667354479432106, |
| "reward_std": 0.39902237243950367, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.13963068462908268, |
| "step": 104 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2526.000030517578, |
| "epoch": 0.12, |
| "grad_norm": 0.07842111587524414, |
| "kl": 4.8801302909851074e-05, |
| "lambda_div_used": 0.6022165417671204, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0569, |
| "reward": -0.12751667387783527, |
| "reward_after_mean": -0.12751667387783527, |
| "reward_after_std": 0.582199590280652, |
| "reward_before_mean": 0.24656505044549704, |
| "reward_before_std": 0.4756303254980594, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37408171594142914, |
| "reward_change_min": -0.5777854695916176, |
| "reward_change_std": 0.22329542227089405, |
| "reward_std": 0.5821996051818132, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": -0.0034349607303738594, |
| "step": 105 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2396.291702270508, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.07387516647577286, |
| "kl": 2.6203226298093796e-05, |
| "lambda_div_used": 0.6128493994474411, |
| "learning_rate": 5.892200842364462e-07, |
| "loss": -0.0293, |
| "reward": 0.159407502040267, |
| "reward_after_mean": 0.159407502040267, |
| "reward_after_std": 0.5759131647646427, |
| "reward_before_mean": 0.6274341251701117, |
| "reward_before_std": 0.5208892030641437, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46802657656371593, |
| "reward_change_min": -0.7112628631293774, |
| "reward_change_std": 0.2871347274631262, |
| "reward_std": 0.5759131908416748, |
| "rewards/accuracy_reward": 0.4583333469927311, |
| "rewards/cosine_scaled_reward": 0.16910075349733233, |
| "step": 106 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2806.875045776367, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.08797896653413773, |
| "kl": 4.5686960220336914e-05, |
| "lambda_div_used": 0.5551519840955734, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0206, |
| "reward": -0.20906101167201996, |
| "reward_after_mean": -0.20906101167201996, |
| "reward_after_std": 0.35940215550363064, |
| "reward_before_mean": 0.19015613943338394, |
| "reward_before_std": 0.24773720651865005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39921717159450054, |
| "reward_change_min": -0.5645803362131119, |
| "reward_change_std": 0.21728475205600262, |
| "reward_std": 0.35940215922892094, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.03901051543653011, |
| "step": 107 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2676.3125381469727, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.08117339015007019, |
| "kl": 4.683062434196472e-05, |
| "lambda_div_used": 0.5782058760523796, |
| "learning_rate": 5.735511803093248e-07, |
| "loss": 0.0222, |
| "reward": -0.33698799335979857, |
| "reward_after_mean": -0.33698799335979857, |
| "reward_after_std": 0.4150846730917692, |
| "reward_before_mean": -0.06898907572031021, |
| "reward_before_std": 0.3580573983490467, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2679989282041788, |
| "reward_change_min": -0.39731478318572044, |
| "reward_change_std": 0.15669311955571175, |
| "reward_std": 0.41508468240499496, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.17315573990345, |
| "step": 108 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3091.4166870117188, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.06771776080131531, |
| "kl": 4.081428050994873e-05, |
| "lambda_div_used": 0.5580320879817009, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0162, |
| "reward": -0.30369802191853523, |
| "reward_after_mean": -0.30369802191853523, |
| "reward_after_std": 0.3783009462058544, |
| "reward_before_mean": 0.060508192516863346, |
| "reward_before_std": 0.2597539462149143, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36420623771846294, |
| "reward_change_min": -0.5151935815811157, |
| "reward_change_std": 0.19158014561980963, |
| "reward_std": 0.37830095551908016, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.08532514423131943, |
| "step": 109 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2727.875045776367, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.07848876714706421, |
| "kl": 3.8996338844299316e-05, |
| "lambda_div_used": 0.5898331105709076, |
| "learning_rate": 5.578535828967777e-07, |
| "loss": 0.0059, |
| "reward": -0.27789773186668754, |
| "reward_after_mean": -0.27789773186668754, |
| "reward_after_std": 0.48964778520166874, |
| "reward_before_mean": 0.006700331810861826, |
| "reward_before_std": 0.40999170672148466, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28459805622696877, |
| "reward_change_min": -0.4541938379406929, |
| "reward_change_std": 0.16897330060601234, |
| "reward_std": 0.48964780382812023, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.15996634401381016, |
| "step": 110 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3152.3958587646484, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.06026454642415047, |
| "kl": 5.206465721130371e-05, |
| "lambda_div_used": 0.6035187616944313, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0004, |
| "reward": -0.22968922927975655, |
| "reward_after_mean": -0.22968922927975655, |
| "reward_after_std": 0.5089491438120604, |
| "reward_before_mean": 0.050578076392412186, |
| "reward_before_std": 0.47721442952752113, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28026728704571724, |
| "reward_change_min": -0.4855511710047722, |
| "reward_change_std": 0.1824399819597602, |
| "reward_std": 0.5089491568505764, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.09525526221841574, |
| "step": 111 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3253.937530517578, |
| "epoch": 0.128, |
| "grad_norm": 0.054045893251895905, |
| "kl": 4.832446575164795e-05, |
| "lambda_div_used": 0.6144762486219406, |
| "learning_rate": 5.421464171032224e-07, |
| "loss": 0.0014, |
| "reward": -0.07707784557715058, |
| "reward_after_mean": -0.07707784557715058, |
| "reward_after_std": 0.5871799997985363, |
| "reward_before_mean": 0.2822006791830063, |
| "reward_before_std": 0.5333117246627808, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35927851870656013, |
| "reward_change_min": -0.6481843888759613, |
| "reward_change_std": 0.23641589283943176, |
| "reward_std": 0.5871800072491169, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.01136732567101717, |
| "step": 112 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2541.812545776367, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.09499017894268036, |
| "kl": 4.720315337181091e-05, |
| "lambda_div_used": 0.5564066097140312, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0471, |
| "reward": -0.3626333475112915, |
| "reward_after_mean": -0.3626333475112915, |
| "reward_after_std": 0.3075137473642826, |
| "reward_before_mean": -0.05370184499770403, |
| "reward_before_std": 0.25310691073536873, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30893150344491005, |
| "reward_change_min": -0.4691920429468155, |
| "reward_change_std": 0.1770205283537507, |
| "reward_std": 0.3075137585401535, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.13703518453985453, |
| "step": 113 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2547.5416984558105, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.06982935965061188, |
| "kl": 2.650544047355652e-05, |
| "lambda_div_used": 0.5948992818593979, |
| "learning_rate": 5.264488196906752e-07, |
| "loss": 0.0506, |
| "reward": -0.33033538423478603, |
| "reward_after_mean": -0.33033538423478603, |
| "reward_after_std": 0.49076898768544197, |
| "reward_before_mean": -0.08067071554251015, |
| "reward_before_std": 0.4387869122438133, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2496646661311388, |
| "reward_change_min": -0.38466524705290794, |
| "reward_change_std": 0.15147447120398283, |
| "reward_std": 0.4907689895480871, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.18483738787472248, |
| "step": 114 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2926.3125228881836, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.10055308789014816, |
| "kl": 4.2185187339782715e-05, |
| "lambda_div_used": 0.5866656303405762, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0219, |
| "reward": -0.16912975907325745, |
| "reward_after_mean": -0.16912975907325745, |
| "reward_after_std": 0.4218728318810463, |
| "reward_before_mean": 0.17230269685387611, |
| "reward_before_std": 0.39482294395565987, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34143244847655296, |
| "reward_change_min": -0.5322228632867336, |
| "reward_change_std": 0.21214590221643448, |
| "reward_std": 0.4218728430569172, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": -0.09853065386414528, |
| "step": 115 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3185.895835876465, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.06827546656131744, |
| "kl": 4.4345855712890625e-05, |
| "lambda_div_used": 0.579861506819725, |
| "learning_rate": 5.107799157635538e-07, |
| "loss": -0.017, |
| "reward": -0.36347829084843397, |
| "reward_after_mean": -0.36347829084843397, |
| "reward_after_std": 0.425207169726491, |
| "reward_before_mean": -0.10740169882774353, |
| "reward_before_std": 0.36591998394578695, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25607660599052906, |
| "reward_change_min": -0.4011174105107784, |
| "reward_change_std": 0.15214112866669893, |
| "reward_std": 0.4252071734517813, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.19073502300307155, |
| "step": 116 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3265.666702270508, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.058839015662670135, |
| "kl": 5.224347114562988e-05, |
| "lambda_div_used": 0.5589891448616982, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0144, |
| "reward": -0.47565145045518875, |
| "reward_after_mean": -0.47565145045518875, |
| "reward_after_std": 0.34672038443386555, |
| "reward_before_mean": -0.2341720014810562, |
| "reward_before_std": 0.2644944768399, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2414794433861971, |
| "reward_change_min": -0.3730311393737793, |
| "reward_change_std": 0.13014927878975868, |
| "reward_std": 0.3467204011976719, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2550053410232067, |
| "step": 117 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3087.9375610351562, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.05542779713869095, |
| "kl": 3.71783971786499e-05, |
| "lambda_div_used": 0.6410617902874947, |
| "learning_rate": 4.951587954676837e-07, |
| "loss": 0.0602, |
| "reward": 0.02640039217658341, |
| "reward_after_mean": 0.02640039217658341, |
| "reward_after_std": 0.6670121420174837, |
| "reward_before_mean": 0.34903212962672114, |
| "reward_before_std": 0.6545964349061251, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.322631748393178, |
| "reward_change_min": -0.5322613082826138, |
| "reward_change_std": 0.21465477347373962, |
| "reward_std": 0.66701215878129, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.07819879939779639, |
| "step": 118 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2036.8542251586914, |
| "epoch": 0.136, |
| "grad_norm": 0.09512177854776382, |
| "kl": 4.920363426208496e-05, |
| "lambda_div_used": 0.5765073597431183, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0968, |
| "reward": -0.1327025555074215, |
| "reward_after_mean": -0.1327025555074215, |
| "reward_after_std": 0.4383635278791189, |
| "reward_before_mean": 0.2736157886683941, |
| "reward_before_std": 0.34712607227265835, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4063183292746544, |
| "reward_change_min": -0.5770336836576462, |
| "reward_change_std": 0.22668993193656206, |
| "reward_std": 0.43836353346705437, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": 0.0444490984082222, |
| "step": 119 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2693.6458854675293, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.0801864042878151, |
| "kl": 5.1856040954589844e-05, |
| "lambda_div_used": 0.5954036563634872, |
| "learning_rate": 4.79604490731896e-07, |
| "loss": -0.0416, |
| "reward": -0.10674675926566124, |
| "reward_after_mean": -0.10674675926566124, |
| "reward_after_std": 0.5345237273722887, |
| "reward_before_mean": 0.26713848020881414, |
| "reward_before_std": 0.44027570402249694, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37388524040579796, |
| "reward_change_min": -0.5998699106276035, |
| "reward_change_std": 0.22230207175016403, |
| "reward_std": 0.5345237515866756, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.017138468101620674, |
| "step": 120 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1960.8958587646484, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.09297072887420654, |
| "kl": 3.6925775930285454e-05, |
| "lambda_div_used": 0.5758452340960503, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": -0.0275, |
| "reward": -0.15540813654661179, |
| "reward_after_mean": -0.15540813654661179, |
| "reward_after_std": 0.3896722886711359, |
| "reward_before_mean": 0.20818629674613476, |
| "reward_before_std": 0.34600438084453344, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3635944165289402, |
| "reward_change_min": -0.5350156500935555, |
| "reward_change_std": 0.21344942972064018, |
| "reward_std": 0.38967230543494225, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": 0.020686281844973564, |
| "step": 121 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2996.7917289733887, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.07053450495004654, |
| "kl": 5.537271499633789e-05, |
| "lambda_div_used": 0.604009747505188, |
| "learning_rate": 4.641359520805548e-07, |
| "loss": 0.0244, |
| "reward": -0.18336665583774447, |
| "reward_after_mean": -0.18336665583774447, |
| "reward_after_std": 0.5550395585596561, |
| "reward_before_mean": 0.11408489104360342, |
| "reward_before_std": 0.4755072835832834, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29745154455304146, |
| "reward_change_min": -0.4127044528722763, |
| "reward_change_std": 0.16168679296970367, |
| "reward_std": 0.5550395771861076, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.05258178571239114, |
| "step": 122 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2965.687530517578, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.061138082295656204, |
| "kl": 4.2922794818878174e-05, |
| "lambda_div_used": 0.5864768177270889, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0629, |
| "reward": -0.24433407932519913, |
| "reward_after_mean": -0.24433407932519913, |
| "reward_after_std": 0.43522679433226585, |
| "reward_before_mean": 0.05285754054784775, |
| "reward_before_std": 0.4003843404352665, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2971916198730469, |
| "reward_change_min": -0.4781832844018936, |
| "reward_change_std": 0.18817945942282677, |
| "reward_std": 0.435226796194911, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.09297579154372215, |
| "step": 123 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2273.895881652832, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.06792795658111572, |
| "kl": 2.843700349330902e-05, |
| "lambda_div_used": 0.5992666333913803, |
| "learning_rate": 4.4877202554526084e-07, |
| "loss": 0.0724, |
| "reward": -0.030117375776171684, |
| "reward_after_mean": -0.030117375776171684, |
| "reward_after_std": 0.6018249355256557, |
| "reward_before_mean": 0.40752510842867196, |
| "reward_before_std": 0.4566116305068135, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4376424718648195, |
| "reward_change_min": -0.6283221691846848, |
| "reward_change_std": 0.2420077919960022, |
| "reward_std": 0.601824939250946, |
| "rewards/accuracy_reward": 0.3541666679084301, |
| "rewards/cosine_scaled_reward": 0.05335840582847595, |
| "step": 124 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2841.687515258789, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.057179540395736694, |
| "kl": 3.587547689676285e-05, |
| "lambda_div_used": 0.5674227699637413, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0092, |
| "reward": -0.09743379801511765, |
| "reward_after_mean": -0.09743379801511765, |
| "reward_after_std": 0.4501750059425831, |
| "reward_before_mean": 0.3561172317713499, |
| "reward_before_std": 0.3082335013896227, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.453551035374403, |
| "reward_change_min": -0.6685109585523605, |
| "reward_change_std": 0.25046134926378727, |
| "reward_std": 0.4501750282943249, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.043617233633995056, |
| "step": 125 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2814.3333892822266, |
| "epoch": 0.144, |
| "grad_norm": 0.06912728399038315, |
| "kl": 3.851950168609619e-05, |
| "lambda_div_used": 0.6011649072170258, |
| "learning_rate": 4.3353142970386557e-07, |
| "loss": 0.0079, |
| "reward": -0.2701383363455534, |
| "reward_after_mean": -0.2701383363455534, |
| "reward_after_std": 0.517305538058281, |
| "reward_before_mean": -0.00174633227288723, |
| "reward_before_std": 0.46759936958551407, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2683920059353113, |
| "reward_change_min": -0.42111679539084435, |
| "reward_change_std": 0.16303524654358625, |
| "reward_std": 0.5173055771738291, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.10591300774831325, |
| "step": 126 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3520.375, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.05083320662379265, |
| "kl": 3.781914710998535e-05, |
| "lambda_div_used": 0.5308314934372902, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0094, |
| "reward": -0.5137354172766209, |
| "reward_after_mean": -0.5137354172766209, |
| "reward_after_std": 0.23104721494019032, |
| "reward_before_mean": -0.23012915067374706, |
| "reward_before_std": 0.13533379370346665, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28360624983906746, |
| "reward_change_min": -0.4074147306382656, |
| "reward_change_std": 0.14828919060528278, |
| "reward_std": 0.2310472223907709, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2301291599869728, |
| "step": 127 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2622.6041946411133, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.07047850638628006, |
| "kl": 3.9443373680114746e-05, |
| "lambda_div_used": 0.5977307558059692, |
| "learning_rate": 4.1843273287476854e-07, |
| "loss": 0.0245, |
| "reward": 0.14617427624762058, |
| "reward_after_mean": 0.14617427624762058, |
| "reward_after_std": 0.5826170947402716, |
| "reward_before_mean": 0.6620007424353389, |
| "reward_before_std": 0.45144926803186536, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5158264562487602, |
| "reward_change_min": -0.7390152402222157, |
| "reward_change_std": 0.29064416885375977, |
| "reward_std": 0.5826170966029167, |
| "rewards/accuracy_reward": 0.41666666977107525, |
| "rewards/cosine_scaled_reward": 0.2453340534120798, |
| "step": 128 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3211.166702270508, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.06908967345952988, |
| "kl": 4.245340824127197e-05, |
| "lambda_div_used": 0.579024501144886, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": -0.0068, |
| "reward": -0.41220802813768387, |
| "reward_after_mean": -0.41220802813768387, |
| "reward_after_std": 0.4235076569020748, |
| "reward_before_mean": -0.1792316001956351, |
| "reward_before_std": 0.3573149349540472, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2329764310270548, |
| "reward_change_min": -0.38812074810266495, |
| "reward_change_std": 0.13700235076248646, |
| "reward_std": 0.423507671803236, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.24173159897327423, |
| "step": 129 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3416.375030517578, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.05305078998208046, |
| "kl": 3.8370490074157715e-05, |
| "lambda_div_used": 0.5585792362689972, |
| "learning_rate": 4.034943304942796e-07, |
| "loss": -0.0353, |
| "reward": -0.324653722345829, |
| "reward_after_mean": -0.324653722345829, |
| "reward_after_std": 0.30889566242694855, |
| "reward_before_mean": 0.0034640291705727577, |
| "reward_before_std": 0.2620142959058285, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32811774499714375, |
| "reward_change_min": -0.4815611355006695, |
| "reward_change_std": 0.18737321346998215, |
| "reward_std": 0.30889566615223885, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.10070264618843794, |
| "step": 130 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2672.8541984558105, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.08398960530757904, |
| "kl": 2.8876587748527527e-05, |
| "lambda_div_used": 0.6004800871014595, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0142, |
| "reward": 0.04825104773044586, |
| "reward_after_mean": 0.04825104773044586, |
| "reward_after_std": 0.537496130913496, |
| "reward_before_mean": 0.48470486514270306, |
| "reward_before_std": 0.4661959493532777, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43645381927490234, |
| "reward_change_min": -0.650093249976635, |
| "reward_change_std": 0.2640222804620862, |
| "reward_std": 0.5374961327761412, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.10970486886799335, |
| "step": 131 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2745.8541870117188, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.10217073559761047, |
| "kl": 3.91155481338501e-05, |
| "lambda_div_used": 0.5872602090239525, |
| "learning_rate": 3.8873442270461485e-07, |
| "loss": -0.0282, |
| "reward": -0.18162552546709776, |
| "reward_after_mean": -0.18162552546709776, |
| "reward_after_std": 0.4464551955461502, |
| "reward_before_mean": 0.14102918095886707, |
| "reward_before_std": 0.3960893382318318, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32265469804406166, |
| "reward_change_min": -0.477376826107502, |
| "reward_change_std": 0.18945662677288055, |
| "reward_std": 0.4464551992714405, |
| "rewards/accuracy_reward": 0.22916667722165585, |
| "rewards/cosine_scaled_reward": -0.08813749923137948, |
| "step": 132 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3396.625, |
| "epoch": 0.152, |
| "grad_norm": 0.05338770151138306, |
| "kl": 4.1738152503967285e-05, |
| "lambda_div_used": 0.5715877488255501, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0083, |
| "reward": -0.3019852191209793, |
| "reward_after_mean": -0.3019852191209793, |
| "reward_after_std": 0.394625848159194, |
| "reward_before_mean": 0.015070955269038677, |
| "reward_before_std": 0.3251611590385437, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3170561697334051, |
| "reward_change_min": -0.5240310952067375, |
| "reward_change_std": 0.18560713436454535, |
| "reward_std": 0.39462585374712944, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.15159572381526232, |
| "step": 133 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2655.7708435058594, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.08679015934467316, |
| "kl": 8.308887481689453e-05, |
| "lambda_div_used": 0.5542216002941132, |
| "learning_rate": 3.7417099217982686e-07, |
| "loss": 0.0208, |
| "reward": -0.32178156822919846, |
| "reward_after_mean": -0.32178156822919846, |
| "reward_after_std": 0.2957470379769802, |
| "reward_before_mean": 0.012201536446809769, |
| "reward_before_std": 0.24192149471491575, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3339830953627825, |
| "reward_change_min": -0.4839101508259773, |
| "reward_change_std": 0.18742438219487667, |
| "reward_std": 0.2957470417022705, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.09196514077484608, |
| "step": 134 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2014.9583587646484, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.07728467881679535, |
| "kl": 2.7412548661231995e-05, |
| "lambda_div_used": 0.6218282878398895, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0739, |
| "reward": 0.15841423906385899, |
| "reward_after_mean": 0.15841423906385899, |
| "reward_after_std": 0.6740392297506332, |
| "reward_before_mean": 0.6407645158469677, |
| "reward_before_std": 0.5697397403419018, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.48235028609633446, |
| "reward_change_min": -0.7837426699697971, |
| "reward_change_std": 0.3034245353192091, |
| "reward_std": 0.6740392409265041, |
| "rewards/accuracy_reward": 0.4791666716337204, |
| "rewards/cosine_scaled_reward": 0.161597837228328, |
| "step": 135 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2602.416717529297, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.06872163712978363, |
| "kl": 2.8759241104125977e-05, |
| "lambda_div_used": 0.6318690255284309, |
| "learning_rate": 3.5982178221668533e-07, |
| "loss": 0.0344, |
| "reward": 0.06710329907946289, |
| "reward_after_mean": 0.06710329907946289, |
| "reward_after_std": 0.6661366745829582, |
| "reward_before_mean": 0.454304663464427, |
| "reward_before_std": 0.6177845690399408, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38720136508345604, |
| "reward_change_min": -0.6763367056846619, |
| "reward_change_std": 0.258380358107388, |
| "reward_std": 0.6661366857588291, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.10013799648731947, |
| "step": 136 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3239.666679382324, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.0593891404569149, |
| "kl": 3.842264413833618e-05, |
| "lambda_div_used": 0.5779730677604675, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0124, |
| "reward": -0.3537064976990223, |
| "reward_after_mean": -0.3537064976990223, |
| "reward_after_std": 0.41078851372003555, |
| "reward_before_mean": -0.08445308171212673, |
| "reward_before_std": 0.35645375214517117, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2692534364759922, |
| "reward_change_min": -0.4356107972562313, |
| "reward_change_std": 0.15999021660536528, |
| "reward_std": 0.4107885267585516, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.16778641194105148, |
| "step": 137 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2760.687526702881, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.0790812149643898, |
| "kl": 2.8595328330993652e-05, |
| "lambda_div_used": 0.5799111127853394, |
| "learning_rate": 3.45704275117204e-07, |
| "loss": -0.0362, |
| "reward": -0.27829512720927596, |
| "reward_after_mean": -0.27829512720927596, |
| "reward_after_std": 0.41231589019298553, |
| "reward_before_mean": 0.02187724970281124, |
| "reward_before_std": 0.362152349203825, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3001723885536194, |
| "reward_change_min": -0.45827802270650864, |
| "reward_change_std": 0.17282870691269636, |
| "reward_std": 0.412315895780921, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.10312275495380163, |
| "step": 138 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2984.4583587646484, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.06229131296277046, |
| "kl": 3.9517879486083984e-05, |
| "lambda_div_used": 0.5838895812630653, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": -0.0589, |
| "reward": -0.3416835393290967, |
| "reward_after_mean": -0.3416835393290967, |
| "reward_after_std": 0.4420221708714962, |
| "reward_before_mean": -0.08248884417116642, |
| "reward_before_std": 0.384817186743021, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.259194690734148, |
| "reward_change_min": -0.42579447478055954, |
| "reward_change_std": 0.15513746719807386, |
| "reward_std": 0.44202217273414135, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.16582217533141375, |
| "step": 139 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2881.1250228881836, |
| "epoch": 0.16, |
| "grad_norm": 0.071082703769207, |
| "kl": 4.310905933380127e-05, |
| "lambda_div_used": 0.5740059092640877, |
| "learning_rate": 3.3183567088914833e-07, |
| "loss": 0.0019, |
| "reward": -0.25879082828760147, |
| "reward_after_mean": -0.25879082828760147, |
| "reward_after_std": 0.4811950568109751, |
| "reward_before_mean": 0.09872006997466087, |
| "reward_before_std": 0.3372529884800315, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35751091688871384, |
| "reward_change_min": -0.4904663935303688, |
| "reward_change_std": 0.18400432635098696, |
| "reward_std": 0.481195081025362, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.06794659808656434, |
| "step": 140 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3180.187515258789, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.05953420698642731, |
| "kl": 3.790855407714844e-05, |
| "lambda_div_used": 0.5660471692681313, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": -0.0557, |
| "reward": -0.3696493972092867, |
| "reward_after_mean": -0.3696493972092867, |
| "reward_after_std": 0.37020369805395603, |
| "reward_before_mean": -0.1031316639855504, |
| "reward_before_std": 0.300534725189209, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2665177509188652, |
| "reward_change_min": -0.3948013670742512, |
| "reward_change_std": 0.151769301854074, |
| "reward_std": 0.3702037110924721, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.20729831932112575, |
| "step": 141 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2633.291732788086, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.06993869692087173, |
| "kl": 4.523620009422302e-05, |
| "lambda_div_used": 0.643665611743927, |
| "learning_rate": 3.182328662904756e-07, |
| "loss": 0.0607, |
| "reward": -0.11501466785557568, |
| "reward_after_mean": -0.11501466785557568, |
| "reward_after_std": 0.7061873953789473, |
| "reward_before_mean": 0.1335596082135453, |
| "reward_before_std": 0.6679177191108465, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2485742662101984, |
| "reward_change_min": -0.438314501196146, |
| "reward_change_std": 0.16557594947516918, |
| "reward_std": 0.7061873972415924, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.05394040001556277, |
| "step": 142 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2543.854202270508, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.09050207585096359, |
| "kl": 4.5418739318847656e-05, |
| "lambda_div_used": 0.5568482205271721, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0559, |
| "reward": -0.44561293721199036, |
| "reward_after_mean": -0.44561293721199036, |
| "reward_after_std": 0.34061737172305584, |
| "reward_before_mean": -0.17733613029122353, |
| "reward_before_std": 0.2554386807605624, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26827680319547653, |
| "reward_change_min": -0.39999108389019966, |
| "reward_change_std": 0.14368562400341034, |
| "reward_std": 0.3406173773109913, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.19816946546779945, |
| "step": 143 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3062.2291984558105, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.08773668855428696, |
| "kl": 4.3526291847229004e-05, |
| "lambda_div_used": 0.5946916490793228, |
| "learning_rate": 3.0491243424323783e-07, |
| "loss": 0.0214, |
| "reward": -0.1852840557694435, |
| "reward_after_mean": -0.1852840557694435, |
| "reward_after_std": 0.4624195992946625, |
| "reward_before_mean": 0.13713636994361877, |
| "reward_before_std": 0.4314339701086283, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3224204182624817, |
| "reward_change_min": -0.502110667526722, |
| "reward_change_std": 0.19803205784410238, |
| "reward_std": 0.4624196030199528, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.05036364169791341, |
| "step": 144 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2343.708381652832, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.09618457406759262, |
| "kl": 3.914535045623779e-05, |
| "lambda_div_used": 0.6065893918275833, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0239, |
| "reward": -0.08643046766519547, |
| "reward_after_mean": -0.08643046766519547, |
| "reward_after_std": 0.565411014482379, |
| "reward_before_mean": 0.2700228439643979, |
| "reward_before_std": 0.4942969996482134, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35645330883562565, |
| "reward_change_min": -0.5621693357825279, |
| "reward_change_std": 0.21702316030859947, |
| "reward_std": 0.565411014482379, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": -0.021643826737999916, |
| "step": 145 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2643.750030517578, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.06749647855758667, |
| "kl": 3.0444934964179993e-05, |
| "lambda_div_used": 0.5759064182639122, |
| "learning_rate": 2.918906036420294e-07, |
| "loss": -0.0525, |
| "reward": -0.3986330684274435, |
| "reward_after_mean": -0.3986330684274435, |
| "reward_after_std": 0.4206914007663727, |
| "reward_before_mean": -0.1481835450977087, |
| "reward_before_std": 0.3465117560699582, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2504495307803154, |
| "reward_change_min": -0.3944549411535263, |
| "reward_change_std": 0.14348686579614878, |
| "reward_std": 0.42069140635430813, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.21068353950977325, |
| "step": 146 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3530.8958435058594, |
| "epoch": 0.168, |
| "grad_norm": 0.05082716792821884, |
| "kl": 4.493445158004761e-05, |
| "lambda_div_used": 0.6231074929237366, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0064, |
| "reward": -0.17444764077663422, |
| "reward_after_mean": -0.17444764077663422, |
| "reward_after_std": 0.5966980569064617, |
| "reward_before_mean": 0.08711316343396902, |
| "reward_before_std": 0.5713155549019575, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26156080327928066, |
| "reward_change_min": -0.42396802455186844, |
| "reward_change_std": 0.16937424894422293, |
| "reward_std": 0.5966980736702681, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.12122016958892345, |
| "step": 147 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2726.0625228881836, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.05811236426234245, |
| "kl": 2.9481947422027588e-05, |
| "lambda_div_used": 0.5576904863119125, |
| "learning_rate": 2.791832395815782e-07, |
| "loss": -0.0067, |
| "reward": -0.3092116080224514, |
| "reward_after_mean": -0.3092116080224514, |
| "reward_after_std": 0.3737166114151478, |
| "reward_before_mean": 0.04292176803573966, |
| "reward_before_std": 0.2566564744338393, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35213335789740086, |
| "reward_change_min": -0.5105132721364498, |
| "reward_change_std": 0.18572800233960152, |
| "reward_std": 0.3737166281789541, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.10291159152984619, |
| "step": 148 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3018.4375228881836, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.07346916198730469, |
| "kl": 3.395974636077881e-05, |
| "lambda_div_used": 0.5993617027997971, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0316, |
| "reward": -0.23622393235564232, |
| "reward_after_mean": -0.23622393235564232, |
| "reward_after_std": 0.513044873252511, |
| "reward_before_mean": 0.05197374615818262, |
| "reward_before_std": 0.4586914679966867, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2881976682692766, |
| "reward_change_min": -0.4784680940210819, |
| "reward_change_std": 0.1794019928202033, |
| "reward_std": 0.5130448862910271, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.0938595961779356, |
| "step": 149 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2817.000030517578, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.07110631465911865, |
| "kl": 4.3623149394989014e-05, |
| "lambda_div_used": 0.5749505385756493, |
| "learning_rate": 2.6680582402757324e-07, |
| "loss": 0.0143, |
| "reward": -0.4273622464388609, |
| "reward_after_mean": -0.4273622464388609, |
| "reward_after_std": 0.41951365023851395, |
| "reward_before_mean": -0.1882035918533802, |
| "reward_before_std": 0.34118228033185005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23915864899754524, |
| "reward_change_min": -0.3481520377099514, |
| "reward_change_std": 0.1273820260539651, |
| "reward_std": 0.4195136558264494, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.22987026162445545, |
| "step": 150 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2473.5208892822266, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.06608390063047409, |
| "kl": 3.757700324058533e-05, |
| "lambda_div_used": 0.6444682851433754, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0674, |
| "reward": 0.013197226449847221, |
| "reward_after_mean": 0.013197226449847221, |
| "reward_after_std": 0.6638381816446781, |
| "reward_before_mean": 0.3273471943102777, |
| "reward_before_std": 0.6709178425371647, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31414996832609177, |
| "reward_change_min": -0.5566080771386623, |
| "reward_change_std": 0.22394540812820196, |
| "reward_std": 0.6638382077217102, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.014847185462713242, |
| "step": 151 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2980.312530517578, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.09729959070682526, |
| "kl": 3.904849290847778e-05, |
| "lambda_div_used": 0.5731147676706314, |
| "learning_rate": 2.547734369542718e-07, |
| "loss": 0.036, |
| "reward": -0.42251406982541084, |
| "reward_after_mean": -0.42251406982541084, |
| "reward_after_std": 0.4107065536081791, |
| "reward_before_mean": -0.17358114942908287, |
| "reward_before_std": 0.3313267915509641, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24893292412161827, |
| "reward_change_min": -0.37377795577049255, |
| "reward_change_std": 0.13549237046390772, |
| "reward_std": 0.4107065834105015, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21524782478809357, |
| "step": 152 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2980.4167098999023, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.07872413098812103, |
| "kl": 5.060434341430664e-05, |
| "lambda_div_used": 0.5996033921837807, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0468, |
| "reward": -0.25326492823660374, |
| "reward_after_mean": -0.25326492823660374, |
| "reward_after_std": 0.5015600807964802, |
| "reward_before_mean": 0.03005093801766634, |
| "reward_before_std": 0.45544715132564306, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2833158541470766, |
| "reward_change_min": -0.490575835108757, |
| "reward_change_std": 0.17807927820831537, |
| "reward_std": 0.5015600919723511, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.09494908014312387, |
| "step": 153 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3400.6250610351562, |
| "epoch": 0.176, |
| "grad_norm": 0.044398024678230286, |
| "kl": 3.533065319061279e-05, |
| "lambda_div_used": 0.6407897770404816, |
| "learning_rate": 2.4310073797187573e-07, |
| "loss": 0.001, |
| "reward": -0.08846104983240366, |
| "reward_after_mean": -0.08846104983240366, |
| "reward_after_std": 0.6875216029584408, |
| "reward_before_mean": 0.1883750823326409, |
| "reward_before_std": 0.6521897967904806, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27683611772954464, |
| "reward_change_min": -0.45937369018793106, |
| "reward_change_std": 0.17437504325062037, |
| "reward_std": 0.6875216346234083, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.019958254415541887, |
| "step": 154 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2453.5416717529297, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.08257688581943512, |
| "kl": 4.6115368604660034e-05, |
| "lambda_div_used": 0.5891367048025131, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0267, |
| "reward": -0.14017992746084929, |
| "reward_after_mean": -0.14017992746084929, |
| "reward_after_std": 0.5328723080456257, |
| "reward_before_mean": 0.24201755598187447, |
| "reward_before_std": 0.40873255487531424, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3821975253522396, |
| "reward_change_min": -0.5966118611395359, |
| "reward_change_std": 0.22055233176797628, |
| "reward_std": 0.532872324809432, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": 0.012850900180637836, |
| "step": 155 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3058.145835876465, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.07657067477703094, |
| "kl": 3.8407742977142334e-05, |
| "lambda_div_used": 0.5792115926742554, |
| "learning_rate": 2.3180194846605364e-07, |
| "loss": 0.0192, |
| "reward": -0.35964071936905384, |
| "reward_after_mean": -0.35964071936905384, |
| "reward_after_std": 0.41963592916727066, |
| "reward_before_mean": -0.10375695489346981, |
| "reward_before_std": 0.36636115331202745, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2558837812393904, |
| "reward_change_min": -0.40037716925144196, |
| "reward_change_std": 0.15252912789583206, |
| "reward_std": 0.41963593289256096, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.18709028977900743, |
| "step": 156 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3067.0833435058594, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.0602630190551281, |
| "kl": 4.166364669799805e-05, |
| "lambda_div_used": 0.578017845749855, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.0024, |
| "reward": -0.20843622065149248, |
| "reward_after_mean": -0.20843622065149248, |
| "reward_after_std": 0.46523306891322136, |
| "reward_before_mean": 0.15985593758523464, |
| "reward_before_std": 0.35562971234321594, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3682921752333641, |
| "reward_change_min": -0.5581395737826824, |
| "reward_change_std": 0.20453235507011414, |
| "reward_std": 0.46523308381438255, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.027644065208733082, |
| "step": 157 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2439.8125381469727, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.07962260395288467, |
| "kl": 3.724917769432068e-05, |
| "lambda_div_used": 0.5700756460428238, |
| "learning_rate": 2.2089083427137329e-07, |
| "loss": 0.0427, |
| "reward": -0.09846613928675652, |
| "reward_after_mean": -0.09846613928675652, |
| "reward_after_std": 0.50458899512887, |
| "reward_before_mean": 0.36336963158100843, |
| "reward_before_std": 0.31559942476451397, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46183576062321663, |
| "reward_change_min": -0.6303520426154137, |
| "reward_change_std": 0.23663399182260036, |
| "reward_std": 0.5045890025794506, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.07170296460390091, |
| "step": 158 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3283.7916717529297, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.05873558670282364, |
| "kl": 4.16487455368042e-05, |
| "lambda_div_used": 0.565159484744072, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": -0.0183, |
| "reward": -0.43611275404691696, |
| "reward_after_mean": -0.43611275404691696, |
| "reward_after_std": 0.34730882942676544, |
| "reward_before_mean": -0.18830033391714096, |
| "reward_before_std": 0.2990688029676676, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.247812420129776, |
| "reward_change_min": -0.4116964153945446, |
| "reward_change_std": 0.15130725782364607, |
| "reward_std": 0.34730884805321693, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.2508003171533346, |
| "step": 159 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3132.4167098999023, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.06507622450590134, |
| "kl": 5.078315734863281e-05, |
| "lambda_div_used": 0.5871393531560898, |
| "learning_rate": 2.1038068889975259e-07, |
| "loss": 0.0371, |
| "reward": -0.25536114536225796, |
| "reward_after_mean": -0.25536114536225796, |
| "reward_after_std": 0.4553165938705206, |
| "reward_before_mean": 0.0424603084102273, |
| "reward_before_std": 0.3940334524959326, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2978214379400015, |
| "reward_change_min": -0.43723243474960327, |
| "reward_change_std": 0.16810003202408552, |
| "reward_std": 0.45531659945845604, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.10337302926927805, |
| "step": 160 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1814.145866394043, |
| "epoch": 0.184, |
| "grad_norm": 0.09199656546115875, |
| "kl": 3.68654727935791e-05, |
| "lambda_div_used": 0.6221684664487839, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": -0.1313, |
| "reward": -0.14752347487956285, |
| "reward_after_mean": -0.14752347487956285, |
| "reward_after_std": 0.6091006584465504, |
| "reward_before_mean": 0.1306287944316864, |
| "reward_before_std": 0.5693164113909006, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2781522646546364, |
| "reward_change_min": -0.4651326909661293, |
| "reward_change_std": 0.17978444695472717, |
| "reward_std": 0.6091006807982922, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.03603787627071142, |
| "step": 161 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3445.2291870117188, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.054613932967185974, |
| "kl": 4.4696033000946045e-05, |
| "lambda_div_used": 0.6093570217490196, |
| "learning_rate": 2.0028431734436308e-07, |
| "loss": 0.0111, |
| "reward": -0.21708323806524277, |
| "reward_after_mean": -0.21708323806524277, |
| "reward_after_std": 0.5340255293995142, |
| "reward_before_mean": 0.05781930312514305, |
| "reward_before_std": 0.5134020620025694, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27490255795419216, |
| "reward_change_min": -0.5003471188247204, |
| "reward_change_std": 0.1919182576239109, |
| "reward_std": 0.5340255443006754, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.10884736478328705, |
| "step": 162 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2758.5833435058594, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.06869502365589142, |
| "kl": 3.579258918762207e-05, |
| "lambda_div_used": 0.5997234806418419, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.025, |
| "reward": -0.0028839372098445892, |
| "reward_after_mean": -0.0028839372098445892, |
| "reward_after_std": 0.5558424014598131, |
| "reward_before_mean": 0.42010904336348176, |
| "reward_before_std": 0.46184782730415463, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42299298755824566, |
| "reward_change_min": -0.614623662084341, |
| "reward_change_std": 0.2444069180637598, |
| "reward_std": 0.5558424014598131, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.08677570521831512, |
| "step": 163 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2280.4583435058594, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.08430740982294083, |
| "kl": 3.807246685028076e-05, |
| "lambda_div_used": 0.583802655339241, |
| "learning_rate": 1.9061402047871833e-07, |
| "loss": -0.0005, |
| "reward": -0.14323966577649117, |
| "reward_after_mean": -0.14323966577649117, |
| "reward_after_std": 0.48269340209662914, |
| "reward_before_mean": 0.24511565826833248, |
| "reward_before_std": 0.38433590345084667, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38835531286895275, |
| "reward_change_min": -0.6137315705418587, |
| "reward_change_std": 0.2298044739291072, |
| "reward_std": 0.48269341699779034, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.0048843612894415855, |
| "step": 164 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3350.2083435058594, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.06055794283747673, |
| "kl": 4.6700239181518555e-05, |
| "lambda_div_used": 0.5597026646137238, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": -0.0239, |
| "reward": -0.383526224642992, |
| "reward_after_mean": -0.383526224642992, |
| "reward_after_std": 0.3388876337558031, |
| "reward_before_mean": -0.09019916784018278, |
| "reward_before_std": 0.27148230001330376, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2933270577341318, |
| "reward_change_min": -0.44717802107334137, |
| "reward_change_std": 0.16905023716390133, |
| "reward_std": 0.3388876374810934, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.15269917901605368, |
| "step": 165 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2993.7709045410156, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.07070616632699966, |
| "kl": 3.575533628463745e-05, |
| "lambda_div_used": 0.6159055009484291, |
| "learning_rate": 1.8138158006995363e-07, |
| "loss": -0.1171, |
| "reward": -0.20591574627906084, |
| "reward_after_mean": -0.20591574627906084, |
| "reward_after_std": 0.568760309368372, |
| "reward_before_mean": 0.07225125166587532, |
| "reward_before_std": 0.5325462874025106, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2781669981777668, |
| "reward_change_min": -0.4841184914112091, |
| "reward_change_std": 0.17965693771839142, |
| "reward_std": 0.5687603335827589, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.0735820853151381, |
| "step": 166 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2243.4583740234375, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.08954203128814697, |
| "kl": 3.8199592381715775e-05, |
| "lambda_div_used": 0.5807118713855743, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0227, |
| "reward": -0.12234261445701122, |
| "reward_after_mean": -0.12234261445701122, |
| "reward_after_std": 0.46423870138823986, |
| "reward_before_mean": 0.28887630719691515, |
| "reward_before_std": 0.3718760753981769, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41121890768408775, |
| "reward_change_min": -0.6246605962514877, |
| "reward_change_std": 0.24211041443049908, |
| "reward_std": 0.4642387069761753, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": 0.0597096448764205, |
| "step": 167 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3159.604217529297, |
| "epoch": 0.192, |
| "grad_norm": 0.05736541375517845, |
| "kl": 4.6193599700927734e-05, |
| "lambda_div_used": 0.6025644987821579, |
| "learning_rate": 1.7259824442455923e-07, |
| "loss": 0.0616, |
| "reward": -0.1081604603677988, |
| "reward_after_mean": -0.1081604603677988, |
| "reward_after_std": 0.5043755322694778, |
| "reward_before_mean": 0.23938040900975466, |
| "reward_before_std": 0.4762058244086802, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.347540894523263, |
| "reward_change_min": -0.5797867476940155, |
| "reward_change_std": 0.22277529910206795, |
| "reward_std": 0.504375534132123, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": 0.031047106254845858, |
| "step": 168 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2691.166717529297, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.05840221792459488, |
| "kl": 3.322819247841835e-05, |
| "lambda_div_used": 0.6123679727315903, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0221, |
| "reward": 0.15223310515284538, |
| "reward_after_mean": 0.15223310515284538, |
| "reward_after_std": 0.589836286380887, |
| "reward_before_mean": 0.622445510700345, |
| "reward_before_std": 0.5133083704859018, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4702123887836933, |
| "reward_change_min": -0.686709500849247, |
| "reward_change_std": 0.27231256756931543, |
| "reward_std": 0.5898362882435322, |
| "rewards/accuracy_reward": 0.45833334140479565, |
| "rewards/cosine_scaled_reward": 0.1641121432185173, |
| "step": 169 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2737.8542137145996, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.08408840745687485, |
| "kl": 2.9318034648895264e-05, |
| "lambda_div_used": 0.5516072362661362, |
| "learning_rate": 1.6427471468404952e-07, |
| "loss": -0.0223, |
| "reward": -0.22534361481666565, |
| "reward_after_mean": -0.22534361481666565, |
| "reward_after_std": 0.34615582413971424, |
| "reward_before_mean": 0.17829649709165096, |
| "reward_before_std": 0.231270051561296, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4036401268094778, |
| "reward_change_min": -0.5518503561615944, |
| "reward_change_std": 0.21789801493287086, |
| "reward_std": 0.34615583159029484, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.05087016709148884, |
| "step": 170 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2810.208366394043, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.07381287217140198, |
| "kl": 3.468245267868042e-05, |
| "lambda_div_used": 0.6072571501135826, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": 0.0763, |
| "reward": -0.0880913995206356, |
| "reward_after_mean": -0.0880913995206356, |
| "reward_after_std": 0.4982736185193062, |
| "reward_before_mean": 0.2392272837460041, |
| "reward_before_std": 0.4966685324907303, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3273186720907688, |
| "reward_change_min": -0.5419761650264263, |
| "reward_change_std": 0.21660141553729773, |
| "reward_std": 0.4982736259698868, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": 0.010060586035251617, |
| "step": 171 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2782.833381652832, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.08758535981178284, |
| "kl": 4.4949352741241455e-05, |
| "lambda_div_used": 0.5883520841598511, |
| "learning_rate": 1.5642113178727193e-07, |
| "loss": 0.0173, |
| "reward": -0.0421207002364099, |
| "reward_after_mean": -0.0421207002364099, |
| "reward_after_std": 0.5838102325797081, |
| "reward_before_mean": 0.41242535319179296, |
| "reward_before_std": 0.4052962730638683, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45454608276486397, |
| "reward_change_min": -0.6413811855018139, |
| "reward_change_std": 0.2437124690040946, |
| "reward_std": 0.5838102586567402, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.07909201784059405, |
| "step": 172 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2038.3125114440918, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.10115820169448853, |
| "kl": 2.4806708097457886e-05, |
| "lambda_div_used": 0.5800783261656761, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": -0.0379, |
| "reward": -0.3465901352465153, |
| "reward_after_mean": -0.3465901352465153, |
| "reward_after_std": 0.41013904474675655, |
| "reward_before_mean": -0.07510924944654107, |
| "reward_before_std": 0.3657920900732279, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27148088812828064, |
| "reward_change_min": -0.41459682397544384, |
| "reward_change_std": 0.16112061124294996, |
| "reward_std": 0.4101390540599823, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.17927592061460018, |
| "step": 173 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2429.7083740234375, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.08579805493354797, |
| "kl": 4.820525646209717e-05, |
| "lambda_div_used": 0.5971189364790916, |
| "learning_rate": 1.4904706411523448e-07, |
| "loss": -0.0681, |
| "reward": -0.16348301246762276, |
| "reward_after_mean": -0.16348301246762276, |
| "reward_after_std": 0.5588766317814589, |
| "reward_before_mean": 0.18685074103996158, |
| "reward_before_std": 0.44327013567090034, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3503337763249874, |
| "reward_change_min": -0.5374101847410202, |
| "reward_change_std": 0.19548403285443783, |
| "reward_std": 0.5588766410946846, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.02148258499801159, |
| "step": 174 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2815.1458587646484, |
| "epoch": 0.2, |
| "grad_norm": 0.06386198848485947, |
| "kl": 3.6529265344142914e-05, |
| "lambda_div_used": 0.5798816308379173, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.0362, |
| "reward": -0.08284463733434677, |
| "reward_after_mean": -0.08284463733434677, |
| "reward_after_std": 0.46138195879757404, |
| "reward_before_mean": 0.33735317550599575, |
| "reward_before_std": 0.3691369164735079, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4201977960765362, |
| "reward_change_min": -0.615710511803627, |
| "reward_change_std": 0.2454231232404709, |
| "reward_std": 0.46138197369873524, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.06651980988681316, |
| "step": 175 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2699.916748046875, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.1062018871307373, |
| "kl": 3.183633089065552e-05, |
| "lambda_div_used": 0.6652076914906502, |
| "learning_rate": 1.4216149583350755e-07, |
| "loss": -0.0096, |
| "reward": 0.01935443957336247, |
| "reward_after_mean": 0.01935443957336247, |
| "reward_after_std": 0.7740565538406372, |
| "reward_before_mean": 0.30571601539850235, |
| "reward_before_std": 0.7710783276706934, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2863615807145834, |
| "reward_change_min": -0.5516976863145828, |
| "reward_change_std": 0.20931501779705286, |
| "reward_std": 0.7740565687417984, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.03488267329521477, |
| "step": 176 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2837.062530517578, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.10419050604104996, |
| "kl": 5.3942203521728516e-05, |
| "lambda_div_used": 0.5891516581177711, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": -0.094, |
| "reward": -0.275299109518528, |
| "reward_after_mean": -0.275299109518528, |
| "reward_after_std": 0.4370743464678526, |
| "reward_before_mean": 0.019260672852396965, |
| "reward_before_std": 0.4135833829641342, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.294559795409441, |
| "reward_change_min": -0.49321193993091583, |
| "reward_change_std": 0.19294326566159725, |
| "reward_std": 0.43707435205578804, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.10573932714760303, |
| "step": 177 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2239.645866394043, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.08482369780540466, |
| "kl": 3.471970558166504e-05, |
| "lambda_div_used": 0.5743999034166336, |
| "learning_rate": 1.3577281594640182e-07, |
| "loss": -0.012, |
| "reward": -0.2907655192539096, |
| "reward_after_mean": -0.2907655192539096, |
| "reward_after_std": 0.3906351812183857, |
| "reward_before_mean": 0.020154590718448162, |
| "reward_before_std": 0.33835357427597046, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31092010997235775, |
| "reward_change_min": -0.4689374789595604, |
| "reward_change_std": 0.18138453178107738, |
| "reward_std": 0.3906352035701275, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.08401207532733679, |
| "step": 178 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2847.187530517578, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.06261342018842697, |
| "kl": 3.7359073758125305e-05, |
| "lambda_div_used": 0.5993921086192131, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0003, |
| "reward": -0.3251352347433567, |
| "reward_after_mean": -0.3251352347433567, |
| "reward_after_std": 0.507962841540575, |
| "reward_before_mean": -0.08432525303214788, |
| "reward_before_std": 0.45724861416965723, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2408099938184023, |
| "reward_change_min": -0.39610618352890015, |
| "reward_change_std": 0.14655023906379938, |
| "reward_std": 0.5079628489911556, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.18849190324544907, |
| "step": 179 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2329.562515258789, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.10947566479444504, |
| "kl": 3.916025161743164e-05, |
| "lambda_div_used": 0.6209289953112602, |
| "learning_rate": 1.2988880807625927e-07, |
| "loss": -0.0732, |
| "reward": 0.0392537759616971, |
| "reward_after_mean": 0.0392537759616971, |
| "reward_after_std": 0.6374544408172369, |
| "reward_before_mean": 0.4333432329003699, |
| "reward_before_std": 0.5579905398190022, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3940894529223442, |
| "reward_change_min": -0.5780177563428879, |
| "reward_change_std": 0.229821746237576, |
| "reward_std": 0.6374544575810432, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.07917657308280468, |
| "step": 180 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3174.9166870117188, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.0638444796204567, |
| "kl": 4.488229751586914e-05, |
| "lambda_div_used": 0.5625592544674873, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": -0.0055, |
| "reward": -0.3669877387583256, |
| "reward_after_mean": -0.3669877387583256, |
| "reward_after_std": 0.3254594895988703, |
| "reward_before_mean": -0.07308395206928253, |
| "reward_before_std": 0.2811311110854149, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29390377551317215, |
| "reward_change_min": -0.47421957924962044, |
| "reward_change_std": 0.1733461432158947, |
| "reward_std": 0.3254595026373863, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.15641729161143303, |
| "step": 181 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2200.5000228881836, |
| "epoch": 0.208, |
| "grad_norm": 0.07948590070009232, |
| "kl": 3.180652856826782e-05, |
| "lambda_div_used": 0.5802313759922981, |
| "learning_rate": 1.2451664098030743e-07, |
| "loss": -0.0177, |
| "reward": -0.33155214530415833, |
| "reward_after_mean": -0.33155214530415833, |
| "reward_after_std": 0.4093964695930481, |
| "reward_before_mean": -0.04757622070610523, |
| "reward_before_std": 0.362399042584002, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2839759271591902, |
| "reward_change_min": -0.4600135274231434, |
| "reward_change_std": 0.1762974951416254, |
| "reward_std": 0.40939648635685444, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.19340955652296543, |
| "step": 182 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1919.0000381469727, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.09855654090642929, |
| "kl": 3.127008676528931e-05, |
| "lambda_div_used": 0.5876503959298134, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0354, |
| "reward": -0.29830155335366726, |
| "reward_after_mean": -0.29830155335366726, |
| "reward_after_std": 0.461369052529335, |
| "reward_before_mean": -0.01361087104305625, |
| "reward_before_std": 0.4003021940588951, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28469069860875607, |
| "reward_change_min": -0.4400797598063946, |
| "reward_change_std": 0.1702390005812049, |
| "reward_std": 0.4613690562546253, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.11777753569185734, |
| "step": 183 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2771.0416870117188, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.09264298528432846, |
| "kl": 4.6037137508392334e-05, |
| "lambda_div_used": 0.5391388088464737, |
| "learning_rate": 1.1966285981663407e-07, |
| "loss": -0.001, |
| "reward": -0.3703397810459137, |
| "reward_after_mean": -0.3703397810459137, |
| "reward_after_std": 0.2937673106789589, |
| "reward_before_mean": -0.011352727189660072, |
| "reward_before_std": 0.17188004031777382, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35898703522980213, |
| "reward_change_min": -0.5206632278859615, |
| "reward_change_std": 0.18913730140775442, |
| "reward_std": 0.29376731254160404, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.13635273277759552, |
| "step": 184 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2770.9792098999023, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.09669750928878784, |
| "kl": 4.3720006942749023e-05, |
| "lambda_div_used": 0.5636111497879028, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0651, |
| "reward": -0.3112166179344058, |
| "reward_after_mean": -0.3112166179344058, |
| "reward_after_std": 0.3695005215704441, |
| "reward_before_mean": -0.0006957156583666801, |
| "reward_before_std": 0.2874382403679192, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3105209097266197, |
| "reward_change_min": -0.43808290362358093, |
| "reward_change_std": 0.17091987561434507, |
| "reward_std": 0.369500532746315, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.10486237239092588, |
| "step": 185 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2965.375015258789, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.05594424530863762, |
| "kl": 3.8955360651016235e-05, |
| "lambda_div_used": 0.608224630355835, |
| "learning_rate": 1.1533337816991931e-07, |
| "loss": -0.0681, |
| "reward": -0.20297073479741812, |
| "reward_after_mean": -0.20297073479741812, |
| "reward_after_std": 0.5205403883010149, |
| "reward_before_mean": 0.07951527182012796, |
| "reward_before_std": 0.502502404153347, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.282486030831933, |
| "reward_change_min": -0.47177543863654137, |
| "reward_change_std": 0.18455488048493862, |
| "reward_std": 0.5205404032021761, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.08715140074491501, |
| "step": 186 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2639.291702270508, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.08843009173870087, |
| "kl": 4.921853542327881e-05, |
| "lambda_div_used": 0.5908937901258469, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": -0.0048, |
| "reward": -0.25490788742899895, |
| "reward_after_mean": -0.25490788742899895, |
| "reward_after_std": 0.4807432759553194, |
| "reward_before_mean": 0.04253344633616507, |
| "reward_before_std": 0.4119319263845682, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29744134843349457, |
| "reward_change_min": -0.4517475329339504, |
| "reward_change_std": 0.16703799460083246, |
| "reward_std": 0.48074328526854515, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.10329987667500973, |
| "step": 187 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3541.9791870117188, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.05120408907532692, |
| "kl": 4.652142524719238e-05, |
| "lambda_div_used": 0.5589732080698013, |
| "learning_rate": 1.1153347084664419e-07, |
| "loss": -0.0029, |
| "reward": -0.4253443591296673, |
| "reward_after_mean": -0.4253443591296673, |
| "reward_after_std": 0.3237063102424145, |
| "reward_before_mean": -0.15165027976036072, |
| "reward_before_std": 0.26862882915884256, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27369407564401627, |
| "reward_change_min": -0.437398936599493, |
| "reward_change_std": 0.16332428343594074, |
| "reward_std": 0.3237063158303499, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.2141502844169736, |
| "step": 188 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2377.2708892822266, |
| "epoch": 0.216, |
| "grad_norm": 0.07240846008062363, |
| "kl": 5.1606446504592896e-05, |
| "lambda_div_used": 0.5700332000851631, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0342, |
| "reward": -0.3627132559195161, |
| "reward_after_mean": -0.3627132559195161, |
| "reward_after_std": 0.4038649797439575, |
| "reward_before_mean": -0.08412502333521843, |
| "reward_before_std": 0.3190039964392781, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2785882242023945, |
| "reward_change_min": -0.4183661602437496, |
| "reward_change_std": 0.15456592850387096, |
| "reward_std": 0.4038649834692478, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.1466250205412507, |
| "step": 189 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3024.041717529297, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.0596420094370842, |
| "kl": 2.85319983959198e-05, |
| "lambda_div_used": 0.5832700356841087, |
| "learning_rate": 1.0826776744855121e-07, |
| "loss": -0.0126, |
| "reward": -0.17394864186644554, |
| "reward_after_mean": -0.17394864186644554, |
| "reward_after_std": 0.471806388348341, |
| "reward_before_mean": 0.198049274738878, |
| "reward_before_std": 0.38090503215789795, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37199792452156544, |
| "reward_change_min": -0.5356397405266762, |
| "reward_change_std": 0.2117150044068694, |
| "reward_std": 0.4718063995242119, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.031117402017116547, |
| "step": 190 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2440.3541870117188, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.0756516084074974, |
| "kl": 3.930646926164627e-05, |
| "lambda_div_used": 0.5715140625834465, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0003, |
| "reward": -0.284542384557426, |
| "reward_after_mean": -0.284542384557426, |
| "reward_after_std": 0.447479585185647, |
| "reward_before_mean": 0.05547440081136301, |
| "reward_before_std": 0.3240885529667139, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34001679718494415, |
| "reward_change_min": -0.4750482700765133, |
| "reward_change_std": 0.1778492433950305, |
| "reward_std": 0.4474795889109373, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.11119226738810539, |
| "step": 191 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3487.6041870117188, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.048071179538965225, |
| "kl": 3.250688314437866e-05, |
| "lambda_div_used": 0.5586921945214272, |
| "learning_rate": 1.0554024673218806e-07, |
| "loss": 0.0206, |
| "reward": -0.3871428966522217, |
| "reward_after_mean": -0.3871428966522217, |
| "reward_after_std": 0.33264124393463135, |
| "reward_before_mean": -0.11114806681871414, |
| "reward_before_std": 0.2715215114876628, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27599484845995903, |
| "reward_change_min": -0.40478406473994255, |
| "reward_change_std": 0.15969175938516855, |
| "reward_std": 0.33264124765992165, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.19448141008615494, |
| "step": 192 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2929.937545776367, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.06499005109071732, |
| "kl": 3.759562969207764e-05, |
| "lambda_div_used": 0.6119573265314102, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0198, |
| "reward": -0.24959510192275047, |
| "reward_after_mean": -0.24959510192275047, |
| "reward_after_std": 0.5340033005923033, |
| "reward_before_mean": 0.009953722357749939, |
| "reward_before_std": 0.520604582503438, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25954881869256496, |
| "reward_change_min": -0.5182105302810669, |
| "reward_change_std": 0.186925214715302, |
| "reward_std": 0.5340033229440451, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1358796115964651, |
| "step": 193 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3132.875030517578, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.06673065572977066, |
| "kl": 4.443526268005371e-05, |
| "lambda_div_used": 0.6207702159881592, |
| "learning_rate": 1.0335423176140511e-07, |
| "loss": 0.0719, |
| "reward": 0.2215312235057354, |
| "reward_after_mean": 0.2215312235057354, |
| "reward_after_std": 0.6115883849561214, |
| "reward_before_mean": 0.7030764240771532, |
| "reward_before_std": 0.5582148376852274, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.48154521360993385, |
| "reward_change_min": -0.7418999075889587, |
| "reward_change_std": 0.29459451511502266, |
| "reward_std": 0.611588392406702, |
| "rewards/accuracy_reward": 0.4583333469927311, |
| "rewards/cosine_scaled_reward": 0.2447430812753737, |
| "step": 194 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2998.687530517578, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.05692190304398537, |
| "kl": 4.340708255767822e-05, |
| "lambda_div_used": 0.5588866546750069, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": -0.0321, |
| "reward": -0.4250563494861126, |
| "reward_after_mean": -0.4250563494861126, |
| "reward_after_std": 0.3346366826444864, |
| "reward_before_mean": -0.16300050355494022, |
| "reward_before_std": 0.26834714552387595, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2620558477938175, |
| "reward_change_min": -0.4044179916381836, |
| "reward_change_std": 0.15184260439127684, |
| "reward_std": 0.3346366863697767, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.22550049051642418, |
| "step": 195 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3551.6041870117188, |
| "epoch": 0.224, |
| "grad_norm": 0.051314327865839005, |
| "kl": 4.8547983169555664e-05, |
| "lambda_div_used": 0.5673639252781868, |
| "learning_rate": 1.017123858587145e-07, |
| "loss": 0.0144, |
| "reward": -0.43661339208483696, |
| "reward_after_mean": -0.43661339208483696, |
| "reward_after_std": 0.3381017968058586, |
| "reward_before_mean": -0.18068170547485352, |
| "reward_before_std": 0.3060085875913501, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25593167915940285, |
| "reward_change_min": -0.436182364821434, |
| "reward_change_std": 0.16187659837305546, |
| "reward_std": 0.3381018117070198, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.24318172316998243, |
| "step": 196 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2789.125068664551, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.08223231136798859, |
| "kl": 3.402773290872574e-05, |
| "lambda_div_used": 0.6149426028132439, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0526, |
| "reward": 0.08269345387816429, |
| "reward_after_mean": 0.08269345387816429, |
| "reward_after_std": 0.5922380294650793, |
| "reward_before_mean": 0.4997589197009802, |
| "reward_before_std": 0.5276397680863738, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4170654658228159, |
| "reward_change_min": -0.6376415118575096, |
| "reward_change_std": 0.24999654106795788, |
| "reward_std": 0.592238049954176, |
| "rewards/accuracy_reward": 0.39583334140479565, |
| "rewards/cosine_scaled_reward": 0.10392560251057148, |
| "step": 197 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2770.2708435058594, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.07601747661828995, |
| "kl": 3.542006015777588e-05, |
| "lambda_div_used": 0.5800714045763016, |
| "learning_rate": 1.0061670936044178e-07, |
| "loss": -0.0178, |
| "reward": -0.24220025539398193, |
| "reward_after_mean": -0.24220025539398193, |
| "reward_after_std": 0.4058863651007414, |
| "reward_before_mean": 0.06675046496093273, |
| "reward_before_std": 0.36683204025030136, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30895073898136616, |
| "reward_change_min": -0.4967211000621319, |
| "reward_change_std": 0.18911676667630672, |
| "reward_std": 0.40588637441396713, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.09991620294749737, |
| "step": 198 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3554.0416870117188, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.04930044710636139, |
| "kl": 4.279613494873047e-05, |
| "lambda_div_used": 0.5560082867741585, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0027, |
| "reward": -0.4229874052107334, |
| "reward_after_mean": -0.4229874052107334, |
| "reward_after_std": 0.31992789916694164, |
| "reward_before_mean": -0.14317837683483958, |
| "reward_before_std": 0.25433824164792895, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27980900928378105, |
| "reward_change_min": -0.4250169135630131, |
| "reward_change_std": 0.16277197189629078, |
| "reward_std": 0.31992790661752224, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.20567839220166206, |
| "step": 199 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2299.5209045410156, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.0840829536318779, |
| "kl": 4.097074270248413e-05, |
| "lambda_div_used": 0.6226711273193359, |
| "learning_rate": 1.0006853717962393e-07, |
| "loss": 0.0203, |
| "reward": 0.062016794458031654, |
| "reward_after_mean": 0.062016794458031654, |
| "reward_after_std": 0.631916331127286, |
| "reward_before_mean": 0.4668920338153839, |
| "reward_before_std": 0.5674512181431055, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40487524308264256, |
| "reward_change_min": -0.6527138948440552, |
| "reward_change_std": 0.250981617718935, |
| "reward_std": 0.6319163534790277, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/cosine_scaled_reward": 0.11272535985335708, |
| "step": 200 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2760.7084197998047, |
| "epoch": 0.2297142857142857, |
| "grad_norm": 0.06477575749158859, |
| "kl": 1.7711427062749863e-05, |
| "lambda_div_used": 0.6429438292980194, |
| "learning_rate": 1e-07, |
| "loss": -0.0022, |
| "reward": 0.15019571036100388, |
| "reward_after_mean": 0.15019571036100388, |
| "reward_after_std": 0.7328120246529579, |
| "reward_before_mean": 0.5535202682949603, |
| "reward_before_std": 0.6601617820560932, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4033245462924242, |
| "reward_change_min": -0.6303750872612, |
| "reward_change_std": 0.24585676938295364, |
| "reward_std": 0.7328120358288288, |
| "rewards/accuracy_reward": 0.41666667349636555, |
| "rewards/cosine_scaled_reward": 0.13685358315706253, |
| "step": 201 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2511.937511444092, |
| "epoch": 0.23085714285714284, |
| "grad_norm": 0.08345558494329453, |
| "kl": 4.0203332901000977e-05, |
| "lambda_div_used": 0.5811712816357613, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": 0.0587, |
| "reward": 0.03902309015393257, |
| "reward_after_mean": 0.03902309015393257, |
| "reward_after_std": 0.5094954669475555, |
| "reward_before_mean": 0.5408022310584784, |
| "reward_before_std": 0.37269798293709755, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5017791502177715, |
| "reward_change_min": -0.7236880213022232, |
| "reward_change_std": 0.28125342447310686, |
| "reward_std": 0.5094954781234264, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.14496888127177954, |
| "step": 202 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3065.3333435058594, |
| "epoch": 0.232, |
| "grad_norm": 0.05931547284126282, |
| "kl": 5.182623863220215e-05, |
| "lambda_div_used": 0.5537669658660889, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.0447, |
| "reward": -0.3224771413952112, |
| "reward_after_mean": -0.3224771413952112, |
| "reward_after_std": 0.36307925172150135, |
| "reward_before_mean": 0.029698201455175877, |
| "reward_before_std": 0.23980092909187078, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3521753493696451, |
| "reward_change_min": -0.49568963050842285, |
| "reward_change_std": 0.18385440576821566, |
| "reward_std": 0.36307926289737225, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.1161351166665554, |
| "step": 203 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2511.833366394043, |
| "epoch": 0.23314285714285715, |
| "grad_norm": 0.07267706841230392, |
| "kl": 5.4508447647094727e-05, |
| "lambda_div_used": 0.5887176766991615, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.0023, |
| "reward": -0.23160922899842262, |
| "reward_after_mean": -0.23160922899842262, |
| "reward_after_std": 0.430733734741807, |
| "reward_before_mean": 0.06992286071181297, |
| "reward_before_std": 0.4116028640419245, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.301532082259655, |
| "reward_change_min": -0.49793890863657, |
| "reward_change_std": 0.19511268101632595, |
| "reward_std": 0.43073374405503273, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.07591048628091812, |
| "step": 204 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2967.1875915527344, |
| "epoch": 0.2342857142857143, |
| "grad_norm": 0.06776981800794601, |
| "kl": 4.439055919647217e-05, |
| "lambda_div_used": 0.653583250939846, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": 0.1014, |
| "reward": -0.08090712130069733, |
| "reward_after_mean": -0.08090712130069733, |
| "reward_after_std": 0.7262772191315889, |
| "reward_before_mean": 0.17835846357047558, |
| "reward_before_std": 0.7181071005761623, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2592655848711729, |
| "reward_change_min": -0.4743462074548006, |
| "reward_change_std": 0.18475584778934717, |
| "reward_std": 0.7262772284448147, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.05080821271985769, |
| "step": 205 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3085.5208892822266, |
| "epoch": 0.23542857142857143, |
| "grad_norm": 0.06689873337745667, |
| "kl": 3.569573163986206e-05, |
| "lambda_div_used": 0.5862618833780289, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0353, |
| "reward": -0.28365214727818966, |
| "reward_after_mean": -0.28365214727818966, |
| "reward_after_std": 0.44012872874736786, |
| "reward_before_mean": 0.004427256062626839, |
| "reward_before_std": 0.3988625044003129, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2880793921649456, |
| "reward_change_min": -0.49235254526138306, |
| "reward_change_std": 0.18395678792148829, |
| "reward_std": 0.44012873619794846, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.12057275045663118, |
| "step": 206 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3146.6458740234375, |
| "epoch": 0.23657142857142857, |
| "grad_norm": 0.06122450903058052, |
| "kl": 3.725104033946991e-05, |
| "lambda_div_used": 0.579835832118988, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": 0.0182, |
| "reward": -0.28688727831467986, |
| "reward_after_mean": -0.28688727831467986, |
| "reward_after_std": 0.4150819983333349, |
| "reward_before_mean": 0.009919969365000725, |
| "reward_before_std": 0.3629662115126848, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29680725932121277, |
| "reward_change_min": -0.45919613167643547, |
| "reward_change_std": 0.17279880121350288, |
| "reward_std": 0.41508200392127037, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.11508002015762031, |
| "step": 207 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2780.291717529297, |
| "epoch": 0.2377142857142857, |
| "grad_norm": 0.06370716542005539, |
| "kl": 3.692507743835449e-05, |
| "lambda_div_used": 0.5807016789913177, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": 0.0403, |
| "reward": -0.17094913870096207, |
| "reward_after_mean": -0.17094913870096207, |
| "reward_after_std": 0.46442117914557457, |
| "reward_before_mean": 0.20349296741187572, |
| "reward_before_std": 0.3671952560544014, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3744421415030956, |
| "reward_change_min": -0.5505415536463261, |
| "reward_change_std": 0.21210224367678165, |
| "reward_std": 0.46442119032144547, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.004840357229113579, |
| "step": 208 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2574.6042079925537, |
| "epoch": 0.23885714285714285, |
| "grad_norm": 0.08756011724472046, |
| "kl": 1.8547754734754562e-05, |
| "lambda_div_used": 0.6246868968009949, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": 0.0837, |
| "reward": 0.028150439262390137, |
| "reward_after_mean": 0.028150439262390137, |
| "reward_after_std": 0.5636487938463688, |
| "reward_before_mean": 0.3838757500052452, |
| "reward_before_std": 0.576031070202589, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3557253200560808, |
| "reward_change_min": -0.5899170190095901, |
| "reward_change_std": 0.23946599010378122, |
| "reward_std": 0.5636488180607557, |
| "rewards/accuracy_reward": 0.3333333469927311, |
| "rewards/cosine_scaled_reward": 0.05054241791367531, |
| "step": 209 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2779.3333435058594, |
| "epoch": 0.24, |
| "grad_norm": 0.06111512333154678, |
| "kl": 3.0368566513061523e-05, |
| "lambda_div_used": 0.5897415727376938, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": 0.0277, |
| "reward": -0.25500940857455134, |
| "reward_after_mean": -0.25500940857455134, |
| "reward_after_std": 0.47190882451832294, |
| "reward_before_mean": 0.03478116978658363, |
| "reward_before_std": 0.4098346810787916, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2897905595600605, |
| "reward_change_min": -0.42270801588892937, |
| "reward_change_std": 0.1620404813438654, |
| "reward_std": 0.4719088301062584, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.11105216934811324, |
| "step": 210 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2558.8541946411133, |
| "epoch": 0.24114285714285713, |
| "grad_norm": 0.06923027336597443, |
| "kl": 3.6522746086120605e-05, |
| "lambda_div_used": 0.5664671063423157, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0729, |
| "reward": -0.06898702308535576, |
| "reward_after_mean": -0.06898702308535576, |
| "reward_after_std": 0.4354093614965677, |
| "reward_before_mean": 0.4075740482658148, |
| "reward_before_std": 0.30287738889455795, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4765610620379448, |
| "reward_change_min": -0.6883851811289787, |
| "reward_change_std": 0.262312775477767, |
| "reward_std": 0.4354093801230192, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.09507404454052448, |
| "step": 211 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2327.750015258789, |
| "epoch": 0.2422857142857143, |
| "grad_norm": 0.08028864860534668, |
| "kl": 2.8003007173538208e-05, |
| "lambda_div_used": 0.5712975114583969, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": -0.0113, |
| "reward": -0.09517045877873898, |
| "reward_after_mean": -0.09517045877873898, |
| "reward_after_std": 0.49211281538009644, |
| "reward_before_mean": 0.3566615767776966, |
| "reward_before_std": 0.32028379291296005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4518320318311453, |
| "reward_change_min": -0.6186705827713013, |
| "reward_change_std": 0.23167487233877182, |
| "reward_std": 0.4921128321439028, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.06499490264104679, |
| "step": 212 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2145.6041946411133, |
| "epoch": 0.24342857142857144, |
| "grad_norm": 0.09073800593614578, |
| "kl": 4.177819937467575e-05, |
| "lambda_div_used": 0.6168783828616142, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": 0.027, |
| "reward": -0.08367926510982215, |
| "reward_after_mean": -0.08367926510982215, |
| "reward_after_std": 0.5790203902870417, |
| "reward_before_mean": 0.24448610469698906, |
| "reward_before_std": 0.5364782512187958, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32816537097096443, |
| "reward_change_min": -0.5068525895476341, |
| "reward_change_std": 0.19713077135384083, |
| "reward_std": 0.579020407050848, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": 0.036152773071080446, |
| "step": 213 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2982.6458587646484, |
| "epoch": 0.24457142857142858, |
| "grad_norm": 0.06497277319431305, |
| "kl": 3.772228956222534e-05, |
| "lambda_div_used": 0.6168569102883339, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": 0.0247, |
| "reward": -0.10639690980315208, |
| "reward_after_mean": -0.10639690980315208, |
| "reward_after_std": 0.563305439427495, |
| "reward_before_mean": 0.21648684330284595, |
| "reward_before_std": 0.5375220291316509, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3228837437927723, |
| "reward_change_min": -0.5641361065208912, |
| "reward_change_std": 0.20987980626523495, |
| "reward_std": 0.5633054543286562, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": 0.00815348862670362, |
| "step": 214 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2382.2500610351562, |
| "epoch": 0.24571428571428572, |
| "grad_norm": 0.07877103984355927, |
| "kl": 4.485622048377991e-05, |
| "lambda_div_used": 0.5695274397730827, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": 0.0351, |
| "reward": -0.39196348818950355, |
| "reward_after_mean": -0.39196348818950355, |
| "reward_after_std": 0.39862883277237415, |
| "reward_before_mean": -0.128106027841568, |
| "reward_before_std": 0.31178954988718033, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.263857439160347, |
| "reward_change_min": -0.38021961972117424, |
| "reward_change_std": 0.13994430005550385, |
| "reward_std": 0.39862884022295475, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.1697727027349174, |
| "step": 215 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2156.3958740234375, |
| "epoch": 0.24685714285714286, |
| "grad_norm": 0.11318857222795486, |
| "kl": 4.297494888305664e-05, |
| "lambda_div_used": 0.5965342745184898, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0186, |
| "reward": -0.22045887634158134, |
| "reward_after_mean": -0.22045887634158134, |
| "reward_after_std": 0.4805217906832695, |
| "reward_before_mean": 0.08258800266776234, |
| "reward_before_std": 0.4415160808712244, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3030468635261059, |
| "reward_change_min": -0.4999500457197428, |
| "reward_change_std": 0.185057258233428, |
| "reward_std": 0.4805217981338501, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.08407866954803467, |
| "step": 216 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2763.520866394043, |
| "epoch": 0.248, |
| "grad_norm": 0.067295141518116, |
| "kl": 3.0644237995147705e-05, |
| "lambda_div_used": 0.6201670467853546, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.0139, |
| "reward": -0.19884846359491348, |
| "reward_after_mean": -0.19884846359491348, |
| "reward_after_std": 0.5856006741523743, |
| "reward_before_mean": 0.06158541014883667, |
| "reward_before_std": 0.5545480605214834, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2604338899254799, |
| "reward_change_min": -0.503634799271822, |
| "reward_change_std": 0.1784855630248785, |
| "reward_std": 0.5856006946414709, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.10508125275373459, |
| "step": 217 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2880.7083587646484, |
| "epoch": 0.24914285714285714, |
| "grad_norm": 0.07706139236688614, |
| "kl": 3.7044286727905273e-05, |
| "lambda_div_used": 0.6074161231517792, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": -0.0223, |
| "reward": -0.19500153325498104, |
| "reward_after_mean": -0.19500153325498104, |
| "reward_after_std": 0.5332945492118597, |
| "reward_before_mean": 0.08328226953744888, |
| "reward_before_std": 0.5006468072533607, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27828381210565567, |
| "reward_change_min": -0.5113113783299923, |
| "reward_change_std": 0.18696410488337278, |
| "reward_std": 0.5332945715636015, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.08338439278304577, |
| "step": 218 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2352.645854949951, |
| "epoch": 0.2502857142857143, |
| "grad_norm": 0.10948827862739563, |
| "kl": 3.1441450119018555e-05, |
| "lambda_div_used": 0.5561031624674797, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": -0.0096, |
| "reward": -0.029067307710647583, |
| "reward_after_mean": -0.029067307710647583, |
| "reward_after_std": 0.41141366213560104, |
| "reward_before_mean": 0.4911606255918741, |
| "reward_before_std": 0.2519808644428849, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5202279426157475, |
| "reward_change_min": -0.7093416638672352, |
| "reward_change_std": 0.27748389169573784, |
| "reward_std": 0.41141367703676224, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.13699393905699253, |
| "step": 219 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2635.5833435058594, |
| "epoch": 0.25142857142857145, |
| "grad_norm": 0.08914919197559357, |
| "kl": 3.5672448575496674e-05, |
| "lambda_div_used": 0.5308618620038033, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": -0.0362, |
| "reward": -0.5146691724658012, |
| "reward_after_mean": -0.5146691724658012, |
| "reward_after_std": 0.23347610421478748, |
| "reward_before_mean": -0.2369950506836176, |
| "reward_before_std": 0.13545648753643036, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27767411433160305, |
| "reward_change_min": -0.3908136747777462, |
| "reward_change_std": 0.14405533485114574, |
| "reward_std": 0.23347610607743263, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2369950469583273, |
| "step": 220 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2279.562530517578, |
| "epoch": 0.25257142857142856, |
| "grad_norm": 0.0954662561416626, |
| "kl": 2.4788081645965576e-05, |
| "lambda_div_used": 0.5764833092689514, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0168, |
| "reward": 0.023251693695783615, |
| "reward_after_mean": 0.023251693695783615, |
| "reward_after_std": 0.4863298423588276, |
| "reward_before_mean": 0.5212067291140556, |
| "reward_before_std": 0.34889572812244296, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.49795500561594963, |
| "reward_change_min": -0.7016883753240108, |
| "reward_change_std": 0.27911104913800955, |
| "reward_std": 0.48632985167205334, |
| "rewards/accuracy_reward": 0.4166666716337204, |
| "rewards/cosine_scaled_reward": 0.10454002395272255, |
| "step": 221 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2104.9583740234375, |
| "epoch": 0.2537142857142857, |
| "grad_norm": 0.0776790902018547, |
| "kl": 2.4726614356040955e-05, |
| "lambda_div_used": 0.5997953563928604, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": -0.0148, |
| "reward": -0.04329463094472885, |
| "reward_after_mean": -0.04329463094472885, |
| "reward_after_std": 0.48106030002236366, |
| "reward_before_mean": 0.33003126084804535, |
| "reward_before_std": 0.46051184553653, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3733258917927742, |
| "reward_change_min": -0.5728014186024666, |
| "reward_change_std": 0.23287776950746775, |
| "reward_std": 0.48106031119823456, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": 0.0591979268938303, |
| "step": 222 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2521.3333587646484, |
| "epoch": 0.25485714285714284, |
| "grad_norm": 0.07036174088716507, |
| "kl": 2.9304996132850647e-05, |
| "lambda_div_used": 0.5750112235546112, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.0216, |
| "reward": -0.1473498847335577, |
| "reward_after_mean": -0.1473498847335577, |
| "reward_after_std": 0.4359878208488226, |
| "reward_before_mean": 0.24288302287459373, |
| "reward_before_std": 0.3438769578933716, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39023288898169994, |
| "reward_change_min": -0.588653527200222, |
| "reward_change_std": 0.22452317085117102, |
| "reward_std": 0.43598783388733864, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.027950339019298553, |
| "step": 223 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3317.729217529297, |
| "epoch": 0.256, |
| "grad_norm": 0.05309538170695305, |
| "kl": 2.8050970286130905e-05, |
| "lambda_div_used": 0.5725216493010521, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.069, |
| "reward": -0.30242439545691013, |
| "reward_after_mean": -0.30242439545691013, |
| "reward_after_std": 0.38869454339146614, |
| "reward_before_mean": -0.000308917835354805, |
| "reward_before_std": 0.3251040354371071, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30211549811065197, |
| "reward_change_min": -0.4553787522017956, |
| "reward_change_std": 0.171081081032753, |
| "reward_std": 0.3886945564299822, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.12530891119968146, |
| "step": 224 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3037.208335876465, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.0768936425447464, |
| "kl": 4.080682992935181e-05, |
| "lambda_div_used": 0.5907088667154312, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0169, |
| "reward": -0.28160014655441046, |
| "reward_after_mean": -0.28160014655441046, |
| "reward_after_std": 0.48472702503204346, |
| "reward_before_mean": -0.008234186680056155, |
| "reward_before_std": 0.41540220472961664, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27336596697568893, |
| "reward_change_min": -0.4061685614287853, |
| "reward_change_std": 0.15673903841525316, |
| "reward_std": 0.48472702503204346, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.13323417864739895, |
| "step": 225 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2861.416702270508, |
| "epoch": 0.2582857142857143, |
| "grad_norm": 0.07499676197767258, |
| "kl": 3.580749034881592e-05, |
| "lambda_div_used": 0.6208535805344582, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0702, |
| "reward": -0.08291551470756531, |
| "reward_after_mean": -0.08291551470756531, |
| "reward_after_std": 0.5764818880707026, |
| "reward_before_mean": 0.23111886344850063, |
| "reward_before_std": 0.5593543313443661, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3140343725681305, |
| "reward_change_min": -0.5283008627593517, |
| "reward_change_std": 0.20683539099991322, |
| "reward_std": 0.5764818955212831, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": -0.03971448230731767, |
| "step": 226 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1926.6875305175781, |
| "epoch": 0.25942857142857145, |
| "grad_norm": 0.10923109948635101, |
| "kl": 4.176795482635498e-05, |
| "lambda_div_used": 0.5834182798862457, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": -0.0447, |
| "reward": -0.2778073139488697, |
| "reward_after_mean": -0.2778073139488697, |
| "reward_after_std": 0.4395454227924347, |
| "reward_before_mean": 0.009000460617244244, |
| "reward_before_std": 0.3901812704280019, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28680778108537197, |
| "reward_change_min": -0.45677450299263, |
| "reward_change_std": 0.18111994117498398, |
| "reward_std": 0.43954543210566044, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.1159995449706912, |
| "step": 227 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2502.562545776367, |
| "epoch": 0.26057142857142856, |
| "grad_norm": 0.08182154595851898, |
| "kl": 1.8984079360961914e-05, |
| "lambda_div_used": 0.6025907471776009, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": 0.074, |
| "reward": 0.03976001590490341, |
| "reward_after_mean": 0.03976001590490341, |
| "reward_after_std": 0.5421929359436035, |
| "reward_before_mean": 0.4711545445024967, |
| "reward_before_std": 0.4750876808539033, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4313945174217224, |
| "reward_change_min": -0.6774051003158092, |
| "reward_change_std": 0.2637105621397495, |
| "reward_std": 0.5421929433941841, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.11698784306645393, |
| "step": 228 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3196.3125228881836, |
| "epoch": 0.26171428571428573, |
| "grad_norm": 0.06398586928844452, |
| "kl": 4.0434300899505615e-05, |
| "lambda_div_used": 0.5669709742069244, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": 0.0135, |
| "reward": -0.15348458290100098, |
| "reward_after_mean": -0.15348458290100098, |
| "reward_after_std": 0.42672324273735285, |
| "reward_before_mean": 0.26130594592541456, |
| "reward_before_std": 0.3047938751988113, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41479056514799595, |
| "reward_change_min": -0.5674791261553764, |
| "reward_change_std": 0.22689451277256012, |
| "reward_std": 0.4267232706770301, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": 0.03213928057812154, |
| "step": 229 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3100.5000228881836, |
| "epoch": 0.26285714285714284, |
| "grad_norm": 0.05996527522802353, |
| "kl": 2.22623348236084e-05, |
| "lambda_div_used": 0.5811162814497948, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.0391, |
| "reward": -0.36068666726350784, |
| "reward_after_mean": -0.36068666726350784, |
| "reward_after_std": 0.43734684213995934, |
| "reward_before_mean": -0.08804995659738779, |
| "reward_before_std": 0.37305002473294735, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2726367134600878, |
| "reward_change_min": -0.4353605732321739, |
| "reward_change_std": 0.16089125256985426, |
| "reward_std": 0.43734684586524963, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.15054996183607727, |
| "step": 230 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2678.7083587646484, |
| "epoch": 0.264, |
| "grad_norm": 0.06770680844783783, |
| "kl": 2.5890767574310303e-05, |
| "lambda_div_used": 0.5817164853215218, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0722, |
| "reward": -0.13625335786491632, |
| "reward_after_mean": -0.13625335786491632, |
| "reward_after_std": 0.46202925965189934, |
| "reward_before_mean": 0.25235490314662457, |
| "reward_before_std": 0.37311020120978355, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3886082824319601, |
| "reward_change_min": -0.6004670634865761, |
| "reward_change_std": 0.2275555245578289, |
| "reward_std": 0.4620292726904154, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.018478410318493843, |
| "step": 231 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3258.625030517578, |
| "epoch": 0.2651428571428571, |
| "grad_norm": 0.05468503385782242, |
| "kl": 3.515183925628662e-05, |
| "lambda_div_used": 0.5800251811742783, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": 0.0159, |
| "reward": -0.3428461756557226, |
| "reward_after_mean": -0.3428461756557226, |
| "reward_after_std": 0.4266065489500761, |
| "reward_before_mean": -0.07336848601698875, |
| "reward_before_std": 0.36755594704300165, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26947769708931446, |
| "reward_change_min": -0.43084757775068283, |
| "reward_change_std": 0.1597052849829197, |
| "reward_std": 0.4266065787523985, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.1567018236964941, |
| "step": 232 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2661.2292251586914, |
| "epoch": 0.2662857142857143, |
| "grad_norm": 0.08775883167982101, |
| "kl": 3.3989548683166504e-05, |
| "lambda_div_used": 0.6155472174286842, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": -0.016, |
| "reward": -0.22876367531716824, |
| "reward_after_mean": -0.22876367531716824, |
| "reward_after_std": 0.5766322333365679, |
| "reward_before_mean": 0.027781556826084852, |
| "reward_before_std": 0.5290827043354511, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2565452288836241, |
| "reward_change_min": -0.4581022933125496, |
| "reward_change_std": 0.16627201065421104, |
| "reward_std": 0.5766322445124388, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.11805178504437208, |
| "step": 233 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2694.3125228881836, |
| "epoch": 0.2674285714285714, |
| "grad_norm": 0.11633959412574768, |
| "kl": 3.643333911895752e-05, |
| "lambda_div_used": 0.5539941042661667, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": -0.0056, |
| "reward": -0.22800572216510773, |
| "reward_after_mean": -0.22800572216510773, |
| "reward_after_std": 0.3484720904380083, |
| "reward_before_mean": 0.16009100899100304, |
| "reward_before_std": 0.24206165876239538, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3880967255681753, |
| "reward_change_min": -0.5548957660794258, |
| "reward_change_std": 0.21049270872026682, |
| "reward_std": 0.34847209975123405, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.06907567009329796, |
| "step": 234 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2490.4375, |
| "epoch": 0.26857142857142857, |
| "grad_norm": 0.10378725826740265, |
| "kl": 2.8094742447137833e-05, |
| "lambda_div_used": 0.5782083421945572, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": -0.0425, |
| "reward": 0.01590564101934433, |
| "reward_after_mean": 0.01590564101934433, |
| "reward_after_std": 0.5053408965468407, |
| "reward_before_mean": 0.49857149738818407, |
| "reward_before_std": 0.3586234971880913, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.48266585171222687, |
| "reward_change_min": -0.7199156694114208, |
| "reward_change_std": 0.2697788691148162, |
| "reward_std": 0.505340900272131, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/cosine_scaled_reward": 0.12357146013528109, |
| "step": 235 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2824.7917098999023, |
| "epoch": 0.26971428571428574, |
| "grad_norm": 0.06474865972995758, |
| "kl": 2.1502375602722168e-05, |
| "lambda_div_used": 0.5763456672430038, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0011, |
| "reward": -0.3977004513144493, |
| "reward_after_mean": -0.3977004513144493, |
| "reward_after_std": 0.41489073634147644, |
| "reward_before_mean": -0.1460555698722601, |
| "reward_before_std": 0.347917802631855, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2516448702663183, |
| "reward_change_min": -0.42327918112277985, |
| "reward_change_std": 0.14836463797837496, |
| "reward_std": 0.4148907568305731, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.20855557406321168, |
| "step": 236 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2764.9583740234375, |
| "epoch": 0.27085714285714285, |
| "grad_norm": 0.06224860996007919, |
| "kl": 2.5756657123565674e-05, |
| "lambda_div_used": 0.5886275842785835, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": -0.0228, |
| "reward": -0.1175742200575769, |
| "reward_after_mean": -0.1175742200575769, |
| "reward_after_std": 0.4582329224795103, |
| "reward_before_mean": 0.24255024455487728, |
| "reward_before_std": 0.40142686292529106, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36012447625398636, |
| "reward_change_min": -0.5257687419652939, |
| "reward_change_std": 0.20567627251148224, |
| "reward_std": 0.4582329299300909, |
| "rewards/accuracy_reward": 0.22916667722165585, |
| "rewards/cosine_scaled_reward": 0.013383567042183131, |
| "step": 237 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3222.479217529297, |
| "epoch": 0.272, |
| "grad_norm": 0.05841728672385216, |
| "kl": 3.546103835105896e-05, |
| "lambda_div_used": 0.6281886473298073, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.0655, |
| "reward": 0.044486068189144135, |
| "reward_after_mean": 0.044486068189144135, |
| "reward_after_std": 0.6025157757103443, |
| "reward_before_mean": 0.4036002438515425, |
| "reward_before_std": 0.6015807576477528, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.359114158898592, |
| "reward_change_min": -0.609935961663723, |
| "reward_change_std": 0.24452881701290607, |
| "reward_std": 0.6025157924741507, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.09110023453831673, |
| "step": 238 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1740.1666793823242, |
| "epoch": 0.27314285714285713, |
| "grad_norm": 0.09357193857431412, |
| "kl": 1.8790364265441895e-05, |
| "lambda_div_used": 0.5962883979082108, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0055, |
| "reward": 0.07376761082559824, |
| "reward_after_mean": 0.07376761082559824, |
| "reward_after_std": 0.5891099888831377, |
| "reward_before_mean": 0.5742647312581539, |
| "reward_before_std": 0.44304153323173523, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5004971195012331, |
| "reward_change_min": -0.7436067499220371, |
| "reward_change_std": 0.2840570341795683, |
| "reward_std": 0.5891100075095892, |
| "rewards/accuracy_reward": 0.37500000186264515, |
| "rewards/cosine_scaled_reward": 0.19926471984945238, |
| "step": 239 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3087.416702270508, |
| "epoch": 0.2742857142857143, |
| "grad_norm": 0.06324295699596405, |
| "kl": 3.217160701751709e-05, |
| "lambda_div_used": 0.5339473709464073, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0166, |
| "reward": -0.5358126908540726, |
| "reward_after_mean": -0.5358126908540726, |
| "reward_after_std": 0.23053276538848877, |
| "reward_before_mean": -0.2753155492246151, |
| "reward_before_std": 0.1490377252921462, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26049717888236046, |
| "reward_change_min": -0.3783254958689213, |
| "reward_change_std": 0.1383643699809909, |
| "reward_std": 0.23053277097642422, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2753155454993248, |
| "step": 240 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3419.625, |
| "epoch": 0.2754285714285714, |
| "grad_norm": 0.047396283596754074, |
| "kl": 3.555417060852051e-05, |
| "lambda_div_used": 0.5585425272583961, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0027, |
| "reward": -0.4805446630343795, |
| "reward_after_mean": -0.4805446630343795, |
| "reward_after_std": 0.3355217073112726, |
| "reward_before_mean": -0.23604051489382982, |
| "reward_before_std": 0.2620125887915492, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2445041425526142, |
| "reward_change_min": -0.37724288925528526, |
| "reward_change_std": 0.13484715577214956, |
| "reward_std": 0.3355217222124338, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2568738413974643, |
| "step": 241 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2400.8750228881836, |
| "epoch": 0.2765714285714286, |
| "grad_norm": 0.08974741399288177, |
| "kl": 4.5746564865112305e-05, |
| "lambda_div_used": 0.5887154564261436, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": -0.0452, |
| "reward": -0.2555653927847743, |
| "reward_after_mean": -0.2555653927847743, |
| "reward_after_std": 0.47001883387565613, |
| "reward_before_mean": 0.03157716616988182, |
| "reward_before_std": 0.4061046461574733, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28714255429804325, |
| "reward_change_min": -0.41172971203923225, |
| "reward_change_std": 0.16218022629618645, |
| "reward_std": 0.47001886926591396, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.1142561559099704, |
| "step": 242 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2828.541679382324, |
| "epoch": 0.2777142857142857, |
| "grad_norm": 0.07105688005685806, |
| "kl": 1.8571503460407257e-05, |
| "lambda_div_used": 0.5722818151116371, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": 0.0039, |
| "reward": -0.1431608721613884, |
| "reward_after_mean": -0.1431608721613884, |
| "reward_after_std": 0.42974008433520794, |
| "reward_before_mean": 0.2529455106705427, |
| "reward_before_std": 0.3276430475525558, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39610639959573746, |
| "reward_change_min": -0.5491434335708618, |
| "reward_change_std": 0.21527612209320068, |
| "reward_std": 0.42974009923636913, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.0029455198673531413, |
| "step": 243 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2882.291679382324, |
| "epoch": 0.27885714285714286, |
| "grad_norm": 0.0700126439332962, |
| "kl": 3.098323941230774e-05, |
| "lambda_div_used": 0.6161807402968407, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": 0.0, |
| "reward": -0.06458889320492744, |
| "reward_after_mean": -0.06458889320492744, |
| "reward_after_std": 0.5576508566737175, |
| "reward_before_mean": 0.2605547234416008, |
| "reward_before_std": 0.5402053641155362, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32514358684420586, |
| "reward_change_min": -0.5207333639264107, |
| "reward_change_std": 0.21059911139309406, |
| "reward_std": 0.5576508603990078, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": -0.01027863984927535, |
| "step": 244 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2907.4583587646484, |
| "epoch": 0.28, |
| "grad_norm": 0.05816182866692543, |
| "kl": 3.5434961318969727e-05, |
| "lambda_div_used": 0.5615689232945442, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": -0.0133, |
| "reward": -0.42641448229551315, |
| "reward_after_mean": -0.42641448229551315, |
| "reward_after_std": 0.3361923936754465, |
| "reward_before_mean": -0.15409247018396854, |
| "reward_before_std": 0.2813150165602565, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2723220158368349, |
| "reward_change_min": -0.445806298404932, |
| "reward_change_std": 0.1635848032310605, |
| "reward_std": 0.3361923974007368, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.19575914042070508, |
| "step": 245 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2680.7083587646484, |
| "epoch": 0.28114285714285714, |
| "grad_norm": 0.07198330760002136, |
| "kl": 1.764507032930851e-05, |
| "lambda_div_used": 0.5865297466516495, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0347, |
| "reward": -0.19568787328898907, |
| "reward_after_mean": -0.19568787328898907, |
| "reward_after_std": 0.4485197216272354, |
| "reward_before_mean": 0.129365224391222, |
| "reward_before_std": 0.393932550214231, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32505310885608196, |
| "reward_change_min": -0.4816659241914749, |
| "reward_change_std": 0.1897038472816348, |
| "reward_std": 0.44851974956691265, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.0789681114256382, |
| "step": 246 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3076.0208435058594, |
| "epoch": 0.2822857142857143, |
| "grad_norm": 0.08006418496370316, |
| "kl": 4.15705144405365e-05, |
| "lambda_div_used": 0.5735789015889168, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0199, |
| "reward": -0.3997113136574626, |
| "reward_after_mean": -0.3997113136574626, |
| "reward_after_std": 0.41638931445777416, |
| "reward_before_mean": -0.1426885835826397, |
| "reward_before_std": 0.3330965582281351, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2570227347314358, |
| "reward_change_min": -0.38052143156528473, |
| "reward_change_std": 0.13911327440291643, |
| "reward_std": 0.4163893237709999, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.18435525218956172, |
| "step": 247 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2145.0000228881836, |
| "epoch": 0.2834285714285714, |
| "grad_norm": 0.10052043944597244, |
| "kl": 3.3289194107055664e-05, |
| "lambda_div_used": 0.6171368733048439, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": 0.0514, |
| "reward": 0.28496517799794674, |
| "reward_after_mean": 0.28496517799794674, |
| "reward_after_std": 0.6654367055743933, |
| "reward_before_mean": 0.83577667362988, |
| "reward_before_std": 0.5420532608404756, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5508115068078041, |
| "reward_change_min": -0.7994196638464928, |
| "reward_change_std": 0.3166588256135583, |
| "reward_std": 0.6654367428272963, |
| "rewards/accuracy_reward": 0.5000000093132257, |
| "rewards/cosine_scaled_reward": 0.33577666338533163, |
| "step": 248 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2076.937515258789, |
| "epoch": 0.2845714285714286, |
| "grad_norm": 0.0908605083823204, |
| "kl": 2.4942681193351746e-05, |
| "lambda_div_used": 0.5948201194405556, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": 0.0219, |
| "reward": -0.003894178196787834, |
| "reward_after_mean": -0.003894178196787834, |
| "reward_after_std": 0.5242901761084795, |
| "reward_before_mean": 0.4153097262606025, |
| "reward_before_std": 0.43534869560971856, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41920389235019684, |
| "reward_change_min": -0.5903440341353416, |
| "reward_change_std": 0.2394579891115427, |
| "reward_std": 0.524290194734931, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.08197638019919395, |
| "step": 249 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2605.0208587646484, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.09027790278196335, |
| "kl": 3.804638981819153e-05, |
| "lambda_div_used": 0.5779460370540619, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": -0.0589, |
| "reward": -0.2859988175332546, |
| "reward_after_mean": -0.2859988175332546, |
| "reward_after_std": 0.4080943390727043, |
| "reward_before_mean": 0.008792944252490997, |
| "reward_before_std": 0.35337654035538435, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2947917561978102, |
| "reward_change_min": -0.4365835040807724, |
| "reward_change_std": 0.16840537451207638, |
| "reward_std": 0.40809434466063976, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.11620705761015415, |
| "step": 250 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2368.9167137145996, |
| "epoch": 0.28685714285714287, |
| "grad_norm": 0.09640161693096161, |
| "kl": 2.7060508728027344e-05, |
| "lambda_div_used": 0.5987462773919106, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0435, |
| "reward": -0.028017258271574974, |
| "reward_after_mean": -0.028017258271574974, |
| "reward_after_std": 0.5376799181103706, |
| "reward_before_mean": 0.38438196340575814, |
| "reward_before_std": 0.4528377316892147, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41239920631051064, |
| "reward_change_min": -0.6119380593299866, |
| "reward_change_std": 0.23853347077965736, |
| "reward_std": 0.5376799292862415, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.05104860384017229, |
| "step": 251 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2798.5625228881836, |
| "epoch": 0.288, |
| "grad_norm": 0.06784425675868988, |
| "kl": 3.535952419042587e-05, |
| "lambda_div_used": 0.5580189973115921, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": 0.0335, |
| "reward": -0.2931139934808016, |
| "reward_after_mean": -0.2931139934808016, |
| "reward_after_std": 0.38731373474001884, |
| "reward_before_mean": 0.06481979880481958, |
| "reward_before_std": 0.2658566879108548, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3579337988048792, |
| "reward_change_min": -0.5358714908361435, |
| "reward_change_std": 0.19932966493070126, |
| "reward_std": 0.3873137477785349, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.10184688493609428, |
| "step": 252 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2946.416679382324, |
| "epoch": 0.28914285714285715, |
| "grad_norm": 0.09991417825222015, |
| "kl": 3.4242868423461914e-05, |
| "lambda_div_used": 0.6016674339771271, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": -0.0465, |
| "reward": -0.0356330550275743, |
| "reward_after_mean": -0.0356330550275743, |
| "reward_after_std": 0.5561915785074234, |
| "reward_before_mean": 0.353804474696517, |
| "reward_before_std": 0.4667428769171238, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.389437522739172, |
| "reward_change_min": -0.592089481651783, |
| "reward_change_std": 0.22914788126945496, |
| "reward_std": 0.5561915840953588, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.041304459096863866, |
| "step": 253 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2754.3541946411133, |
| "epoch": 0.29028571428571426, |
| "grad_norm": 0.08061102032661438, |
| "kl": 3.886967897415161e-05, |
| "lambda_div_used": 0.6251527816057205, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": 0.0857, |
| "reward": -0.1664750911295414, |
| "reward_after_mean": -0.1664750911295414, |
| "reward_after_std": 0.599596256390214, |
| "reward_before_mean": 0.1026190984994173, |
| "reward_before_std": 0.5836509419605136, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2690941859036684, |
| "reward_change_min": -0.49960801005363464, |
| "reward_change_std": 0.19118957500904799, |
| "reward_std": 0.5995962955057621, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.08488089527236298, |
| "step": 254 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3190.083335876465, |
| "epoch": 0.2914285714285714, |
| "grad_norm": 0.08434556424617767, |
| "kl": 2.3322179913520813e-05, |
| "lambda_div_used": 0.5955284982919693, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": -0.0125, |
| "reward": -0.3585695568472147, |
| "reward_after_mean": -0.3585695568472147, |
| "reward_after_std": 0.4996061436831951, |
| "reward_before_mean": -0.12086338270455599, |
| "reward_before_std": 0.4362535886466503, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2377061638981104, |
| "reward_change_min": -0.39885040931403637, |
| "reward_change_std": 0.1406589960679412, |
| "reward_std": 0.4996061585843563, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.20419671526178718, |
| "step": 255 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3160.8125610351562, |
| "epoch": 0.2925714285714286, |
| "grad_norm": 0.0576457642018795, |
| "kl": 3.143772482872009e-05, |
| "lambda_div_used": 0.639025017619133, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.0243, |
| "reward": -0.03833652473986149, |
| "reward_after_mean": -0.03833652473986149, |
| "reward_after_std": 0.6670235451310873, |
| "reward_before_mean": 0.2675688254312263, |
| "reward_before_std": 0.6473761759698391, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30590534023940563, |
| "reward_change_min": -0.511355496942997, |
| "reward_change_std": 0.2077637044712901, |
| "reward_std": 0.6670235693454742, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.01756881456822157, |
| "step": 256 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3138.5000228881836, |
| "epoch": 0.2937142857142857, |
| "grad_norm": 0.05595054104924202, |
| "kl": 3.0346214771270752e-05, |
| "lambda_div_used": 0.6474356725811958, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": 0.025, |
| "reward": -0.0769930558744818, |
| "reward_after_mean": -0.0769930558744818, |
| "reward_after_std": 0.7138958293944597, |
| "reward_before_mean": 0.18525638710707426, |
| "reward_before_std": 0.685269920155406, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26224944926798344, |
| "reward_change_min": -0.4810149297118187, |
| "reward_change_std": 0.18088674824684858, |
| "reward_std": 0.7138958312571049, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.04391028080135584, |
| "step": 257 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3473.187530517578, |
| "epoch": 0.2948571428571429, |
| "grad_norm": 0.05202889442443848, |
| "kl": 3.561750054359436e-05, |
| "lambda_div_used": 0.6007983982563019, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": 0.0058, |
| "reward": -0.23316247668117285, |
| "reward_after_mean": -0.23316247668117285, |
| "reward_after_std": 0.5119184870272875, |
| "reward_before_mean": 0.04610642418265343, |
| "reward_before_std": 0.46757086645811796, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2792689222842455, |
| "reward_change_min": -0.4602624364197254, |
| "reward_change_std": 0.17828952055424452, |
| "reward_std": 0.5119185000658035, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.09972689533606172, |
| "step": 258 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2996.7500534057617, |
| "epoch": 0.296, |
| "grad_norm": 0.11353152990341187, |
| "kl": 4.0959566831588745e-05, |
| "lambda_div_used": 0.605031318962574, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": -0.0572, |
| "reward": -0.24131755530834198, |
| "reward_after_mean": -0.24131755530834198, |
| "reward_after_std": 0.5208989772945642, |
| "reward_before_mean": 0.03226012596860528, |
| "reward_before_std": 0.4831458665430546, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27357766777276993, |
| "reward_change_min": -0.4753991588950157, |
| "reward_change_std": 0.1725512887351215, |
| "reward_std": 0.5208989884704351, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.1135732214897871, |
| "step": 259 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2184.8542098999023, |
| "epoch": 0.29714285714285715, |
| "grad_norm": 0.1352783739566803, |
| "kl": 2.3871660232543945e-05, |
| "lambda_div_used": 0.6604775562882423, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": 0.0496, |
| "reward": 0.1305767484009266, |
| "reward_after_mean": 0.1305767484009266, |
| "reward_after_std": 0.7162795849144459, |
| "reward_before_mean": 0.48802967881783843, |
| "reward_before_std": 0.7462537074461579, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3574528992176056, |
| "reward_change_min": -0.6257632970809937, |
| "reward_change_std": 0.25755990110337734, |
| "reward_std": 0.7162796072661877, |
| "rewards/accuracy_reward": 0.39583334513008595, |
| "rewards/cosine_scaled_reward": 0.0921963145956397, |
| "step": 260 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3070.3125228881836, |
| "epoch": 0.29828571428571427, |
| "grad_norm": 0.06917808949947357, |
| "kl": 3.3020973205566406e-05, |
| "lambda_div_used": 0.5845921337604523, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0106, |
| "reward": -0.3141371374949813, |
| "reward_after_mean": -0.3141371374949813, |
| "reward_after_std": 0.43580205366015434, |
| "reward_before_mean": -0.03358305338770151, |
| "reward_before_std": 0.38949463702738285, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28055410645902157, |
| "reward_change_min": -0.44674795493483543, |
| "reward_change_std": 0.16879158467054367, |
| "reward_std": 0.4358020592480898, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.11691637896001339, |
| "step": 261 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3258.3125, |
| "epoch": 0.29942857142857143, |
| "grad_norm": 0.06239396706223488, |
| "kl": 4.535168409347534e-05, |
| "lambda_div_used": 0.5396690741181374, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": 0.0153, |
| "reward": -0.49852147325873375, |
| "reward_after_mean": -0.49852147325873375, |
| "reward_after_std": 0.23993146419525146, |
| "reward_before_mean": -0.21930878423154354, |
| "reward_before_std": 0.17422470543533564, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2792126890271902, |
| "reward_change_min": -0.4082174263894558, |
| "reward_change_std": 0.15261581167578697, |
| "reward_std": 0.2399314697831869, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.21930878423154354, |
| "step": 262 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2837.6666717529297, |
| "epoch": 0.30057142857142854, |
| "grad_norm": 0.07131204754114151, |
| "kl": 2.771243453025818e-05, |
| "lambda_div_used": 0.5753477811813354, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": 0.0159, |
| "reward": -0.32574891671538353, |
| "reward_after_mean": -0.32574891671538353, |
| "reward_after_std": 0.40547293052077293, |
| "reward_before_mean": -0.04349888768047094, |
| "reward_before_std": 0.3437284992542118, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28225001133978367, |
| "reward_change_min": -0.423517182469368, |
| "reward_change_std": 0.16225541010499, |
| "reward_std": 0.4054729398339987, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.14766556210815907, |
| "step": 263 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2913.812515258789, |
| "epoch": 0.3017142857142857, |
| "grad_norm": 0.07276218384504318, |
| "kl": 3.5703182220458984e-05, |
| "lambda_div_used": 0.5857644975185394, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": 0.039, |
| "reward": -0.19073306024074554, |
| "reward_after_mean": -0.19073306024074554, |
| "reward_after_std": 0.4183583725243807, |
| "reward_before_mean": 0.13579276762902737, |
| "reward_before_std": 0.39237749949097633, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32652581483125687, |
| "reward_change_min": -0.512863963842392, |
| "reward_change_std": 0.20149299688637257, |
| "reward_std": 0.4183583725243807, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.051707250997424126, |
| "step": 264 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2092.333351135254, |
| "epoch": 0.3028571428571429, |
| "grad_norm": 0.09631343185901642, |
| "kl": 2.018176019191742e-05, |
| "lambda_div_used": 0.5927653685212135, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.0409, |
| "reward": -0.05978839658200741, |
| "reward_after_mean": -0.05978839658200741, |
| "reward_after_std": 0.5236263573169708, |
| "reward_before_mean": 0.3295632619410753, |
| "reward_before_std": 0.4253385625779629, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38935166224837303, |
| "reward_change_min": -0.5855174511671066, |
| "reward_change_std": 0.22452097665518522, |
| "reward_std": 0.5236263833940029, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": 0.037896597757935524, |
| "step": 265 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3210.8541717529297, |
| "epoch": 0.304, |
| "grad_norm": 0.05451129376888275, |
| "kl": 3.138929605484009e-05, |
| "lambda_div_used": 0.557657316327095, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0024, |
| "reward": -0.3950183019042015, |
| "reward_after_mean": -0.3950183019042015, |
| "reward_after_std": 0.32154187746345997, |
| "reward_before_mean": -0.12019951082766056, |
| "reward_before_std": 0.2612606221809983, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2748187892138958, |
| "reward_change_min": -0.42080841958522797, |
| "reward_change_std": 0.158494733273983, |
| "reward_std": 0.321541890501976, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.20353284664452076, |
| "step": 266 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3039.916679382324, |
| "epoch": 0.30514285714285716, |
| "grad_norm": 0.06252676248550415, |
| "kl": 3.5075165214948356e-05, |
| "lambda_div_used": 0.5528770685195923, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": -0.037, |
| "reward": -0.4460434205830097, |
| "reward_after_mean": -0.4460434205830097, |
| "reward_after_std": 0.3107005339115858, |
| "reward_before_mean": -0.1744655454531312, |
| "reward_before_std": 0.2377403611317277, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27157786674797535, |
| "reward_change_min": -0.42647456377744675, |
| "reward_change_std": 0.1541104121133685, |
| "reward_std": 0.3107005413621664, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21613221243023872, |
| "step": 267 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2704.8333587646484, |
| "epoch": 0.3062857142857143, |
| "grad_norm": 0.08417963981628418, |
| "kl": 4.7713518142700195e-05, |
| "lambda_div_used": 0.6233388632535934, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": 0.0822, |
| "reward": -0.21941478177905083, |
| "reward_after_mean": -0.21941478177905083, |
| "reward_after_std": 0.6098091676831245, |
| "reward_before_mean": 0.03345827816519886, |
| "reward_before_std": 0.5743174999952316, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25287305377423763, |
| "reward_change_min": -0.45724478363990784, |
| "reward_change_std": 0.1690685572102666, |
| "reward_std": 0.6098091900348663, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.11237506754696369, |
| "step": 268 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3165.9791717529297, |
| "epoch": 0.30742857142857144, |
| "grad_norm": 0.06371315568685532, |
| "kl": 2.181902527809143e-05, |
| "lambda_div_used": 0.5798910111188889, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": 0.0477, |
| "reward": -0.38025568798184395, |
| "reward_after_mean": -0.38025568798184395, |
| "reward_after_std": 0.4126162938773632, |
| "reward_before_mean": -0.13023450784385204, |
| "reward_before_std": 0.3629760518670082, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2500211838632822, |
| "reward_change_min": -0.40024495497345924, |
| "reward_change_std": 0.15232266392558813, |
| "reward_std": 0.4126163087785244, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.2135678417980671, |
| "step": 269 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3056.875045776367, |
| "epoch": 0.30857142857142855, |
| "grad_norm": 0.05920446664094925, |
| "kl": 2.387911081314087e-05, |
| "lambda_div_used": 0.6503699943423271, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": 0.0, |
| "reward": -0.004786740057170391, |
| "reward_after_mean": -0.004786740057170391, |
| "reward_after_std": 0.6943319551646709, |
| "reward_before_mean": 0.292231020051986, |
| "reward_before_std": 0.7036092299968004, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2970177587121725, |
| "reward_change_min": -0.5634518079459667, |
| "reward_change_std": 0.21872046310454607, |
| "reward_std": 0.6943319924175739, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.021397670439910144, |
| "step": 270 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2221.3958702087402, |
| "epoch": 0.3097142857142857, |
| "grad_norm": 0.09025074541568756, |
| "kl": 6.723217666149139e-06, |
| "lambda_div_used": 0.640851192176342, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0064, |
| "reward": 0.11755780689418316, |
| "reward_after_mean": 0.11755780689418316, |
| "reward_after_std": 0.6356876939535141, |
| "reward_before_mean": 0.4748641401529312, |
| "reward_before_std": 0.654548792168498, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35730634443461895, |
| "reward_change_min": -0.5644437614828348, |
| "reward_change_std": 0.23989163804799318, |
| "reward_std": 0.6356877088546753, |
| "rewards/accuracy_reward": 0.39583334885537624, |
| "rewards/cosine_scaled_reward": 0.07903079688549042, |
| "step": 271 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2814.5208740234375, |
| "epoch": 0.31085714285714283, |
| "grad_norm": 0.07214022427797318, |
| "kl": 2.553686499595642e-05, |
| "lambda_div_used": 0.5532237812876701, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": -0.023, |
| "reward": -0.33643968403339386, |
| "reward_after_mean": -0.33643968403339386, |
| "reward_after_std": 0.35168860107660294, |
| "reward_before_mean": 0.01186647079885006, |
| "reward_before_std": 0.2353415172547102, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34830615669488907, |
| "reward_change_min": -0.5079392194747925, |
| "reward_change_std": 0.1846030419692397, |
| "reward_std": 0.351688614115119, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.13396686874330044, |
| "step": 272 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2673.937515258789, |
| "epoch": 0.312, |
| "grad_norm": 0.067450150847435, |
| "kl": 2.347538247704506e-05, |
| "lambda_div_used": 0.5976931154727936, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": 0.0439, |
| "reward": 0.060379184782505035, |
| "reward_after_mean": 0.060379184782505035, |
| "reward_after_std": 0.5129444599151611, |
| "reward_before_mean": 0.4946505483239889, |
| "reward_before_std": 0.44454328902065754, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4342713989317417, |
| "reward_change_min": -0.6668536812067032, |
| "reward_change_std": 0.25412870943546295, |
| "reward_std": 0.5129444822669029, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.1196505706757307, |
| "step": 273 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1866.4375076293945, |
| "epoch": 0.31314285714285717, |
| "grad_norm": 0.11823767423629761, |
| "kl": 2.6108697056770325e-05, |
| "lambda_div_used": 0.6173145174980164, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": -0.0666, |
| "reward": 0.20245935022830963, |
| "reward_after_mean": 0.20245935022830963, |
| "reward_after_std": 0.6078098546713591, |
| "reward_before_mean": 0.6919816123554483, |
| "reward_before_std": 0.5468513960950077, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4895222559571266, |
| "reward_change_min": -0.7379350513219833, |
| "reward_change_std": 0.3024911228567362, |
| "reward_std": 0.6078098695725203, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.2336482839891687, |
| "step": 274 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2334.0416870117188, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.07390157133340836, |
| "kl": 2.975761890411377e-05, |
| "lambda_div_used": 0.5953021869063377, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": 0.0067, |
| "reward": -0.02193348854780197, |
| "reward_after_mean": -0.02193348854780197, |
| "reward_after_std": 0.5273136273026466, |
| "reward_before_mean": 0.3926870714276447, |
| "reward_before_std": 0.4364564120769501, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4146205447614193, |
| "reward_change_min": -0.6028474234044552, |
| "reward_change_std": 0.23770872876048088, |
| "reward_std": 0.5273136328905821, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.08018706925213337, |
| "step": 275 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2735.6458740234375, |
| "epoch": 0.31542857142857145, |
| "grad_norm": 0.07891591638326645, |
| "kl": 3.522634506225586e-05, |
| "lambda_div_used": 0.6019129753112793, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0433, |
| "reward": 0.016597013920545578, |
| "reward_after_mean": 0.016597013920545578, |
| "reward_after_std": 0.5496951006352901, |
| "reward_before_mean": 0.43198024667799473, |
| "reward_before_std": 0.4716393407434225, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4153832569718361, |
| "reward_change_min": -0.6486029289662838, |
| "reward_change_std": 0.25036295782774687, |
| "reward_std": 0.5496951248496771, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.09864690899848938, |
| "step": 276 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2435.437530517578, |
| "epoch": 0.31657142857142856, |
| "grad_norm": 0.08397295325994492, |
| "kl": 2.7611851692199707e-05, |
| "lambda_div_used": 0.6150463595986366, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0536, |
| "reward": -0.049143560230731964, |
| "reward_after_mean": -0.049143560230731964, |
| "reward_after_std": 0.5724204778671265, |
| "reward_before_mean": 0.29686027206480503, |
| "reward_before_std": 0.5315626971423626, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34600381925702095, |
| "reward_change_min": -0.5762605480849743, |
| "reward_change_std": 0.21639508474618196, |
| "reward_std": 0.5724204815924168, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.02602693811058998, |
| "step": 277 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2029.708381652832, |
| "epoch": 0.3177142857142857, |
| "grad_norm": 0.11247767508029938, |
| "kl": 3.094971179962158e-05, |
| "lambda_div_used": 0.6170216798782349, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": 0.002, |
| "reward": 0.056462954729795456, |
| "reward_after_mean": 0.056462954729795456, |
| "reward_after_std": 0.6216260213404894, |
| "reward_before_mean": 0.48691817931830883, |
| "reward_before_std": 0.5461275167763233, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4304552264511585, |
| "reward_change_min": -0.7186719551682472, |
| "reward_change_std": 0.27375681325793266, |
| "reward_std": 0.6216260306537151, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.13275149278342724, |
| "step": 278 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3158.6875228881836, |
| "epoch": 0.31885714285714284, |
| "grad_norm": 0.06945524364709854, |
| "kl": 3.522634506225586e-05, |
| "lambda_div_used": 0.5564287528395653, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": -0.0272, |
| "reward": -0.43442236818373203, |
| "reward_after_mean": -0.43442236818373203, |
| "reward_after_std": 0.32187592424452305, |
| "reward_before_mean": -0.1544840056449175, |
| "reward_before_std": 0.25075172632932663, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27993838116526604, |
| "reward_change_min": -0.41151439025998116, |
| "reward_change_std": 0.15283891931176186, |
| "reward_std": 0.32187593914568424, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.17531732900533825, |
| "step": 279 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2157.833335876465, |
| "epoch": 0.32, |
| "grad_norm": 0.14424921572208405, |
| "kl": 4.202499985694885e-05, |
| "lambda_div_used": 0.630903884768486, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": -0.0339, |
| "reward": -0.14438428170979023, |
| "reward_after_mean": -0.14438428170979023, |
| "reward_after_std": 0.6414528246968985, |
| "reward_before_mean": 0.1313652544049546, |
| "reward_before_std": 0.6011563409119844, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.275749534368515, |
| "reward_change_min": -0.4570390097796917, |
| "reward_change_std": 0.1709576854482293, |
| "reward_std": 0.6414528302848339, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.076968085631961, |
| "step": 280 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3559.000030517578, |
| "epoch": 0.3211428571428571, |
| "grad_norm": 0.0494488850235939, |
| "kl": 3.0197203159332275e-05, |
| "lambda_div_used": 0.5301951244473457, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": -0.0035, |
| "reward": -0.5142169296741486, |
| "reward_after_mean": -0.5142169296741486, |
| "reward_after_std": 0.22926145792007446, |
| "reward_before_mean": -0.2300790660083294, |
| "reward_before_std": 0.1324998252093792, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2841378580778837, |
| "reward_change_min": -0.4100039228796959, |
| "reward_change_std": 0.14742697216570377, |
| "reward_std": 0.22926146537065506, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2300790660083294, |
| "step": 281 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2757.7708587646484, |
| "epoch": 0.3222857142857143, |
| "grad_norm": 0.07136296480894089, |
| "kl": 1.5236437320709229e-05, |
| "lambda_div_used": 0.6041858941316605, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": 0.0304, |
| "reward": 0.016035709530115128, |
| "reward_after_mean": 0.016035709530115128, |
| "reward_after_std": 0.5573885068297386, |
| "reward_before_mean": 0.4244079850614071, |
| "reward_before_std": 0.48719789227470756, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40837226063013077, |
| "reward_change_min": -0.630510251969099, |
| "reward_change_std": 0.253003865480423, |
| "reward_std": 0.5573885291814804, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.07024132460355759, |
| "step": 282 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2882.5416946411133, |
| "epoch": 0.32342857142857145, |
| "grad_norm": 0.05982055515050888, |
| "kl": 1.1865049600601196e-05, |
| "lambda_div_used": 0.6191478446125984, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": -0.015, |
| "reward": 0.10000502690672874, |
| "reward_after_mean": 0.10000502690672874, |
| "reward_after_std": 0.6287181153893471, |
| "reward_before_mean": 0.5279214177280664, |
| "reward_before_std": 0.5531216450035572, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4279164057224989, |
| "reward_change_min": -0.6583010666072369, |
| "reward_change_std": 0.2602456407621503, |
| "reward_std": 0.6287181228399277, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.1529214084148407, |
| "step": 283 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2187.083381652832, |
| "epoch": 0.32457142857142857, |
| "grad_norm": 0.20831985771656036, |
| "kl": 5.378853529691696e-05, |
| "lambda_div_used": 0.617803268134594, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0301, |
| "reward": -0.09672576747834682, |
| "reward_after_mean": -0.09672576747834682, |
| "reward_after_std": 0.5820088647305965, |
| "reward_before_mean": 0.21344758570194244, |
| "reward_before_std": 0.544846129603684, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3101733736693859, |
| "reward_change_min": -0.5270103476941586, |
| "reward_change_std": 0.20029443874955177, |
| "reward_std": 0.5820088759064674, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.0157190952450037, |
| "step": 284 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2571.4583435058594, |
| "epoch": 0.32571428571428573, |
| "grad_norm": 0.05750874802470207, |
| "kl": 2.5823712348937988e-05, |
| "lambda_div_used": 0.5412982106208801, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": 0.0429, |
| "reward": -0.3610886335372925, |
| "reward_after_mean": -0.3610886335372925, |
| "reward_after_std": 0.2963402010500431, |
| "reward_before_mean": 0.004524916410446167, |
| "reward_before_std": 0.18141429405659437, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36561354249715805, |
| "reward_change_min": -0.5212092585861683, |
| "reward_change_std": 0.19485185854136944, |
| "reward_std": 0.29634021408855915, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.12047509290277958, |
| "step": 285 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2607.625030517578, |
| "epoch": 0.32685714285714285, |
| "grad_norm": 0.06422320753335953, |
| "kl": 2.036895602941513e-05, |
| "lambda_div_used": 0.593349277973175, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0069, |
| "reward": -0.1434864792972803, |
| "reward_after_mean": -0.1434864792972803, |
| "reward_after_std": 0.4716298431158066, |
| "reward_before_mean": 0.18680993653833866, |
| "reward_before_std": 0.4280705275014043, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3302964009344578, |
| "reward_change_min": -0.5444469898939133, |
| "reward_change_std": 0.20489494875073433, |
| "reward_std": 0.4716298636049032, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.04235674999654293, |
| "step": 286 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2068.4375343322754, |
| "epoch": 0.328, |
| "grad_norm": 0.11217836290597916, |
| "kl": 3.282725811004639e-05, |
| "lambda_div_used": 0.576636016368866, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": -0.0127, |
| "reward": -0.015125550329685211, |
| "reward_after_mean": -0.015125550329685211, |
| "reward_after_std": 0.44022079929709435, |
| "reward_before_mean": 0.4338516741991043, |
| "reward_before_std": 0.349846001714468, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4489772208034992, |
| "reward_change_min": -0.6400703266263008, |
| "reward_change_std": 0.25500839948654175, |
| "reward_std": 0.44022080302238464, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.12135165929794312, |
| "step": 287 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3021.1666870117188, |
| "epoch": 0.3291428571428571, |
| "grad_norm": 0.05326732248067856, |
| "kl": 2.1582163753919303e-05, |
| "lambda_div_used": 0.5814583152532578, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": 0.0215, |
| "reward": -0.23482287488877773, |
| "reward_after_mean": -0.23482287488877773, |
| "reward_after_std": 0.4797380156815052, |
| "reward_before_mean": 0.10618079453706741, |
| "reward_before_std": 0.37177785113453865, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34100368432700634, |
| "reward_change_min": -0.5246531553566456, |
| "reward_change_std": 0.190623770467937, |
| "reward_std": 0.47973802499473095, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.08131920825690031, |
| "step": 288 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2210.0208435058594, |
| "epoch": 0.3302857142857143, |
| "grad_norm": 0.0918356254696846, |
| "kl": 3.3445656299591064e-05, |
| "lambda_div_used": 0.5579611882567406, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": 0.0308, |
| "reward": -0.1890019178390503, |
| "reward_after_mean": -0.1890019178390503, |
| "reward_after_std": 0.38851393200457096, |
| "reward_before_mean": 0.2269407268613577, |
| "reward_before_std": 0.2580757178366184, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4159426633268595, |
| "reward_change_min": -0.5701811872422695, |
| "reward_change_std": 0.21641669981181622, |
| "reward_std": 0.38851393945515156, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": -0.04389259871095419, |
| "step": 289 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1820.6666870117188, |
| "epoch": 0.3314285714285714, |
| "grad_norm": 0.10334110260009766, |
| "kl": 1.7639249563217163e-05, |
| "lambda_div_used": 0.649879202246666, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": -0.0389, |
| "reward": 0.04819735325872898, |
| "reward_after_mean": 0.04819735325872898, |
| "reward_after_std": 0.752808591350913, |
| "reward_before_mean": 0.39133079699240625, |
| "reward_before_std": 0.7000208692625165, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3431334514170885, |
| "reward_change_min": -0.5988858677446842, |
| "reward_change_std": 0.2255254928022623, |
| "reward_std": 0.7528086155653, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.05799746699631214, |
| "step": 290 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2717.166732788086, |
| "epoch": 0.3325714285714286, |
| "grad_norm": 0.07149659842252731, |
| "kl": 9.842216968536377e-06, |
| "lambda_div_used": 0.6230586618185043, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0634, |
| "reward": 0.04544975981116295, |
| "reward_after_mean": 0.04544975981116295, |
| "reward_after_std": 0.6337935384362936, |
| "reward_before_mean": 0.4299341347068548, |
| "reward_before_std": 0.5707205794751644, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38448438234627247, |
| "reward_change_min": -0.6367709413170815, |
| "reward_change_std": 0.2398422248661518, |
| "reward_std": 0.6337935607880354, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.09660080214962363, |
| "step": 291 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3176.979202270508, |
| "epoch": 0.33371428571428574, |
| "grad_norm": 0.05150657147169113, |
| "kl": 2.0240433514118195e-05, |
| "lambda_div_used": 0.5560602247714996, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.062, |
| "reward": -0.4294139966368675, |
| "reward_after_mean": -0.4294139966368675, |
| "reward_after_std": 0.3452510107308626, |
| "reward_before_mean": -0.15922172274440527, |
| "reward_before_std": 0.2578093442134559, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27019229158759117, |
| "reward_change_min": -0.42562897875905037, |
| "reward_change_std": 0.15438843425363302, |
| "reward_std": 0.3452510181814432, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.20088838692754507, |
| "step": 292 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2630.9791946411133, |
| "epoch": 0.33485714285714285, |
| "grad_norm": 0.05954331159591675, |
| "kl": 1.2833625078201294e-05, |
| "lambda_div_used": 0.5555919855833054, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": 0.0009, |
| "reward": -0.19852645695209503, |
| "reward_after_mean": -0.19852645695209503, |
| "reward_after_std": 0.3515181578695774, |
| "reward_before_mean": 0.20652466267347336, |
| "reward_before_std": 0.24862384609878063, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40505112148821354, |
| "reward_change_min": -0.5747470110654831, |
| "reward_change_std": 0.22049889154732227, |
| "reward_std": 0.35151816345751286, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.022642023861408234, |
| "step": 293 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3005.5, |
| "epoch": 0.336, |
| "grad_norm": 0.06993379443883896, |
| "kl": 2.8274953365325928e-05, |
| "lambda_div_used": 0.5608418732881546, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": -0.0248, |
| "reward": -0.3667301833629608, |
| "reward_after_mean": -0.3667301833629608, |
| "reward_after_std": 0.32350156269967556, |
| "reward_before_mean": -0.07280110754072666, |
| "reward_before_std": 0.2737845163792372, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29392906464636326, |
| "reward_change_min": -0.4593726359307766, |
| "reward_change_std": 0.17168805841356516, |
| "reward_std": 0.3235015720129013, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.15613443590700626, |
| "step": 294 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3221.2291717529297, |
| "epoch": 0.33714285714285713, |
| "grad_norm": 0.06364311277866364, |
| "kl": 1.817569136619568e-05, |
| "lambda_div_used": 0.5979067236185074, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": 0.0107, |
| "reward": -0.259306401014328, |
| "reward_after_mean": -0.259306401014328, |
| "reward_after_std": 0.5024331342428923, |
| "reward_before_mean": 0.018260781886056066, |
| "reward_before_std": 0.45087322127074003, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.277567183598876, |
| "reward_change_min": -0.41658853366971016, |
| "reward_change_std": 0.1643626783043146, |
| "reward_std": 0.5024331398308277, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.10673921927809715, |
| "step": 295 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3282.8750610351562, |
| "epoch": 0.3382857142857143, |
| "grad_norm": 0.0698896273970604, |
| "kl": 2.3877248167991638e-05, |
| "lambda_div_used": 0.5625706240534782, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0079, |
| "reward": -0.39721857756376266, |
| "reward_after_mean": -0.39721857756376266, |
| "reward_after_std": 0.34284412302076817, |
| "reward_before_mean": -0.1091517936438322, |
| "reward_before_std": 0.28544116113334894, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28806679882109165, |
| "reward_change_min": -0.4639837518334389, |
| "reward_change_std": 0.1706175785511732, |
| "reward_std": 0.3428441286087036, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.15081845596432686, |
| "step": 296 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3569.8541870117188, |
| "epoch": 0.3394285714285714, |
| "grad_norm": 0.04541197791695595, |
| "kl": 2.245139330625534e-05, |
| "lambda_div_used": 0.5649159774184227, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": 0.0071, |
| "reward": -0.43140796944499016, |
| "reward_after_mean": -0.43140796944499016, |
| "reward_after_std": 0.3433863054960966, |
| "reward_before_mean": -0.17089181207120419, |
| "reward_before_std": 0.29400468710809946, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26051616482436657, |
| "reward_change_min": -0.4413977116346359, |
| "reward_change_std": 0.15714262332767248, |
| "reward_std": 0.3433863129466772, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2125584715977311, |
| "step": 297 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2834.479202270508, |
| "epoch": 0.3405714285714286, |
| "grad_norm": 0.07559877634048462, |
| "kl": 1.9371509552001953e-05, |
| "lambda_div_used": 0.5950812250375748, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": -0.0217, |
| "reward": -0.13096300419420004, |
| "reward_after_mean": -0.13096300419420004, |
| "reward_after_std": 0.505608232691884, |
| "reward_before_mean": 0.22305661533027887, |
| "reward_before_std": 0.43375879526138306, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.354019645601511, |
| "reward_change_min": -0.5659407489001751, |
| "reward_change_std": 0.20670694950968027, |
| "reward_std": 0.5056082457304001, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.0686100497841835, |
| "step": 298 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3288.5833587646484, |
| "epoch": 0.3417142857142857, |
| "grad_norm": 0.05182049795985222, |
| "kl": 3.013480454683304e-05, |
| "lambda_div_used": 0.5971293970942497, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": -0.0035, |
| "reward": -0.1649590004235506, |
| "reward_after_mean": -0.1649590004235506, |
| "reward_after_std": 0.476910138502717, |
| "reward_before_mean": 0.15552489459514618, |
| "reward_before_std": 0.44229450821876526, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3204838838428259, |
| "reward_change_min": -0.5031619034707546, |
| "reward_change_std": 0.19439208041876554, |
| "reward_std": 0.47691015154123306, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.052808452397584915, |
| "step": 299 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3455.3958435058594, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.059302717447280884, |
| "kl": 3.039836883544922e-05, |
| "lambda_div_used": 0.5659952610731125, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.0631, |
| "reward": -0.4138263203203678, |
| "reward_after_mean": -0.4138263203203678, |
| "reward_after_std": 0.35950295627117157, |
| "reward_before_mean": -0.14833886176347733, |
| "reward_before_std": 0.30170741491019726, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26548744924366474, |
| "reward_change_min": -0.45029690116643906, |
| "reward_change_std": 0.16043098457157612, |
| "reward_std": 0.35950295627117157, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.19000552780926228, |
| "step": 300 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2581.2917404174805, |
| "epoch": 0.344, |
| "grad_norm": 0.0829557478427887, |
| "kl": 1.7702579498291016e-05, |
| "lambda_div_used": 0.6020706444978714, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0033, |
| "reward": -0.2905968725681305, |
| "reward_after_mean": -0.2905968725681305, |
| "reward_after_std": 0.522302333265543, |
| "reward_before_mean": -0.03232934419065714, |
| "reward_before_std": 0.47291796933859587, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2582675274461508, |
| "reward_change_min": -0.4681916609406471, |
| "reward_change_std": 0.16694430727511644, |
| "reward_std": 0.5223023407161236, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.13649601582437754, |
| "step": 301 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2299.250026702881, |
| "epoch": 0.34514285714285714, |
| "grad_norm": 0.09259702265262604, |
| "kl": 2.473965287208557e-05, |
| "lambda_div_used": 0.6064160838723183, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": 0.0049, |
| "reward": 0.1023632986471057, |
| "reward_after_mean": 0.1023632986471057, |
| "reward_after_std": 0.5881610047072172, |
| "reward_before_mean": 0.5643375236541033, |
| "reward_before_std": 0.48729276517406106, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46197423338890076, |
| "reward_change_min": -0.6793596595525742, |
| "reward_change_std": 0.2643775464966893, |
| "reward_std": 0.588161014020443, |
| "rewards/accuracy_reward": 0.39583334513008595, |
| "rewards/cosine_scaled_reward": 0.16850417526438832, |
| "step": 302 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2496.604202270508, |
| "epoch": 0.3462857142857143, |
| "grad_norm": 0.10342813283205032, |
| "kl": 2.822279930114746e-05, |
| "lambda_div_used": 0.6173663139343262, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": -0.0508, |
| "reward": -0.21569269057363272, |
| "reward_after_mean": -0.21569269057363272, |
| "reward_after_std": 0.5887450613081455, |
| "reward_before_mean": 0.048040480352938175, |
| "reward_before_std": 0.5433634500950575, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2637331634759903, |
| "reward_change_min": -0.4554433934390545, |
| "reward_change_std": 0.17205023765563965, |
| "reward_std": 0.5887450724840164, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.09779285965487361, |
| "step": 303 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2828.375030517578, |
| "epoch": 0.3474285714285714, |
| "grad_norm": 0.06273287534713745, |
| "kl": 3.515370190143585e-05, |
| "lambda_div_used": 0.5850896239280701, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": 0.0395, |
| "reward": -0.351226020604372, |
| "reward_after_mean": -0.351226020604372, |
| "reward_after_std": 0.4407376032322645, |
| "reward_before_mean": -0.091004628688097, |
| "reward_before_std": 0.39247662480920553, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2602214030921459, |
| "reward_change_min": -0.4692201763391495, |
| "reward_change_std": 0.16711712814867496, |
| "reward_std": 0.4407376106828451, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.1743379645049572, |
| "step": 304 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2963.270835876465, |
| "epoch": 0.3485714285714286, |
| "grad_norm": 0.07068406045436859, |
| "kl": 3.521144390106201e-05, |
| "lambda_div_used": 0.5555498078465462, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": 0.0077, |
| "reward": -0.44899775832891464, |
| "reward_after_mean": -0.44899775832891464, |
| "reward_after_std": 0.32930242642760277, |
| "reward_before_mean": -0.17746215965598822, |
| "reward_before_std": 0.2488851365633309, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27153559774160385, |
| "reward_change_min": -0.41173194721341133, |
| "reward_change_std": 0.14773181919008493, |
| "reward_std": 0.32930243387818336, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.19829548941925168, |
| "step": 305 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2542.8750228881836, |
| "epoch": 0.3497142857142857, |
| "grad_norm": 0.09160702675580978, |
| "kl": 2.777576446533203e-05, |
| "lambda_div_used": 0.5710241869091988, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.001, |
| "reward": -0.28438286297023296, |
| "reward_after_mean": -0.28438286297023296, |
| "reward_after_std": 0.45469350554049015, |
| "reward_before_mean": 0.05048683221684769, |
| "reward_before_std": 0.3197702756151557, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33486970886588097, |
| "reward_change_min": -0.46306542679667473, |
| "reward_change_std": 0.1733616916462779, |
| "reward_std": 0.45469350926578045, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.11617984622716904, |
| "step": 306 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2358.9167098999023, |
| "epoch": 0.35085714285714287, |
| "grad_norm": 0.08308543264865875, |
| "kl": 2.5270506739616394e-05, |
| "lambda_div_used": 0.6283661872148514, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.0233, |
| "reward": -0.023870151489973068, |
| "reward_after_mean": -0.023870151489973068, |
| "reward_after_std": 0.5968872811645269, |
| "reward_before_mean": 0.2986760139465332, |
| "reward_before_std": 0.602552474476397, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3225461672991514, |
| "reward_change_min": -0.5788566246628761, |
| "reward_change_std": 0.22817020770162344, |
| "reward_std": 0.5968873165547848, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.007009351626038551, |
| "step": 307 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3364.187530517578, |
| "epoch": 0.352, |
| "grad_norm": 0.04729427769780159, |
| "kl": 1.2226402759552002e-05, |
| "lambda_div_used": 0.5575956255197525, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": -0.0147, |
| "reward": -0.364587739109993, |
| "reward_after_mean": -0.364587739109993, |
| "reward_after_std": 0.3172164801508188, |
| "reward_before_mean": -0.06630371138453484, |
| "reward_before_std": 0.25998193118721247, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29828402772545815, |
| "reward_change_min": -0.4606154337525368, |
| "reward_change_std": 0.17100436985492706, |
| "reward_std": 0.31721648946404457, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.14963705092668533, |
| "step": 308 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3246.750030517578, |
| "epoch": 0.35314285714285715, |
| "grad_norm": 0.054204076528549194, |
| "kl": 1.4858320355415344e-05, |
| "lambda_div_used": 0.6364770829677582, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": 0.0936, |
| "reward": -0.09930397570133209, |
| "reward_after_mean": -0.09930397570133209, |
| "reward_after_std": 0.6563027147203684, |
| "reward_before_mean": 0.18065341375768185, |
| "reward_before_std": 0.6305609010159969, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27995740808546543, |
| "reward_change_min": -0.5127242244780064, |
| "reward_change_std": 0.19019456766545773, |
| "reward_std": 0.6563027296215296, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.027679930441081524, |
| "step": 309 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2454.750015258789, |
| "epoch": 0.35428571428571426, |
| "grad_norm": 0.10188445448875427, |
| "kl": 3.9263395592570305e-05, |
| "lambda_div_used": 0.5863905549049377, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": -0.0091, |
| "reward": -0.358197920024395, |
| "reward_after_mean": -0.358197920024395, |
| "reward_after_std": 0.4752412661910057, |
| "reward_before_mean": -0.10783447185531259, |
| "reward_before_std": 0.39106855262070894, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25036344304680824, |
| "reward_change_min": -0.36833222955465317, |
| "reward_change_std": 0.13517011515796185, |
| "reward_std": 0.47524126805365086, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.17033447965513915, |
| "step": 310 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2549.5000228881836, |
| "epoch": 0.3554285714285714, |
| "grad_norm": 0.07070475816726685, |
| "kl": 1.1476688086986542e-05, |
| "lambda_div_used": 0.6028245091438293, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.0234, |
| "reward": -0.07257996033877134, |
| "reward_after_mean": -0.07257996033877134, |
| "reward_after_std": 0.5427698846906424, |
| "reward_before_mean": 0.32260693423449993, |
| "reward_before_std": 0.4735551681369543, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3951868824660778, |
| "reward_change_min": -0.6274620294570923, |
| "reward_change_std": 0.2447280865162611, |
| "reward_std": 0.5427699014544487, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.010106915608048439, |
| "step": 311 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2062.020851135254, |
| "epoch": 0.3565714285714286, |
| "grad_norm": 0.0864594504237175, |
| "kl": 2.2163614630699158e-05, |
| "lambda_div_used": 0.5457132831215858, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": 0.0122, |
| "reward": -0.03406492620706558, |
| "reward_after_mean": -0.03406492620706558, |
| "reward_after_std": 0.3851391561329365, |
| "reward_before_mean": 0.5170513242483139, |
| "reward_before_std": 0.20092127658426762, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5511162541806698, |
| "reward_change_min": -0.7572819888591766, |
| "reward_change_std": 0.2855915669351816, |
| "reward_std": 0.38513917103409767, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/cosine_scaled_reward": 0.1420513167977333, |
| "step": 312 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2939.958351135254, |
| "epoch": 0.3577142857142857, |
| "grad_norm": 0.06731264293193817, |
| "kl": 2.4283304810523987e-05, |
| "lambda_div_used": 0.5877447426319122, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": -0.0439, |
| "reward": -0.15467195864766836, |
| "reward_after_mean": -0.15467195864766836, |
| "reward_after_std": 0.522945849224925, |
| "reward_before_mean": 0.20540180057287216, |
| "reward_before_std": 0.40051606576889753, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3600737862288952, |
| "reward_change_min": -0.5124829597771168, |
| "reward_change_std": 0.19857864920049906, |
| "reward_std": 0.5229458846151829, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": -0.02376485476270318, |
| "step": 313 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2541.18754196167, |
| "epoch": 0.3588571428571429, |
| "grad_norm": 0.08012691885232925, |
| "kl": 3.474205732345581e-05, |
| "lambda_div_used": 0.607881672680378, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": 0.0168, |
| "reward": -0.20861644856631756, |
| "reward_after_mean": -0.20861644856631756, |
| "reward_after_std": 0.5089061111211777, |
| "reward_before_mean": 0.08002243563532829, |
| "reward_before_std": 0.49900877848267555, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28863888792693615, |
| "reward_change_min": -0.548176895827055, |
| "reward_change_std": 0.2009973768144846, |
| "reward_std": 0.5089061167091131, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.08664422971196473, |
| "step": 314 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3124.3333740234375, |
| "epoch": 0.36, |
| "grad_norm": 0.0558871328830719, |
| "kl": 1.9735191017389297e-05, |
| "lambda_div_used": 0.5734824016690254, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0494, |
| "reward": -0.1729298224672675, |
| "reward_after_mean": -0.1729298224672675, |
| "reward_after_std": 0.45045475475490093, |
| "reward_before_mean": 0.20758753083646297, |
| "reward_before_std": 0.33522335812449455, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3805173486471176, |
| "reward_change_min": -0.5440769977867603, |
| "reward_change_std": 0.20876472163945436, |
| "reward_std": 0.4504547640681267, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.02157914894632995, |
| "step": 315 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3509.5416870117188, |
| "epoch": 0.36114285714285715, |
| "grad_norm": 0.05331319198012352, |
| "kl": 2.981536090373993e-05, |
| "lambda_div_used": 0.5893311724066734, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0135, |
| "reward": -0.22396450489759445, |
| "reward_after_mean": -0.22396450489759445, |
| "reward_after_std": 0.43868801929056644, |
| "reward_before_mean": 0.07456053979694843, |
| "reward_before_std": 0.414919788017869, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29852502048015594, |
| "reward_change_min": -0.4823254942893982, |
| "reward_change_std": 0.1912739286199212, |
| "reward_std": 0.43868803791701794, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.09210613742470741, |
| "step": 316 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3099.5833435058594, |
| "epoch": 0.36228571428571427, |
| "grad_norm": 0.06200157478451729, |
| "kl": 3.091990947723389e-05, |
| "lambda_div_used": 0.5825675800442696, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0437, |
| "reward": -0.35854507237672806, |
| "reward_after_mean": -0.35854507237672806, |
| "reward_after_std": 0.43180895783007145, |
| "reward_before_mean": -0.102005859836936, |
| "reward_before_std": 0.3769839182496071, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25653921999037266, |
| "reward_change_min": -0.4170425795018673, |
| "reward_change_std": 0.15367608424276114, |
| "reward_std": 0.4318089634180069, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.18533920124173164, |
| "step": 317 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2281.000068664551, |
| "epoch": 0.36342857142857143, |
| "grad_norm": 0.0932127833366394, |
| "kl": 3.602728247642517e-05, |
| "lambda_div_used": 0.6223095878958702, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": 0.0631, |
| "reward": -0.07363937655463815, |
| "reward_after_mean": -0.07363937655463815, |
| "reward_after_std": 0.5691333152353764, |
| "reward_before_mean": 0.24711408838629723, |
| "reward_before_std": 0.5656038168817759, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3207534924149513, |
| "reward_change_min": -0.5814453661441803, |
| "reward_change_std": 0.2219030074775219, |
| "reward_std": 0.569133322685957, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": -0.0028859074227511883, |
| "step": 318 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2854.5625076293945, |
| "epoch": 0.36457142857142855, |
| "grad_norm": 0.08494079858064651, |
| "kl": 2.7507543563842773e-05, |
| "lambda_div_used": 0.5380512997508049, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": -0.0389, |
| "reward": -0.4965253435075283, |
| "reward_after_mean": -0.4965253435075283, |
| "reward_after_std": 0.2333353590220213, |
| "reward_before_mean": -0.2120634987950325, |
| "reward_before_std": 0.16711975168436766, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28446184657514095, |
| "reward_change_min": -0.4124374948441982, |
| "reward_change_std": 0.15511877462267876, |
| "reward_std": 0.2333353627473116, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2120634950697422, |
| "step": 319 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1964.8958740234375, |
| "epoch": 0.3657142857142857, |
| "grad_norm": 0.09068436175584793, |
| "kl": 3.33394855260849e-05, |
| "lambda_div_used": 0.6160280704498291, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0117, |
| "reward": -0.19740011962130666, |
| "reward_after_mean": -0.19740011962130666, |
| "reward_after_std": 0.5963947810232639, |
| "reward_before_mean": 0.0693174353800714, |
| "reward_before_std": 0.5336382519453764, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26671755872666836, |
| "reward_change_min": -0.4143567681312561, |
| "reward_change_std": 0.15817437414079905, |
| "reward_std": 0.5963947977870703, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.05568256159313023, |
| "step": 320 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1974.8958778381348, |
| "epoch": 0.3668571428571429, |
| "grad_norm": 0.0693729817867279, |
| "kl": 1.574307680130005e-05, |
| "lambda_div_used": 0.6290735602378845, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0387, |
| "reward": 0.11023577488958836, |
| "reward_after_mean": 0.11023577488958836, |
| "reward_after_std": 0.6442763805389404, |
| "reward_before_mean": 0.5218545235693455, |
| "reward_before_std": 0.6013733670115471, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4116187300533056, |
| "reward_change_min": -0.6836260408163071, |
| "reward_change_std": 0.2705372450873256, |
| "reward_std": 0.6442763898521662, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.12602117005735636, |
| "step": 321 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2837.833381652832, |
| "epoch": 0.368, |
| "grad_norm": 0.10218925029039383, |
| "kl": 4.9501657485961914e-05, |
| "lambda_div_used": 0.6059064492583275, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": -0.0178, |
| "reward": -0.2954628551378846, |
| "reward_after_mean": -0.2954628551378846, |
| "reward_after_std": 0.5665331184864044, |
| "reward_before_mean": -0.04816420469433069, |
| "reward_before_std": 0.48093545995652676, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24729863554239273, |
| "reward_change_min": -0.3516420107334852, |
| "reward_change_std": 0.13184416200965643, |
| "reward_std": 0.5665331222116947, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.13149754563346505, |
| "step": 322 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3015.666717529297, |
| "epoch": 0.36914285714285716, |
| "grad_norm": 0.0596209317445755, |
| "kl": 1.7877668142318726e-05, |
| "lambda_div_used": 0.5565166473388672, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": 0.0465, |
| "reward": -0.16177266091108322, |
| "reward_after_mean": -0.16177266091108322, |
| "reward_after_std": 0.42436218820512295, |
| "reward_before_mean": 0.2816486116498709, |
| "reward_before_std": 0.2578566027805209, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4434212874621153, |
| "reward_change_min": -0.6335049495100975, |
| "reward_change_std": 0.23841418512165546, |
| "reward_std": 0.4243622049689293, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": -0.010018057189881802, |
| "step": 323 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2751.6458435058594, |
| "epoch": 0.3702857142857143, |
| "grad_norm": 0.10376841574907303, |
| "kl": 2.1306797862052917e-05, |
| "lambda_div_used": 0.5670785158872604, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": -0.0038, |
| "reward": -0.10797288408502936, |
| "reward_after_mean": -0.10797288408502936, |
| "reward_after_std": 0.4300071895122528, |
| "reward_before_mean": 0.3295288155786693, |
| "reward_before_std": 0.3011532872915268, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43750171549618244, |
| "reward_change_min": -0.6185614429414272, |
| "reward_change_std": 0.2333526872098446, |
| "reward_std": 0.43000719882547855, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.07952881557866931, |
| "step": 324 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2943.8958587646484, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.07239633053541183, |
| "kl": 9.275972843170166e-06, |
| "lambda_div_used": 0.6356078162789345, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0667, |
| "reward": -0.050798285752534866, |
| "reward_after_mean": -0.050798285752534866, |
| "reward_after_std": 0.6440786644816399, |
| "reward_before_mean": 0.24087253957986832, |
| "reward_before_std": 0.6331603992730379, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2916708290576935, |
| "reward_change_min": -0.4932599440217018, |
| "reward_change_std": 0.20057004038244486, |
| "reward_std": 0.6440786886960268, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.02996080555021763, |
| "step": 325 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2375.645851135254, |
| "epoch": 0.37257142857142855, |
| "grad_norm": 0.09299691766500473, |
| "kl": 2.9705464839935303e-05, |
| "lambda_div_used": 0.5330315083265305, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": -0.0979, |
| "reward": -0.20862240344285965, |
| "reward_after_mean": -0.20862240344285965, |
| "reward_after_std": 0.3296848703175783, |
| "reward_before_mean": 0.2517512815538794, |
| "reward_before_std": 0.1450797226279974, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.460373692214489, |
| "reward_change_min": -0.6264396719634533, |
| "reward_change_std": 0.23280893173068762, |
| "reward_std": 0.3296848740428686, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": 0.001751287141814828, |
| "step": 326 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2810.187511444092, |
| "epoch": 0.3737142857142857, |
| "grad_norm": 0.06210591271519661, |
| "kl": 8.400529623031616e-06, |
| "lambda_div_used": 0.5783005133271217, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": -0.0483, |
| "reward": -0.11793380603194237, |
| "reward_after_mean": -0.11793380603194237, |
| "reward_after_std": 0.45353105291724205, |
| "reward_before_mean": 0.2935123089700937, |
| "reward_before_std": 0.3566841436550021, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41144610941410065, |
| "reward_change_min": -0.6089312434196472, |
| "reward_change_std": 0.23675687983632088, |
| "reward_std": 0.4535310585051775, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.022678975947201252, |
| "step": 327 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3518.8958435058594, |
| "epoch": 0.37485714285714283, |
| "grad_norm": 0.05192619562149048, |
| "kl": 2.485513687133789e-05, |
| "lambda_div_used": 0.5715365409851074, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0155, |
| "reward": -0.4497411046177149, |
| "reward_after_mean": -0.4497411046177149, |
| "reward_after_std": 0.40177739411592484, |
| "reward_before_mean": -0.21567542850971222, |
| "reward_before_std": 0.32284149527549744, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23406569100916386, |
| "reward_change_min": -0.33814629539847374, |
| "reward_change_std": 0.12489900179207325, |
| "reward_std": 0.40177739597857, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2573420889675617, |
| "step": 328 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2243.0416946411133, |
| "epoch": 0.376, |
| "grad_norm": 0.10950610786676407, |
| "kl": 2.469681203365326e-05, |
| "lambda_div_used": 0.6176783889532089, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": -0.0562, |
| "reward": -0.013319691643118858, |
| "reward_after_mean": -0.013319691643118858, |
| "reward_after_std": 0.5674807205796242, |
| "reward_before_mean": 0.3346351385116577, |
| "reward_before_std": 0.5435493532568216, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34795483760535717, |
| "reward_change_min": -0.5437430925667286, |
| "reward_change_std": 0.22110824659466743, |
| "reward_std": 0.5674807410687208, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": 0.06380179291591048, |
| "step": 329 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2141.479190826416, |
| "epoch": 0.37714285714285717, |
| "grad_norm": 0.10051169991493225, |
| "kl": 4.89354133605957e-05, |
| "lambda_div_used": 0.5594945177435875, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": -0.0064, |
| "reward": -0.43674849811941385, |
| "reward_after_mean": -0.43674849811941385, |
| "reward_after_std": 0.32430145144462585, |
| "reward_before_mean": -0.17029727809131145, |
| "reward_before_std": 0.2681279256939888, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.266451220959425, |
| "reward_change_min": -0.44791119545698166, |
| "reward_change_std": 0.1575082140043378, |
| "reward_std": 0.32430145516991615, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2119639493757859, |
| "step": 330 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2564.604202270508, |
| "epoch": 0.3782857142857143, |
| "grad_norm": 0.12382425367832184, |
| "kl": 4.225596785545349e-05, |
| "lambda_div_used": 0.5574543103575706, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0413, |
| "reward": -0.41342686861753464, |
| "reward_after_mean": -0.41342686861753464, |
| "reward_after_std": 0.33145094849169254, |
| "reward_before_mean": -0.13134576752781868, |
| "reward_before_std": 0.25930391903966665, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2820810880511999, |
| "reward_change_min": -0.4496995583176613, |
| "reward_change_std": 0.16215670108795166, |
| "reward_std": 0.33145095966756344, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.17301243357360363, |
| "step": 331 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2420.6666870117188, |
| "epoch": 0.37942857142857145, |
| "grad_norm": 0.07938601076602936, |
| "kl": 2.1241605281829834e-05, |
| "lambda_div_used": 0.5416555106639862, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": 0.0287, |
| "reward": -0.21761326864361763, |
| "reward_after_mean": -0.21761326864361763, |
| "reward_after_std": 0.34953613951802254, |
| "reward_before_mean": 0.2352729644626379, |
| "reward_before_std": 0.18298226408660412, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45288625732064247, |
| "reward_change_min": -0.6215734221041203, |
| "reward_change_std": 0.2345103258267045, |
| "reward_std": 0.34953614324331284, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": -0.014727018773555756, |
| "step": 332 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2350.6875381469727, |
| "epoch": 0.38057142857142856, |
| "grad_norm": 0.08657004684209824, |
| "kl": 2.1725893020629883e-05, |
| "lambda_div_used": 0.6729468256235123, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.0369, |
| "reward": -0.01155824027955532, |
| "reward_after_mean": -0.01155824027955532, |
| "reward_after_std": 0.8104779217392206, |
| "reward_before_mean": 0.23989354074001312, |
| "reward_before_std": 0.8117707390338182, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2514517717063427, |
| "reward_change_min": -0.5040576457977295, |
| "reward_change_std": 0.19348772894591093, |
| "reward_std": 0.810477938503027, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.030939804390072823, |
| "step": 333 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3491.6041870117188, |
| "epoch": 0.38171428571428573, |
| "grad_norm": 0.05208956450223923, |
| "kl": 1.1973665095865726e-05, |
| "lambda_div_used": 0.5869207382202148, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": -0.009, |
| "reward": -0.2944636158645153, |
| "reward_after_mean": -0.2944636158645153, |
| "reward_after_std": 0.43844909220933914, |
| "reward_before_mean": -0.013160821050405502, |
| "reward_before_std": 0.40495526185259223, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2813027612864971, |
| "reward_change_min": -0.4717176593840122, |
| "reward_change_std": 0.1835445323958993, |
| "reward_std": 0.4384490940719843, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.13816083781421185, |
| "step": 334 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2589.0833892822266, |
| "epoch": 0.38285714285714284, |
| "grad_norm": 0.0683097094297409, |
| "kl": 1.9595026969909668e-05, |
| "lambda_div_used": 0.6027273386716843, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": -0.0488, |
| "reward": 0.005996011197566986, |
| "reward_after_mean": 0.005996011197566986, |
| "reward_after_std": 0.5422599408775568, |
| "reward_before_mean": 0.41999348998069763, |
| "reward_before_std": 0.4701922629028559, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41399746760725975, |
| "reward_change_min": -0.623339332640171, |
| "reward_change_std": 0.24341293703764677, |
| "reward_std": 0.5422599650919437, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.08666014112532139, |
| "step": 335 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3162.1458740234375, |
| "epoch": 0.384, |
| "grad_norm": 0.067026786506176, |
| "kl": 2.0024715922772884e-05, |
| "lambda_div_used": 0.5950900241732597, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0138, |
| "reward": -0.16571878641843796, |
| "reward_after_mean": -0.16571878641843796, |
| "reward_after_std": 0.4802239239215851, |
| "reward_before_mean": 0.14471458829939365, |
| "reward_before_std": 0.4368621027097106, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31043340265750885, |
| "reward_change_min": -0.45523249730467796, |
| "reward_change_std": 0.18632046319544315, |
| "reward_std": 0.48022393323481083, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.06361875729635358, |
| "step": 336 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3228.916717529297, |
| "epoch": 0.3851428571428571, |
| "grad_norm": 0.05528295040130615, |
| "kl": 1.9825994968414307e-05, |
| "lambda_div_used": 0.6006625667214394, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": 0.0552, |
| "reward": -0.2433483125641942, |
| "reward_after_mean": -0.2433483125641942, |
| "reward_after_std": 0.5013067033141851, |
| "reward_before_mean": 0.03825543075799942, |
| "reward_before_std": 0.4624515902251005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28160375356674194, |
| "reward_change_min": -0.4847262054681778, |
| "reward_change_std": 0.17836903873831034, |
| "reward_std": 0.501306714490056, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.10757789574563503, |
| "step": 337 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2393.645866394043, |
| "epoch": 0.3862857142857143, |
| "grad_norm": 0.10292567312717438, |
| "kl": 3.5159289836883545e-05, |
| "lambda_div_used": 0.6184682846069336, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": 0.1082, |
| "reward": -0.010080082342028618, |
| "reward_after_mean": -0.010080082342028618, |
| "reward_after_std": 0.6298389285802841, |
| "reward_before_mean": 0.3685674872249365, |
| "reward_before_std": 0.546540604904294, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3786476030945778, |
| "reward_change_min": -0.5825354494154453, |
| "reward_change_std": 0.2281006295233965, |
| "reward_std": 0.6298389509320259, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.05606749979779124, |
| "step": 338 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3026.9583892822266, |
| "epoch": 0.38742857142857146, |
| "grad_norm": 0.0688738226890564, |
| "kl": 2.4460256099700928e-05, |
| "lambda_div_used": 0.5621443763375282, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": -0.0178, |
| "reward": -0.32898029685020447, |
| "reward_after_mean": -0.32898029685020447, |
| "reward_after_std": 0.3227359801530838, |
| "reward_before_mean": -0.01589403674006462, |
| "reward_before_std": 0.2781273443251848, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31308628246188164, |
| "reward_change_min": -0.48185204714536667, |
| "reward_change_std": 0.1813461184501648, |
| "reward_std": 0.3227359913289547, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.12006070092320442, |
| "step": 339 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2447.937515258789, |
| "epoch": 0.38857142857142857, |
| "grad_norm": 0.10290948301553726, |
| "kl": 2.363821113249287e-05, |
| "lambda_div_used": 0.5592405423521996, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": -0.0046, |
| "reward": -0.42997122183442116, |
| "reward_after_mean": -0.42997122183442116, |
| "reward_after_std": 0.33643546886742115, |
| "reward_before_mean": -0.1512418081983924, |
| "reward_before_std": 0.2663228642195463, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2787294201552868, |
| "reward_change_min": -0.4129678010940552, |
| "reward_change_std": 0.1523351836949587, |
| "reward_std": 0.3364354781806469, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.17207514215260744, |
| "step": 340 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2488.9583587646484, |
| "epoch": 0.38971428571428574, |
| "grad_norm": 0.061042170971632004, |
| "kl": 1.909211277961731e-05, |
| "lambda_div_used": 0.5834595933556557, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": -0.0023, |
| "reward": 0.009830114664509892, |
| "reward_after_mean": 0.009830114664509892, |
| "reward_after_std": 0.5199873205274343, |
| "reward_before_mean": 0.4715769328176975, |
| "reward_before_std": 0.38039534725248814, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46174679696559906, |
| "reward_change_min": -0.6357490979135036, |
| "reward_change_std": 0.2529716519638896, |
| "reward_std": 0.5199873335659504, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.13824361143633723, |
| "step": 341 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3020.520854949951, |
| "epoch": 0.39085714285714285, |
| "grad_norm": 0.07325860857963562, |
| "kl": 2.4762004613876343e-05, |
| "lambda_div_used": 0.5985761880874634, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": 0.0007, |
| "reward": -0.06264204811304808, |
| "reward_after_mean": -0.06264204811304808, |
| "reward_after_std": 0.5387034490704536, |
| "reward_before_mean": 0.33080895524472, |
| "reward_before_std": 0.45346040464937687, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3934510052204132, |
| "reward_change_min": -0.5952773988246918, |
| "reward_change_std": 0.23232300952076912, |
| "reward_std": 0.5387034583836794, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": 0.03914228640496731, |
| "step": 342 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3405.1875610351562, |
| "epoch": 0.392, |
| "grad_norm": 0.04665439575910568, |
| "kl": 2.7433037757873535e-05, |
| "lambda_div_used": 0.6150833070278168, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": 0.0509, |
| "reward": -0.1582264108583331, |
| "reward_after_mean": -0.1582264108583331, |
| "reward_after_std": 0.5790915079414845, |
| "reward_before_mean": 0.13680532574653625, |
| "reward_before_std": 0.5264813583344221, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29503174126148224, |
| "reward_change_min": -0.4708465002477169, |
| "reward_change_std": 0.1767220702022314, |
| "reward_std": 0.5790915191173553, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.050694676116108894, |
| "step": 343 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2360.875045776367, |
| "epoch": 0.3931428571428571, |
| "grad_norm": 0.07271434366703033, |
| "kl": 1.041218638420105e-05, |
| "lambda_div_used": 0.5770246461033821, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": -0.0008, |
| "reward": 0.04454847797751427, |
| "reward_after_mean": 0.04454847797751427, |
| "reward_after_std": 0.5018230397254229, |
| "reward_before_mean": 0.5492894258350134, |
| "reward_before_std": 0.35145203582942486, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5047409404069185, |
| "reward_change_min": -0.7455123476684093, |
| "reward_change_std": 0.2790954224765301, |
| "reward_std": 0.5018230620771646, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_scaled_reward": 0.1117894072085619, |
| "step": 344 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3030.0833740234375, |
| "epoch": 0.3942857142857143, |
| "grad_norm": 0.0703606829047203, |
| "kl": 3.1925737857818604e-05, |
| "lambda_div_used": 0.5804503262042999, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": 0.0646, |
| "reward": -0.22463034093379974, |
| "reward_after_mean": -0.22463034093379974, |
| "reward_after_std": 0.41369511373341084, |
| "reward_before_mean": 0.09700941108167171, |
| "reward_before_std": 0.37086532823741436, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32163975574076176, |
| "reward_change_min": -0.5076877251267433, |
| "reward_change_std": 0.1956562791019678, |
| "reward_std": 0.4136951379477978, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.06965726800262928, |
| "step": 345 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3186.6458740234375, |
| "epoch": 0.3954285714285714, |
| "grad_norm": 0.05292130261659622, |
| "kl": 1.4215707778930664e-05, |
| "lambda_div_used": 0.5958857163786888, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0331, |
| "reward": -0.24454469978809357, |
| "reward_after_mean": -0.24454469978809357, |
| "reward_after_std": 0.48920249938964844, |
| "reward_before_mean": 0.04633236164227128, |
| "reward_before_std": 0.44041235372424126, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29087705351412296, |
| "reward_change_min": -0.4697858430445194, |
| "reward_change_std": 0.179568306542933, |
| "reward_std": 0.4892025087028742, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.12033431546296924, |
| "step": 346 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3055.4583587646484, |
| "epoch": 0.3965714285714286, |
| "grad_norm": 0.07354738563299179, |
| "kl": 6.0675665736198425e-06, |
| "lambda_div_used": 0.5582710355520248, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.0132, |
| "reward": -0.42621116526424885, |
| "reward_after_mean": -0.42621116526424885, |
| "reward_after_std": 0.3322943150997162, |
| "reward_before_mean": -0.15516536496579647, |
| "reward_before_std": 0.2633455842733383, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2710457965731621, |
| "reward_change_min": -0.4307188205420971, |
| "reward_change_std": 0.15532648842781782, |
| "reward_std": 0.33229432441294193, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.19683203659951687, |
| "step": 347 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2852.4583435058594, |
| "epoch": 0.3977142857142857, |
| "grad_norm": 0.09605983644723892, |
| "kl": 3.167241811752319e-05, |
| "lambda_div_used": 0.5788434967398643, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": -0.0017, |
| "reward": -0.2087371125817299, |
| "reward_after_mean": -0.2087371125817299, |
| "reward_after_std": 0.40125221759080887, |
| "reward_before_mean": 0.12491290923207998, |
| "reward_before_std": 0.36025606002658606, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3336500097066164, |
| "reward_change_min": -0.5146553069353104, |
| "reward_change_std": 0.20004253275692463, |
| "reward_std": 0.40125224366784096, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.04175376985222101, |
| "step": 348 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2964.875015258789, |
| "epoch": 0.39885714285714285, |
| "grad_norm": 0.06869405508041382, |
| "kl": 1.731887459754944e-05, |
| "lambda_div_used": 0.5606642514467239, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0031, |
| "reward": -0.2807905152440071, |
| "reward_after_mean": -0.2807905152440071, |
| "reward_after_std": 0.39816635474562645, |
| "reward_before_mean": 0.08202904835343361, |
| "reward_before_std": 0.2763519547879696, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36281956918537617, |
| "reward_change_min": -0.54488330706954, |
| "reward_change_std": 0.20150058157742023, |
| "reward_std": 0.39816636219620705, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.0846376121044159, |
| "step": 349 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2521.9791870117188, |
| "epoch": 0.4, |
| "grad_norm": 0.10303740203380585, |
| "kl": 2.6671215891838074e-05, |
| "lambda_div_used": 0.6121688261628151, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": -0.0711, |
| "reward": -0.04820730444043875, |
| "reward_after_mean": -0.04820730444043875, |
| "reward_after_std": 0.5604431573301554, |
| "reward_before_mean": 0.287683189380914, |
| "reward_before_std": 0.5214535812847316, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3358904607594013, |
| "reward_change_min": -0.5252225995063782, |
| "reward_change_std": 0.21241459622979164, |
| "reward_std": 0.5604431666433811, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": 0.016849845182150602, |
| "step": 350 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3268.250030517578, |
| "epoch": 0.40114285714285713, |
| "grad_norm": 0.04833042621612549, |
| "kl": 2.4568289518356323e-05, |
| "lambda_div_used": 0.5601532310247421, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0519, |
| "reward": -0.250777292996645, |
| "reward_after_mean": -0.250777292996645, |
| "reward_after_std": 0.38171103224158287, |
| "reward_before_mean": 0.1175131555646658, |
| "reward_before_std": 0.2718004435300827, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36829047463834286, |
| "reward_change_min": -0.5422449931502342, |
| "reward_change_std": 0.20415859669446945, |
| "reward_std": 0.3817110415548086, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.06998682580888271, |
| "step": 351 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2589.333366394043, |
| "epoch": 0.4022857142857143, |
| "grad_norm": 0.06552080065011978, |
| "kl": 1.905485987663269e-05, |
| "lambda_div_used": 0.6104472130537033, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": 0.0025, |
| "reward": -0.09347447147592902, |
| "reward_after_mean": -0.09347447147592902, |
| "reward_after_std": 0.6156985405832529, |
| "reward_before_mean": 0.26158976616716245, |
| "reward_before_std": 0.5061299707740545, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3550642393529415, |
| "reward_change_min": -0.5232127867639065, |
| "reward_change_std": 0.1957317991182208, |
| "reward_std": 0.6156985703855753, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.01158975763246417, |
| "step": 352 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2533.7916946411133, |
| "epoch": 0.4034285714285714, |
| "grad_norm": 0.07949981093406677, |
| "kl": 4.393863491714001e-05, |
| "lambda_div_used": 0.6204324588179588, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": 0.0456, |
| "reward": -0.12379613053053617, |
| "reward_after_mean": -0.12379613053053617, |
| "reward_after_std": 0.6454745382070541, |
| "reward_before_mean": 0.203694608528167, |
| "reward_before_std": 0.5591385969892144, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.327490733936429, |
| "reward_change_min": -0.5519858598709106, |
| "reward_change_std": 0.1984806014224887, |
| "reward_std": 0.6454745400696993, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.0463053984567523, |
| "step": 353 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1914.562515258789, |
| "epoch": 0.4045714285714286, |
| "grad_norm": 0.07853464037179947, |
| "kl": 2.3346394300460815e-05, |
| "lambda_div_used": 0.5937526449561119, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": -0.0345, |
| "reward": 0.036306386813521385, |
| "reward_after_mean": 0.036306386813521385, |
| "reward_after_std": 0.5187021996825933, |
| "reward_before_mean": 0.4854283039458096, |
| "reward_before_std": 0.42800275422632694, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4491219110786915, |
| "reward_change_min": -0.637521255761385, |
| "reward_change_std": 0.252533134073019, |
| "reward_std": 0.5187022183090448, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.15209495089948177, |
| "step": 354 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2335.8958740234375, |
| "epoch": 0.4057142857142857, |
| "grad_norm": 0.09039249271154404, |
| "kl": 2.7535483241081238e-05, |
| "lambda_div_used": 0.6392721608281136, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": 0.0158, |
| "reward": 0.017912205308675766, |
| "reward_after_mean": 0.017912205308675766, |
| "reward_after_std": 0.6705970745533705, |
| "reward_before_mean": 0.355070261284709, |
| "reward_before_std": 0.6396346259862185, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33715808019042015, |
| "reward_change_min": -0.5581717267632484, |
| "reward_change_std": 0.21758130192756653, |
| "reward_std": 0.6705970987677574, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.0009035973343998194, |
| "step": 355 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2585.4791870117188, |
| "epoch": 0.40685714285714286, |
| "grad_norm": 0.06332956254482269, |
| "kl": 1.6089528799057007e-05, |
| "lambda_div_used": 0.6472217217087746, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": -0.0329, |
| "reward": -0.00402236171066761, |
| "reward_after_mean": -0.00402236171066761, |
| "reward_after_std": 0.6937260664999485, |
| "reward_before_mean": 0.2978215580806136, |
| "reward_before_std": 0.6849680617451668, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3018439058214426, |
| "reward_change_min": -0.4991762898862362, |
| "reward_change_std": 0.2024542335420847, |
| "reward_std": 0.6937260907143354, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/cosine_scaled_reward": 0.00615486316382885, |
| "step": 356 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3264.8125610351562, |
| "epoch": 0.408, |
| "grad_norm": 0.057182345539331436, |
| "kl": 2.91336327791214e-05, |
| "lambda_div_used": 0.5862653851509094, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": 0.0104, |
| "reward": -0.31888771802186966, |
| "reward_after_mean": -0.31888771802186966, |
| "reward_after_std": 0.43322005309164524, |
| "reward_before_mean": -0.049007685855031013, |
| "reward_before_std": 0.39840223640203476, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2698800265789032, |
| "reward_change_min": -0.48054200410842896, |
| "reward_change_std": 0.1758509548380971, |
| "reward_std": 0.43322005309164524, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.1531743509694934, |
| "step": 357 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3050.2500534057617, |
| "epoch": 0.40914285714285714, |
| "grad_norm": 0.05330098420381546, |
| "kl": 2.4370267055928707e-05, |
| "lambda_div_used": 0.6431510671973228, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": 0.003, |
| "reward": -0.07201657444238663, |
| "reward_after_mean": -0.07201657444238663, |
| "reward_after_std": 0.688179362565279, |
| "reward_before_mean": 0.2027557131368667, |
| "reward_before_std": 0.6673171781003475, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27477228827774525, |
| "reward_change_min": -0.4944618083536625, |
| "reward_change_std": 0.18934499006718397, |
| "reward_std": 0.6881793700158596, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.026410957798361778, |
| "step": 358 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2671.937530517578, |
| "epoch": 0.4102857142857143, |
| "grad_norm": 0.07592163234949112, |
| "kl": 3.884732723236084e-05, |
| "lambda_div_used": 0.5795001536607742, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": 0.0464, |
| "reward": -0.2641681991517544, |
| "reward_after_mean": -0.2641681991517544, |
| "reward_after_std": 0.4112328961491585, |
| "reward_before_mean": 0.03885669261217117, |
| "reward_before_std": 0.36149344593286514, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30302487686276436, |
| "reward_change_min": -0.4962599165737629, |
| "reward_change_std": 0.18252001702785492, |
| "reward_std": 0.4112329035997391, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.10697665251791477, |
| "step": 359 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2798.937530517578, |
| "epoch": 0.4114285714285714, |
| "grad_norm": 0.06665439158678055, |
| "kl": 3.1637027859687805e-05, |
| "lambda_div_used": 0.6062901616096497, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": -0.0029, |
| "reward": -0.0033570118248462677, |
| "reward_after_mean": -0.0033570118248462677, |
| "reward_after_std": 0.5593161657452583, |
| "reward_before_mean": 0.4031880460679531, |
| "reward_before_std": 0.4958424214273691, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40654509887099266, |
| "reward_change_min": -0.6473257802426815, |
| "reward_change_std": 0.25588439870625734, |
| "reward_std": 0.5593161787837744, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.0698547288775444, |
| "step": 360 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3150.625030517578, |
| "epoch": 0.4125714285714286, |
| "grad_norm": 0.06971946358680725, |
| "kl": 2.9014074243605137e-05, |
| "lambda_div_used": 0.6219506114721298, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0048, |
| "reward": -0.11981392558664083, |
| "reward_after_mean": -0.11981392558664083, |
| "reward_after_std": 0.5868565142154694, |
| "reward_before_mean": 0.17182117886841297, |
| "reward_before_std": 0.564973471686244, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29163510724902153, |
| "reward_change_min": -0.5129627995193005, |
| "reward_change_std": 0.19524423126131296, |
| "reward_std": 0.5868565402925014, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.057345494627952576, |
| "step": 361 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1471.916706085205, |
| "epoch": 0.4137142857142857, |
| "grad_norm": 0.12450961768627167, |
| "kl": 2.4568289518356323e-05, |
| "lambda_div_used": 0.571638435125351, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": -0.0303, |
| "reward": 0.013313630130141973, |
| "reward_after_mean": 0.013313630130141973, |
| "reward_after_std": 0.4897063076496124, |
| "reward_before_mean": 0.505017813295126, |
| "reward_before_std": 0.32288316125050187, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4917041715234518, |
| "reward_change_min": -0.6615025624632835, |
| "reward_change_std": 0.25815817900002, |
| "reward_std": 0.4897063188254833, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.1300177900120616, |
| "step": 362 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2174.270881652832, |
| "epoch": 0.41485714285714287, |
| "grad_norm": 0.08419760316610336, |
| "kl": 4.357472062110901e-05, |
| "lambda_div_used": 0.6037088930606842, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0176, |
| "reward": -0.09517315030097961, |
| "reward_after_mean": -0.09517315030097961, |
| "reward_after_std": 0.57970448769629, |
| "reward_before_mean": 0.282845395617187, |
| "reward_before_std": 0.4830307289958, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37801856361329556, |
| "reward_change_min": -0.5875305943191051, |
| "reward_change_std": 0.22446841653436422, |
| "reward_std": 0.57970448769629, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.03284538397565484, |
| "step": 363 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2614.9167098999023, |
| "epoch": 0.416, |
| "grad_norm": 0.07822614908218384, |
| "kl": 2.289540134370327e-05, |
| "lambda_div_used": 0.5602610111236572, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": 0.0079, |
| "reward": -0.4077296704053879, |
| "reward_after_mean": -0.4077296704053879, |
| "reward_after_std": 0.3345623780041933, |
| "reward_before_mean": -0.1330121699720621, |
| "reward_before_std": 0.27498441375792027, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2747174873948097, |
| "reward_change_min": -0.4260003827512264, |
| "reward_change_std": 0.16245513781905174, |
| "reward_std": 0.3345623817294836, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.195512181147933, |
| "step": 364 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3003.604202270508, |
| "epoch": 0.41714285714285715, |
| "grad_norm": 0.04828161001205444, |
| "kl": 6.87967985868454e-06, |
| "lambda_div_used": 0.6036554351449013, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": 0.0555, |
| "reward": -0.31679424038156867, |
| "reward_after_mean": -0.31679424038156867, |
| "reward_after_std": 0.5247047282755375, |
| "reward_before_mean": -0.07733823172748089, |
| "reward_before_std": 0.4768796032294631, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23945600911974907, |
| "reward_change_min": -0.42952996119856834, |
| "reward_change_std": 0.15284609980881214, |
| "reward_std": 0.5247047450393438, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.18150490219704807, |
| "step": 365 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1893.00004196167, |
| "epoch": 0.41828571428571426, |
| "grad_norm": 0.08456390351057053, |
| "kl": 1.6473233699798584e-05, |
| "lambda_div_used": 0.6091032102704048, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": -0.0195, |
| "reward": 0.005720527842640877, |
| "reward_after_mean": 0.005720527842640877, |
| "reward_after_std": 0.6514943428337574, |
| "reward_before_mean": 0.43899719044566154, |
| "reward_before_std": 0.49832448456436396, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4332766607403755, |
| "reward_change_min": -0.6058122254908085, |
| "reward_change_std": 0.23482245951890945, |
| "reward_std": 0.6514943763613701, |
| "rewards/accuracy_reward": 0.39583333767950535, |
| "rewards/cosine_scaled_reward": 0.043163834139704704, |
| "step": 366 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2780.937515258789, |
| "epoch": 0.41942857142857143, |
| "grad_norm": 0.06994574517011642, |
| "kl": 2.946704626083374e-05, |
| "lambda_div_used": 0.6079575195908546, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": 0.0087, |
| "reward": -0.16347008850425482, |
| "reward_after_mean": -0.16347008850425482, |
| "reward_after_std": 0.5466162711381912, |
| "reward_before_mean": 0.1403359491378069, |
| "reward_before_std": 0.49341694079339504, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30380604416131973, |
| "reward_change_min": -0.4883100688457489, |
| "reward_change_std": 0.18378268275409937, |
| "reward_std": 0.5466162823140621, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.0471640374744311, |
| "step": 367 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2967.041679382324, |
| "epoch": 0.4205714285714286, |
| "grad_norm": 0.07414139062166214, |
| "kl": 4.155188798904419e-05, |
| "lambda_div_used": 0.5803077146410942, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": 0.0504, |
| "reward": -0.27402403950691223, |
| "reward_after_mean": -0.27402403950691223, |
| "reward_after_std": 0.4059589561074972, |
| "reward_before_mean": 0.008191026747226715, |
| "reward_before_std": 0.371637674048543, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28221505135297775, |
| "reward_change_min": -0.44202784821391106, |
| "reward_change_std": 0.17603088915348053, |
| "reward_std": 0.40595896542072296, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.15847563743591309, |
| "step": 368 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2863.1667404174805, |
| "epoch": 0.4217142857142857, |
| "grad_norm": 0.07171155512332916, |
| "kl": 2.0891427993774414e-05, |
| "lambda_div_used": 0.6472674459218979, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": -0.0044, |
| "reward": -0.04304695222526789, |
| "reward_after_mean": -0.04304695222526789, |
| "reward_after_std": 0.687617726624012, |
| "reward_before_mean": 0.25146727729588747, |
| "reward_before_std": 0.6888436311855912, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29451421834528446, |
| "reward_change_min": -0.6169496960937977, |
| "reward_change_std": 0.22693553566932678, |
| "reward_std": 0.6876177359372377, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.0014672745019197464, |
| "step": 369 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3261.2291870117188, |
| "epoch": 0.4228571428571429, |
| "grad_norm": 0.058094967156648636, |
| "kl": 2.1554529666900635e-05, |
| "lambda_div_used": 0.5657742545008659, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0031, |
| "reward": -0.2483972143381834, |
| "reward_after_mean": -0.2483972143381834, |
| "reward_after_std": 0.4499538466334343, |
| "reward_before_mean": 0.13386818021535873, |
| "reward_before_std": 0.29791492875665426, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38226539455354214, |
| "reward_change_min": -0.51585279032588, |
| "reward_change_std": 0.19617348536849022, |
| "reward_std": 0.4499538540840149, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.03279848676174879, |
| "step": 370 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1885.645866394043, |
| "epoch": 0.424, |
| "grad_norm": 0.12381042540073395, |
| "kl": 1.5240773791447282e-05, |
| "lambda_div_used": 0.5325883999466896, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": -0.0304, |
| "reward": -0.14332129061222076, |
| "reward_after_mean": -0.14332129061222076, |
| "reward_after_std": 0.3713335506618023, |
| "reward_before_mean": 0.3775896169245243, |
| "reward_before_std": 0.14313361048698425, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5209108907729387, |
| "reward_change_min": -0.6799871250987053, |
| "reward_change_std": 0.26304140500724316, |
| "reward_std": 0.3713335543870926, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/cosine_scaled_reward": 0.0025896020233631134, |
| "step": 371 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3061.6250228881836, |
| "epoch": 0.42514285714285716, |
| "grad_norm": 0.0565616711974144, |
| "kl": 1.3771001249551773e-05, |
| "lambda_div_used": 0.6153379678726196, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": 0.0019, |
| "reward": 0.08842738252133131, |
| "reward_after_mean": 0.08842738252133131, |
| "reward_after_std": 0.6899000462144613, |
| "reward_before_mean": 0.5513297475408763, |
| "reward_before_std": 0.5352799613028765, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46290238574147224, |
| "reward_change_min": -0.6905807815492153, |
| "reward_change_std": 0.26435263454914093, |
| "reward_std": 0.6899000480771065, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.15549640776589513, |
| "step": 372 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1999.791690826416, |
| "epoch": 0.42628571428571427, |
| "grad_norm": 0.11289634555578232, |
| "kl": 2.6244670152664185e-05, |
| "lambda_div_used": 0.5901408270001411, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": -0.0028, |
| "reward": -0.25084885116666555, |
| "reward_after_mean": -0.25084885116666555, |
| "reward_after_std": 0.47954990342259407, |
| "reward_before_mean": 0.03629123326390982, |
| "reward_before_std": 0.41134614683687687, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28714008443057537, |
| "reward_change_min": -0.4200817756354809, |
| "reward_change_std": 0.1626526527106762, |
| "reward_std": 0.47954992204904556, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.08870877418667078, |
| "step": 373 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2700.791717529297, |
| "epoch": 0.42742857142857144, |
| "grad_norm": 0.0688440129160881, |
| "kl": 2.674013376235962e-05, |
| "lambda_div_used": 0.6036395579576492, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": -0.0525, |
| "reward": -0.17511339485645294, |
| "reward_after_mean": -0.17511339485645294, |
| "reward_after_std": 0.5073369853198528, |
| "reward_before_mean": 0.13348117470741272, |
| "reward_before_std": 0.48259105905890465, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30859458073973656, |
| "reward_change_min": -0.5309739634394646, |
| "reward_change_std": 0.20630291104316711, |
| "reward_std": 0.5073370076715946, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.07485215552151203, |
| "step": 374 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3033.791702270508, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.05866050720214844, |
| "kl": 1.8252991139888763e-05, |
| "lambda_div_used": 0.6017400398850441, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": 0.0477, |
| "reward": -0.18571221362799406, |
| "reward_after_mean": -0.18571221362799406, |
| "reward_after_std": 0.5023255608975887, |
| "reward_before_mean": 0.11062880232930183, |
| "reward_before_std": 0.46861019916832447, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2963410019874573, |
| "reward_change_min": -0.45132842287421227, |
| "reward_change_std": 0.1817708509042859, |
| "reward_std": 0.5023255832493305, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.097704553976655, |
| "step": 375 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2246.770851135254, |
| "epoch": 0.4297142857142857, |
| "grad_norm": 0.08337263762950897, |
| "kl": 8.605420589447021e-06, |
| "lambda_div_used": 0.5853807479143143, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0128, |
| "reward": -0.3334618601948023, |
| "reward_after_mean": -0.3334618601948023, |
| "reward_after_std": 0.4486316703259945, |
| "reward_before_mean": -0.05933803477091715, |
| "reward_before_std": 0.3892780668102205, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2741238009184599, |
| "reward_change_min": -0.46821001917123795, |
| "reward_change_std": 0.1663337228819728, |
| "reward_std": 0.44863167591392994, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.163504708558321, |
| "step": 376 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3488.3541870117188, |
| "epoch": 0.4308571428571429, |
| "grad_norm": 0.055966660380363464, |
| "kl": 2.5756657123565674e-05, |
| "lambda_div_used": 0.5742340236902237, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": -0.0032, |
| "reward": -0.3738445993512869, |
| "reward_after_mean": -0.3738445993512869, |
| "reward_after_std": 0.39967909548431635, |
| "reward_before_mean": -0.10715527582215145, |
| "reward_before_std": 0.33878533728420734, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26668932288885117, |
| "reward_change_min": -0.4109174869954586, |
| "reward_change_std": 0.15686567965894938, |
| "reward_std": 0.39967911317944527, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.19048861227929592, |
| "step": 377 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2134.3333740234375, |
| "epoch": 0.432, |
| "grad_norm": 0.13181807100772858, |
| "kl": 2.8654932975769043e-05, |
| "lambda_div_used": 0.6004306748509407, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": -0.0331, |
| "reward": -0.06989812850952148, |
| "reward_after_mean": -0.06989812850952148, |
| "reward_after_std": 0.5459046512842178, |
| "reward_before_mean": 0.31881395215168595, |
| "reward_before_std": 0.46263560838997364, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3887120857834816, |
| "reward_change_min": -0.5917952992022038, |
| "reward_change_std": 0.23005317710340023, |
| "reward_std": 0.5459046605974436, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": 0.027147281914949417, |
| "step": 378 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3121.0416870117188, |
| "epoch": 0.43314285714285716, |
| "grad_norm": 0.07427296042442322, |
| "kl": 1.5733763575553894e-05, |
| "lambda_div_used": 0.5833418369293213, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": -0.0048, |
| "reward": -0.29101302847266197, |
| "reward_after_mean": -0.29101302847266197, |
| "reward_after_std": 0.4290795102715492, |
| "reward_before_mean": -0.009382706135511398, |
| "reward_before_std": 0.38597164303064346, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2816303465515375, |
| "reward_change_min": -0.47839244455099106, |
| "reward_change_std": 0.17911072820425034, |
| "reward_std": 0.4290795363485813, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.13438269309699535, |
| "step": 379 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2823.416702270508, |
| "epoch": 0.4342857142857143, |
| "grad_norm": 0.06611248105764389, |
| "kl": 2.2601336240768433e-05, |
| "lambda_div_used": 0.6066281795501709, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": 0.0533, |
| "reward": -0.17613293696194887, |
| "reward_after_mean": -0.17613293696194887, |
| "reward_after_std": 0.495351817458868, |
| "reward_before_mean": 0.1346770692616701, |
| "reward_before_std": 0.48867712169885635, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31081000342965126, |
| "reward_change_min": -0.5689719989895821, |
| "reward_change_std": 0.21053248085081577, |
| "reward_std": 0.4953518286347389, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.05282294252538122, |
| "step": 380 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2983.7916984558105, |
| "epoch": 0.43542857142857144, |
| "grad_norm": 0.08519254624843597, |
| "kl": 3.2504089176654816e-05, |
| "lambda_div_used": 0.6109358817338943, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0165, |
| "reward": -0.2218069350346923, |
| "reward_after_mean": -0.2218069350346923, |
| "reward_after_std": 0.5642109382897615, |
| "reward_before_mean": 0.040106164291501045, |
| "reward_before_std": 0.5099438140168786, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.261913089081645, |
| "reward_change_min": -0.45296039059758186, |
| "reward_change_std": 0.1642473293468356, |
| "reward_std": 0.5642109606415033, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.10572716826573014, |
| "step": 381 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2500.437545776367, |
| "epoch": 0.43657142857142855, |
| "grad_norm": 0.07325682789087296, |
| "kl": 3.65767627954483e-05, |
| "lambda_div_used": 0.5600408837199211, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": -0.0096, |
| "reward": -0.4078812226653099, |
| "reward_after_mean": -0.4078812226653099, |
| "reward_after_std": 0.3340430334210396, |
| "reward_before_mean": -0.13890772312879562, |
| "reward_before_std": 0.27443209011107683, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26897350139915943, |
| "reward_change_min": -0.42554087564349174, |
| "reward_change_std": 0.15754834469407797, |
| "reward_std": 0.3340430427342653, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.20140772499144077, |
| "step": 382 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2824.291717529297, |
| "epoch": 0.4377142857142857, |
| "grad_norm": 0.08796168118715286, |
| "kl": 4.8138201236724854e-05, |
| "lambda_div_used": 0.5781937688589096, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": -0.0086, |
| "reward": -0.16291768848896027, |
| "reward_after_mean": -0.16291768848896027, |
| "reward_after_std": 0.387674568220973, |
| "reward_before_mean": 0.1924248207360506, |
| "reward_before_std": 0.35597400926053524, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3553424943238497, |
| "reward_change_min": -0.5255323797464371, |
| "reward_change_std": 0.21135967783629894, |
| "reward_std": 0.3876745719462633, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/cosine_scaled_reward": -0.01590852066874504, |
| "step": 383 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2347.6042404174805, |
| "epoch": 0.43885714285714283, |
| "grad_norm": 0.09212023764848709, |
| "kl": 2.508983016014099e-05, |
| "lambda_div_used": 0.6615950018167496, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": 0.0408, |
| "reward": 0.27819300815463066, |
| "reward_after_mean": 0.27819300815463066, |
| "reward_after_std": 0.7268820777535439, |
| "reward_before_mean": 0.6785084716975689, |
| "reward_before_std": 0.7540020374581218, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4003154691308737, |
| "reward_change_min": -0.6902611702680588, |
| "reward_change_std": 0.283959056250751, |
| "reward_std": 0.7268821019679308, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.22017512656748295, |
| "step": 384 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2711.958396911621, |
| "epoch": 0.44, |
| "grad_norm": 0.07038652151823044, |
| "kl": 1.942366361618042e-05, |
| "lambda_div_used": 0.6165113672614098, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": 0.012, |
| "reward": -0.25626325886696577, |
| "reward_after_mean": -0.25626325886696577, |
| "reward_after_std": 0.5892766248434782, |
| "reward_before_mean": -0.0074533987790346146, |
| "reward_before_std": 0.5360177559778094, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24880987033247948, |
| "reward_change_min": -0.44854912906885147, |
| "reward_change_std": 0.1595793990418315, |
| "reward_std": 0.5892766322940588, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.13245339877903461, |
| "step": 385 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2889.1042404174805, |
| "epoch": 0.44114285714285717, |
| "grad_norm": 0.060970280319452286, |
| "kl": 1.8231570720672607e-05, |
| "lambda_div_used": 0.5940364003181458, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": -0.0287, |
| "reward": -0.24576714914292097, |
| "reward_after_mean": -0.24576714914292097, |
| "reward_after_std": 0.4883785657584667, |
| "reward_before_mean": 0.04634229093790054, |
| "reward_before_std": 0.4279829766601324, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29210945032536983, |
| "reward_change_min": -0.4432514049112797, |
| "reward_change_std": 0.17009661067277193, |
| "reward_std": 0.48837856762111187, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.07865770626813173, |
| "step": 386 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2997.3125, |
| "epoch": 0.4422857142857143, |
| "grad_norm": 0.07781542837619781, |
| "kl": 1.3821758329868317e-05, |
| "lambda_div_used": 0.5551474094390869, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": -0.0529, |
| "reward": -0.38486793637275696, |
| "reward_after_mean": -0.38486793637275696, |
| "reward_after_std": 0.31045267172157764, |
| "reward_before_mean": -0.10442159324884415, |
| "reward_before_std": 0.24840335873886943, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28044634498655796, |
| "reward_change_min": -0.4241391494870186, |
| "reward_change_std": 0.15805031638592482, |
| "reward_std": 0.31045267917215824, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.20858826488256454, |
| "step": 387 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2572.2708587646484, |
| "epoch": 0.44342857142857145, |
| "grad_norm": 0.06563723832368851, |
| "kl": 2.9848888516426086e-06, |
| "lambda_div_used": 0.5865650475025177, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": 0.0407, |
| "reward": -0.15434425324201584, |
| "reward_after_mean": -0.15434425324201584, |
| "reward_after_std": 0.47601727209985256, |
| "reward_before_mean": 0.20021704956889153, |
| "reward_before_std": 0.39916726760566235, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.354561323300004, |
| "reward_change_min": -0.5467559210956097, |
| "reward_change_std": 0.21171134896576405, |
| "reward_std": 0.47601727209985256, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": -0.07061627879738808, |
| "step": 388 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2650.937545776367, |
| "epoch": 0.44457142857142856, |
| "grad_norm": 0.07840488851070404, |
| "kl": 1.73286534845829e-05, |
| "lambda_div_used": 0.6045578718185425, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": -0.0394, |
| "reward": -0.26122746989130974, |
| "reward_after_mean": -0.26122746989130974, |
| "reward_after_std": 0.5210577324032784, |
| "reward_before_mean": 0.002998221665620804, |
| "reward_before_std": 0.486898148432374, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26422569528222084, |
| "reward_change_min": -0.4951511509716511, |
| "reward_change_std": 0.18348107766360044, |
| "reward_std": 0.5210577566176653, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1428351059439592, |
| "step": 389 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3007.666679382324, |
| "epoch": 0.44571428571428573, |
| "grad_norm": 0.07837636768817902, |
| "kl": 1.6082078218460083e-05, |
| "lambda_div_used": 0.61600511521101, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0713, |
| "reward": -0.18647840432822704, |
| "reward_after_mean": -0.18647840432822704, |
| "reward_after_std": 0.5758322961628437, |
| "reward_before_mean": 0.08954079262912273, |
| "reward_before_std": 0.5347468825057149, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27601918019354343, |
| "reward_change_min": -0.47189946845173836, |
| "reward_change_std": 0.17452176753431559, |
| "reward_std": 0.5758323054760695, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07712590089067817, |
| "step": 390 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2716.708351135254, |
| "epoch": 0.44685714285714284, |
| "grad_norm": 0.09803462773561478, |
| "kl": 2.5559216737747192e-05, |
| "lambda_div_used": 0.6362191960215569, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0166, |
| "reward": -0.013867436617147177, |
| "reward_after_mean": -0.013867436617147177, |
| "reward_after_std": 0.6719868443906307, |
| "reward_before_mean": 0.30256712157279253, |
| "reward_before_std": 0.6324740117415786, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3164345696568489, |
| "reward_change_min": -0.5261501334607601, |
| "reward_change_std": 0.20623697619885206, |
| "reward_std": 0.6719868592917919, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": 0.03173377411440015, |
| "step": 391 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2130.9791984558105, |
| "epoch": 0.448, |
| "grad_norm": 0.11540649086236954, |
| "kl": 1.8610619008541107e-05, |
| "lambda_div_used": 0.6027960926294327, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": -0.0919, |
| "reward": -0.16835473664104939, |
| "reward_after_mean": -0.16835473664104939, |
| "reward_after_std": 0.5122340489178896, |
| "reward_before_mean": 0.1393747702240944, |
| "reward_before_std": 0.4794600326567888, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3077295087277889, |
| "reward_change_min": -0.4898943528532982, |
| "reward_change_std": 0.1954572731629014, |
| "reward_std": 0.5122340768575668, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.027291906299069524, |
| "step": 392 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2649.562530517578, |
| "epoch": 0.4491428571428571, |
| "grad_norm": 0.07226411253213882, |
| "kl": 2.648681402206421e-05, |
| "lambda_div_used": 0.5953627824783325, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": -0.067, |
| "reward": -0.20140517689287663, |
| "reward_after_mean": -0.20140517689287663, |
| "reward_after_std": 0.4766168761998415, |
| "reward_before_mean": 0.09705937840044498, |
| "reward_before_std": 0.4375506564974785, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2984645199030638, |
| "reward_change_min": -0.47758448868989944, |
| "reward_change_std": 0.18242334388196468, |
| "reward_std": 0.47661688551306725, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.09044064581394196, |
| "step": 393 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3104.1458740234375, |
| "epoch": 0.4502857142857143, |
| "grad_norm": 0.07446404546499252, |
| "kl": 2.555176615715027e-05, |
| "lambda_div_used": 0.5502220839262009, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": -0.1089, |
| "reward": -0.46874738670885563, |
| "reward_after_mean": -0.46874738670885563, |
| "reward_after_std": 0.30342659167945385, |
| "reward_before_mean": -0.20392105542123318, |
| "reward_before_std": 0.22309745661914349, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2648263294249773, |
| "reward_change_min": -0.3818593733012676, |
| "reward_change_std": 0.14172559697180986, |
| "reward_std": 0.30342659167945385, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.22475438751280308, |
| "step": 394 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2284.6458435058594, |
| "epoch": 0.4514285714285714, |
| "grad_norm": 0.12902304530143738, |
| "kl": 4.9501657485961914e-05, |
| "lambda_div_used": 0.5602920204401016, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": -0.1675, |
| "reward": -0.3165151756256819, |
| "reward_after_mean": -0.3165151756256819, |
| "reward_after_std": 0.3793674483895302, |
| "reward_before_mean": 0.0216768067330122, |
| "reward_before_std": 0.27139727398753166, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3381919823586941, |
| "reward_change_min": -0.48955149203538895, |
| "reward_change_std": 0.17990652937442064, |
| "reward_std": 0.3793674521148205, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.12415653187781572, |
| "step": 395 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3048.1458587646484, |
| "epoch": 0.45257142857142857, |
| "grad_norm": 0.054839495569467545, |
| "kl": 7.04331323504448e-05, |
| "lambda_div_used": 0.6417915895581245, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0192, |
| "reward": -0.09411877207458019, |
| "reward_after_mean": -0.09411877207458019, |
| "reward_after_std": 0.6742421016097069, |
| "reward_before_mean": 0.1871709941624431, |
| "reward_before_std": 0.659186695702374, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2812897562980652, |
| "reward_change_min": -0.5664796940982342, |
| "reward_change_std": 0.20405743923038244, |
| "reward_std": 0.6742421071976423, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.021162351593375206, |
| "step": 396 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2982.750030517578, |
| "epoch": 0.45371428571428574, |
| "grad_norm": 0.05798732861876488, |
| "kl": 2.0850449800491333e-05, |
| "lambda_div_used": 0.5765915215015411, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": 0.0391, |
| "reward": -0.36006474308669567, |
| "reward_after_mean": -0.36006474308669567, |
| "reward_after_std": 0.40458301082253456, |
| "reward_before_mean": -0.08499279711395502, |
| "reward_before_std": 0.3492864612489939, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2750719413161278, |
| "reward_change_min": -0.4323217496275902, |
| "reward_change_std": 0.16262990981340408, |
| "reward_std": 0.40458302199840546, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.16832613572478294, |
| "step": 397 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2822.5209045410156, |
| "epoch": 0.45485714285714285, |
| "grad_norm": 0.06360434740781784, |
| "kl": 2.2810418158769608e-05, |
| "lambda_div_used": 0.6480221897363663, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": 0.0465, |
| "reward": -0.09229415841400623, |
| "reward_after_mean": -0.09229415841400623, |
| "reward_after_std": 0.7079674638807774, |
| "reward_before_mean": 0.1752314588520676, |
| "reward_before_std": 0.6917341919615865, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26752561889588833, |
| "reward_change_min": -0.5181693434715271, |
| "reward_change_std": 0.19102454278618097, |
| "reward_std": 0.7079674787819386, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.03310186788439751, |
| "step": 398 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2191.583351135254, |
| "epoch": 0.456, |
| "grad_norm": 0.07585066556930542, |
| "kl": 3.449246287345886e-05, |
| "lambda_div_used": 0.6071069464087486, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": -0.0428, |
| "reward": -0.06968086212873459, |
| "reward_after_mean": -0.06968086212873459, |
| "reward_after_std": 0.5623702071607113, |
| "reward_before_mean": 0.29000685061328113, |
| "reward_before_std": 0.4939764440059662, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35968772508203983, |
| "reward_change_min": -0.5781176537275314, |
| "reward_change_std": 0.2176226656883955, |
| "reward_std": 0.5623702295124531, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.01917351223528385, |
| "step": 399 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1814.4375305175781, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 0.09906096011400223, |
| "kl": 3.718771040439606e-05, |
| "lambda_div_used": 0.615766242146492, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": -0.0203, |
| "reward": 0.023047026246786118, |
| "reward_after_mean": 0.023047026246786118, |
| "reward_after_std": 0.6236102003604174, |
| "reward_before_mean": 0.43256790889427066, |
| "reward_before_std": 0.5322064086794853, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40952087566256523, |
| "reward_change_min": -0.6220400035381317, |
| "reward_change_std": 0.2354581467807293, |
| "reward_std": 0.62361023388803, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.12006791215389967, |
| "step": 400 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3091.7916870117188, |
| "epoch": 0.4582857142857143, |
| "grad_norm": 0.05475914105772972, |
| "kl": 3.5446137189865112e-06, |
| "lambda_div_used": 0.562153548002243, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0508, |
| "reward": -0.29909564182162285, |
| "reward_after_mean": -0.29909564182162285, |
| "reward_after_std": 0.3837758805602789, |
| "reward_before_mean": 0.04575100168585777, |
| "reward_before_std": 0.28164876997470856, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3448466807603836, |
| "reward_change_min": -0.5300594680011272, |
| "reward_change_std": 0.19381076097488403, |
| "reward_std": 0.38377588987350464, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.12091565364971757, |
| "step": 401 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2440.1458740234375, |
| "epoch": 0.4594285714285714, |
| "grad_norm": 0.11770489811897278, |
| "kl": 1.8846243619918823e-05, |
| "lambda_div_used": 0.5605455562472343, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": -0.0261, |
| "reward": -0.2383374497294426, |
| "reward_after_mean": -0.2383374497294426, |
| "reward_after_std": 0.3794025480747223, |
| "reward_before_mean": 0.14294641837477684, |
| "reward_before_std": 0.27562023140490055, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38128384202718735, |
| "reward_change_min": -0.5627436973154545, |
| "reward_change_std": 0.2136234436184168, |
| "reward_std": 0.3794025518000126, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.044553620740771294, |
| "step": 402 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1985.6875343322754, |
| "epoch": 0.4605714285714286, |
| "grad_norm": 0.10363903641700745, |
| "kl": 2.3838132619857788e-05, |
| "lambda_div_used": 0.5793934315443039, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": 0.003, |
| "reward": -0.17537187691777945, |
| "reward_after_mean": -0.17537187691777945, |
| "reward_after_std": 0.46652381494641304, |
| "reward_before_mean": 0.20323466695845127, |
| "reward_before_std": 0.3647587588056922, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37860656157135963, |
| "reward_change_min": -0.5506669841706753, |
| "reward_change_std": 0.21334033645689487, |
| "reward_std": 0.4665238317102194, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.025931986048817635, |
| "step": 403 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2137.375, |
| "epoch": 0.4617142857142857, |
| "grad_norm": 0.10717064887285233, |
| "kl": 3.7202611565589905e-05, |
| "lambda_div_used": 0.5993036478757858, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": -0.0043, |
| "reward": -0.053446926176548004, |
| "reward_after_mean": -0.053446926176548004, |
| "reward_after_std": 0.4612566214054823, |
| "reward_before_mean": 0.3198554217815399, |
| "reward_before_std": 0.4542539082467556, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3733023554086685, |
| "reward_change_min": -0.5774630047380924, |
| "reward_change_std": 0.2339334823191166, |
| "reward_std": 0.4612566400319338, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": 0.04902207478880882, |
| "step": 404 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2319.5000228881836, |
| "epoch": 0.46285714285714286, |
| "grad_norm": 0.13891565799713135, |
| "kl": 2.9403716325759888e-05, |
| "lambda_div_used": 0.6237343773245811, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": -0.0602, |
| "reward": 0.1285000964999199, |
| "reward_after_mean": 0.1285000964999199, |
| "reward_after_std": 0.7067493386566639, |
| "reward_before_mean": 0.5776062086224556, |
| "reward_before_std": 0.5746421907097101, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.449106115847826, |
| "reward_change_min": -0.6593505516648293, |
| "reward_change_std": 0.263070298358798, |
| "reward_std": 0.7067493461072445, |
| "rewards/accuracy_reward": 0.4166666679084301, |
| "rewards/cosine_scaled_reward": 0.1609395444393158, |
| "step": 405 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2442.1458892822266, |
| "epoch": 0.464, |
| "grad_norm": 0.07689624279737473, |
| "kl": 8.527189493179321e-06, |
| "lambda_div_used": 0.6336105018854141, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": -0.0, |
| "reward": -0.11170937749557197, |
| "reward_after_mean": -0.11170937749557197, |
| "reward_after_std": 0.6453428398817778, |
| "reward_before_mean": 0.17541324836201966, |
| "reward_before_std": 0.6142212487757206, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2871226370334625, |
| "reward_change_min": -0.5107162520289421, |
| "reward_change_std": 0.19029081240296364, |
| "reward_std": 0.645342854782939, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.032920089550316334, |
| "step": 406 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2516.6041946411133, |
| "epoch": 0.46514285714285714, |
| "grad_norm": 0.09361904859542847, |
| "kl": 2.995133399963379e-05, |
| "lambda_div_used": 0.5926980003714561, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": 0.011, |
| "reward": -0.16482560662552714, |
| "reward_after_mean": -0.16482560662552714, |
| "reward_after_std": 0.5236028637737036, |
| "reward_before_mean": 0.18611273169517517, |
| "reward_before_std": 0.4258856289088726, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35093834809958935, |
| "reward_change_min": -0.5517234578728676, |
| "reward_change_std": 0.20658957865089178, |
| "reward_std": 0.5236028656363487, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": -0.06388726737350225, |
| "step": 407 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2709.4791984558105, |
| "epoch": 0.4662857142857143, |
| "grad_norm": 0.09306048601865768, |
| "kl": 3.203703090548515e-05, |
| "lambda_div_used": 0.5583930537104607, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": -0.111, |
| "reward": -0.17684245621785522, |
| "reward_after_mean": -0.17684245621785522, |
| "reward_after_std": 0.42921141162514687, |
| "reward_before_mean": 0.2615569829940796, |
| "reward_before_std": 0.26226984336972237, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43839943781495094, |
| "reward_change_min": -0.6072936318814754, |
| "reward_change_std": 0.22694199439138174, |
| "reward_std": 0.42921141907572746, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": -0.009276359807699919, |
| "step": 408 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3529.3541870117188, |
| "epoch": 0.4674285714285714, |
| "grad_norm": 0.048929836601018906, |
| "kl": 7.6089054346084595e-06, |
| "lambda_div_used": 0.6116252392530441, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": 0.001, |
| "reward": -0.17062923312187195, |
| "reward_after_mean": -0.17062923312187195, |
| "reward_after_std": 0.5682655684649944, |
| "reward_before_mean": 0.12847105879336596, |
| "reward_before_std": 0.5097848381847143, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2991002984344959, |
| "reward_change_min": -0.43997345492243767, |
| "reward_change_std": 0.17078326642513275, |
| "reward_std": 0.5682655889540911, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.038195611676201224, |
| "step": 409 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2435.625045776367, |
| "epoch": 0.4685714285714286, |
| "grad_norm": 0.08304905891418457, |
| "kl": 1.5234574675559998e-05, |
| "lambda_div_used": 0.6268336623907089, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": 0.0499, |
| "reward": -0.03433734131976962, |
| "reward_after_mean": -0.03433734131976962, |
| "reward_after_std": 0.6416803412139416, |
| "reward_before_mean": 0.2960522407665849, |
| "reward_before_std": 0.5826444877311587, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3303895927965641, |
| "reward_change_min": -0.4872638136148453, |
| "reward_change_std": 0.19508948456496, |
| "reward_std": 0.6416803747415543, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": 0.04605222793179564, |
| "step": 410 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3163.1041717529297, |
| "epoch": 0.4697142857142857, |
| "grad_norm": 0.06846357136964798, |
| "kl": 1.529604196548462e-05, |
| "lambda_div_used": 0.5874563306570053, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0214, |
| "reward": -0.22584301605820656, |
| "reward_after_mean": -0.22584301605820656, |
| "reward_after_std": 0.42311797849833965, |
| "reward_before_mean": 0.0774743240326643, |
| "reward_before_std": 0.4020446836948395, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3033173345029354, |
| "reward_change_min": -0.48705917969346046, |
| "reward_change_std": 0.1920422399416566, |
| "reward_std": 0.42311798594892025, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.08919236063957214, |
| "step": 411 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2888.125015258789, |
| "epoch": 0.47085714285714286, |
| "grad_norm": 0.06694348156452179, |
| "kl": 2.148747444152832e-05, |
| "lambda_div_used": 0.6005539819598198, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": 0.049, |
| "reward": -0.21726901549845934, |
| "reward_after_mean": -0.21726901549845934, |
| "reward_after_std": 0.48868509009480476, |
| "reward_before_mean": 0.08159955404698849, |
| "reward_before_std": 0.463420107960701, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2988685816526413, |
| "reward_change_min": -0.554147582501173, |
| "reward_change_std": 0.19932363275438547, |
| "reward_std": 0.48868510872125626, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.06423377990722656, |
| "step": 412 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2530.2708702087402, |
| "epoch": 0.472, |
| "grad_norm": 0.08418448269367218, |
| "kl": 2.208724617958069e-05, |
| "lambda_div_used": 0.6184473037719727, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.0462, |
| "reward": 0.05549921467900276, |
| "reward_after_mean": 0.05549921467900276, |
| "reward_after_std": 0.6010984163731337, |
| "reward_before_mean": 0.4671566132456064, |
| "reward_before_std": 0.546179112046957, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4116574004292488, |
| "reward_change_min": -0.6464647352695465, |
| "reward_change_std": 0.253952544182539, |
| "reward_std": 0.6010984499007463, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.09215660532936454, |
| "step": 413 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2948.562530517578, |
| "epoch": 0.47314285714285714, |
| "grad_norm": 0.06618716567754745, |
| "kl": 1.0458752512931824e-05, |
| "lambda_div_used": 0.6192215830087662, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": -0.0483, |
| "reward": -0.21263186633586884, |
| "reward_after_mean": -0.21263186633586884, |
| "reward_after_std": 0.5998369809240103, |
| "reward_before_mean": 0.04798411298543215, |
| "reward_before_std": 0.555210480466485, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2606159746646881, |
| "reward_change_min": -0.4564817361533642, |
| "reward_change_std": 0.16899613942950964, |
| "reward_std": 0.5998369976878166, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.0978492172434926, |
| "step": 414 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3234.8125, |
| "epoch": 0.4742857142857143, |
| "grad_norm": 0.06085793301463127, |
| "kl": 2.8954818844795227e-05, |
| "lambda_div_used": 0.5570018589496613, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.0073, |
| "reward": -0.46066740388050675, |
| "reward_after_mean": -0.46066740388050675, |
| "reward_after_std": 0.3419057931751013, |
| "reward_before_mean": -0.2106443401426077, |
| "reward_before_std": 0.2546057654544711, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2500230632722378, |
| "reward_change_min": -0.36453111097216606, |
| "reward_change_std": 0.1324605904519558, |
| "reward_std": 0.3419058118015528, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.23147768154740334, |
| "step": 415 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1748.9583892822266, |
| "epoch": 0.4754285714285714, |
| "grad_norm": 0.09120064228773117, |
| "kl": 8.981674909591675e-06, |
| "lambda_div_used": 0.6067373231053352, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0771, |
| "reward": -0.13937147613614798, |
| "reward_after_mean": -0.13937147613614798, |
| "reward_after_std": 0.6240962240844965, |
| "reward_before_mean": 0.20647221896797419, |
| "reward_before_std": 0.48884215706493706, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34584368020296097, |
| "reward_change_min": -0.5141387544572353, |
| "reward_change_std": 0.18731264490634203, |
| "reward_std": 0.6240962333977222, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.04352780181216076, |
| "step": 416 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3307.250015258789, |
| "epoch": 0.4765714285714286, |
| "grad_norm": 0.05766940861940384, |
| "kl": 3.1495699658989906e-05, |
| "lambda_div_used": 0.5544050261378288, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": -0.0019, |
| "reward": -0.4980150870978832, |
| "reward_after_mean": -0.4980150870978832, |
| "reward_after_std": 0.3123048096895218, |
| "reward_before_mean": -0.2511206082999706, |
| "reward_before_std": 0.24326619878411293, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24689447320997715, |
| "reward_change_min": -0.3710709176957607, |
| "reward_change_std": 0.13346257898956537, |
| "reward_std": 0.3123048171401024, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2719539441168308, |
| "step": 417 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2171.625015258789, |
| "epoch": 0.4777142857142857, |
| "grad_norm": 0.0854194164276123, |
| "kl": 2.8558075428009033e-05, |
| "lambda_div_used": 0.6033797562122345, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": 0.0073, |
| "reward": -0.0949536501429975, |
| "reward_after_mean": -0.0949536501429975, |
| "reward_after_std": 0.540143633261323, |
| "reward_before_mean": 0.25413690507411957, |
| "reward_before_std": 0.4725718079134822, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34909053333103657, |
| "reward_change_min": -0.5411153584718704, |
| "reward_change_std": 0.2063078135251999, |
| "reward_std": 0.5401436407119036, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.03752976981922984, |
| "step": 418 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2464.3750228881836, |
| "epoch": 0.47885714285714287, |
| "grad_norm": 0.08315848559141159, |
| "kl": 3.0182301998138428e-05, |
| "lambda_div_used": 0.5937136337161064, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.03, |
| "reward": -0.27206650376319885, |
| "reward_after_mean": -0.27206650376319885, |
| "reward_after_std": 0.49108788557350636, |
| "reward_before_mean": 0.0038498505018651485, |
| "reward_before_std": 0.4298001816496253, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27591635659337044, |
| "reward_change_min": -0.4215012192726135, |
| "reward_change_std": 0.16396540496498346, |
| "reward_std": 0.49108790047466755, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.12115013878792524, |
| "step": 419 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1972.7083625793457, |
| "epoch": 0.48, |
| "grad_norm": 0.11145550012588501, |
| "kl": 3.493949770927429e-05, |
| "lambda_div_used": 0.5797194987535477, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": -0.0607, |
| "reward": -0.1652970388531685, |
| "reward_after_mean": -0.1652970388531685, |
| "reward_after_std": 0.4477591011673212, |
| "reward_before_mean": 0.18980162939988077, |
| "reward_before_std": 0.36517443507909775, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35509867407381535, |
| "reward_change_min": -0.5067496970295906, |
| "reward_change_std": 0.19984195847064257, |
| "reward_std": 0.4477591197937727, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.03936504106968641, |
| "step": 420 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3191.937515258789, |
| "epoch": 0.48114285714285715, |
| "grad_norm": 0.07170204073190689, |
| "kl": 3.11434268951416e-05, |
| "lambda_div_used": 0.5753046199679375, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0088, |
| "reward": -0.38796948781237006, |
| "reward_after_mean": -0.38796948781237006, |
| "reward_after_std": 0.4225973077118397, |
| "reward_before_mean": -0.12759371474385262, |
| "reward_before_std": 0.3405588921159506, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2603757604956627, |
| "reward_change_min": -0.37436668202281, |
| "reward_change_std": 0.13996374886482954, |
| "reward_std": 0.4225973132997751, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.16926039289683104, |
| "step": 421 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2776.437545776367, |
| "epoch": 0.48228571428571426, |
| "grad_norm": 0.06681375205516815, |
| "kl": 1.9788742065429688e-05, |
| "lambda_div_used": 0.5904401987791061, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0293, |
| "reward": -0.25396816432476044, |
| "reward_after_mean": -0.25396816432476044, |
| "reward_after_std": 0.4787884410470724, |
| "reward_before_mean": 0.03486193250864744, |
| "reward_before_std": 0.4093271289020777, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2888300847262144, |
| "reward_change_min": -0.43693453073501587, |
| "reward_change_std": 0.16317449882626534, |
| "reward_std": 0.4787884559482336, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.11097141215577722, |
| "step": 422 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2945.9583740234375, |
| "epoch": 0.48342857142857143, |
| "grad_norm": 0.06767871230840683, |
| "kl": 2.205371856689453e-05, |
| "lambda_div_used": 0.611551009118557, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": 0.0153, |
| "reward": -0.16482537053525448, |
| "reward_after_mean": -0.16482537053525448, |
| "reward_after_std": 0.5363675616681576, |
| "reward_before_mean": 0.12604539189487696, |
| "reward_before_std": 0.518328445032239, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2908707782626152, |
| "reward_change_min": -0.4775175042450428, |
| "reward_change_std": 0.18889948446303606, |
| "reward_std": 0.5363675802946091, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.040621266700327396, |
| "step": 423 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3215.0208740234375, |
| "epoch": 0.4845714285714286, |
| "grad_norm": 0.061716482043266296, |
| "kl": 1.1460855603218079e-05, |
| "lambda_div_used": 0.5937408357858658, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": 0.0327, |
| "reward": -0.3415182586759329, |
| "reward_after_mean": -0.3415182586759329, |
| "reward_after_std": 0.48476750776171684, |
| "reward_before_mean": -0.08947536488994956, |
| "reward_before_std": 0.4272408355027437, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2520429063588381, |
| "reward_change_min": -0.42084335163235664, |
| "reward_change_std": 0.15120856929570436, |
| "reward_std": 0.4847675133496523, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.17280870315153152, |
| "step": 424 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2154.9375610351562, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 0.09371069073677063, |
| "kl": 3.830809146165848e-05, |
| "lambda_div_used": 0.6315138638019562, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0058, |
| "reward": 0.34828917868435383, |
| "reward_after_mean": 0.34828917868435383, |
| "reward_after_std": 0.7491090279072523, |
| "reward_before_mean": 0.91605463065207, |
| "reward_before_std": 0.6105400957167149, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5677654705941677, |
| "reward_change_min": -0.9185826852917671, |
| "reward_change_std": 0.34253316558897495, |
| "reward_std": 0.7491090428084135, |
| "rewards/accuracy_reward": 0.5833333358168602, |
| "rewards/cosine_scaled_reward": 0.3327212668955326, |
| "step": 425 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2205.3125190734863, |
| "epoch": 0.4868571428571429, |
| "grad_norm": 0.09298980236053467, |
| "kl": 2.3175030946731567e-05, |
| "lambda_div_used": 0.6215192526578903, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.002, |
| "reward": -0.029936233535408974, |
| "reward_after_mean": -0.029936233535408974, |
| "reward_after_std": 0.5604026541113853, |
| "reward_before_mean": 0.3139693345874548, |
| "reward_before_std": 0.5635973755270243, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34390556812286377, |
| "reward_change_min": -0.5907084755599499, |
| "reward_change_std": 0.23498705588281155, |
| "reward_std": 0.5604026671499014, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.0014693308621644974, |
| "step": 426 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3295.7708740234375, |
| "epoch": 0.488, |
| "grad_norm": 0.0548052154481411, |
| "kl": 2.4527311325073242e-05, |
| "lambda_div_used": 0.5990054532885551, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": 0.0088, |
| "reward": -0.2071160115301609, |
| "reward_after_mean": -0.2071160115301609, |
| "reward_after_std": 0.49672279693186283, |
| "reward_before_mean": 0.08893106225878, |
| "reward_before_std": 0.454012256115675, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2960470784455538, |
| "reward_change_min": -0.4723533205688, |
| "reward_change_std": 0.18442231137305498, |
| "reward_std": 0.4967228155583143, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.09856893587857485, |
| "step": 427 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2302.520881652832, |
| "epoch": 0.48914285714285716, |
| "grad_norm": 0.11483462899923325, |
| "kl": 0.000337366946041584, |
| "lambda_div_used": 0.5547136962413788, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0311, |
| "reward": -0.42663199454545975, |
| "reward_after_mean": -0.42663199454545975, |
| "reward_after_std": 0.29241302236914635, |
| "reward_before_mean": -0.1547260768711567, |
| "reward_before_std": 0.24498768709599972, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27190591767430305, |
| "reward_change_min": -0.42438486963510513, |
| "reward_change_std": 0.1579811777919531, |
| "reward_std": 0.2924130354076624, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.2380594089627266, |
| "step": 428 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2431.520881652832, |
| "epoch": 0.49028571428571427, |
| "grad_norm": 0.0878632664680481, |
| "kl": 2.4685636162757874e-05, |
| "lambda_div_used": 0.6025609895586967, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": 0.0378, |
| "reward": -0.057155030546709895, |
| "reward_after_mean": -0.057155030546709895, |
| "reward_after_std": 0.5567535478621721, |
| "reward_before_mean": 0.3208682704716921, |
| "reward_before_std": 0.47341752983629704, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3780232574790716, |
| "reward_change_min": -0.5763550102710724, |
| "reward_change_std": 0.22364642471075058, |
| "reward_std": 0.5567535553127527, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": 0.029201554832980037, |
| "step": 429 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2577.3125610351562, |
| "epoch": 0.49142857142857144, |
| "grad_norm": 0.08103640377521515, |
| "kl": 1.4763325452804565e-05, |
| "lambda_div_used": 0.6012379750609398, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": -0.0049, |
| "reward": 0.031602535396814346, |
| "reward_after_mean": 0.031602535396814346, |
| "reward_after_std": 0.5440461356192827, |
| "reward_before_mean": 0.4564003311097622, |
| "reward_before_std": 0.46839726250618696, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42479780688881874, |
| "reward_change_min": -0.6648109555244446, |
| "reward_change_std": 0.2579617351293564, |
| "reward_std": 0.5440461542457342, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.1022336557507515, |
| "step": 430 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2690.6458740234375, |
| "epoch": 0.49257142857142855, |
| "grad_norm": 0.06538030505180359, |
| "kl": 2.8170645236968994e-05, |
| "lambda_div_used": 0.5918548330664635, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": -0.0328, |
| "reward": -0.24143310775980353, |
| "reward_after_mean": -0.24143310775980353, |
| "reward_after_std": 0.481786971911788, |
| "reward_before_mean": 0.05766227189451456, |
| "reward_before_std": 0.41715206764638424, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29909538850188255, |
| "reward_change_min": -0.4450513869524002, |
| "reward_change_std": 0.1692206682637334, |
| "reward_std": 0.4817869979888201, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.08817105047637597, |
| "step": 431 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2843.750045776367, |
| "epoch": 0.4937142857142857, |
| "grad_norm": 0.05780716612935066, |
| "kl": 3.104284405708313e-05, |
| "lambda_div_used": 0.576793298125267, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": 0.0053, |
| "reward": -0.42155745439231396, |
| "reward_after_mean": -0.42155745439231396, |
| "reward_after_std": 0.4230448566377163, |
| "reward_before_mean": -0.19034346495755017, |
| "reward_before_std": 0.35024640895426273, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23121398873627186, |
| "reward_change_min": -0.3715337961912155, |
| "reward_change_std": 0.1338915005326271, |
| "reward_std": 0.4230448678135872, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.2528434665873647, |
| "step": 432 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2992.062530517578, |
| "epoch": 0.4948571428571429, |
| "grad_norm": 0.06562553346157074, |
| "kl": 3.0972063541412354e-05, |
| "lambda_div_used": 0.6079665347933769, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": 0.1201, |
| "reward": -0.26266857516020536, |
| "reward_after_mean": -0.26266857516020536, |
| "reward_after_std": 0.5306479204446077, |
| "reward_before_mean": -0.0102414321154356, |
| "reward_before_std": 0.5030816271901131, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25242713652551174, |
| "reward_change_min": -0.4955142140388489, |
| "reward_change_std": 0.1781205264851451, |
| "reward_std": 0.5306479260325432, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.15607476886361837, |
| "step": 433 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2915.625030517578, |
| "epoch": 0.496, |
| "grad_norm": 0.06934478878974915, |
| "kl": 3.8422178477048874e-05, |
| "lambda_div_used": 0.5604980885982513, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": -0.004, |
| "reward": -0.4280230412259698, |
| "reward_after_mean": -0.4280230412259698, |
| "reward_after_std": 0.32627478428184986, |
| "reward_before_mean": -0.15262745507061481, |
| "reward_before_std": 0.2731653768569231, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27539557591080666, |
| "reward_change_min": -0.44822341948747635, |
| "reward_change_std": 0.16339552495628595, |
| "reward_std": 0.32627478800714016, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.1942941201850772, |
| "step": 434 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2320.750026702881, |
| "epoch": 0.49714285714285716, |
| "grad_norm": 0.13675029575824738, |
| "kl": 4.8140063881874084e-05, |
| "lambda_div_used": 0.5811168253421783, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": -0.0374, |
| "reward": -0.17623315937817097, |
| "reward_after_mean": -0.17623315937817097, |
| "reward_after_std": 0.483237961307168, |
| "reward_before_mean": 0.19585114251822233, |
| "reward_before_std": 0.3759047882631421, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37208431400358677, |
| "reward_change_min": -0.5468379594385624, |
| "reward_change_std": 0.20962903555482626, |
| "reward_std": 0.4832379762083292, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.01248218398541212, |
| "step": 435 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2316.3125381469727, |
| "epoch": 0.4982857142857143, |
| "grad_norm": 0.09783437103033066, |
| "kl": 2.839416265487671e-05, |
| "lambda_div_used": 0.6289880573749542, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.0826, |
| "reward": 0.12041758373379707, |
| "reward_after_mean": 0.12041758373379707, |
| "reward_after_std": 0.6426001656800508, |
| "reward_before_mean": 0.5348608233034611, |
| "reward_before_std": 0.6021685730665922, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41444322280585766, |
| "reward_change_min": -0.6993351243436337, |
| "reward_change_std": 0.271469890139997, |
| "reward_std": 0.6426001694053411, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.13902746886014938, |
| "step": 436 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2954.0416717529297, |
| "epoch": 0.49942857142857144, |
| "grad_norm": 0.05496148392558098, |
| "kl": 0.00023943744599819183, |
| "lambda_div_used": 0.5626013651490211, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": 0.05, |
| "reward": -0.42826351523399353, |
| "reward_after_mean": -0.42826351523399353, |
| "reward_after_std": 0.33650987036526203, |
| "reward_before_mean": -0.16868688352406025, |
| "reward_before_std": 0.2832178361713886, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25957662612199783, |
| "reward_change_min": -0.42033713683485985, |
| "reward_change_std": 0.1560918828472495, |
| "reward_std": 0.3365098759531975, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.2311868779361248, |
| "step": 437 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3007.500015258789, |
| "epoch": 0.5005714285714286, |
| "grad_norm": 0.05682806298136711, |
| "kl": 4.358415026217699e-06, |
| "lambda_div_used": 0.6039799600839615, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": 0.0289, |
| "reward": -0.29300207551568747, |
| "reward_after_mean": -0.29300207551568747, |
| "reward_after_std": 0.5204536523669958, |
| "reward_before_mean": -0.03094739466905594, |
| "reward_before_std": 0.4829921592026949, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2620546855032444, |
| "reward_change_min": -0.5145607963204384, |
| "reward_change_std": 0.18371508549898863, |
| "reward_std": 0.5204536579549313, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.15594739385414869, |
| "step": 438 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2249.5000076293945, |
| "epoch": 0.5017142857142857, |
| "grad_norm": 0.0828532725572586, |
| "kl": 3.217160701751709e-05, |
| "lambda_div_used": 0.5708463862538338, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.021, |
| "reward": -0.40206424333155155, |
| "reward_after_mean": -0.40206424333155155, |
| "reward_after_std": 0.40689974650740623, |
| "reward_before_mean": -0.14063972979784012, |
| "reward_before_std": 0.32079045102000237, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.261424507945776, |
| "reward_change_min": -0.38026969507336617, |
| "reward_change_std": 0.14067976083606482, |
| "reward_std": 0.406899768859148, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.1823064020718448, |
| "step": 439 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2964.000011444092, |
| "epoch": 0.5028571428571429, |
| "grad_norm": 0.07350306212902069, |
| "kl": 2.7701258659362793e-05, |
| "lambda_div_used": 0.5365751013159752, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": 0.0201, |
| "reward": -0.5367953963577747, |
| "reward_after_mean": -0.5367953963577747, |
| "reward_after_std": 0.231884878128767, |
| "reward_before_mean": -0.28081973269581795, |
| "reward_before_std": 0.16059848852455616, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25597566179931164, |
| "reward_change_min": -0.38940757140517235, |
| "reward_change_std": 0.13862022012472153, |
| "reward_std": 0.2318848893046379, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.28081973642110825, |
| "step": 440 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3007.770896911621, |
| "epoch": 0.504, |
| "grad_norm": 0.07498018443584442, |
| "kl": 2.944841980934143e-05, |
| "lambda_div_used": 0.6011143997311592, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": -0.0116, |
| "reward": -0.13889071717858315, |
| "reward_after_mean": -0.13889071717858315, |
| "reward_after_std": 0.4864419028162956, |
| "reward_before_mean": 0.17084867507219315, |
| "reward_before_std": 0.4689741334877908, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3097394183278084, |
| "reward_change_min": -0.514279019087553, |
| "reward_change_std": 0.20097953081130981, |
| "reward_std": 0.48644191212952137, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.05831799004226923, |
| "step": 441 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2837.4583435058594, |
| "epoch": 0.5051428571428571, |
| "grad_norm": 0.0673719272017479, |
| "kl": 3.160163760185242e-05, |
| "lambda_div_used": 0.5689455196261406, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": 0.0084, |
| "reward": -0.31079866108484566, |
| "reward_after_mean": -0.31079866108484566, |
| "reward_after_std": 0.3982803635299206, |
| "reward_before_mean": -0.010625829687342048, |
| "reward_before_std": 0.31661996035836637, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30017283000051975, |
| "reward_change_min": -0.4261031448841095, |
| "reward_change_std": 0.16800945159047842, |
| "reward_std": 0.3982803765684366, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.1147925069089979, |
| "step": 442 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3194.270854949951, |
| "epoch": 0.5062857142857143, |
| "grad_norm": 0.05984271690249443, |
| "kl": 1.745857298374176e-05, |
| "lambda_div_used": 0.5607094466686249, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": 0.0053, |
| "reward": -0.38279488682746887, |
| "reward_after_mean": -0.38279488682746887, |
| "reward_after_std": 0.33906700275838375, |
| "reward_before_mean": -0.09817820321768522, |
| "reward_before_std": 0.2764374865218997, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2846166864037514, |
| "reward_change_min": -0.4438001811504364, |
| "reward_change_std": 0.16594033408910036, |
| "reward_std": 0.3390670083463192, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.16067820694297552, |
| "step": 443 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2902.2291717529297, |
| "epoch": 0.5074285714285715, |
| "grad_norm": 0.0638304203748703, |
| "kl": 1.5033408999443054e-05, |
| "lambda_div_used": 0.5774018093943596, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": -0.0355, |
| "reward": -0.3352237604558468, |
| "reward_after_mean": -0.3352237604558468, |
| "reward_after_std": 0.4039857666939497, |
| "reward_before_mean": -0.05285666696727276, |
| "reward_before_std": 0.35096561443060637, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2823671065270901, |
| "reward_change_min": -0.42594215646386147, |
| "reward_change_std": 0.16656992863863707, |
| "reward_std": 0.40398577041924, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.15702332742512226, |
| "step": 444 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3006.916717529297, |
| "epoch": 0.5085714285714286, |
| "grad_norm": 0.06397882103919983, |
| "kl": 2.842303365468979e-05, |
| "lambda_div_used": 0.6194909662008286, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0715, |
| "reward": -0.05263347551226616, |
| "reward_after_mean": -0.05263347551226616, |
| "reward_after_std": 0.5679135732352734, |
| "reward_before_mean": 0.2764017879962921, |
| "reward_before_std": 0.5510673765093088, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3290352523326874, |
| "reward_change_min": -0.5197134874761105, |
| "reward_change_std": 0.21186745446175337, |
| "reward_std": 0.5679135844111443, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": 0.0055684298276901245, |
| "step": 445 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3028.8333892822266, |
| "epoch": 0.5097142857142857, |
| "grad_norm": 0.06140498071908951, |
| "kl": 3.2648444175720215e-05, |
| "lambda_div_used": 0.606703408062458, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": -0.0241, |
| "reward": -0.2073356769979, |
| "reward_after_mean": -0.2073356769979, |
| "reward_after_std": 0.5240648984909058, |
| "reward_before_mean": 0.0723939798772335, |
| "reward_before_std": 0.4952498711645603, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27972964756190777, |
| "reward_change_min": -0.5379137881100178, |
| "reward_change_std": 0.1909602265805006, |
| "reward_std": 0.5240649078041315, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.07343936339020729, |
| "step": 446 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2571.166721343994, |
| "epoch": 0.5108571428571429, |
| "grad_norm": 0.08778514713048935, |
| "kl": 9.072478860616684e-06, |
| "lambda_div_used": 0.5970501974225044, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": 0.0018, |
| "reward": -0.19514422863721848, |
| "reward_after_mean": -0.19514422863721848, |
| "reward_after_std": 0.4898754768073559, |
| "reward_before_mean": 0.10967571474611759, |
| "reward_before_std": 0.4461768325418234, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3048199340701103, |
| "reward_change_min": -0.48957687243819237, |
| "reward_change_std": 0.18721044715493917, |
| "reward_std": 0.4898754861205816, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.07782429456710815, |
| "step": 447 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2039.6042022705078, |
| "epoch": 0.512, |
| "grad_norm": 0.12122303992509842, |
| "kl": 3.530830144882202e-05, |
| "lambda_div_used": 0.5591797679662704, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": 0.0057, |
| "reward": -0.3564775697886944, |
| "reward_after_mean": -0.3564775697886944, |
| "reward_after_std": 0.33722602762281895, |
| "reward_before_mean": -0.06660962291061878, |
| "reward_before_std": 0.27086107339709997, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28986795619130135, |
| "reward_change_min": -0.4345347508788109, |
| "reward_change_std": 0.1656003799289465, |
| "reward_std": 0.33722603134810925, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.14994295500218868, |
| "step": 448 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2680.0208435058594, |
| "epoch": 0.5131428571428571, |
| "grad_norm": 0.07745695859193802, |
| "kl": 3.194063901901245e-05, |
| "lambda_div_used": 0.5827131420373917, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": -0.0247, |
| "reward": -0.278532937169075, |
| "reward_after_mean": -0.278532937169075, |
| "reward_after_std": 0.44566163793206215, |
| "reward_before_mean": 0.01611769199371338, |
| "reward_before_std": 0.3775772461667657, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29465061984956264, |
| "reward_change_min": -0.4545559212565422, |
| "reward_change_std": 0.17489958554506302, |
| "reward_std": 0.44566163793206215, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.17138232477009296, |
| "step": 449 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2405.0833702087402, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 0.07359456270933151, |
| "kl": 4.271417856216431e-05, |
| "lambda_div_used": 0.5313882529735565, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": 0.0047, |
| "reward": -0.40455422177910805, |
| "reward_after_mean": -0.40455422177910805, |
| "reward_after_std": 0.27055008336901665, |
| "reward_before_mean": -0.05470774322748184, |
| "reward_before_std": 0.13779542688280344, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34984651021659374, |
| "reward_change_min": -0.482446551322937, |
| "reward_change_std": 0.18013106007128954, |
| "reward_std": 0.2705500964075327, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.17970772832632065, |
| "step": 450 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2726.7916870117188, |
| "epoch": 0.5154285714285715, |
| "grad_norm": 0.10456772893667221, |
| "kl": 5.123019218444824e-05, |
| "lambda_div_used": 0.5659012496471405, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0437, |
| "reward": -0.233762688934803, |
| "reward_after_mean": -0.233762688934803, |
| "reward_after_std": 0.3937326893210411, |
| "reward_before_mean": 0.14449233608320355, |
| "reward_before_std": 0.30359079129993916, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3782550562173128, |
| "reward_change_min": -0.5748258531093597, |
| "reward_change_std": 0.21745321340858936, |
| "reward_std": 0.3937326930463314, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.0430076620541513, |
| "step": 451 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3173.812530517578, |
| "epoch": 0.5165714285714286, |
| "grad_norm": 0.05382152646780014, |
| "kl": 1.621060073375702e-05, |
| "lambda_div_used": 0.628274716436863, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": 0.0152, |
| "reward": 0.08829959481954575, |
| "reward_after_mean": 0.08829959481954575, |
| "reward_after_std": 0.6491915434598923, |
| "reward_before_mean": 0.49067158019170165, |
| "reward_before_std": 0.5948440972715616, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4023719858378172, |
| "reward_change_min": -0.6429549157619476, |
| "reward_change_std": 0.2526876190677285, |
| "reward_std": 0.6491915658116341, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.11567158252000809, |
| "step": 452 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2601.1666870117188, |
| "epoch": 0.5177142857142857, |
| "grad_norm": 0.10710335522890091, |
| "kl": 4.57763671875e-05, |
| "lambda_div_used": 0.5769856572151184, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": -0.0233, |
| "reward": -0.4120060931891203, |
| "reward_after_mean": -0.4120060931891203, |
| "reward_after_std": 0.4247955661267042, |
| "reward_before_mean": -0.17395742796361446, |
| "reward_before_std": 0.3489131908863783, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23804865032434464, |
| "reward_change_min": -0.3744778670370579, |
| "reward_change_std": 0.12989939749240875, |
| "reward_std": 0.4247955847531557, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21562409959733486, |
| "step": 453 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2794.8958740234375, |
| "epoch": 0.5188571428571429, |
| "grad_norm": 0.059399981051683426, |
| "kl": 1.829490065574646e-05, |
| "lambda_div_used": 0.604412779211998, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0084, |
| "reward": -0.23108407109975815, |
| "reward_after_mean": -0.23108407109975815, |
| "reward_after_std": 0.5207228269428015, |
| "reward_before_mean": 0.05918463226407766, |
| "reward_before_std": 0.4853199487552047, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29026869870722294, |
| "reward_change_min": -0.5276229903101921, |
| "reward_change_std": 0.1960765514522791, |
| "reward_std": 0.5207228306680918, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.08664871653309092, |
| "step": 454 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3120.0416717529297, |
| "epoch": 0.52, |
| "grad_norm": 0.07823975384235382, |
| "kl": 1.1070631444454193e-05, |
| "lambda_div_used": 0.5381916239857674, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": 0.0035, |
| "reward": -0.5376988351345062, |
| "reward_after_mean": -0.5376988351345062, |
| "reward_after_std": 0.2399882897734642, |
| "reward_before_mean": -0.28268107399344444, |
| "reward_before_std": 0.16770172398537397, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25501775927841663, |
| "reward_change_min": -0.38403724879026413, |
| "reward_change_std": 0.1382020702585578, |
| "reward_std": 0.23998829536139965, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.28268107026815414, |
| "step": 455 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3183.125, |
| "epoch": 0.5211428571428571, |
| "grad_norm": 0.07441714406013489, |
| "kl": 1.7248094081878662e-05, |
| "lambda_div_used": 0.535825714468956, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": -0.0061, |
| "reward": -0.5242688432335854, |
| "reward_after_mean": -0.5242688432335854, |
| "reward_after_std": 0.2355129700154066, |
| "reward_before_mean": -0.2603513076901436, |
| "reward_before_std": 0.15733365854248405, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2639175299555063, |
| "reward_change_min": -0.3936190530657768, |
| "reward_change_std": 0.14126356784254313, |
| "reward_std": 0.23551297932863235, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2603513114154339, |
| "step": 456 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3090.9166717529297, |
| "epoch": 0.5222857142857142, |
| "grad_norm": 0.06740739941596985, |
| "kl": 1.6771256923675537e-05, |
| "lambda_div_used": 0.5661213397979736, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": 0.0183, |
| "reward": -0.2666686773300171, |
| "reward_after_mean": -0.2666686773300171, |
| "reward_after_std": 0.3630063198506832, |
| "reward_before_mean": 0.06939902156591415, |
| "reward_before_std": 0.3003841144964099, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3360677044838667, |
| "reward_change_min": -0.5109980814158916, |
| "reward_change_std": 0.19570716377347708, |
| "reward_std": 0.36300632916390896, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.0764343123883009, |
| "step": 457 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2337.0416717529297, |
| "epoch": 0.5234285714285715, |
| "grad_norm": 0.10038017481565475, |
| "kl": 1.5580561012029648e-05, |
| "lambda_div_used": 0.587464913725853, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": 0.0337, |
| "reward": -0.2921891398727894, |
| "reward_after_mean": -0.2921891398727894, |
| "reward_after_std": 0.46863692067563534, |
| "reward_before_mean": -0.017078701872378588, |
| "reward_before_std": 0.3978575337678194, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27511043660342693, |
| "reward_change_min": -0.4165792725980282, |
| "reward_change_std": 0.15801922790706158, |
| "reward_std": 0.4686369299888611, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.14207870024256408, |
| "step": 458 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1897.9791870117188, |
| "epoch": 0.5245714285714286, |
| "grad_norm": 0.10566994547843933, |
| "kl": 3.1791627407073975e-05, |
| "lambda_div_used": 0.6483638733625412, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": -0.0327, |
| "reward": 0.04763873480260372, |
| "reward_after_mean": 0.04763873480260372, |
| "reward_after_std": 0.6893943976610899, |
| "reward_before_mean": 0.37104589492082596, |
| "reward_before_std": 0.6905025038868189, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3234071359038353, |
| "reward_change_min": -0.573150984942913, |
| "reward_change_std": 0.22597133833914995, |
| "reward_std": 0.6893944274634123, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.07937921397387981, |
| "step": 459 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3152.9583587646484, |
| "epoch": 0.5257142857142857, |
| "grad_norm": 0.0553775392472744, |
| "kl": 1.6976147890090942e-05, |
| "lambda_div_used": 0.6212490126490593, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": -0.0403, |
| "reward": -0.2310712798498571, |
| "reward_after_mean": -0.2310712798498571, |
| "reward_after_std": 0.5967046339064837, |
| "reward_before_mean": 0.018506756518036127, |
| "reward_before_std": 0.5584379080682993, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24957803264260292, |
| "reward_change_min": -0.45073715783655643, |
| "reward_change_std": 0.16405375488102436, |
| "reward_std": 0.5967046469449997, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1273265864001587, |
| "step": 460 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2965.625015258789, |
| "epoch": 0.5268571428571428, |
| "grad_norm": 0.0683179497718811, |
| "kl": 3.898143768310547e-05, |
| "lambda_div_used": 0.5838751494884491, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0201, |
| "reward": -0.36440329253673553, |
| "reward_after_mean": -0.36440329253673553, |
| "reward_after_std": 0.44049660488963127, |
| "reward_before_mean": -0.10854836623184383, |
| "reward_before_std": 0.3863721750676632, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25585492700338364, |
| "reward_change_min": -0.41783962957561016, |
| "reward_change_std": 0.15460436698049307, |
| "reward_std": 0.44049660861492157, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.19188169576227665, |
| "step": 461 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2874.3125076293945, |
| "epoch": 0.528, |
| "grad_norm": 0.07832667976617813, |
| "kl": 2.7470290660858154e-05, |
| "lambda_div_used": 0.5419389680027962, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": -0.0381, |
| "reward": -0.5123195722699165, |
| "reward_after_mean": -0.5123195722699165, |
| "reward_after_std": 0.2568210382014513, |
| "reward_before_mean": -0.2502432279288769, |
| "reward_before_std": 0.1842196974903345, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2620763499289751, |
| "reward_change_min": -0.4005816727876663, |
| "reward_change_std": 0.14368234388530254, |
| "reward_std": 0.2568210456520319, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2502432242035866, |
| "step": 462 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2986.8333740234375, |
| "epoch": 0.5291428571428571, |
| "grad_norm": 0.053950026631355286, |
| "kl": 8.771196007728577e-06, |
| "lambda_div_used": 0.622698076069355, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.0106, |
| "reward": -0.2041953857988119, |
| "reward_after_mean": -0.2041953857988119, |
| "reward_after_std": 0.6069091446697712, |
| "reward_before_mean": 0.05050618201494217, |
| "reward_before_std": 0.5684067364782095, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25470156222581863, |
| "reward_change_min": -0.4568561315536499, |
| "reward_change_std": 0.16765617858618498, |
| "reward_std": 0.6069091446697712, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.09532715613022447, |
| "step": 463 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1749.0416717529297, |
| "epoch": 0.5302857142857142, |
| "grad_norm": 0.11516103893518448, |
| "kl": 3.466010093688965e-05, |
| "lambda_div_used": 0.5341480076313019, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": -0.0088, |
| "reward": -0.040389321744441986, |
| "reward_after_mean": -0.040389321744441986, |
| "reward_after_std": 0.37696985714137554, |
| "reward_before_mean": 0.5386080192402005, |
| "reward_before_std": 0.14989514648914337, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5789973307400942, |
| "reward_change_min": -0.7674467526376247, |
| "reward_change_std": 0.2917799688875675, |
| "reward_std": 0.37696986459195614, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/cosine_scaled_reward": 0.16360801365226507, |
| "step": 464 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3036.854179382324, |
| "epoch": 0.5314285714285715, |
| "grad_norm": 0.08283663541078568, |
| "kl": 2.4955719709396362e-05, |
| "lambda_div_used": 0.572467640042305, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": 0.0373, |
| "reward": -0.40219551732297987, |
| "reward_after_mean": -0.40219551732297987, |
| "reward_after_std": 0.40797613374888897, |
| "reward_before_mean": -0.1455877646803856, |
| "reward_before_std": 0.32708599977195263, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.256607748568058, |
| "reward_change_min": -0.37336331233382225, |
| "reward_change_std": 0.1377662243321538, |
| "reward_std": 0.40797614119946957, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.18725443072617054, |
| "step": 465 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3122.9583435058594, |
| "epoch": 0.5325714285714286, |
| "grad_norm": 0.07085831463336945, |
| "kl": 2.0582228899002075e-05, |
| "lambda_div_used": 0.5319224968552589, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": -0.0563, |
| "reward": -0.36368466913700104, |
| "reward_after_mean": -0.36368466913700104, |
| "reward_after_std": 0.27606455981731415, |
| "reward_before_mean": 0.005439521744847298, |
| "reward_before_std": 0.14014938473701477, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36912417598068714, |
| "reward_change_min": -0.5061142668128014, |
| "reward_change_std": 0.1899967910721898, |
| "reward_std": 0.2760645691305399, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.11956048291176558, |
| "step": 466 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3113.2708854675293, |
| "epoch": 0.5337142857142857, |
| "grad_norm": 0.07966674119234085, |
| "kl": 3.802310675382614e-05, |
| "lambda_div_used": 0.5612561926245689, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": -0.0067, |
| "reward": -0.3137255348265171, |
| "reward_after_mean": -0.3137255348265171, |
| "reward_after_std": 0.39207486622035503, |
| "reward_before_mean": 0.03320633992552757, |
| "reward_before_std": 0.2764817178249359, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3469318952411413, |
| "reward_change_min": -0.4828169047832489, |
| "reward_change_std": 0.18068420328199863, |
| "reward_std": 0.3920748811215162, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.11262698657810688, |
| "step": 467 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2898.770854949951, |
| "epoch": 0.5348571428571428, |
| "grad_norm": 0.07577298581600189, |
| "kl": 4.957616329193115e-05, |
| "lambda_div_used": 0.5363663136959076, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": 0.0338, |
| "reward": -0.5352834053337574, |
| "reward_after_mean": -0.5352834053337574, |
| "reward_after_std": 0.22454985231161118, |
| "reward_before_mean": -0.2775777019560337, |
| "reward_before_std": 0.15978037798777223, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2577057033777237, |
| "reward_change_min": -0.3789172098040581, |
| "reward_change_std": 0.14186344109475613, |
| "reward_std": 0.22454985417425632, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.27757770381867886, |
| "step": 468 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2854.8750228881836, |
| "epoch": 0.536, |
| "grad_norm": 0.09182782471179962, |
| "kl": 2.0341947674751282e-05, |
| "lambda_div_used": 0.5550094842910767, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": 0.0301, |
| "reward": -0.35969678312540054, |
| "reward_after_mean": -0.35969678312540054, |
| "reward_after_std": 0.30523936823010445, |
| "reward_before_mean": -0.04356633126735687, |
| "reward_before_std": 0.2476841462776065, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31613045185804367, |
| "reward_change_min": -0.45744797959923744, |
| "reward_change_std": 0.17962745483964682, |
| "reward_std": 0.3052393738180399, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.12689966335892677, |
| "step": 469 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3333.562530517578, |
| "epoch": 0.5371428571428571, |
| "grad_norm": 0.05610484257340431, |
| "kl": 2.9811635613441467e-05, |
| "lambda_div_used": 0.5911690816283226, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.0307, |
| "reward": -0.37843912467360497, |
| "reward_after_mean": -0.37843912467360497, |
| "reward_after_std": 0.49228920973837376, |
| "reward_before_mean": -0.14763083557772916, |
| "reward_before_std": 0.4128831513226032, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.230808287858963, |
| "reward_change_min": -0.3457215949892998, |
| "reward_change_std": 0.12627399526536465, |
| "reward_std": 0.49228921718895435, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.21013083728030324, |
| "step": 470 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3032.312515258789, |
| "epoch": 0.5382857142857143, |
| "grad_norm": 0.053547170013189316, |
| "kl": 2.0138919353485107e-05, |
| "lambda_div_used": 0.5520770102739334, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0036, |
| "reward": -0.32707202807068825, |
| "reward_after_mean": -0.32707202807068825, |
| "reward_after_std": 0.30774626694619656, |
| "reward_before_mean": 0.006061417981982231, |
| "reward_before_std": 0.23228682670742273, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3331334535032511, |
| "reward_change_min": -0.47613633051514626, |
| "reward_change_std": 0.181988756172359, |
| "reward_std": 0.3077462762594223, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.09810524806380272, |
| "step": 471 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3115.8333435058594, |
| "epoch": 0.5394285714285715, |
| "grad_norm": 0.05183921754360199, |
| "kl": 1.757591962814331e-05, |
| "lambda_div_used": 0.6081600487232208, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": 0.0248, |
| "reward": -0.22974545462056994, |
| "reward_after_mean": -0.22974545462056994, |
| "reward_after_std": 0.5618578754365444, |
| "reward_before_mean": 0.03380160592496395, |
| "reward_before_std": 0.4960998175665736, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2635470647364855, |
| "reward_change_min": -0.4026007801294327, |
| "reward_change_std": 0.1525269951671362, |
| "reward_std": 0.5618579015135765, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.1120317269815132, |
| "step": 472 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3389.7291870117188, |
| "epoch": 0.5405714285714286, |
| "grad_norm": 0.05399390682578087, |
| "kl": 2.3730099201202393e-05, |
| "lambda_div_used": 0.6023471653461456, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": -0.0081, |
| "reward": -0.16527403378859162, |
| "reward_after_mean": -0.16527403378859162, |
| "reward_after_std": 0.4859522972255945, |
| "reward_before_mean": 0.15660450607538223, |
| "reward_before_std": 0.469901567324996, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3218785412609577, |
| "reward_change_min": -0.5562809407711029, |
| "reward_change_std": 0.21282578073441982, |
| "reward_std": 0.48595230281352997, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.051728841848671436, |
| "step": 473 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2609.3541946411133, |
| "epoch": 0.5417142857142857, |
| "grad_norm": 0.07730484753847122, |
| "kl": 1.3811513781547546e-05, |
| "lambda_div_used": 0.5951687097549438, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.085, |
| "reward": 0.0976928174495697, |
| "reward_after_mean": 0.0976928174495697, |
| "reward_after_std": 0.5153173375874758, |
| "reward_before_mean": 0.5706379320472479, |
| "reward_before_std": 0.4373670984059572, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.47294510155916214, |
| "reward_change_min": -0.6950589865446091, |
| "reward_change_std": 0.27640796452760696, |
| "reward_std": 0.515317365527153, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.1748045664280653, |
| "step": 474 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2361.020851135254, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 0.08214546740055084, |
| "kl": 9.525567293167114e-06, |
| "lambda_div_used": 0.6207183972001076, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": 0.0204, |
| "reward": 0.0007177861407399178, |
| "reward_after_mean": 0.0007177861407399178, |
| "reward_after_std": 0.5817896965891123, |
| "reward_before_mean": 0.34289272502064705, |
| "reward_before_std": 0.5596933793276548, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3421749472618103, |
| "reward_change_min": -0.5431834943592548, |
| "reward_change_std": 0.21874003671109676, |
| "reward_std": 0.581789730116725, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/cosine_scaled_reward": 0.05122605012729764, |
| "step": 475 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2583.8333740234375, |
| "epoch": 0.544, |
| "grad_norm": 0.08052952587604523, |
| "kl": 3.0007213354110718e-05, |
| "lambda_div_used": 0.6784488782286644, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0154, |
| "reward": -0.031517775263637304, |
| "reward_after_mean": -0.031517775263637304, |
| "reward_after_std": 0.8641770519316196, |
| "reward_before_mean": 0.2038349723443389, |
| "reward_before_std": 0.8286011293530464, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.23535274900496006, |
| "reward_change_min": -0.41804002970457077, |
| "reward_change_std": 0.16118939872831106, |
| "reward_std": 0.8641770891845226, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.025331700686365366, |
| "step": 476 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2308.291732788086, |
| "epoch": 0.5451428571428572, |
| "grad_norm": 0.06839341670274734, |
| "kl": 8.73953104019165e-06, |
| "lambda_div_used": 0.6096622571349144, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.1111, |
| "reward": 0.19456686079502106, |
| "reward_after_mean": 0.19456686079502106, |
| "reward_after_std": 0.5847878716886044, |
| "reward_before_mean": 0.678333050571382, |
| "reward_before_std": 0.5006472393870354, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.48376619443297386, |
| "reward_change_min": -0.6835142783820629, |
| "reward_change_std": 0.2774948738515377, |
| "reward_std": 0.5847878959029913, |
| "rewards/accuracy_reward": 0.45833334885537624, |
| "rewards/cosine_scaled_reward": 0.21999971382319927, |
| "step": 477 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3169.3125610351562, |
| "epoch": 0.5462857142857143, |
| "grad_norm": 0.052864328026771545, |
| "kl": 6.990041583776474e-06, |
| "lambda_div_used": 0.5758631080389023, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0349, |
| "reward": -0.23655124567449093, |
| "reward_after_mean": -0.23655124567449093, |
| "reward_after_std": 0.4532997701317072, |
| "reward_before_mean": 0.11144567281007767, |
| "reward_before_std": 0.3459607223048806, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3479969333857298, |
| "reward_change_min": -0.5187661461532116, |
| "reward_change_std": 0.19546091184020042, |
| "reward_std": 0.4532997887581587, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.09688764810562134, |
| "step": 478 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2825.916702270508, |
| "epoch": 0.5474285714285714, |
| "grad_norm": 0.08355361968278885, |
| "kl": 3.739655949175358e-05, |
| "lambda_div_used": 0.5902442187070847, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": 0.033, |
| "reward": -0.37585191056132317, |
| "reward_after_mean": -0.37585191056132317, |
| "reward_after_std": 0.45466959848999977, |
| "reward_before_mean": -0.12638765759766102, |
| "reward_before_std": 0.4168459586799145, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2494642548263073, |
| "reward_change_min": -0.4229847304522991, |
| "reward_change_std": 0.15689124166965485, |
| "reward_std": 0.4546696189790964, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.18888765573501587, |
| "step": 479 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2415.895866394043, |
| "epoch": 0.5485714285714286, |
| "grad_norm": 0.08123134076595306, |
| "kl": 1.4044344425201416e-05, |
| "lambda_div_used": 0.5890126600861549, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": 0.0739, |
| "reward": -0.36289478465914726, |
| "reward_after_mean": -0.36289478465914726, |
| "reward_after_std": 0.4815117195248604, |
| "reward_before_mean": -0.11086839716881514, |
| "reward_before_std": 0.4026770405471325, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25202639400959015, |
| "reward_change_min": -0.37619682028889656, |
| "reward_change_std": 0.13623447716236115, |
| "reward_std": 0.4815117232501507, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.17336839414201677, |
| "step": 480 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3057.625030517578, |
| "epoch": 0.5497142857142857, |
| "grad_norm": 0.05632660165429115, |
| "kl": 2.8233975172042847e-05, |
| "lambda_div_used": 0.5841230228543282, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0198, |
| "reward": -0.386552257463336, |
| "reward_after_mean": -0.386552257463336, |
| "reward_after_std": 0.4468122199177742, |
| "reward_before_mean": -0.14256569184362888, |
| "reward_before_std": 0.38717135414481163, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24398657120764256, |
| "reward_change_min": -0.42186837270855904, |
| "reward_change_std": 0.14736179821193218, |
| "reward_std": 0.4468122236430645, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.20506569370627403, |
| "step": 481 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2824.5833587646484, |
| "epoch": 0.5508571428571428, |
| "grad_norm": 0.0865793228149414, |
| "kl": 2.4417764507234097e-05, |
| "lambda_div_used": 0.622517004609108, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": -0.0313, |
| "reward": -0.05876433290541172, |
| "reward_after_mean": -0.05876433290541172, |
| "reward_after_std": 0.5989817604422569, |
| "reward_before_mean": 0.259432727470994, |
| "reward_before_std": 0.5737235806882381, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31819707341492176, |
| "reward_change_min": -0.5269695781171322, |
| "reward_change_std": 0.21138717606663704, |
| "reward_std": 0.5989817790687084, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": 0.030266055837273598, |
| "step": 482 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2869.250015258789, |
| "epoch": 0.552, |
| "grad_norm": 0.08511485904455185, |
| "kl": 3.474205732345581e-05, |
| "lambda_div_used": 0.6013401597738266, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": -0.0837, |
| "reward": -0.2791392467916012, |
| "reward_after_mean": -0.2791392467916012, |
| "reward_after_std": 0.5272991992533207, |
| "reward_before_mean": -0.02371177263557911, |
| "reward_before_std": 0.4689871799200773, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2554274704307318, |
| "reward_change_min": -0.40494704991579056, |
| "reward_change_std": 0.15485912468284369, |
| "reward_std": 0.5272992141544819, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.1278784405440092, |
| "step": 483 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2591.7500228881836, |
| "epoch": 0.5531428571428572, |
| "grad_norm": 0.08745326101779938, |
| "kl": 3.6368146538734436e-07, |
| "lambda_div_used": 0.6466837078332901, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": 0.0082, |
| "reward": -0.04187892563641071, |
| "reward_after_mean": -0.04187892563641071, |
| "reward_after_std": 0.6843612100929022, |
| "reward_before_mean": 0.242530676885508, |
| "reward_before_std": 0.6844684220850468, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.284409599378705, |
| "reward_change_min": -0.49204646795988083, |
| "reward_change_std": 0.1993796620517969, |
| "reward_std": 0.6843612212687731, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.0283026653341949, |
| "step": 484 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2046.2500381469727, |
| "epoch": 0.5542857142857143, |
| "grad_norm": 0.09076947718858719, |
| "kl": 2.752244472503662e-05, |
| "lambda_div_used": 0.5719796344637871, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": 0.0035, |
| "reward": -0.40644849208183587, |
| "reward_after_mean": -0.40644849208183587, |
| "reward_after_std": 0.398155614733696, |
| "reward_before_mean": -0.14403727410535794, |
| "reward_before_std": 0.3282261691056192, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2624112106859684, |
| "reward_change_min": -0.43003450334072113, |
| "reward_change_std": 0.15388297475874424, |
| "reward_std": 0.39815562404692173, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.20653727487660944, |
| "step": 485 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1967.7916946411133, |
| "epoch": 0.5554285714285714, |
| "grad_norm": 0.09958811849355698, |
| "kl": 3.91155481338501e-05, |
| "lambda_div_used": 0.5703976079821587, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0966, |
| "reward": -0.26609380822628736, |
| "reward_after_mean": -0.26609380822628736, |
| "reward_after_std": 0.4386676363646984, |
| "reward_before_mean": 0.07946969009935856, |
| "reward_before_std": 0.31633214373141527, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34556350111961365, |
| "reward_change_min": -0.48576217144727707, |
| "reward_change_std": 0.1828850321471691, |
| "reward_std": 0.4386676475405693, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.08719698037020862, |
| "step": 486 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1967.0625228881836, |
| "epoch": 0.5565714285714286, |
| "grad_norm": 0.10718576610088348, |
| "kl": 3.7085264921188354e-06, |
| "lambda_div_used": 0.6087248548865318, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": 0.0461, |
| "reward": 0.08780635055154562, |
| "reward_after_mean": 0.08780635055154562, |
| "reward_after_std": 0.6550032701343298, |
| "reward_before_mean": 0.5625860100844875, |
| "reward_before_std": 0.5011934200301766, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.47477963753044605, |
| "reward_change_min": -0.7034505233168602, |
| "reward_change_std": 0.26853484753519297, |
| "reward_std": 0.6550032943487167, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.16675265738740563, |
| "step": 487 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2356.8125228881836, |
| "epoch": 0.5577142857142857, |
| "grad_norm": 0.08583448082208633, |
| "kl": 2.1670013666152954e-05, |
| "lambda_div_used": 0.5598724335432053, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": -0.0446, |
| "reward": -0.39712974801659584, |
| "reward_after_mean": -0.39712974801659584, |
| "reward_after_std": 0.3282298669219017, |
| "reward_before_mean": -0.11880321707576513, |
| "reward_before_std": 0.2731999019160867, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27832652255892754, |
| "reward_change_min": -0.42709672823548317, |
| "reward_change_std": 0.16301770228892565, |
| "reward_std": 0.3282298743724823, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.20213654916733503, |
| "step": 488 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3039.2916870117188, |
| "epoch": 0.5588571428571428, |
| "grad_norm": 0.08493036031723022, |
| "kl": 4.955753684043884e-05, |
| "lambda_div_used": 0.5549413114786148, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": -0.0127, |
| "reward": -0.4395306259393692, |
| "reward_after_mean": -0.4395306259393692, |
| "reward_after_std": 0.30519232526421547, |
| "reward_before_mean": -0.16155868768692017, |
| "reward_before_std": 0.24803727120161057, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27797193080186844, |
| "reward_change_min": -0.44194458797574043, |
| "reward_change_std": 0.1623495165258646, |
| "reward_std": 0.3051923308521509, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.22405867651104927, |
| "step": 489 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2838.0833892822266, |
| "epoch": 0.56, |
| "grad_norm": 0.06069381162524223, |
| "kl": 1.2032687664031982e-05, |
| "lambda_div_used": 0.583185076713562, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": 0.0139, |
| "reward": -0.1338807214051485, |
| "reward_after_mean": -0.1338807214051485, |
| "reward_after_std": 0.4649778436869383, |
| "reward_before_mean": 0.25663699954748154, |
| "reward_before_std": 0.3789667785167694, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39051773957908154, |
| "reward_change_min": -0.6341840960085392, |
| "reward_change_std": 0.2300328817218542, |
| "reward_std": 0.46497784554958344, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.014196328818798065, |
| "step": 490 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2593.7291870117188, |
| "epoch": 0.5611428571428572, |
| "grad_norm": 0.07340901345014572, |
| "kl": 3.531062975525856e-05, |
| "lambda_div_used": 0.5965912714600563, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0268, |
| "reward": 0.01746355928480625, |
| "reward_after_mean": 0.01746355928480625, |
| "reward_after_std": 0.5940692499279976, |
| "reward_before_mean": 0.47233812790364027, |
| "reward_before_std": 0.4421461224555969, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4548745583742857, |
| "reward_change_min": -0.6400704979896545, |
| "reward_change_std": 0.24487750884145498, |
| "reward_std": 0.5940692741423845, |
| "rewards/accuracy_reward": 0.3541666679084301, |
| "rewards/cosine_scaled_reward": 0.11817142926156521, |
| "step": 491 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2583.5416679382324, |
| "epoch": 0.5622857142857143, |
| "grad_norm": 0.08282249420881271, |
| "kl": 2.4866312742233276e-05, |
| "lambda_div_used": 0.5367531701922417, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": 0.0376, |
| "reward": -0.3908469006419182, |
| "reward_after_mean": -0.3908469006419182, |
| "reward_after_std": 0.27869052439928055, |
| "reward_before_mean": -0.03699151985347271, |
| "reward_before_std": 0.16138391755521297, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3538553789258003, |
| "reward_change_min": -0.5079905577003956, |
| "reward_change_std": 0.18725439626723528, |
| "reward_std": 0.27869053184986115, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.1619915273040533, |
| "step": 492 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2176.8333740234375, |
| "epoch": 0.5634285714285714, |
| "grad_norm": 0.09936831891536713, |
| "kl": 6.021931767463684e-06, |
| "lambda_div_used": 0.596942737698555, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": 0.0246, |
| "reward": -0.037120603024959564, |
| "reward_after_mean": -0.037120603024959564, |
| "reward_after_std": 0.46812310442328453, |
| "reward_before_mean": 0.34137603268027306, |
| "reward_before_std": 0.44823691714555025, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3784966245293617, |
| "reward_change_min": -0.5661123469471931, |
| "reward_change_std": 0.234741548076272, |
| "reward_std": 0.4681231305003166, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": 0.07054269965738058, |
| "step": 493 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2352.583427429199, |
| "epoch": 0.5645714285714286, |
| "grad_norm": 0.07574566453695297, |
| "kl": 1.6648322343826294e-05, |
| "lambda_div_used": 0.6796721071004868, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": 0.092, |
| "reward": 0.10963849350810051, |
| "reward_after_mean": 0.10963849350810051, |
| "reward_after_std": 0.8239130303263664, |
| "reward_before_mean": 0.4074633736163378, |
| "reward_before_std": 0.848267612978816, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2978248745203018, |
| "reward_change_min": -0.6511921100318432, |
| "reward_change_std": 0.2445714958012104, |
| "reward_std": 0.8239130582660437, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.09496336756274104, |
| "step": 494 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3203.8959045410156, |
| "epoch": 0.5657142857142857, |
| "grad_norm": 0.057527996599674225, |
| "kl": 2.3846514523029327e-05, |
| "lambda_div_used": 0.5994853675365448, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": 0.0637, |
| "reward": -0.21300244145095348, |
| "reward_after_mean": -0.21300244145095348, |
| "reward_after_std": 0.48377062380313873, |
| "reward_before_mean": 0.08397414721548557, |
| "reward_before_std": 0.4531490486115217, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29697658494114876, |
| "reward_change_min": -0.49157723411917686, |
| "reward_change_std": 0.1875515878200531, |
| "reward_std": 0.4837706368416548, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.08269252139143646, |
| "step": 495 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1895.9375457763672, |
| "epoch": 0.5668571428571428, |
| "grad_norm": 0.10791665315628052, |
| "kl": 1.890142448246479e-05, |
| "lambda_div_used": 0.6198414713144302, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0183, |
| "reward": 0.09977892541792244, |
| "reward_after_mean": 0.09977892541792244, |
| "reward_after_std": 0.6343465056270361, |
| "reward_before_mean": 0.5255357641726732, |
| "reward_before_std": 0.555769513361156, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4257568307220936, |
| "reward_change_min": -0.6588771641254425, |
| "reward_change_std": 0.25742017664015293, |
| "reward_std": 0.634346516802907, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/cosine_scaled_reward": 0.17136909160763025, |
| "step": 496 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2577.645851135254, |
| "epoch": 0.568, |
| "grad_norm": 0.06887742131948471, |
| "kl": 2.493336796760559e-05, |
| "lambda_div_used": 0.6232517957687378, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0143, |
| "reward": 0.039356768131256104, |
| "reward_after_mean": 0.039356768131256104, |
| "reward_after_std": 0.5658366903662682, |
| "reward_before_mean": 0.401943476870656, |
| "reward_before_std": 0.5692849718034267, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.362586697563529, |
| "reward_change_min": -0.6212925836443901, |
| "reward_change_std": 0.24119753576815128, |
| "reward_std": 0.5658366959542036, |
| "rewards/accuracy_reward": 0.3333333469927311, |
| "rewards/cosine_scaled_reward": 0.06861014291644096, |
| "step": 497 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2857.2916946411133, |
| "epoch": 0.5691428571428572, |
| "grad_norm": 0.06479962170124054, |
| "kl": 9.963754564523697e-06, |
| "lambda_div_used": 0.6236685812473297, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": 0.0008, |
| "reward": -0.18094617873430252, |
| "reward_after_mean": -0.18094617873430252, |
| "reward_after_std": 0.6155845355242491, |
| "reward_before_mean": 0.07468715589493513, |
| "reward_before_std": 0.5768492119386792, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2556333262473345, |
| "reward_change_min": -0.4431779384613037, |
| "reward_change_std": 0.16846577636897564, |
| "reward_std": 0.6155845616012812, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.0919795110821724, |
| "step": 498 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2707.791702270508, |
| "epoch": 0.5702857142857143, |
| "grad_norm": 0.06317302584648132, |
| "kl": 1.5532365068793297e-05, |
| "lambda_div_used": 0.6250721588730812, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": -0.0298, |
| "reward": -0.12677906453609467, |
| "reward_after_mean": -0.12677906453609467, |
| "reward_after_std": 0.6044176463037729, |
| "reward_before_mean": 0.1698420336470008, |
| "reward_before_std": 0.5829241154715419, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2966211009770632, |
| "reward_change_min": -0.5905468240380287, |
| "reward_change_std": 0.2108100038021803, |
| "reward_std": 0.6044176481664181, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.03849130589514971, |
| "step": 499 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2980.9167098999023, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.0621151477098465, |
| "kl": 1.033581793308258e-05, |
| "lambda_div_used": 0.6023986041545868, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": 0.0314, |
| "reward": -0.26747864981007297, |
| "reward_after_mean": -0.26747864981007297, |
| "reward_after_std": 0.5251844674348831, |
| "reward_before_mean": -0.0025623496621847153, |
| "reward_before_std": 0.4696931503713131, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26491627655923367, |
| "reward_change_min": -0.43205036222934723, |
| "reward_change_std": 0.15884541906416416, |
| "reward_std": 0.5251844730228186, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.10672901570796967, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.005314271912367986, |
| "train_runtime": 106074.3369, |
| "train_samples_per_second": 0.226, |
| "train_steps_per_second": 0.005 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|