| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0833333333333335, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.375, |
| "epoch": 0.0020833333333333333, |
| "grad_norm": 12.936853408813477, |
| "kl": 0.001018524169921875, |
| "learning_rate": 9.993055555555556e-07, |
| "loss": 0.0, |
| "reward": 0.21875, |
| "reward_std": 0.3930980935692787, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.15625, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.8125, |
| "epoch": 0.004166666666666667, |
| "grad_norm": 15.255448341369629, |
| "kl": 0.004360198974609375, |
| "learning_rate": 9.98611111111111e-07, |
| "loss": 0.0, |
| "reward": 0.34375, |
| "reward_std": 0.43536408245563507, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.3125, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.75, |
| "epoch": 0.00625, |
| "grad_norm": 16.691909790039062, |
| "kl": 0.0025482177734375, |
| "learning_rate": 9.979166666666667e-07, |
| "loss": 0.0, |
| "reward": 0.6875, |
| "reward_std": 0.5512787848711014, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.625, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.28125, |
| "epoch": 0.008333333333333333, |
| "grad_norm": 21.816268920898438, |
| "kl": 0.08805465698242188, |
| "learning_rate": 9.972222222222222e-07, |
| "loss": 0.0001, |
| "reward": 0.53125, |
| "reward_std": 0.564938560128212, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.5, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.09375, |
| "epoch": 0.010416666666666666, |
| "grad_norm": 10.746119499206543, |
| "kl": 0.003200531005859375, |
| "learning_rate": 9.965277777777778e-07, |
| "loss": 0.0, |
| "reward": 0.5625, |
| "reward_std": 0.49022960662841797, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.5625, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.3125, |
| "epoch": 0.0125, |
| "grad_norm": 3.8227508068084717, |
| "kl": 0.0056610107421875, |
| "learning_rate": 9.958333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.49796397238969803, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.8125, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.0625, |
| "epoch": 0.014583333333333334, |
| "grad_norm": 3.433580160140991, |
| "kl": 0.00739288330078125, |
| "learning_rate": 9.95138888888889e-07, |
| "loss": 0.0, |
| "reward": 0.6875, |
| "reward_std": 0.49721167981624603, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.625, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.5625, |
| "epoch": 0.016666666666666666, |
| "grad_norm": 3.7514586448669434, |
| "kl": 0.00689697265625, |
| "learning_rate": 9.944444444444444e-07, |
| "loss": 0.0, |
| "reward": 0.71875, |
| "reward_std": 0.4397946000099182, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.6875, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.125, |
| "epoch": 0.01875, |
| "grad_norm": 1.7038488388061523, |
| "kl": 0.00711822509765625, |
| "learning_rate": 9.9375e-07, |
| "loss": 0.0, |
| "reward": 0.78125, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.75, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.21875, |
| "epoch": 0.020833333333333332, |
| "grad_norm": 2.8775792121887207, |
| "kl": 0.00970458984375, |
| "learning_rate": 9.930555555555555e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.78125, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.90625, |
| "epoch": 0.022916666666666665, |
| "grad_norm": 2.9945576190948486, |
| "kl": 0.00738525390625, |
| "learning_rate": 9.923611111111111e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.5281829461455345, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.8125, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.375, |
| "epoch": 0.025, |
| "grad_norm": 2.374281883239746, |
| "kl": 0.0085296630859375, |
| "learning_rate": 9.916666666666666e-07, |
| "loss": 0.0, |
| "reward": 1.09375, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.9375, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.34375, |
| "epoch": 0.027083333333333334, |
| "grad_norm": 2.477187395095825, |
| "kl": 0.0110626220703125, |
| "learning_rate": 9.909722222222222e-07, |
| "loss": 0.0, |
| "reward": 1.03125, |
| "reward_std": 0.48461921513080597, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.8125, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.0625, |
| "epoch": 0.029166666666666667, |
| "grad_norm": 8.733186721801758, |
| "kl": 0.0125274658203125, |
| "learning_rate": 9.902777777777779e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.408231720328331, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.78125, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.78125, |
| "epoch": 0.03125, |
| "grad_norm": 3.690876007080078, |
| "kl": 0.0147705078125, |
| "learning_rate": 9.895833333333333e-07, |
| "loss": 0.0, |
| "reward": 1.03125, |
| "reward_std": 0.5986681878566742, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.71875, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.28125, |
| "epoch": 0.03333333333333333, |
| "grad_norm": 3.3308470249176025, |
| "kl": 0.01483154296875, |
| "learning_rate": 9.88888888888889e-07, |
| "loss": 0.0, |
| "reward": 0.75, |
| "reward_std": 0.3535533845424652, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.71875, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.1875, |
| "epoch": 0.035416666666666666, |
| "grad_norm": 4.208990573883057, |
| "kl": 0.016632080078125, |
| "learning_rate": 9.881944444444444e-07, |
| "loss": 0.0, |
| "reward": 0.46875, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.40625, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.0625, |
| "epoch": 0.0375, |
| "grad_norm": 3.331338405609131, |
| "kl": 0.01483154296875, |
| "learning_rate": 9.875e-07, |
| "loss": 0.0, |
| "reward": 1.21875, |
| "reward_std": 0.5347195863723755, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.8125, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.0, |
| "epoch": 0.03958333333333333, |
| "grad_norm": 1.6825522184371948, |
| "kl": 0.017303466796875, |
| "learning_rate": 9.868055555555555e-07, |
| "loss": 0.0, |
| "reward": 0.625, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.5625, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.375, |
| "epoch": 0.041666666666666664, |
| "grad_norm": 4.040245532989502, |
| "kl": 0.0188140869140625, |
| "learning_rate": 9.861111111111112e-07, |
| "loss": 0.0, |
| "reward": 1.0, |
| "reward_std": 0.44403792917728424, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.84375, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.28125, |
| "epoch": 0.04375, |
| "grad_norm": 3.547840118408203, |
| "kl": 0.0157470703125, |
| "learning_rate": 9.854166666666666e-07, |
| "loss": 0.0, |
| "reward": 1.40625, |
| "reward_std": 0.4218914955854416, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 1.0, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.90625, |
| "epoch": 0.04583333333333333, |
| "grad_norm": 3.7362163066864014, |
| "kl": 0.0159454345703125, |
| "learning_rate": 9.847222222222223e-07, |
| "loss": 0.0, |
| "reward": 1.15625, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 1.0, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.5, |
| "epoch": 0.04791666666666667, |
| "grad_norm": 3.4020462036132812, |
| "kl": 0.01849365234375, |
| "learning_rate": 9.840277777777777e-07, |
| "loss": 0.0, |
| "reward": 0.6875, |
| "reward_std": 0.408231720328331, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.59375, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 475.625, |
| "epoch": 0.05, |
| "grad_norm": 4.43328332901001, |
| "kl": 0.0170135498046875, |
| "learning_rate": 9.833333333333332e-07, |
| "loss": 0.0, |
| "reward": 1.125, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.96875, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.0, |
| "epoch": 0.052083333333333336, |
| "grad_norm": 2.972919225692749, |
| "kl": 0.018707275390625, |
| "learning_rate": 9.826388888888888e-07, |
| "loss": 0.0, |
| "reward": 1.3125, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.96875, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.0625, |
| "epoch": 0.05416666666666667, |
| "grad_norm": 6.743931293487549, |
| "kl": 0.0195770263671875, |
| "learning_rate": 9.819444444444443e-07, |
| "loss": 0.0, |
| "reward": 1.15625, |
| "reward_std": 0.4807935431599617, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.78125, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.90625, |
| "epoch": 0.05625, |
| "grad_norm": 9.631500244140625, |
| "kl": 0.018890380859375, |
| "learning_rate": 9.8125e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.75, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.375, |
| "epoch": 0.058333333333333334, |
| "grad_norm": 4.5614447593688965, |
| "kl": 0.019439697265625, |
| "learning_rate": 9.805555555555554e-07, |
| "loss": 0.0, |
| "reward": 1.28125, |
| "reward_std": 0.3471629247069359, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.9375, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 336.09375, |
| "epoch": 0.06041666666666667, |
| "grad_norm": 16.96288299560547, |
| "kl": 0.034393310546875, |
| "learning_rate": 9.79861111111111e-07, |
| "loss": 0.0, |
| "reward": 0.3125, |
| "reward_std": 0.408231720328331, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.28125, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.71875, |
| "epoch": 0.0625, |
| "grad_norm": 3.2910525798797607, |
| "kl": 0.025390625, |
| "learning_rate": 9.791666666666667e-07, |
| "loss": 0.0, |
| "reward": 1.375, |
| "reward_std": 0.5597654432058334, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9375, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.65625, |
| "epoch": 0.06458333333333334, |
| "grad_norm": 5.341634750366211, |
| "kl": 0.02484130859375, |
| "learning_rate": 9.784722222222221e-07, |
| "loss": 0.0, |
| "reward": 0.96875, |
| "reward_std": 0.4218914955854416, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.78125, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.09375, |
| "epoch": 0.06666666666666667, |
| "grad_norm": 3.29443621635437, |
| "kl": 0.0223388671875, |
| "learning_rate": 9.777777777777778e-07, |
| "loss": 0.0, |
| "reward": 0.71875, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 395.46875, |
| "epoch": 0.06875, |
| "grad_norm": 5.408890247344971, |
| "kl": 0.02655029296875, |
| "learning_rate": 9.770833333333332e-07, |
| "loss": 0.0, |
| "reward": 0.625, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.53125, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.90625, |
| "epoch": 0.07083333333333333, |
| "grad_norm": 5.162864685058594, |
| "kl": 0.024810791015625, |
| "learning_rate": 9.763888888888889e-07, |
| "loss": 0.0, |
| "reward": 0.8125, |
| "reward_std": 0.5468482673168182, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.59375, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.0625, |
| "epoch": 0.07291666666666667, |
| "grad_norm": 2.051487684249878, |
| "kl": 0.0206298828125, |
| "learning_rate": 9.756944444444443e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.2896047830581665, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.71875, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.53125, |
| "epoch": 0.075, |
| "grad_norm": 2.547386407852173, |
| "kl": 0.025421142578125, |
| "learning_rate": 9.75e-07, |
| "loss": 0.0, |
| "reward": 0.375, |
| "reward_std": 0.27439429610967636, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.25, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.65625, |
| "epoch": 0.07708333333333334, |
| "grad_norm": 2.0971977710723877, |
| "kl": 0.02703857421875, |
| "learning_rate": 9.743055555555554e-07, |
| "loss": 0.0, |
| "reward": 0.28125, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.25, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.09375, |
| "epoch": 0.07916666666666666, |
| "grad_norm": 3.524477005004883, |
| "kl": 0.023406982421875, |
| "learning_rate": 9.73611111111111e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.4355512708425522, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.8125, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 369.3125, |
| "epoch": 0.08125, |
| "grad_norm": 1.2645416259765625, |
| "kl": 0.024261474609375, |
| "learning_rate": 9.729166666666665e-07, |
| "loss": 0.0, |
| "reward": 1.0625, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 1.0, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.3125, |
| "epoch": 0.08333333333333333, |
| "grad_norm": 5.557051658630371, |
| "kl": 0.029052734375, |
| "learning_rate": 9.722222222222222e-07, |
| "loss": 0.0, |
| "reward": 1.375, |
| "reward_std": 0.4765502139925957, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 1.0, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.03125, |
| "epoch": 0.08541666666666667, |
| "grad_norm": 3.182037830352783, |
| "kl": 0.023590087890625, |
| "learning_rate": 9.715277777777776e-07, |
| "loss": 0.0, |
| "reward": 0.96875, |
| "reward_std": 0.4095756262540817, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.71875, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.8125, |
| "epoch": 0.0875, |
| "grad_norm": 9.412776947021484, |
| "kl": 0.02490234375, |
| "learning_rate": 9.708333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.96875, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.8125, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.1875, |
| "epoch": 0.08958333333333333, |
| "grad_norm": 19.119165420532227, |
| "kl": 0.0267333984375, |
| "learning_rate": 9.70138888888889e-07, |
| "loss": 0.0, |
| "reward": 1.46875, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.375, |
| "epoch": 0.09166666666666666, |
| "grad_norm": 5.377166748046875, |
| "kl": 0.027923583984375, |
| "learning_rate": 9.694444444444444e-07, |
| "loss": 0.0, |
| "reward": 1.09375, |
| "reward_std": 0.494472935795784, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.78125, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.1875, |
| "epoch": 0.09375, |
| "grad_norm": 11.015312194824219, |
| "kl": 0.02325439453125, |
| "learning_rate": 9.6875e-07, |
| "loss": 0.0, |
| "reward": 1.09375, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.1875, |
| "epoch": 0.09583333333333334, |
| "grad_norm": 9.002971649169922, |
| "kl": 0.032989501953125, |
| "learning_rate": 9.680555555555555e-07, |
| "loss": 0.0, |
| "reward": 1.21875, |
| "reward_std": 0.4218914955854416, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 1.0, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.6875, |
| "epoch": 0.09791666666666667, |
| "grad_norm": 9.63524055480957, |
| "kl": 0.03021240234375, |
| "learning_rate": 9.673611111111111e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.4080249145627022, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.6875, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.6875, |
| "epoch": 0.1, |
| "grad_norm": 6.078580379486084, |
| "kl": 0.03594970703125, |
| "learning_rate": 9.666666666666666e-07, |
| "loss": 0.0, |
| "reward": 0.625, |
| "reward_std": 0.3535533845424652, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.5625, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.53125, |
| "epoch": 0.10208333333333333, |
| "grad_norm": 2.8953256607055664, |
| "kl": 0.034576416015625, |
| "learning_rate": 9.659722222222222e-07, |
| "loss": 0.0, |
| "reward": 0.90625, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.75, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.125, |
| "epoch": 0.10416666666666667, |
| "grad_norm": 1.4677155017852783, |
| "kl": 0.031280517578125, |
| "learning_rate": 9.652777777777777e-07, |
| "loss": 0.0, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.03125, |
| "epoch": 0.10625, |
| "grad_norm": 4.762105464935303, |
| "kl": 0.03411865234375, |
| "learning_rate": 9.645833333333333e-07, |
| "loss": 0.0, |
| "reward": 0.71875, |
| "reward_std": 0.3987956568598747, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5625, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.09375, |
| "epoch": 0.10833333333333334, |
| "grad_norm": 5.787018775939941, |
| "kl": 0.03045654296875, |
| "learning_rate": 9.638888888888888e-07, |
| "loss": 0.0, |
| "reward": 1.59375, |
| "reward_std": 0.4534739926457405, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 1.0, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.125, |
| "epoch": 0.11041666666666666, |
| "grad_norm": 7.240172386169434, |
| "kl": 0.03271484375, |
| "learning_rate": 9.631944444444444e-07, |
| "loss": 0.0, |
| "reward": 0.875, |
| "reward_std": 0.3745020925998688, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5625, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.3125, |
| "epoch": 0.1125, |
| "grad_norm": 5.157228946685791, |
| "kl": 0.030731201171875, |
| "learning_rate": 9.624999999999999e-07, |
| "loss": 0.0, |
| "reward": 1.34375, |
| "reward_std": 0.5038893818855286, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 415.8125, |
| "epoch": 0.11458333333333333, |
| "grad_norm": 2.8105556964874268, |
| "kl": 0.028778076171875, |
| "learning_rate": 9.618055555555555e-07, |
| "loss": 0.0, |
| "reward": 1.0625, |
| "reward_std": 0.3745020925998688, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.8125, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.0625, |
| "epoch": 0.11666666666666667, |
| "grad_norm": 4.457938194274902, |
| "kl": 0.0325927734375, |
| "learning_rate": 9.61111111111111e-07, |
| "loss": 0.0, |
| "reward": 1.28125, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.90625, |
| "epoch": 0.11875, |
| "grad_norm": 29.321313858032227, |
| "kl": 0.03338623046875, |
| "learning_rate": 9.604166666666666e-07, |
| "loss": 0.0, |
| "reward": 0.78125, |
| "reward_std": 0.3471629247069359, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.625, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 336.03125, |
| "epoch": 0.12083333333333333, |
| "grad_norm": 3.946969747543335, |
| "kl": 0.0340576171875, |
| "learning_rate": 9.597222222222223e-07, |
| "loss": 0.0, |
| "reward": 1.53125, |
| "reward_std": 0.494472935795784, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.0625, |
| "epoch": 0.12291666666666666, |
| "grad_norm": 5.483251094818115, |
| "kl": 0.03076171875, |
| "learning_rate": 9.590277777777777e-07, |
| "loss": 0.0, |
| "reward": 1.125, |
| "reward_std": 0.408231720328331, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.8125, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.5, |
| "epoch": 0.125, |
| "grad_norm": 3.5011932849884033, |
| "kl": 0.029449462890625, |
| "learning_rate": 9.583333333333334e-07, |
| "loss": 0.0, |
| "reward": 1.09375, |
| "reward_std": 0.5038893818855286, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.8125, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.59375, |
| "epoch": 0.12708333333333333, |
| "grad_norm": 9.172750473022461, |
| "kl": 0.02972412109375, |
| "learning_rate": 9.576388888888888e-07, |
| "loss": 0.0, |
| "reward": 1.59375, |
| "reward_std": 0.38816186785697937, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 1.0, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.21875, |
| "epoch": 0.12916666666666668, |
| "grad_norm": 3.7546005249023438, |
| "kl": 0.03594970703125, |
| "learning_rate": 9.569444444444445e-07, |
| "loss": 0.0, |
| "reward": 1.03125, |
| "reward_std": 0.35564958304166794, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.03125, |
| "epoch": 0.13125, |
| "grad_norm": 3.4299263954162598, |
| "kl": 0.03411865234375, |
| "learning_rate": 9.5625e-07, |
| "loss": 0.0, |
| "reward": 0.8125, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.59375, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.6875, |
| "epoch": 0.13333333333333333, |
| "grad_norm": 2.6421077251434326, |
| "kl": 0.034515380859375, |
| "learning_rate": 9.555555555555556e-07, |
| "loss": 0.0, |
| "reward": 0.96875, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.78125, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.3125, |
| "epoch": 0.13541666666666666, |
| "grad_norm": 3.543858051300049, |
| "kl": 0.0399169921875, |
| "learning_rate": 9.54861111111111e-07, |
| "loss": 0.0, |
| "reward": 1.40625, |
| "reward_std": 0.4628904387354851, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 1.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.40625, |
| "epoch": 0.1375, |
| "grad_norm": 4.455636501312256, |
| "kl": 0.04608154296875, |
| "learning_rate": 9.541666666666667e-07, |
| "loss": 0.0, |
| "reward": 1.28125, |
| "reward_std": 0.5116237476468086, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.9375, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.9375, |
| "epoch": 0.13958333333333334, |
| "grad_norm": 6.6961259841918945, |
| "kl": 0.04595947265625, |
| "learning_rate": 9.534722222222223e-07, |
| "loss": 0.0, |
| "reward": 0.8125, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.71875, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.59375, |
| "epoch": 0.14166666666666666, |
| "grad_norm": 1.9689209461212158, |
| "kl": 0.033203125, |
| "learning_rate": 9.527777777777777e-07, |
| "loss": 0.0, |
| "reward": 0.71875, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.6875, |
| "epoch": 0.14375, |
| "grad_norm": 4.801360130310059, |
| "kl": 0.0440673828125, |
| "learning_rate": 9.520833333333333e-07, |
| "loss": 0.0, |
| "reward": 1.0, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.78125, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.375, |
| "epoch": 0.14583333333333334, |
| "grad_norm": 4.846868515014648, |
| "kl": 0.036529541015625, |
| "learning_rate": 9.513888888888888e-07, |
| "loss": 0.0, |
| "reward": 1.4375, |
| "reward_std": 0.49022960662841797, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 1.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 342.5, |
| "epoch": 0.14791666666666667, |
| "grad_norm": 4.30186128616333, |
| "kl": 0.06610107421875, |
| "learning_rate": 9.506944444444444e-07, |
| "loss": 0.0001, |
| "reward": 1.03125, |
| "reward_std": 0.3808925524353981, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.78125, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 335.40625, |
| "epoch": 0.15, |
| "grad_norm": 3.9767167568206787, |
| "kl": 0.05157470703125, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0001, |
| "reward": 0.8125, |
| "reward_std": 0.4355512708425522, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5625, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.65625, |
| "epoch": 0.15208333333333332, |
| "grad_norm": 2.6517255306243896, |
| "kl": 0.03857421875, |
| "learning_rate": 9.493055555555555e-07, |
| "loss": 0.0, |
| "reward": 0.84375, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.75, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.0, |
| "epoch": 0.15416666666666667, |
| "grad_norm": 6.51157283782959, |
| "kl": 0.04010009765625, |
| "learning_rate": 9.48611111111111e-07, |
| "loss": 0.0, |
| "reward": 1.53125, |
| "reward_std": 0.5038893818855286, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.6875, |
| "epoch": 0.15625, |
| "grad_norm": 4.548913478851318, |
| "kl": 0.0443115234375, |
| "learning_rate": 9.479166666666666e-07, |
| "loss": 0.0, |
| "reward": 0.53125, |
| "reward_std": 0.3198433741927147, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.34375, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.34375, |
| "epoch": 0.15833333333333333, |
| "grad_norm": 3.1202585697174072, |
| "kl": 0.04510498046875, |
| "learning_rate": 9.472222222222221e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.3104073107242584, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.78125, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.25, |
| "epoch": 0.16041666666666668, |
| "grad_norm": 2.928286552429199, |
| "kl": 0.04559326171875, |
| "learning_rate": 9.465277777777777e-07, |
| "loss": 0.0, |
| "reward": 1.15625, |
| "reward_std": 0.3608423173427582, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 381.3125, |
| "epoch": 0.1625, |
| "grad_norm": 5.202105522155762, |
| "kl": 0.04644775390625, |
| "learning_rate": 9.458333333333333e-07, |
| "loss": 0.0, |
| "reward": 1.5, |
| "reward_std": 0.44403792917728424, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.21875, |
| "epoch": 0.16458333333333333, |
| "grad_norm": 6.891729831695557, |
| "kl": 0.04815673828125, |
| "learning_rate": 9.451388888888889e-07, |
| "loss": 0.0, |
| "reward": 0.40625, |
| "reward_std": 0.3808925524353981, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.375, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 362.5625, |
| "epoch": 0.16666666666666666, |
| "grad_norm": 5.241474151611328, |
| "kl": 0.04620361328125, |
| "learning_rate": 9.444444444444444e-07, |
| "loss": 0.0, |
| "reward": 1.0625, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.78125, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.96875, |
| "epoch": 0.16875, |
| "grad_norm": 4.389130592346191, |
| "kl": 0.048095703125, |
| "learning_rate": 9.4375e-07, |
| "loss": 0.0, |
| "reward": 1.34375, |
| "reward_std": 0.4628904387354851, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.84375, |
| "epoch": 0.17083333333333334, |
| "grad_norm": 8.09018611907959, |
| "kl": 0.043701171875, |
| "learning_rate": 9.430555555555555e-07, |
| "loss": 0.0, |
| "reward": 0.90625, |
| "reward_std": 0.3198433741927147, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.78125, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.84375, |
| "epoch": 0.17291666666666666, |
| "grad_norm": 2.641420602798462, |
| "kl": 0.04937744140625, |
| "learning_rate": 9.423611111111111e-07, |
| "loss": 0.0, |
| "reward": 1.40625, |
| "reward_std": 0.3787454217672348, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 1.0, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.78125, |
| "epoch": 0.175, |
| "grad_norm": 7.009854793548584, |
| "kl": 0.04791259765625, |
| "learning_rate": 9.416666666666666e-07, |
| "loss": 0.0, |
| "reward": 1.0625, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.78125, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 394.3125, |
| "epoch": 0.17708333333333334, |
| "grad_norm": 2.642425537109375, |
| "kl": 0.0491943359375, |
| "learning_rate": 9.409722222222222e-07, |
| "loss": 0.0, |
| "reward": 1.15625, |
| "reward_std": 0.3198433741927147, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 424.03125, |
| "epoch": 0.17916666666666667, |
| "grad_norm": 2.3896079063415527, |
| "kl": 0.04443359375, |
| "learning_rate": 9.402777777777777e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.71875, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.34375, |
| "epoch": 0.18125, |
| "grad_norm": 4.422949314117432, |
| "kl": 0.04510498046875, |
| "learning_rate": 9.395833333333333e-07, |
| "loss": 0.0, |
| "reward": 1.0, |
| "reward_std": 0.3745020925998688, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.65625, |
| "epoch": 0.18333333333333332, |
| "grad_norm": 3.236891746520996, |
| "kl": 0.04571533203125, |
| "learning_rate": 9.388888888888888e-07, |
| "loss": 0.0, |
| "reward": 0.9375, |
| "reward_std": 0.3104073107242584, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.75, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.9375, |
| "epoch": 0.18541666666666667, |
| "grad_norm": 8.649247169494629, |
| "kl": 0.04290771484375, |
| "learning_rate": 9.381944444444444e-07, |
| "loss": 0.0, |
| "reward": 0.78125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.125, |
| "epoch": 0.1875, |
| "grad_norm": 3.4409844875335693, |
| "kl": 0.04608154296875, |
| "learning_rate": 9.374999999999999e-07, |
| "loss": 0.0, |
| "reward": 1.125, |
| "reward_std": 0.4492306634783745, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.78125, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.4375, |
| "epoch": 0.18958333333333333, |
| "grad_norm": 7.360363483428955, |
| "kl": 0.05267333984375, |
| "learning_rate": 9.368055555555555e-07, |
| "loss": 0.0001, |
| "reward": 1.125, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.78125, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 470.03125, |
| "epoch": 0.19166666666666668, |
| "grad_norm": 1.7753536701202393, |
| "kl": 0.046875, |
| "learning_rate": 9.361111111111111e-07, |
| "loss": 0.0, |
| "reward": 1.15625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.65625, |
| "epoch": 0.19375, |
| "grad_norm": 16.041807174682617, |
| "kl": 0.05078125, |
| "learning_rate": 9.354166666666667e-07, |
| "loss": 0.0001, |
| "reward": 0.96875, |
| "reward_std": 0.494472935795784, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.78125, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.90625, |
| "epoch": 0.19583333333333333, |
| "grad_norm": 5.043431758880615, |
| "kl": 0.05303955078125, |
| "learning_rate": 9.347222222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.28125, |
| "reward_std": 0.35564958304166794, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.1875, |
| "epoch": 0.19791666666666666, |
| "grad_norm": 3.5956928730010986, |
| "kl": 0.0496826171875, |
| "learning_rate": 9.340277777777778e-07, |
| "loss": 0.0, |
| "reward": 0.96875, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.5, |
| "epoch": 0.2, |
| "grad_norm": 2.4647815227508545, |
| "kl": 0.0474853515625, |
| "learning_rate": 9.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 1.03125, |
| "reward_std": 0.35564958304166794, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.09375, |
| "epoch": 0.20208333333333334, |
| "grad_norm": 4.0429534912109375, |
| "kl": 0.05426025390625, |
| "learning_rate": 9.326388888888889e-07, |
| "loss": 0.0001, |
| "reward": 1.6875, |
| "reward_std": 0.49022960662841797, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.46875, |
| "epoch": 0.20416666666666666, |
| "grad_norm": 4.266984939575195, |
| "kl": 0.07110595703125, |
| "learning_rate": 9.319444444444444e-07, |
| "loss": 0.0001, |
| "reward": 1.4375, |
| "reward_std": 0.38298875093460083, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 1.0, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.28125, |
| "epoch": 0.20625, |
| "grad_norm": 2.255748748779297, |
| "kl": 0.04803466796875, |
| "learning_rate": 9.3125e-07, |
| "loss": 0.0, |
| "reward": 1.0625, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.90625, |
| "epoch": 0.20833333333333334, |
| "grad_norm": 2.3120131492614746, |
| "kl": 0.05810546875, |
| "learning_rate": 9.305555555555555e-07, |
| "loss": 0.0001, |
| "reward": 1.09375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.53125, |
| "epoch": 0.21041666666666667, |
| "grad_norm": 3.011608839035034, |
| "kl": 0.048583984375, |
| "learning_rate": 9.298611111111111e-07, |
| "loss": 0.0, |
| "reward": 1.5625, |
| "reward_std": 0.49022960662841797, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.5, |
| "epoch": 0.2125, |
| "grad_norm": 3.771697521209717, |
| "kl": 0.05657958984375, |
| "learning_rate": 9.291666666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.125, |
| "reward_std": 0.3650856465101242, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.75, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.59375, |
| "epoch": 0.21458333333333332, |
| "grad_norm": 3.9330313205718994, |
| "kl": 0.07916259765625, |
| "learning_rate": 9.284722222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.5, |
| "reward_std": 0.5081327110528946, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.5, |
| "epoch": 0.21666666666666667, |
| "grad_norm": 3.8951761722564697, |
| "kl": 0.04876708984375, |
| "learning_rate": 9.277777777777777e-07, |
| "loss": 0.0, |
| "reward": 1.5625, |
| "reward_std": 0.49022960662841797, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.46875, |
| "epoch": 0.21875, |
| "grad_norm": 5.237806797027588, |
| "kl": 0.05755615234375, |
| "learning_rate": 9.270833333333333e-07, |
| "loss": 0.0001, |
| "reward": 0.8125, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.625, |
| "epoch": 0.22083333333333333, |
| "grad_norm": 2.3644521236419678, |
| "kl": 0.08203125, |
| "learning_rate": 9.263888888888889e-07, |
| "loss": 0.0001, |
| "reward": 0.78125, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.46875, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 390.84375, |
| "epoch": 0.22291666666666668, |
| "grad_norm": 2.5936055183410645, |
| "kl": 0.06103515625, |
| "learning_rate": 9.256944444444445e-07, |
| "loss": 0.0001, |
| "reward": 0.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.15625, |
| "epoch": 0.225, |
| "grad_norm": 8.296189308166504, |
| "kl": 0.05120849609375, |
| "learning_rate": 9.25e-07, |
| "loss": 0.0001, |
| "reward": 1.34375, |
| "reward_std": 0.4628904387354851, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.125, |
| "epoch": 0.22708333333333333, |
| "grad_norm": 5.851869106292725, |
| "kl": 0.06024169921875, |
| "learning_rate": 9.243055555555556e-07, |
| "loss": 0.0001, |
| "reward": 1.0625, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.75, |
| "epoch": 0.22916666666666666, |
| "grad_norm": 2.6394460201263428, |
| "kl": 0.05841064453125, |
| "learning_rate": 9.236111111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.125, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.75, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 403.0, |
| "epoch": 0.23125, |
| "grad_norm": 2.53338885307312, |
| "kl": 0.06317138671875, |
| "learning_rate": 9.229166666666667e-07, |
| "loss": 0.0001, |
| "reward": 1.0625, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.1875, |
| "epoch": 0.23333333333333334, |
| "grad_norm": 1.3787952661514282, |
| "kl": 0.06109619140625, |
| "learning_rate": 9.222222222222222e-07, |
| "loss": 0.0001, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.75, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.53125, |
| "epoch": 0.23541666666666666, |
| "grad_norm": 3.1469290256500244, |
| "kl": 0.06982421875, |
| "learning_rate": 9.215277777777777e-07, |
| "loss": 0.0001, |
| "reward": 1.09375, |
| "reward_std": 0.3608423173427582, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.375, |
| "epoch": 0.2375, |
| "grad_norm": 2.1029345989227295, |
| "kl": 0.064208984375, |
| "learning_rate": 9.208333333333332e-07, |
| "loss": 0.0001, |
| "reward": 0.65625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 365.65625, |
| "epoch": 0.23958333333333334, |
| "grad_norm": 7.250767230987549, |
| "kl": 0.06658935546875, |
| "learning_rate": 9.201388888888888e-07, |
| "loss": 0.0001, |
| "reward": 0.96875, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.8125, |
| "epoch": 0.24166666666666667, |
| "grad_norm": 2.3046934604644775, |
| "kl": 0.064208984375, |
| "learning_rate": 9.194444444444443e-07, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.26726123690605164, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.0, |
| "epoch": 0.24375, |
| "grad_norm": 1.9355486631393433, |
| "kl": 0.0821533203125, |
| "learning_rate": 9.187499999999999e-07, |
| "loss": 0.0001, |
| "reward": 0.71875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 332.25, |
| "epoch": 0.24583333333333332, |
| "grad_norm": 2.3174281120300293, |
| "kl": 0.0784912109375, |
| "learning_rate": 9.180555555555554e-07, |
| "loss": 0.0001, |
| "reward": 0.65625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.75, |
| "epoch": 0.24791666666666667, |
| "grad_norm": 3.247333288192749, |
| "kl": 0.0699462890625, |
| "learning_rate": 9.17361111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.09375, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.6875, |
| "epoch": 0.25, |
| "grad_norm": 2.650913953781128, |
| "kl": 0.07958984375, |
| "learning_rate": 9.166666666666665e-07, |
| "loss": 0.0001, |
| "reward": 1.65625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.34375, |
| "epoch": 0.2520833333333333, |
| "grad_norm": 4.5364603996276855, |
| "kl": 0.0885009765625, |
| "learning_rate": 9.159722222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.34375, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.4375, |
| "epoch": 0.25416666666666665, |
| "grad_norm": 2.341081142425537, |
| "kl": 0.134765625, |
| "learning_rate": 9.152777777777777e-07, |
| "loss": 0.0001, |
| "reward": 0.59375, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.5, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.4375, |
| "epoch": 0.25625, |
| "grad_norm": 1.698325276374817, |
| "kl": 0.1190185546875, |
| "learning_rate": 9.145833333333333e-07, |
| "loss": 0.0001, |
| "reward": 0.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.25, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.5, |
| "epoch": 0.25833333333333336, |
| "grad_norm": 2.0879573822021484, |
| "kl": 0.1610107421875, |
| "learning_rate": 9.138888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.40625, |
| "epoch": 0.2604166666666667, |
| "grad_norm": 8.24306869506836, |
| "kl": 0.0919189453125, |
| "learning_rate": 9.131944444444444e-07, |
| "loss": 0.0001, |
| "reward": 1.75, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.65625, |
| "epoch": 0.2625, |
| "grad_norm": 4.286442279815674, |
| "kl": 0.1165771484375, |
| "learning_rate": 9.124999999999999e-07, |
| "loss": 0.0001, |
| "reward": 1.1875, |
| "reward_std": 0.38298875093460083, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 279.8125, |
| "epoch": 0.26458333333333334, |
| "grad_norm": 36.095619201660156, |
| "kl": 0.1119384765625, |
| "learning_rate": 9.118055555555555e-07, |
| "loss": 0.0001, |
| "reward": 1.5625, |
| "reward_std": 0.48503687232732773, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.15625, |
| "epoch": 0.26666666666666666, |
| "grad_norm": 2.5451955795288086, |
| "kl": 0.1114501953125, |
| "learning_rate": 9.11111111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.25, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.4375, |
| "epoch": 0.26875, |
| "grad_norm": 2.765319347381592, |
| "kl": 0.1148681640625, |
| "learning_rate": 9.104166666666666e-07, |
| "loss": 0.0001, |
| "reward": 0.84375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 280.3125, |
| "epoch": 0.2708333333333333, |
| "grad_norm": 4.0128583908081055, |
| "kl": 0.1129150390625, |
| "learning_rate": 9.097222222222221e-07, |
| "loss": 0.0001, |
| "reward": 0.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.25, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.5625, |
| "epoch": 0.27291666666666664, |
| "grad_norm": 4.9140801429748535, |
| "kl": 0.133544921875, |
| "learning_rate": 9.090277777777777e-07, |
| "loss": 0.0001, |
| "reward": 1.6875, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 245.96875, |
| "epoch": 0.275, |
| "grad_norm": 3.4184932708740234, |
| "kl": 0.1329345703125, |
| "learning_rate": 9.083333333333332e-07, |
| "loss": 0.0001, |
| "reward": 0.65625, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.90625, |
| "epoch": 0.27708333333333335, |
| "grad_norm": 4.854220390319824, |
| "kl": 0.1396484375, |
| "learning_rate": 9.076388888888888e-07, |
| "loss": 0.0001, |
| "reward": 1.46875, |
| "reward_std": 0.3808925524353981, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.5, |
| "epoch": 0.2791666666666667, |
| "grad_norm": 3.3047986030578613, |
| "kl": 0.146240234375, |
| "learning_rate": 9.069444444444443e-07, |
| "loss": 0.0001, |
| "reward": 1.25, |
| "reward_std": 0.3650856465101242, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.84375, |
| "epoch": 0.28125, |
| "grad_norm": 6.767662048339844, |
| "kl": 0.1334228515625, |
| "learning_rate": 9.0625e-07, |
| "loss": 0.0001, |
| "reward": 1.0625, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 246.625, |
| "epoch": 0.2833333333333333, |
| "grad_norm": 2.0335617065429688, |
| "kl": 0.142578125, |
| "learning_rate": 9.055555555555556e-07, |
| "loss": 0.0001, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.875, |
| "epoch": 0.28541666666666665, |
| "grad_norm": 12.565716743469238, |
| "kl": 0.1414794921875, |
| "learning_rate": 9.048611111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.375, |
| "reward_std": 0.4492306634783745, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 1.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.65625, |
| "epoch": 0.2875, |
| "grad_norm": 9.577066421508789, |
| "kl": 0.1416015625, |
| "learning_rate": 9.041666666666667e-07, |
| "loss": 0.0001, |
| "reward": 1.34375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 269.375, |
| "epoch": 0.28958333333333336, |
| "grad_norm": 4.135301113128662, |
| "kl": 0.1522216796875, |
| "learning_rate": 9.034722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.125, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.75, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.59375, |
| "epoch": 0.2916666666666667, |
| "grad_norm": 2.6078929901123047, |
| "kl": 0.1363525390625, |
| "learning_rate": 9.027777777777778e-07, |
| "loss": 0.0001, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.71875, |
| "epoch": 0.29375, |
| "grad_norm": 23.88492774963379, |
| "kl": 0.1279296875, |
| "learning_rate": 9.020833333333333e-07, |
| "loss": 0.0001, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.03125, |
| "epoch": 0.29583333333333334, |
| "grad_norm": 2.0252206325531006, |
| "kl": 0.139404296875, |
| "learning_rate": 9.013888888888889e-07, |
| "loss": 0.0001, |
| "reward": 1.625, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.4375, |
| "epoch": 0.29791666666666666, |
| "grad_norm": 3.3563263416290283, |
| "kl": 0.12841796875, |
| "learning_rate": 9.006944444444444e-07, |
| "loss": 0.0001, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.71875, |
| "epoch": 0.3, |
| "grad_norm": 2.928553819656372, |
| "kl": 0.156982421875, |
| "learning_rate": 9e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 274.5625, |
| "epoch": 0.3020833333333333, |
| "grad_norm": 2.215178966522217, |
| "kl": 0.145263671875, |
| "learning_rate": 8.993055555555555e-07, |
| "loss": 0.0001, |
| "reward": 0.875, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 280.3125, |
| "epoch": 0.30416666666666664, |
| "grad_norm": 2.916330337524414, |
| "kl": 0.142822265625, |
| "learning_rate": 8.986111111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.78125, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.5625, |
| "epoch": 0.30625, |
| "grad_norm": 2.6211419105529785, |
| "kl": 0.1351318359375, |
| "learning_rate": 8.979166666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.46875, |
| "epoch": 0.30833333333333335, |
| "grad_norm": 9.624622344970703, |
| "kl": 0.154296875, |
| "learning_rate": 8.972222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.5625, |
| "reward_std": 0.5468482673168182, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.96875, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 275.5, |
| "epoch": 0.3104166666666667, |
| "grad_norm": 1.7619229555130005, |
| "kl": 0.137939453125, |
| "learning_rate": 8.965277777777778e-07, |
| "loss": 0.0001, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.78125, |
| "epoch": 0.3125, |
| "grad_norm": 3.684971570968628, |
| "kl": 0.12841796875, |
| "learning_rate": 8.958333333333334e-07, |
| "loss": 0.0001, |
| "reward": 1.75, |
| "reward_std": 0.4261348247528076, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 0.96875, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 304.40625, |
| "epoch": 0.3145833333333333, |
| "grad_norm": 3.517716407775879, |
| "kl": 0.132568359375, |
| "learning_rate": 8.951388888888889e-07, |
| "loss": 0.0001, |
| "reward": 0.8125, |
| "reward_std": 0.2587745785713196, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.125, |
| "epoch": 0.31666666666666665, |
| "grad_norm": 2.7190983295440674, |
| "kl": 0.131103515625, |
| "learning_rate": 8.944444444444445e-07, |
| "loss": 0.0001, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.875, |
| "epoch": 0.31875, |
| "grad_norm": 3.1140782833099365, |
| "kl": 0.145751953125, |
| "learning_rate": 8.9375e-07, |
| "loss": 0.0001, |
| "reward": 1.3125, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.71875, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.9375, |
| "epoch": 0.32083333333333336, |
| "grad_norm": 5.122025012969971, |
| "kl": 0.132080078125, |
| "learning_rate": 8.930555555555556e-07, |
| "loss": 0.0001, |
| "reward": 1.28125, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.71875, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.9375, |
| "epoch": 0.3229166666666667, |
| "grad_norm": 2.554211378097534, |
| "kl": 0.149169921875, |
| "learning_rate": 8.923611111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.125, |
| "reward_std": 0.27439429610967636, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.71875, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.375, |
| "epoch": 0.325, |
| "grad_norm": 1.5763827562332153, |
| "kl": 0.1688232421875, |
| "learning_rate": 8.916666666666667e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.78125, |
| "epoch": 0.32708333333333334, |
| "grad_norm": 3.9709863662719727, |
| "kl": 0.120361328125, |
| "learning_rate": 8.909722222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.625, |
| "reward_std": 0.5081327110528946, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.9375, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.0, |
| "epoch": 0.32916666666666666, |
| "grad_norm": 1.7929834127426147, |
| "kl": 0.1209716796875, |
| "learning_rate": 8.902777777777777e-07, |
| "loss": 0.0001, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.28125, |
| "epoch": 0.33125, |
| "grad_norm": 5.112552165985107, |
| "kl": 0.143798828125, |
| "learning_rate": 8.895833333333332e-07, |
| "loss": 0.0001, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 328.4375, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.3229613304138184, |
| "kl": 0.12841796875, |
| "learning_rate": 8.888888888888888e-07, |
| "loss": 0.0001, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.21875, |
| "epoch": 0.33541666666666664, |
| "grad_norm": 0.010016150772571564, |
| "kl": 0.124267578125, |
| "learning_rate": 8.881944444444443e-07, |
| "loss": 0.0001, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.65625, |
| "epoch": 0.3375, |
| "grad_norm": 2.9473917484283447, |
| "kl": 0.1375732421875, |
| "learning_rate": 8.874999999999999e-07, |
| "loss": 0.0001, |
| "reward": 1.375, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.3125, |
| "epoch": 0.33958333333333335, |
| "grad_norm": 19.425966262817383, |
| "kl": 0.19970703125, |
| "learning_rate": 8.868055555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.28125, |
| "epoch": 0.3416666666666667, |
| "grad_norm": 1.3836629390716553, |
| "kl": 0.135986328125, |
| "learning_rate": 8.861111111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.15625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.1875, |
| "epoch": 0.34375, |
| "grad_norm": 6.287445068359375, |
| "kl": 0.136474609375, |
| "learning_rate": 8.854166666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.78125, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.15625, |
| "epoch": 0.3458333333333333, |
| "grad_norm": 4.487782001495361, |
| "kl": 0.1461181640625, |
| "learning_rate": 8.847222222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.625, |
| "reward_std": 0.4671337679028511, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.96875, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.625, |
| "epoch": 0.34791666666666665, |
| "grad_norm": 2.907824754714966, |
| "kl": 0.141845703125, |
| "learning_rate": 8.840277777777777e-07, |
| "loss": 0.0001, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.34375, |
| "epoch": 0.35, |
| "grad_norm": 2.528587818145752, |
| "kl": 0.146728515625, |
| "learning_rate": 8.833333333333333e-07, |
| "loss": 0.0001, |
| "reward": 1.34375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.1875, |
| "epoch": 0.35208333333333336, |
| "grad_norm": 0.007634375710040331, |
| "kl": 0.14111328125, |
| "learning_rate": 8.826388888888888e-07, |
| "loss": 0.0001, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.75, |
| "epoch": 0.3541666666666667, |
| "grad_norm": 0.009850732050836086, |
| "kl": 0.145263671875, |
| "learning_rate": 8.819444444444444e-07, |
| "loss": 0.0001, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.375, |
| "epoch": 0.35625, |
| "grad_norm": 4.6734089851379395, |
| "kl": 0.1787109375, |
| "learning_rate": 8.812499999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.125, |
| "reward_std": 0.40089185535907745, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.75, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 317.0, |
| "epoch": 0.35833333333333334, |
| "grad_norm": 1.5366886854171753, |
| "kl": 0.13427734375, |
| "learning_rate": 8.805555555555555e-07, |
| "loss": 0.0001, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.5625, |
| "epoch": 0.36041666666666666, |
| "grad_norm": 0.022151868790388107, |
| "kl": 0.1435546875, |
| "learning_rate": 8.79861111111111e-07, |
| "loss": 0.0001, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 274.3125, |
| "epoch": 0.3625, |
| "grad_norm": 2.0597925186157227, |
| "kl": 0.1456298828125, |
| "learning_rate": 8.791666666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 276.03125, |
| "epoch": 0.3645833333333333, |
| "grad_norm": 0.007576541043817997, |
| "kl": 0.160888671875, |
| "learning_rate": 8.784722222222221e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.5, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 267.4375, |
| "epoch": 0.36666666666666664, |
| "grad_norm": 4.791009902954102, |
| "kl": 0.166748046875, |
| "learning_rate": 8.777777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.3125, |
| "epoch": 0.36875, |
| "grad_norm": 1.4421019554138184, |
| "kl": 0.330810546875, |
| "learning_rate": 8.770833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 274.6875, |
| "epoch": 0.37083333333333335, |
| "grad_norm": 13.741174697875977, |
| "kl": 0.174560546875, |
| "learning_rate": 8.763888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.71875, |
| "epoch": 0.3729166666666667, |
| "grad_norm": 2.762673854827881, |
| "kl": 0.158447265625, |
| "learning_rate": 8.756944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.3650856465101242, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.71875, |
| "epoch": 0.375, |
| "grad_norm": 3.762700319290161, |
| "kl": 0.16552734375, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.59375, |
| "epoch": 0.3770833333333333, |
| "grad_norm": 2.4614808559417725, |
| "kl": 0.14794921875, |
| "learning_rate": 8.743055555555555e-07, |
| "loss": 0.0001, |
| "reward": 0.375, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.25, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 267.5, |
| "epoch": 0.37916666666666665, |
| "grad_norm": 2.320138692855835, |
| "kl": 0.169921875, |
| "learning_rate": 8.736111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.8125, |
| "reward_std": 0.249358132481575, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 267.46875, |
| "epoch": 0.38125, |
| "grad_norm": 1.7790272235870361, |
| "kl": 0.1593017578125, |
| "learning_rate": 8.729166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 280.5625, |
| "epoch": 0.38333333333333336, |
| "grad_norm": 2.4105911254882812, |
| "kl": 0.16015625, |
| "learning_rate": 8.722222222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.84375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.84375, |
| "epoch": 0.3854166666666667, |
| "grad_norm": 2.337040424346924, |
| "kl": 0.190185546875, |
| "learning_rate": 8.715277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 276.59375, |
| "epoch": 0.3875, |
| "grad_norm": 5.802671909332275, |
| "kl": 0.20263671875, |
| "learning_rate": 8.708333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.4218914955854416, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.8125, |
| "epoch": 0.38958333333333334, |
| "grad_norm": 2.9159724712371826, |
| "kl": 0.18212890625, |
| "learning_rate": 8.701388888888888e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.125, |
| "epoch": 0.39166666666666666, |
| "grad_norm": 21.03277015686035, |
| "kl": 0.178466796875, |
| "learning_rate": 8.694444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 276.0625, |
| "epoch": 0.39375, |
| "grad_norm": 1.9231356382369995, |
| "kl": 0.187255859375, |
| "learning_rate": 8.687499999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 274.90625, |
| "epoch": 0.3958333333333333, |
| "grad_norm": 1.5890367031097412, |
| "kl": 0.1796875, |
| "learning_rate": 8.680555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.5, |
| "epoch": 0.39791666666666664, |
| "grad_norm": 1.5505869388580322, |
| "kl": 0.163330078125, |
| "learning_rate": 8.673611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.53125, |
| "epoch": 0.4, |
| "grad_norm": 0.04606321454048157, |
| "kl": 0.180419921875, |
| "learning_rate": 8.666666666666667e-07, |
| "loss": 0.0002, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.28125, |
| "epoch": 0.40208333333333335, |
| "grad_norm": 4.318572044372559, |
| "kl": 0.201171875, |
| "learning_rate": 8.659722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.35564958304166794, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.34375, |
| "epoch": 0.4041666666666667, |
| "grad_norm": 2.215465784072876, |
| "kl": 0.185546875, |
| "learning_rate": 8.652777777777778e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.5625, |
| "epoch": 0.40625, |
| "grad_norm": 1.9885610342025757, |
| "kl": 0.22119140625, |
| "learning_rate": 8.645833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.8125, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.5, |
| "epoch": 0.4083333333333333, |
| "grad_norm": 2.8268754482269287, |
| "kl": 0.1611328125, |
| "learning_rate": 8.638888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.09375, |
| "epoch": 0.41041666666666665, |
| "grad_norm": 3.276139497756958, |
| "kl": 0.173828125, |
| "learning_rate": 8.631944444444445e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 281.65625, |
| "epoch": 0.4125, |
| "grad_norm": 3.957500696182251, |
| "kl": 0.1904296875, |
| "learning_rate": 8.625e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.21875, |
| "epoch": 0.41458333333333336, |
| "grad_norm": 1.9258300065994263, |
| "kl": 0.17919921875, |
| "learning_rate": 8.618055555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.71875, |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.656267523765564, |
| "kl": 0.1787109375, |
| "learning_rate": 8.611111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 281.96875, |
| "epoch": 0.41875, |
| "grad_norm": 3.227402687072754, |
| "kl": 0.18408203125, |
| "learning_rate": 8.604166666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 1.0, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.28125, |
| "epoch": 0.42083333333333334, |
| "grad_norm": 1.7516613006591797, |
| "kl": 0.1708984375, |
| "learning_rate": 8.597222222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.21875, |
| "epoch": 0.42291666666666666, |
| "grad_norm": 2.2734436988830566, |
| "kl": 0.17041015625, |
| "learning_rate": 8.590277777777776e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.25, |
| "epoch": 0.425, |
| "grad_norm": 2.0509085655212402, |
| "kl": 0.18212890625, |
| "learning_rate": 8.583333333333332e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.25, |
| "epoch": 0.4270833333333333, |
| "grad_norm": 1.707233190536499, |
| "kl": 0.186279296875, |
| "learning_rate": 8.576388888888887e-07, |
| "loss": 0.0002, |
| "reward": 0.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.25, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.3125, |
| "epoch": 0.42916666666666664, |
| "grad_norm": 0.017311234027147293, |
| "kl": 0.20947265625, |
| "learning_rate": 8.569444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.0625, |
| "epoch": 0.43125, |
| "grad_norm": 5.802515506744385, |
| "kl": 0.167236328125, |
| "learning_rate": 8.5625e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.4218914955854416, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.84375, |
| "epoch": 0.43333333333333335, |
| "grad_norm": 2.5334019660949707, |
| "kl": 0.179931640625, |
| "learning_rate": 8.555555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.71875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.15625, |
| "epoch": 0.4354166666666667, |
| "grad_norm": 2.551426649093628, |
| "kl": 0.1845703125, |
| "learning_rate": 8.548611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.53125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.5, |
| "epoch": 0.4375, |
| "grad_norm": 6.069310665130615, |
| "kl": 0.181640625, |
| "learning_rate": 8.541666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.3745020925998688, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.09375, |
| "epoch": 0.4395833333333333, |
| "grad_norm": 3.099595308303833, |
| "kl": 0.1796875, |
| "learning_rate": 8.534722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.71875, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.59375, |
| "epoch": 0.44166666666666665, |
| "grad_norm": 2.682811737060547, |
| "kl": 0.1591796875, |
| "learning_rate": 8.527777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.59375, |
| "epoch": 0.44375, |
| "grad_norm": 2.0939760208129883, |
| "kl": 0.18212890625, |
| "learning_rate": 8.520833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.375, |
| "epoch": 0.44583333333333336, |
| "grad_norm": 2.2327020168304443, |
| "kl": 0.18212890625, |
| "learning_rate": 8.513888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.59375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 1.0, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.71875, |
| "epoch": 0.4479166666666667, |
| "grad_norm": 2.284752607345581, |
| "kl": 0.157958984375, |
| "learning_rate": 8.506944444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.84375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.8125, |
| "epoch": 0.45, |
| "grad_norm": 4.905982971191406, |
| "kl": 0.15869140625, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.53125, |
| "reward_std": 0.3787454217672348, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.21875, |
| "epoch": 0.45208333333333334, |
| "grad_norm": 2.2356278896331787, |
| "kl": 0.15185546875, |
| "learning_rate": 8.493055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.625, |
| "epoch": 0.45416666666666666, |
| "grad_norm": 0.00922483392059803, |
| "kl": 0.154541015625, |
| "learning_rate": 8.48611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.46875, |
| "epoch": 0.45625, |
| "grad_norm": 7.279197692871094, |
| "kl": 0.149658203125, |
| "learning_rate": 8.479166666666667e-07, |
| "loss": 0.0001, |
| "reward": 1.1875, |
| "reward_std": 0.2587745785713196, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 363.03125, |
| "epoch": 0.4583333333333333, |
| "grad_norm": 15.603084564208984, |
| "kl": 0.15869140625, |
| "learning_rate": 8.472222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.78125, |
| "epoch": 0.46041666666666664, |
| "grad_norm": 1.14226233959198, |
| "kl": 0.1630859375, |
| "learning_rate": 8.465277777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 394.78125, |
| "epoch": 0.4625, |
| "grad_norm": 3.340059280395508, |
| "kl": 0.1669921875, |
| "learning_rate": 8.458333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 369.375, |
| "epoch": 0.46458333333333335, |
| "grad_norm": 1.2038440704345703, |
| "kl": 0.175537109375, |
| "learning_rate": 8.451388888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.53125, |
| "epoch": 0.4666666666666667, |
| "grad_norm": 3.947021722793579, |
| "kl": 0.165283203125, |
| "learning_rate": 8.444444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 392.5, |
| "epoch": 0.46875, |
| "grad_norm": 4.695667266845703, |
| "kl": 0.17724609375, |
| "learning_rate": 8.4375e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 0.96875, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.9375, |
| "epoch": 0.4708333333333333, |
| "grad_norm": 3.941527843475342, |
| "kl": 0.160888671875, |
| "learning_rate": 8.430555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.3745020925998688, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.96875, |
| "epoch": 0.47291666666666665, |
| "grad_norm": 1.2121331691741943, |
| "kl": 0.177734375, |
| "learning_rate": 8.423611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.75, |
| "epoch": 0.475, |
| "grad_norm": 1.2622644901275635, |
| "kl": 0.17529296875, |
| "learning_rate": 8.416666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.8125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.15625, |
| "epoch": 0.47708333333333336, |
| "grad_norm": 2.040025472640991, |
| "kl": 0.167236328125, |
| "learning_rate": 8.409722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.0625, |
| "epoch": 0.4791666666666667, |
| "grad_norm": 2.3206212520599365, |
| "kl": 0.16357421875, |
| "learning_rate": 8.402777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.3787454217672348, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.65625, |
| "epoch": 0.48125, |
| "grad_norm": 2.9016504287719727, |
| "kl": 0.17041015625, |
| "learning_rate": 8.395833333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.71875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.4375, |
| "epoch": 0.48333333333333334, |
| "grad_norm": 0.07506023347377777, |
| "kl": 0.177001953125, |
| "learning_rate": 8.388888888888888e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.90625, |
| "epoch": 0.48541666666666666, |
| "grad_norm": 1.9703696966171265, |
| "kl": 0.1591796875, |
| "learning_rate": 8.381944444444445e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 1.0, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.03125, |
| "epoch": 0.4875, |
| "grad_norm": 1.3863449096679688, |
| "kl": 0.162109375, |
| "learning_rate": 8.375e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.75, |
| "epoch": 0.4895833333333333, |
| "grad_norm": 3.7418622970581055, |
| "kl": 0.15869140625, |
| "learning_rate": 8.368055555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.8125, |
| "reward_std": 0.408231720328331, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.15625, |
| "epoch": 0.49166666666666664, |
| "grad_norm": 2.131373882293701, |
| "kl": 0.1455078125, |
| "learning_rate": 8.361111111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.75, |
| "reward_std": 0.2587745785713196, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.0, |
| "epoch": 0.49375, |
| "grad_norm": 1.503204584121704, |
| "kl": 0.147705078125, |
| "learning_rate": 8.354166666666667e-07, |
| "loss": 0.0001, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.5625, |
| "epoch": 0.49583333333333335, |
| "grad_norm": 1.8451569080352783, |
| "kl": 0.14990234375, |
| "learning_rate": 8.347222222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.875, |
| "epoch": 0.4979166666666667, |
| "grad_norm": 0.005814376752823591, |
| "kl": 0.1365966796875, |
| "learning_rate": 8.340277777777778e-07, |
| "loss": 0.0001, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.5625, |
| "epoch": 0.5, |
| "grad_norm": 2.293483257293701, |
| "kl": 0.149169921875, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.0001, |
| "reward": 1.875, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.78125, |
| "epoch": 0.5020833333333333, |
| "grad_norm": 1.321162462234497, |
| "kl": 0.13720703125, |
| "learning_rate": 8.326388888888889e-07, |
| "loss": 0.0001, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.28125, |
| "epoch": 0.5041666666666667, |
| "grad_norm": 2.4420886039733887, |
| "kl": 0.17236328125, |
| "learning_rate": 8.319444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.8125, |
| "epoch": 0.50625, |
| "grad_norm": 2.3468263149261475, |
| "kl": 0.162109375, |
| "learning_rate": 8.3125e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.0, |
| "epoch": 0.5083333333333333, |
| "grad_norm": 2.658416509628296, |
| "kl": 0.148681640625, |
| "learning_rate": 8.305555555555555e-07, |
| "loss": 0.0001, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.59375, |
| "epoch": 0.5104166666666666, |
| "grad_norm": 2.731208324432373, |
| "kl": 0.137451171875, |
| "learning_rate": 8.298611111111111e-07, |
| "loss": 0.0001, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.75, |
| "epoch": 0.5125, |
| "grad_norm": 1.5126349925994873, |
| "kl": 0.13818359375, |
| "learning_rate": 8.291666666666666e-07, |
| "loss": 0.0001, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.125, |
| "epoch": 0.5145833333333333, |
| "grad_norm": 0.9014095664024353, |
| "kl": 0.147216796875, |
| "learning_rate": 8.284722222222223e-07, |
| "loss": 0.0001, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.03125, |
| "epoch": 0.5166666666666667, |
| "grad_norm": 4.276453018188477, |
| "kl": 0.1531982421875, |
| "learning_rate": 8.277777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.3787454217672348, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.28125, |
| "epoch": 0.51875, |
| "grad_norm": 1.9292553663253784, |
| "kl": 0.1513671875, |
| "learning_rate": 8.270833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.3125, |
| "epoch": 0.5208333333333334, |
| "grad_norm": 0.9851927161216736, |
| "kl": 0.151123046875, |
| "learning_rate": 8.263888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.25, |
| "epoch": 0.5229166666666667, |
| "grad_norm": 0.006976987235248089, |
| "kl": 0.158447265625, |
| "learning_rate": 8.256944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.84375, |
| "epoch": 0.525, |
| "grad_norm": 1.9051716327667236, |
| "kl": 0.150146484375, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.25, |
| "epoch": 0.5270833333333333, |
| "grad_norm": 1.9278944730758667, |
| "kl": 0.143798828125, |
| "learning_rate": 8.243055555555555e-07, |
| "loss": 0.0001, |
| "reward": 0.78125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.25, |
| "epoch": 0.5291666666666667, |
| "grad_norm": 1.7085317373275757, |
| "kl": 0.160888671875, |
| "learning_rate": 8.23611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.59375, |
| "epoch": 0.53125, |
| "grad_norm": 0.015542632900178432, |
| "kl": 0.159423828125, |
| "learning_rate": 8.229166666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.15625, |
| "epoch": 0.5333333333333333, |
| "grad_norm": 2.3451013565063477, |
| "kl": 0.15869140625, |
| "learning_rate": 8.222222222222221e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.0, |
| "epoch": 0.5354166666666667, |
| "grad_norm": 2.0487141609191895, |
| "kl": 0.140869140625, |
| "learning_rate": 8.215277777777777e-07, |
| "loss": 0.0001, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.46875, |
| "epoch": 0.5375, |
| "grad_norm": 1.092297911643982, |
| "kl": 0.16064453125, |
| "learning_rate": 8.208333333333332e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.3125, |
| "epoch": 0.5395833333333333, |
| "grad_norm": 1.0972661972045898, |
| "kl": 0.15966796875, |
| "learning_rate": 8.201388888888888e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.96875, |
| "epoch": 0.5416666666666666, |
| "grad_norm": 0.0056478967890143394, |
| "kl": 0.149658203125, |
| "learning_rate": 8.194444444444443e-07, |
| "loss": 0.0001, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.65625, |
| "epoch": 0.54375, |
| "grad_norm": 1.2728099822998047, |
| "kl": 0.14697265625, |
| "learning_rate": 8.187499999999999e-07, |
| "loss": 0.0001, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.5625, |
| "epoch": 0.5458333333333333, |
| "grad_norm": 1.4108928442001343, |
| "kl": 0.162353515625, |
| "learning_rate": 8.180555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.46875, |
| "epoch": 0.5479166666666667, |
| "grad_norm": 0.014726191759109497, |
| "kl": 0.161865234375, |
| "learning_rate": 8.173611111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.375, |
| "epoch": 0.55, |
| "grad_norm": 1.6673963069915771, |
| "kl": 0.157470703125, |
| "learning_rate": 8.166666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.0, |
| "epoch": 0.5520833333333334, |
| "grad_norm": 3.3869149684906006, |
| "kl": 0.143798828125, |
| "learning_rate": 8.159722222222222e-07, |
| "loss": 0.0001, |
| "reward": 1.5, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.21875, |
| "epoch": 0.5541666666666667, |
| "grad_norm": 1.9510457515716553, |
| "kl": 0.192626953125, |
| "learning_rate": 8.152777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.03125, |
| "epoch": 0.55625, |
| "grad_norm": 1.3092948198318481, |
| "kl": 0.175537109375, |
| "learning_rate": 8.145833333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.375, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.25, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.53125, |
| "epoch": 0.5583333333333333, |
| "grad_norm": 0.09412389248609543, |
| "kl": 0.2783203125, |
| "learning_rate": 8.138888888888888e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.46875, |
| "epoch": 0.5604166666666667, |
| "grad_norm": 1.6767691373825073, |
| "kl": 0.16552734375, |
| "learning_rate": 8.131944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.78125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.8125, |
| "epoch": 0.5625, |
| "grad_norm": 1.0303244590759277, |
| "kl": 0.160888671875, |
| "learning_rate": 8.125e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.5625, |
| "epoch": 0.5645833333333333, |
| "grad_norm": 3.553706169128418, |
| "kl": 0.169189453125, |
| "learning_rate": 8.118055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.96875, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.65625, |
| "epoch": 0.5666666666666667, |
| "grad_norm": 1.0599257946014404, |
| "kl": 0.183837890625, |
| "learning_rate": 8.11111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0625, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.90625, |
| "epoch": 0.56875, |
| "grad_norm": 2.1599457263946533, |
| "kl": 0.187744140625, |
| "learning_rate": 8.104166666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.875, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.65625, |
| "epoch": 0.5708333333333333, |
| "grad_norm": 4.606043338775635, |
| "kl": 0.165771484375, |
| "learning_rate": 8.097222222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.125, |
| "epoch": 0.5729166666666666, |
| "grad_norm": 1.3466901779174805, |
| "kl": 0.181396484375, |
| "learning_rate": 8.090277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.65625, |
| "epoch": 0.575, |
| "grad_norm": 0.010240813717246056, |
| "kl": 0.19287109375, |
| "learning_rate": 8.083333333333334e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.8125, |
| "epoch": 0.5770833333333333, |
| "grad_norm": 12.152362823486328, |
| "kl": 0.1728515625, |
| "learning_rate": 8.076388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.3650856465101242, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.96875, |
| "epoch": 0.5791666666666667, |
| "grad_norm": 11.584708213806152, |
| "kl": 0.1640625, |
| "learning_rate": 8.069444444444445e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.0, |
| "epoch": 0.58125, |
| "grad_norm": 1.5547815561294556, |
| "kl": 0.181640625, |
| "learning_rate": 8.0625e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.96875, |
| "epoch": 0.5833333333333334, |
| "grad_norm": 1.335976004600525, |
| "kl": 0.177490234375, |
| "learning_rate": 8.055555555555556e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.96875, |
| "epoch": 0.5854166666666667, |
| "grad_norm": 2.280996322631836, |
| "kl": 0.189208984375, |
| "learning_rate": 8.048611111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.6875, |
| "reward_std": 0.2587745785713196, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.03125, |
| "epoch": 0.5875, |
| "grad_norm": 1.1769981384277344, |
| "kl": 0.16650390625, |
| "learning_rate": 8.041666666666667e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.75, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.75, |
| "epoch": 0.5895833333333333, |
| "grad_norm": 1.1851226091384888, |
| "kl": 0.1767578125, |
| "learning_rate": 8.034722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.65625, |
| "epoch": 0.5916666666666667, |
| "grad_norm": 1.5428513288497925, |
| "kl": 0.178955078125, |
| "learning_rate": 8.027777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.53125, |
| "epoch": 0.59375, |
| "grad_norm": 0.058219242841005325, |
| "kl": 0.23583984375, |
| "learning_rate": 8.020833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.15625, |
| "epoch": 0.5958333333333333, |
| "grad_norm": 1.0132204294204712, |
| "kl": 0.181884765625, |
| "learning_rate": 8.013888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.4375, |
| "epoch": 0.5979166666666667, |
| "grad_norm": 0.008692107163369656, |
| "kl": 0.1728515625, |
| "learning_rate": 8.006944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.15625, |
| "epoch": 0.6, |
| "grad_norm": 0.03367244824767113, |
| "kl": 0.18310546875, |
| "learning_rate": 8e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.3125, |
| "epoch": 0.6020833333333333, |
| "grad_norm": 0.008252977393567562, |
| "kl": 0.1708984375, |
| "learning_rate": 7.993055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.375, |
| "epoch": 0.6041666666666666, |
| "grad_norm": 2.225816249847412, |
| "kl": 0.173095703125, |
| "learning_rate": 7.986111111111112e-07, |
| "loss": 0.0002, |
| "reward": 1.53125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.0625, |
| "epoch": 0.60625, |
| "grad_norm": 0.021306902170181274, |
| "kl": 0.166259765625, |
| "learning_rate": 7.979166666666667e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.5, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.875, |
| "epoch": 0.6083333333333333, |
| "grad_norm": 1.042975664138794, |
| "kl": 0.16015625, |
| "learning_rate": 7.972222222222223e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.875, |
| "epoch": 0.6104166666666667, |
| "grad_norm": 1.670255184173584, |
| "kl": 0.168212890625, |
| "learning_rate": 7.965277777777777e-07, |
| "loss": 0.0002, |
| "reward": 0.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.09375, |
| "epoch": 0.6125, |
| "grad_norm": 1.954439640045166, |
| "kl": 0.171142578125, |
| "learning_rate": 7.958333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.59375, |
| "epoch": 0.6145833333333334, |
| "grad_norm": 1.7601760625839233, |
| "kl": 0.1767578125, |
| "learning_rate": 7.951388888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.34375, |
| "epoch": 0.6166666666666667, |
| "grad_norm": 10.422201156616211, |
| "kl": 0.1806640625, |
| "learning_rate": 7.944444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.375, |
| "epoch": 0.61875, |
| "grad_norm": 0.028056718409061432, |
| "kl": 0.180419921875, |
| "learning_rate": 7.937499999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.65625, |
| "epoch": 0.6208333333333333, |
| "grad_norm": 1.2360841035842896, |
| "kl": 0.1806640625, |
| "learning_rate": 7.930555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.71875, |
| "epoch": 0.6229166666666667, |
| "grad_norm": 2.3312301635742188, |
| "kl": 0.180419921875, |
| "learning_rate": 7.92361111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.3125, |
| "epoch": 0.625, |
| "grad_norm": 0.00832283217459917, |
| "kl": 0.1640625, |
| "learning_rate": 7.916666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.59375, |
| "epoch": 0.6270833333333333, |
| "grad_norm": 1.8436075448989868, |
| "kl": 0.178466796875, |
| "learning_rate": 7.909722222222221e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.71875, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.21875, |
| "epoch": 0.6291666666666667, |
| "grad_norm": 0.011904319748282433, |
| "kl": 0.176025390625, |
| "learning_rate": 7.902777777777777e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.90625, |
| "epoch": 0.63125, |
| "grad_norm": 1.144376516342163, |
| "kl": 0.183837890625, |
| "learning_rate": 7.895833333333332e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.84375, |
| "epoch": 0.6333333333333333, |
| "grad_norm": 3.746612310409546, |
| "kl": 0.182373046875, |
| "learning_rate": 7.888888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.9375, |
| "epoch": 0.6354166666666666, |
| "grad_norm": 2.1090877056121826, |
| "kl": 0.1728515625, |
| "learning_rate": 7.881944444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.46875, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.40625, |
| "epoch": 0.6375, |
| "grad_norm": 0.009726927615702152, |
| "kl": 0.17724609375, |
| "learning_rate": 7.875e-07, |
| "loss": 0.0002, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.75, |
| "epoch": 0.6395833333333333, |
| "grad_norm": 3.932494640350342, |
| "kl": 0.184326171875, |
| "learning_rate": 7.868055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.875, |
| "epoch": 0.6416666666666667, |
| "grad_norm": 2.0402674674987793, |
| "kl": 0.180908203125, |
| "learning_rate": 7.861111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.625, |
| "epoch": 0.64375, |
| "grad_norm": 2.2270169258117676, |
| "kl": 0.180908203125, |
| "learning_rate": 7.854166666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.25, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.34375, |
| "epoch": 0.6458333333333334, |
| "grad_norm": 2.4076709747314453, |
| "kl": 0.183837890625, |
| "learning_rate": 7.847222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.8125, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.4375, |
| "epoch": 0.6479166666666667, |
| "grad_norm": 1.1454743146896362, |
| "kl": 0.185791015625, |
| "learning_rate": 7.840277777777777e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.1875, |
| "epoch": 0.65, |
| "grad_norm": 2.555177927017212, |
| "kl": 0.1884765625, |
| "learning_rate": 7.833333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.09375, |
| "epoch": 0.6520833333333333, |
| "grad_norm": 1.292601227760315, |
| "kl": 0.180419921875, |
| "learning_rate": 7.826388888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.5625, |
| "epoch": 0.6541666666666667, |
| "grad_norm": 6.739890098571777, |
| "kl": 0.18212890625, |
| "learning_rate": 7.819444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.625, |
| "reward_std": 0.3104073107242584, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.65625, |
| "epoch": 0.65625, |
| "grad_norm": 1.3566937446594238, |
| "kl": 0.197998046875, |
| "learning_rate": 7.812499999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.09375, |
| "epoch": 0.6583333333333333, |
| "grad_norm": 1.7616151571273804, |
| "kl": 0.17529296875, |
| "learning_rate": 7.805555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.5, |
| "epoch": 0.6604166666666667, |
| "grad_norm": 1.9657090902328491, |
| "kl": 0.206298828125, |
| "learning_rate": 7.79861111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 403.21875, |
| "epoch": 0.6625, |
| "grad_norm": 3.3363990783691406, |
| "kl": 0.18115234375, |
| "learning_rate": 7.791666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 403.9375, |
| "epoch": 0.6645833333333333, |
| "grad_norm": 1.5157190561294556, |
| "kl": 0.207763671875, |
| "learning_rate": 7.784722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.46875, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.01322422455996275, |
| "kl": 0.200439453125, |
| "learning_rate": 7.777777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.5, |
| "epoch": 0.66875, |
| "grad_norm": 1.8207961320877075, |
| "kl": 0.1865234375, |
| "learning_rate": 7.770833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.15625, |
| "epoch": 0.6708333333333333, |
| "grad_norm": 2.6609857082366943, |
| "kl": 0.181884765625, |
| "learning_rate": 7.763888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.71875, |
| "epoch": 0.6729166666666667, |
| "grad_norm": 0.01364789716899395, |
| "kl": 0.18896484375, |
| "learning_rate": 7.756944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.25, |
| "epoch": 0.675, |
| "grad_norm": 1.3705228567123413, |
| "kl": 0.185546875, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.90625, |
| "epoch": 0.6770833333333334, |
| "grad_norm": 1.159696340560913, |
| "kl": 0.17724609375, |
| "learning_rate": 7.743055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.28125, |
| "epoch": 0.6791666666666667, |
| "grad_norm": 0.011270968243479729, |
| "kl": 0.187744140625, |
| "learning_rate": 7.736111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.40625, |
| "epoch": 0.68125, |
| "grad_norm": 2.9195308685302734, |
| "kl": 0.18310546875, |
| "learning_rate": 7.729166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.71875, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.6875, |
| "epoch": 0.6833333333333333, |
| "grad_norm": 6.094662666320801, |
| "kl": 0.196044921875, |
| "learning_rate": 7.722222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.625, |
| "epoch": 0.6854166666666667, |
| "grad_norm": 1.3171031475067139, |
| "kl": 0.18212890625, |
| "learning_rate": 7.715277777777777e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.59375, |
| "epoch": 0.6875, |
| "grad_norm": 2.229274034500122, |
| "kl": 0.191162109375, |
| "learning_rate": 7.708333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.5625, |
| "epoch": 0.6895833333333333, |
| "grad_norm": 2.1081833839416504, |
| "kl": 0.203857421875, |
| "learning_rate": 7.701388888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.2773705795407295, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.71875, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.5625, |
| "epoch": 0.6916666666666667, |
| "grad_norm": 0.007810765411704779, |
| "kl": 0.1826171875, |
| "learning_rate": 7.694444444444445e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.4375, |
| "epoch": 0.69375, |
| "grad_norm": 2.289745569229126, |
| "kl": 0.190185546875, |
| "learning_rate": 7.6875e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 1.0, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.3125, |
| "epoch": 0.6958333333333333, |
| "grad_norm": 1.1293355226516724, |
| "kl": 0.192626953125, |
| "learning_rate": 7.680555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.21875, |
| "epoch": 0.6979166666666666, |
| "grad_norm": 1.968024492263794, |
| "kl": 0.200927734375, |
| "learning_rate": 7.673611111111112e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.46875, |
| "epoch": 0.7, |
| "grad_norm": 2.0755598545074463, |
| "kl": 0.206298828125, |
| "learning_rate": 7.666666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.0, |
| "epoch": 0.7020833333333333, |
| "grad_norm": 1.1718143224716187, |
| "kl": 0.2001953125, |
| "learning_rate": 7.659722222222223e-07, |
| "loss": 0.0002, |
| "reward": 0.375, |
| "reward_std": 0.18898223340511322, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.21875, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.0, |
| "epoch": 0.7041666666666667, |
| "grad_norm": 1.2479380369186401, |
| "kl": 0.199462890625, |
| "learning_rate": 7.652777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.625, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.40625, |
| "epoch": 0.70625, |
| "grad_norm": 1.1230976581573486, |
| "kl": 0.204833984375, |
| "learning_rate": 7.645833333333332e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 422.46875, |
| "epoch": 0.7083333333333334, |
| "grad_norm": 1.2302477359771729, |
| "kl": 0.209228515625, |
| "learning_rate": 7.638888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.53125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 392.40625, |
| "epoch": 0.7104166666666667, |
| "grad_norm": 0.010557924397289753, |
| "kl": 0.21630859375, |
| "learning_rate": 7.631944444444443e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.0, |
| "epoch": 0.7125, |
| "grad_norm": 2.4895095825195312, |
| "kl": 0.206298828125, |
| "learning_rate": 7.624999999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.5625, |
| "reward_std": 0.4355512708425522, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.96875, |
| "epoch": 0.7145833333333333, |
| "grad_norm": 1.8133002519607544, |
| "kl": 0.19384765625, |
| "learning_rate": 7.618055555555554e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.1875, |
| "epoch": 0.7166666666666667, |
| "grad_norm": 0.015499824658036232, |
| "kl": 0.216552734375, |
| "learning_rate": 7.61111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.46875, |
| "epoch": 0.71875, |
| "grad_norm": 0.012258811853826046, |
| "kl": 0.20947265625, |
| "learning_rate": 7.604166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.125, |
| "epoch": 0.7208333333333333, |
| "grad_norm": 1.241074562072754, |
| "kl": 0.1982421875, |
| "learning_rate": 7.597222222222221e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.28125, |
| "epoch": 0.7229166666666667, |
| "grad_norm": 2.9186177253723145, |
| "kl": 0.204345703125, |
| "learning_rate": 7.590277777777778e-07, |
| "loss": 0.0002, |
| "reward": 0.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.46875, |
| "epoch": 0.725, |
| "grad_norm": 2.4491827487945557, |
| "kl": 0.199951171875, |
| "learning_rate": 7.583333333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.15625, |
| "epoch": 0.7270833333333333, |
| "grad_norm": 1.914108157157898, |
| "kl": 0.21044921875, |
| "learning_rate": 7.576388888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.53125, |
| "epoch": 0.7291666666666666, |
| "grad_norm": 2.4412832260131836, |
| "kl": 0.236572265625, |
| "learning_rate": 7.569444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.34375, |
| "epoch": 0.73125, |
| "grad_norm": 0.007453648839145899, |
| "kl": 0.197509765625, |
| "learning_rate": 7.5625e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.0625, |
| "epoch": 0.7333333333333333, |
| "grad_norm": 2.017571449279785, |
| "kl": 0.290283203125, |
| "learning_rate": 7.555555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.875, |
| "epoch": 0.7354166666666667, |
| "grad_norm": 1.6187738180160522, |
| "kl": 0.20166015625, |
| "learning_rate": 7.548611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.71875, |
| "epoch": 0.7375, |
| "grad_norm": 1.283926248550415, |
| "kl": 0.2021484375, |
| "learning_rate": 7.541666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.125, |
| "epoch": 0.7395833333333334, |
| "grad_norm": 1.8160797357559204, |
| "kl": 0.226806640625, |
| "learning_rate": 7.534722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.15625, |
| "epoch": 0.7416666666666667, |
| "grad_norm": 2.673011541366577, |
| "kl": 0.205078125, |
| "learning_rate": 7.527777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.3514062538743019, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.96875, |
| "epoch": 0.74375, |
| "grad_norm": 1.9269709587097168, |
| "kl": 0.19482421875, |
| "learning_rate": 7.520833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.09375, |
| "epoch": 0.7458333333333333, |
| "grad_norm": 12.258075714111328, |
| "kl": 0.20458984375, |
| "learning_rate": 7.513888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.71875, |
| "epoch": 0.7479166666666667, |
| "grad_norm": 2.0176916122436523, |
| "kl": 0.210205078125, |
| "learning_rate": 7.506944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.75, |
| "epoch": 0.75, |
| "grad_norm": 1.29171621799469, |
| "kl": 0.2138671875, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.0625, |
| "epoch": 0.7520833333333333, |
| "grad_norm": 1.6412715911865234, |
| "kl": 0.21044921875, |
| "learning_rate": 7.493055555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.15625, |
| "epoch": 0.7541666666666667, |
| "grad_norm": 2.2163078784942627, |
| "kl": 0.227783203125, |
| "learning_rate": 7.486111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.28125, |
| "epoch": 0.75625, |
| "grad_norm": 0.5271323323249817, |
| "kl": 0.562744140625, |
| "learning_rate": 7.479166666666667e-07, |
| "loss": 0.0006, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.5, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 422.5, |
| "epoch": 0.7583333333333333, |
| "grad_norm": 3.3099429607391357, |
| "kl": 0.22119140625, |
| "learning_rate": 7.472222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.96875, |
| "epoch": 0.7604166666666666, |
| "grad_norm": 0.02184051275253296, |
| "kl": 0.22314453125, |
| "learning_rate": 7.465277777777778e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.84375, |
| "epoch": 0.7625, |
| "grad_norm": 2.9574718475341797, |
| "kl": 0.2255859375, |
| "learning_rate": 7.458333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.90625, |
| "epoch": 0.7645833333333333, |
| "grad_norm": 1.6166619062423706, |
| "kl": 0.227294921875, |
| "learning_rate": 7.451388888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.65625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.875, |
| "epoch": 0.7666666666666667, |
| "grad_norm": 1.139664888381958, |
| "kl": 0.222412109375, |
| "learning_rate": 7.444444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 408.625, |
| "epoch": 0.76875, |
| "grad_norm": 1.73232901096344, |
| "kl": 0.248779296875, |
| "learning_rate": 7.4375e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.625, |
| "epoch": 0.7708333333333334, |
| "grad_norm": 0.019330745562911034, |
| "kl": 0.22314453125, |
| "learning_rate": 7.430555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.75, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.0, |
| "epoch": 0.7729166666666667, |
| "grad_norm": 2.8990607261657715, |
| "kl": 0.215087890625, |
| "learning_rate": 7.423611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.6875, |
| "epoch": 0.775, |
| "grad_norm": 1.9207866191864014, |
| "kl": 0.217529296875, |
| "learning_rate": 7.416666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 362.5625, |
| "epoch": 0.7770833333333333, |
| "grad_norm": 0.010317516513168812, |
| "kl": 0.248779296875, |
| "learning_rate": 7.409722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.5625, |
| "epoch": 0.7791666666666667, |
| "grad_norm": 1.9645934104919434, |
| "kl": 0.240966796875, |
| "learning_rate": 7.402777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.875, |
| "epoch": 0.78125, |
| "grad_norm": 3.6975531578063965, |
| "kl": 0.228759765625, |
| "learning_rate": 7.395833333333334e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.5, |
| "epoch": 0.7833333333333333, |
| "grad_norm": 0.015749173238873482, |
| "kl": 0.24609375, |
| "learning_rate": 7.388888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.125, |
| "epoch": 0.7854166666666667, |
| "grad_norm": 1.320116400718689, |
| "kl": 0.27734375, |
| "learning_rate": 7.381944444444445e-07, |
| "loss": 0.0003, |
| "reward": 0.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.25, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.40625, |
| "epoch": 0.7875, |
| "grad_norm": 1.221765398979187, |
| "kl": 0.247802734375, |
| "learning_rate": 7.375e-07, |
| "loss": 0.0002, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.875, |
| "epoch": 0.7895833333333333, |
| "grad_norm": 2.8052141666412354, |
| "kl": 0.269287109375, |
| "learning_rate": 7.368055555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.96875, |
| "epoch": 0.7916666666666666, |
| "grad_norm": 1.387290596961975, |
| "kl": 0.241455078125, |
| "learning_rate": 7.361111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 368.03125, |
| "epoch": 0.79375, |
| "grad_norm": 1.335808515548706, |
| "kl": 0.23779296875, |
| "learning_rate": 7.354166666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.78125, |
| "epoch": 0.7958333333333333, |
| "grad_norm": 0.013379854150116444, |
| "kl": 0.244873046875, |
| "learning_rate": 7.347222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.90625, |
| "epoch": 0.7979166666666667, |
| "grad_norm": 80.3907699584961, |
| "kl": 0.2431640625, |
| "learning_rate": 7.340277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.09375, |
| "epoch": 0.8, |
| "grad_norm": 0.011957396753132343, |
| "kl": 0.24072265625, |
| "learning_rate": 7.333333333333332e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.0625, |
| "epoch": 0.8020833333333334, |
| "grad_norm": 2.5682637691497803, |
| "kl": 0.2353515625, |
| "learning_rate": 7.326388888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.59375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 1.0, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.0, |
| "epoch": 0.8041666666666667, |
| "grad_norm": 3.7549641132354736, |
| "kl": 0.228515625, |
| "learning_rate": 7.319444444444443e-07, |
| "loss": 0.0002, |
| "reward": 1.78125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.53125, |
| "epoch": 0.80625, |
| "grad_norm": 0.009337302297353745, |
| "kl": 0.224609375, |
| "learning_rate": 7.312499999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.34375, |
| "epoch": 0.8083333333333333, |
| "grad_norm": 1.9596688747406006, |
| "kl": 0.2333984375, |
| "learning_rate": 7.305555555555554e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.0, |
| "epoch": 0.8104166666666667, |
| "grad_norm": 0.01328630093485117, |
| "kl": 0.242431640625, |
| "learning_rate": 7.298611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.5, |
| "epoch": 0.8125, |
| "grad_norm": 3.5032737255096436, |
| "kl": 0.228271484375, |
| "learning_rate": 7.291666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.65625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.5, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.875, |
| "epoch": 0.8145833333333333, |
| "grad_norm": 1.401888370513916, |
| "kl": 0.28076171875, |
| "learning_rate": 7.284722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.25, |
| "epoch": 0.8166666666666667, |
| "grad_norm": 1.3528683185577393, |
| "kl": 0.249755859375, |
| "learning_rate": 7.277777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 365.28125, |
| "epoch": 0.81875, |
| "grad_norm": 1.6087630987167358, |
| "kl": 0.22900390625, |
| "learning_rate": 7.270833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 1.0, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.9375, |
| "epoch": 0.8208333333333333, |
| "grad_norm": 2.008995771408081, |
| "kl": 0.228515625, |
| "learning_rate": 7.263888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.625, |
| "epoch": 0.8229166666666666, |
| "grad_norm": 0.011729438789188862, |
| "kl": 0.25, |
| "learning_rate": 7.256944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.78125, |
| "epoch": 0.825, |
| "grad_norm": 1.4965615272521973, |
| "kl": 0.25634765625, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.625, |
| "epoch": 0.8270833333333333, |
| "grad_norm": 1.816392183303833, |
| "kl": 0.222412109375, |
| "learning_rate": 7.243055555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.65625, |
| "epoch": 0.8291666666666667, |
| "grad_norm": 0.011323979124426842, |
| "kl": 0.22607421875, |
| "learning_rate": 7.23611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.3125, |
| "epoch": 0.83125, |
| "grad_norm": 6.289952754974365, |
| "kl": 0.220703125, |
| "learning_rate": 7.229166666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 388.21875, |
| "epoch": 0.8333333333333334, |
| "grad_norm": 2.264404296875, |
| "kl": 0.20703125, |
| "learning_rate": 7.222222222222221e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.875, |
| "epoch": 0.8354166666666667, |
| "grad_norm": 2.491809368133545, |
| "kl": 0.220703125, |
| "learning_rate": 7.215277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.34375, |
| "epoch": 0.8375, |
| "grad_norm": 1.1848517656326294, |
| "kl": 0.21728515625, |
| "learning_rate": 7.208333333333332e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.21875, |
| "epoch": 0.8395833333333333, |
| "grad_norm": 1.2143757343292236, |
| "kl": 0.22607421875, |
| "learning_rate": 7.201388888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.625, |
| "epoch": 0.8416666666666667, |
| "grad_norm": 0.008448748849332333, |
| "kl": 0.208984375, |
| "learning_rate": 7.194444444444445e-07, |
| "loss": 0.0002, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.96875, |
| "epoch": 0.84375, |
| "grad_norm": 0.007129390258342028, |
| "kl": 0.20654296875, |
| "learning_rate": 7.1875e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 431.0625, |
| "epoch": 0.8458333333333333, |
| "grad_norm": 2.8381500244140625, |
| "kl": 0.219482421875, |
| "learning_rate": 7.180555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.4628904387354851, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.5, |
| "epoch": 0.8479166666666667, |
| "grad_norm": 2.8964383602142334, |
| "kl": 0.20703125, |
| "learning_rate": 7.173611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 415.1875, |
| "epoch": 0.85, |
| "grad_norm": 0.03928419202566147, |
| "kl": 0.221435546875, |
| "learning_rate": 7.166666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.46875, |
| "epoch": 0.8520833333333333, |
| "grad_norm": 1.7243295907974243, |
| "kl": 0.2099609375, |
| "learning_rate": 7.159722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.1875, |
| "epoch": 0.8541666666666666, |
| "grad_norm": 1.768519401550293, |
| "kl": 0.199951171875, |
| "learning_rate": 7.152777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.5, |
| "epoch": 0.85625, |
| "grad_norm": 0.018853794783353806, |
| "kl": 0.239013671875, |
| "learning_rate": 7.145833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.9375, |
| "epoch": 0.8583333333333333, |
| "grad_norm": 1.988168716430664, |
| "kl": 0.22119140625, |
| "learning_rate": 7.138888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.15625, |
| "epoch": 0.8604166666666667, |
| "grad_norm": 0.010176747106015682, |
| "kl": 0.214599609375, |
| "learning_rate": 7.131944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.90625, |
| "epoch": 0.8625, |
| "grad_norm": 2.550144910812378, |
| "kl": 0.22021484375, |
| "learning_rate": 7.125e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.8125, |
| "epoch": 0.8645833333333334, |
| "grad_norm": 0.018261313438415527, |
| "kl": 0.200927734375, |
| "learning_rate": 7.118055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.71875, |
| "epoch": 0.8666666666666667, |
| "grad_norm": 0.008350728079676628, |
| "kl": 0.212646484375, |
| "learning_rate": 7.111111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.125, |
| "epoch": 0.86875, |
| "grad_norm": 2.7766783237457275, |
| "kl": 0.21484375, |
| "learning_rate": 7.104166666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.53125, |
| "epoch": 0.8708333333333333, |
| "grad_norm": 1.4548749923706055, |
| "kl": 0.215087890625, |
| "learning_rate": 7.097222222222223e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.1875, |
| "epoch": 0.8729166666666667, |
| "grad_norm": 0.007628277875483036, |
| "kl": 0.211181640625, |
| "learning_rate": 7.090277777777778e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.5, |
| "epoch": 0.875, |
| "grad_norm": 2.389847993850708, |
| "kl": 0.203125, |
| "learning_rate": 7.083333333333334e-07, |
| "loss": 0.0002, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.71875, |
| "epoch": 0.8770833333333333, |
| "grad_norm": 1.6909396648406982, |
| "kl": 0.21923828125, |
| "learning_rate": 7.076388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.8125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.84375, |
| "epoch": 0.8791666666666667, |
| "grad_norm": 2.283431053161621, |
| "kl": 0.2138671875, |
| "learning_rate": 7.069444444444445e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.2587745785713196, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.28125, |
| "epoch": 0.88125, |
| "grad_norm": 1.1481057405471802, |
| "kl": 0.2109375, |
| "learning_rate": 7.0625e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.03125, |
| "epoch": 0.8833333333333333, |
| "grad_norm": 0.01631513424217701, |
| "kl": 0.22802734375, |
| "learning_rate": 7.055555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.1875, |
| "epoch": 0.8854166666666666, |
| "grad_norm": 1.1521072387695312, |
| "kl": 0.21484375, |
| "learning_rate": 7.048611111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.78125, |
| "epoch": 0.8875, |
| "grad_norm": 4.860723495483398, |
| "kl": 0.211181640625, |
| "learning_rate": 7.041666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.84375, |
| "epoch": 0.8895833333333333, |
| "grad_norm": 0.011157250963151455, |
| "kl": 0.23193359375, |
| "learning_rate": 7.034722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 470.78125, |
| "epoch": 0.8916666666666667, |
| "grad_norm": 1.62472665309906, |
| "kl": 0.231201171875, |
| "learning_rate": 7.027777777777777e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.65625, |
| "epoch": 0.89375, |
| "grad_norm": 2.0805013179779053, |
| "kl": 0.21826171875, |
| "learning_rate": 7.020833333333332e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.59375, |
| "epoch": 0.8958333333333334, |
| "grad_norm": 0.012136026285588741, |
| "kl": 0.23193359375, |
| "learning_rate": 7.013888888888888e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.21875, |
| "epoch": 0.8979166666666667, |
| "grad_norm": 1.2128183841705322, |
| "kl": 0.22119140625, |
| "learning_rate": 7.006944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.1875, |
| "epoch": 0.9, |
| "grad_norm": 1.5345137119293213, |
| "kl": 0.217529296875, |
| "learning_rate": 7e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 388.125, |
| "epoch": 0.9020833333333333, |
| "grad_norm": 1.3869545459747314, |
| "kl": 0.236083984375, |
| "learning_rate": 6.993055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.90625, |
| "epoch": 0.9041666666666667, |
| "grad_norm": 0.009476087987422943, |
| "kl": 0.215576171875, |
| "learning_rate": 6.986111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.125, |
| "epoch": 0.90625, |
| "grad_norm": 0.010908279567956924, |
| "kl": 0.235107421875, |
| "learning_rate": 6.979166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.53125, |
| "epoch": 0.9083333333333333, |
| "grad_norm": 2.5724964141845703, |
| "kl": 0.225830078125, |
| "learning_rate": 6.972222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.3125, |
| "epoch": 0.9104166666666667, |
| "grad_norm": 1.8228105306625366, |
| "kl": 0.2275390625, |
| "learning_rate": 6.965277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.625, |
| "epoch": 0.9125, |
| "grad_norm": 0.011684931814670563, |
| "kl": 0.217529296875, |
| "learning_rate": 6.958333333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.46875, |
| "epoch": 0.9145833333333333, |
| "grad_norm": 3.1498048305511475, |
| "kl": 0.257568359375, |
| "learning_rate": 6.951388888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.75, |
| "epoch": 0.9166666666666666, |
| "grad_norm": 3.2582144737243652, |
| "kl": 0.224609375, |
| "learning_rate": 6.944444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.78125, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.375, |
| "epoch": 0.91875, |
| "grad_norm": 3.094038963317871, |
| "kl": 0.238037109375, |
| "learning_rate": 6.937499999999999e-07, |
| "loss": 0.0002, |
| "reward": 0.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 424.59375, |
| "epoch": 0.9208333333333333, |
| "grad_norm": 7.279547214508057, |
| "kl": 0.23828125, |
| "learning_rate": 6.930555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.5625, |
| "epoch": 0.9229166666666667, |
| "grad_norm": 3.03475022315979, |
| "kl": 0.219970703125, |
| "learning_rate": 6.92361111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.3471629247069359, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.6875, |
| "epoch": 0.925, |
| "grad_norm": 2.076587677001953, |
| "kl": 0.222412109375, |
| "learning_rate": 6.916666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.71875, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.34375, |
| "epoch": 0.9270833333333334, |
| "grad_norm": 1.3405368328094482, |
| "kl": 0.236572265625, |
| "learning_rate": 6.909722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.25, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.75, |
| "epoch": 0.9291666666666667, |
| "grad_norm": 2.1806628704071045, |
| "kl": 0.23779296875, |
| "learning_rate": 6.902777777777778e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.5, |
| "epoch": 0.93125, |
| "grad_norm": 3.1020021438598633, |
| "kl": 0.2451171875, |
| "learning_rate": 6.895833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.1875, |
| "epoch": 0.9333333333333333, |
| "grad_norm": 0.0076973834075033665, |
| "kl": 0.21533203125, |
| "learning_rate": 6.888888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 414.625, |
| "epoch": 0.9354166666666667, |
| "grad_norm": 0.010062651708722115, |
| "kl": 0.24658203125, |
| "learning_rate": 6.881944444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 417.28125, |
| "epoch": 0.9375, |
| "grad_norm": 4.19633674621582, |
| "kl": 0.237060546875, |
| "learning_rate": 6.875e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.34375, |
| "epoch": 0.9395833333333333, |
| "grad_norm": 2.232433795928955, |
| "kl": 0.251953125, |
| "learning_rate": 6.868055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.09375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.59375, |
| "epoch": 0.9416666666666667, |
| "grad_norm": 1.2692770957946777, |
| "kl": 0.24658203125, |
| "learning_rate": 6.861111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.25, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.96875, |
| "epoch": 0.94375, |
| "grad_norm": 0.007977345958352089, |
| "kl": 0.247314453125, |
| "learning_rate": 6.854166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 415.03125, |
| "epoch": 0.9458333333333333, |
| "grad_norm": 1.4225736856460571, |
| "kl": 0.239013671875, |
| "learning_rate": 6.847222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.4375, |
| "epoch": 0.9479166666666666, |
| "grad_norm": 1.563883662223816, |
| "kl": 0.228759765625, |
| "learning_rate": 6.840277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.8125, |
| "epoch": 0.95, |
| "grad_norm": 1.4256101846694946, |
| "kl": 0.240966796875, |
| "learning_rate": 6.833333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.03125, |
| "epoch": 0.9520833333333333, |
| "grad_norm": 1.4623489379882812, |
| "kl": 0.255859375, |
| "learning_rate": 6.826388888888888e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 420.75, |
| "epoch": 0.9541666666666667, |
| "grad_norm": 1.2219001054763794, |
| "kl": 0.244140625, |
| "learning_rate": 6.819444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.90625, |
| "epoch": 0.95625, |
| "grad_norm": 3.5887980461120605, |
| "kl": 0.24169921875, |
| "learning_rate": 6.8125e-07, |
| "loss": 0.0002, |
| "reward": 0.8125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.5, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.4375, |
| "epoch": 0.9583333333333334, |
| "grad_norm": 1.426658272743225, |
| "kl": 0.246826171875, |
| "learning_rate": 6.805555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.71875, |
| "epoch": 0.9604166666666667, |
| "grad_norm": 1.7160536050796509, |
| "kl": 0.23779296875, |
| "learning_rate": 6.798611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.8125, |
| "epoch": 0.9625, |
| "grad_norm": 1.2215250730514526, |
| "kl": 0.25537109375, |
| "learning_rate": 6.791666666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.6875, |
| "epoch": 0.9645833333333333, |
| "grad_norm": 0.016700007021427155, |
| "kl": 0.20458984375, |
| "learning_rate": 6.784722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.03125, |
| "epoch": 0.9666666666666667, |
| "grad_norm": 1.8133171796798706, |
| "kl": 0.23779296875, |
| "learning_rate": 6.777777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.59375, |
| "epoch": 0.96875, |
| "grad_norm": 1.1255253553390503, |
| "kl": 0.225830078125, |
| "learning_rate": 6.770833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.71875, |
| "epoch": 0.9708333333333333, |
| "grad_norm": 0.08303828537464142, |
| "kl": 0.3046875, |
| "learning_rate": 6.763888888888889e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.34375, |
| "epoch": 0.9729166666666667, |
| "grad_norm": 2.6234073638916016, |
| "kl": 0.22119140625, |
| "learning_rate": 6.756944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.96875, |
| "epoch": 0.975, |
| "grad_norm": 1.0526636838912964, |
| "kl": 0.232177734375, |
| "learning_rate": 6.75e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.46875, |
| "epoch": 0.9770833333333333, |
| "grad_norm": 1.8160419464111328, |
| "kl": 0.21826171875, |
| "learning_rate": 6.743055555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.875, |
| "epoch": 0.9791666666666666, |
| "grad_norm": 2.412604331970215, |
| "kl": 0.236083984375, |
| "learning_rate": 6.736111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.09375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.78125, |
| "epoch": 0.98125, |
| "grad_norm": 0.0071896640583872795, |
| "kl": 0.21728515625, |
| "learning_rate": 6.729166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.75, |
| "epoch": 0.9833333333333333, |
| "grad_norm": 2.7071609497070312, |
| "kl": 0.215087890625, |
| "learning_rate": 6.722222222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.25, |
| "epoch": 0.9854166666666667, |
| "grad_norm": 1.6044063568115234, |
| "kl": 0.208251953125, |
| "learning_rate": 6.715277777777776e-07, |
| "loss": 0.0002, |
| "reward": 1.5625, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.0, |
| "epoch": 0.9875, |
| "grad_norm": 1.518963098526001, |
| "kl": 0.22314453125, |
| "learning_rate": 6.708333333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.25, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.34375, |
| "epoch": 0.9895833333333334, |
| "grad_norm": 14.804647445678711, |
| "kl": 0.2041015625, |
| "learning_rate": 6.701388888888888e-07, |
| "loss": 0.0002, |
| "reward": 0.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.75, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.0, |
| "epoch": 0.9916666666666667, |
| "grad_norm": 1.5678462982177734, |
| "kl": 0.20166015625, |
| "learning_rate": 6.694444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.84375, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.09375, |
| "rewards/format_reward": 0.75, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.875, |
| "epoch": 0.99375, |
| "grad_norm": 3.285810947418213, |
| "kl": 0.208251953125, |
| "learning_rate": 6.6875e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.03125, |
| "epoch": 0.9958333333333333, |
| "grad_norm": 1.3398975133895874, |
| "kl": 0.207275390625, |
| "learning_rate": 6.680555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.5625, |
| "epoch": 0.9979166666666667, |
| "grad_norm": 1.2645913362503052, |
| "kl": 0.207763671875, |
| "learning_rate": 6.67361111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.625, |
| "epoch": 1.0, |
| "grad_norm": 3.012862205505371, |
| "kl": 0.208984375, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.375, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.46875, |
| "epoch": 1.0020833333333334, |
| "grad_norm": 0.9152086973190308, |
| "kl": 0.20849609375, |
| "learning_rate": 6.659722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.34375, |
| "epoch": 1.0041666666666667, |
| "grad_norm": 2.6818764209747314, |
| "kl": 0.206787109375, |
| "learning_rate": 6.652777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.6875, |
| "epoch": 1.00625, |
| "grad_norm": 1.9887526035308838, |
| "kl": 0.211669921875, |
| "learning_rate": 6.645833333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.84375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.90625, |
| "epoch": 1.0083333333333333, |
| "grad_norm": 1.1005487442016602, |
| "kl": 0.20654296875, |
| "learning_rate": 6.638888888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.53125, |
| "epoch": 1.0104166666666667, |
| "grad_norm": 1.0807424783706665, |
| "kl": 0.2275390625, |
| "learning_rate": 6.631944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.4375, |
| "epoch": 1.0125, |
| "grad_norm": 0.008356544189155102, |
| "kl": 0.21875, |
| "learning_rate": 6.624999999999999e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.375, |
| "epoch": 1.0145833333333334, |
| "grad_norm": 2.017381191253662, |
| "kl": 0.22216796875, |
| "learning_rate": 6.618055555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.84375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.5, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.78125, |
| "epoch": 1.0166666666666666, |
| "grad_norm": 2.344238042831421, |
| "kl": 0.21923828125, |
| "learning_rate": 6.611111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.6875, |
| "reward_std": 0.249358132481575, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.0625, |
| "epoch": 1.01875, |
| "grad_norm": 2.2267184257507324, |
| "kl": 0.2236328125, |
| "learning_rate": 6.604166666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.3471629247069359, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.71875, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.3125, |
| "epoch": 1.0208333333333333, |
| "grad_norm": 5.060998916625977, |
| "kl": 0.228515625, |
| "learning_rate": 6.597222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.59375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 1.0, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.3125, |
| "epoch": 1.0229166666666667, |
| "grad_norm": 1.5494803190231323, |
| "kl": 0.236083984375, |
| "learning_rate": 6.590277777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.59375, |
| "epoch": 1.025, |
| "grad_norm": 0.006925213150680065, |
| "kl": 0.22021484375, |
| "learning_rate": 6.583333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.25, |
| "epoch": 1.0270833333333333, |
| "grad_norm": 1.8279296159744263, |
| "kl": 0.21240234375, |
| "learning_rate": 6.576388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.03125, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.3125, |
| "epoch": 1.0291666666666666, |
| "grad_norm": 1.733322262763977, |
| "kl": 0.22802734375, |
| "learning_rate": 6.569444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.6875, |
| "epoch": 1.03125, |
| "grad_norm": 2.433555841445923, |
| "kl": 0.227294921875, |
| "learning_rate": 6.5625e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.59375, |
| "epoch": 1.0333333333333334, |
| "grad_norm": 0.009394297376275063, |
| "kl": 0.25244140625, |
| "learning_rate": 6.555555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.5625, |
| "epoch": 1.0354166666666667, |
| "grad_norm": 1.5645465850830078, |
| "kl": 0.248779296875, |
| "learning_rate": 6.548611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.0, |
| "epoch": 1.0375, |
| "grad_norm": 1.6025605201721191, |
| "kl": 0.2392578125, |
| "learning_rate": 6.541666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.65625, |
| "epoch": 1.0395833333333333, |
| "grad_norm": 1.1154199838638306, |
| "kl": 0.228271484375, |
| "learning_rate": 6.534722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.65625, |
| "epoch": 1.0416666666666667, |
| "grad_norm": 1.7574169635772705, |
| "kl": 0.26318359375, |
| "learning_rate": 6.527777777777777e-07, |
| "loss": 0.0003, |
| "reward": 0.78125, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 554.09375, |
| "epoch": 1.04375, |
| "grad_norm": 1.7386115789413452, |
| "kl": 0.24609375, |
| "learning_rate": 6.520833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 501 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.1875, |
| "epoch": 1.0458333333333334, |
| "grad_norm": 1.2160353660583496, |
| "kl": 0.255615234375, |
| "learning_rate": 6.513888888888889e-07, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 502 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.53125, |
| "epoch": 1.0479166666666666, |
| "grad_norm": 2.100022792816162, |
| "kl": 0.267333984375, |
| "learning_rate": 6.506944444444445e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 503 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.59375, |
| "epoch": 1.05, |
| "grad_norm": 3.8708770275115967, |
| "kl": 0.24755859375, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.3377464786171913, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 504 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.59375, |
| "epoch": 1.0520833333333333, |
| "grad_norm": 0.009273013100028038, |
| "kl": 0.2431640625, |
| "learning_rate": 6.493055555555556e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.78125, |
| "epoch": 1.0541666666666667, |
| "grad_norm": 1.5016146898269653, |
| "kl": 0.240234375, |
| "learning_rate": 6.486111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.8125, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.75, |
| "step": 506 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 529.96875, |
| "epoch": 1.05625, |
| "grad_norm": 1.255817174911499, |
| "kl": 0.255859375, |
| "learning_rate": 6.479166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 507 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.96875, |
| "epoch": 1.0583333333333333, |
| "grad_norm": 2.7088470458984375, |
| "kl": 0.26611328125, |
| "learning_rate": 6.472222222222222e-07, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 508 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.90625, |
| "epoch": 1.0604166666666666, |
| "grad_norm": 1.8195452690124512, |
| "kl": 0.25927734375, |
| "learning_rate": 6.465277777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 509 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.3125, |
| "epoch": 1.0625, |
| "grad_norm": 1.773526668548584, |
| "kl": 0.245361328125, |
| "learning_rate": 6.458333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.90625, |
| "epoch": 1.0645833333333334, |
| "grad_norm": 1.7840821743011475, |
| "kl": 0.2734375, |
| "learning_rate": 6.451388888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.15625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 1.0, |
| "step": 511 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.40625, |
| "epoch": 1.0666666666666667, |
| "grad_norm": 2.6982321739196777, |
| "kl": 0.2841796875, |
| "learning_rate": 6.444444444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 512 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.375, |
| "epoch": 1.06875, |
| "grad_norm": 1.1204781532287598, |
| "kl": 0.2626953125, |
| "learning_rate": 6.4375e-07, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 513 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.46875, |
| "epoch": 1.0708333333333333, |
| "grad_norm": 1.2831130027770996, |
| "kl": 0.263671875, |
| "learning_rate": 6.430555555555555e-07, |
| "loss": 0.0003, |
| "reward": 0.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.5, |
| "step": 514 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.28125, |
| "epoch": 1.0729166666666667, |
| "grad_norm": 2.457568407058716, |
| "kl": 0.26123046875, |
| "learning_rate": 6.423611111111112e-07, |
| "loss": 0.0003, |
| "reward": 1.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.15625, |
| "epoch": 1.075, |
| "grad_norm": 3.24466609954834, |
| "kl": 0.25537109375, |
| "learning_rate": 6.416666666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.53125, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 1.0, |
| "step": 516 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.5625, |
| "epoch": 1.0770833333333334, |
| "grad_norm": 2.705453634262085, |
| "kl": 0.29150390625, |
| "learning_rate": 6.409722222222223e-07, |
| "loss": 0.0003, |
| "reward": 1.09375, |
| "reward_std": 0.3061639815568924, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 517 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.875, |
| "epoch": 1.0791666666666666, |
| "grad_norm": 1.0492925643920898, |
| "kl": 0.26123046875, |
| "learning_rate": 6.402777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 518 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 401.84375, |
| "epoch": 1.08125, |
| "grad_norm": 2.4395976066589355, |
| "kl": 0.2763671875, |
| "learning_rate": 6.395833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.6875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 519 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.21875, |
| "epoch": 1.0833333333333333, |
| "grad_norm": 0.008863838389515877, |
| "kl": 0.27783203125, |
| "learning_rate": 6.388888888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.125, |
| "epoch": 1.0854166666666667, |
| "grad_norm": 1.1511051654815674, |
| "kl": 0.26416015625, |
| "learning_rate": 6.381944444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.25, |
| "step": 521 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.125, |
| "epoch": 1.0875, |
| "grad_norm": 0.013866711407899857, |
| "kl": 0.287841796875, |
| "learning_rate": 6.374999999999999e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 522 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.40625, |
| "epoch": 1.0895833333333333, |
| "grad_norm": 1.231552004814148, |
| "kl": 0.28515625, |
| "learning_rate": 6.368055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 523 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 380.28125, |
| "epoch": 1.0916666666666666, |
| "grad_norm": 5.2175092697143555, |
| "kl": 0.28759765625, |
| "learning_rate": 6.36111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 524 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.625, |
| "epoch": 1.09375, |
| "grad_norm": 16.90829086303711, |
| "kl": 0.31396484375, |
| "learning_rate": 6.354166666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.71875, |
| "epoch": 1.0958333333333334, |
| "grad_norm": 1.8970509767532349, |
| "kl": 0.29345703125, |
| "learning_rate": 6.347222222222221e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 526 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.28125, |
| "epoch": 1.0979166666666667, |
| "grad_norm": 1.165528416633606, |
| "kl": 0.3154296875, |
| "learning_rate": 6.340277777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 527 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.09375, |
| "epoch": 1.1, |
| "grad_norm": 1.5739704370498657, |
| "kl": 0.326171875, |
| "learning_rate": 6.333333333333332e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 528 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 341.78125, |
| "epoch": 1.1020833333333333, |
| "grad_norm": 2.9738852977752686, |
| "kl": 0.29833984375, |
| "learning_rate": 6.326388888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 529 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.75, |
| "epoch": 1.1041666666666667, |
| "grad_norm": 0.0143277607858181, |
| "kl": 0.2939453125, |
| "learning_rate": 6.319444444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.5, |
| "epoch": 1.10625, |
| "grad_norm": 2.137756824493408, |
| "kl": 0.31005859375, |
| "learning_rate": 6.3125e-07, |
| "loss": 0.0003, |
| "reward": 1.6875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 531 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.71875, |
| "epoch": 1.1083333333333334, |
| "grad_norm": 1.2957160472869873, |
| "kl": 0.32470703125, |
| "learning_rate": 6.305555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 532 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.4375, |
| "epoch": 1.1104166666666666, |
| "grad_norm": 2.687209367752075, |
| "kl": 0.33447265625, |
| "learning_rate": 6.298611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 533 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.15625, |
| "epoch": 1.1125, |
| "grad_norm": 2.23334002494812, |
| "kl": 0.3720703125, |
| "learning_rate": 6.291666666666666e-07, |
| "loss": 0.0004, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 534 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.375, |
| "epoch": 1.1145833333333333, |
| "grad_norm": 2.0771384239196777, |
| "kl": 0.31884765625, |
| "learning_rate": 6.284722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.09375, |
| "epoch": 1.1166666666666667, |
| "grad_norm": 3.610607624053955, |
| "kl": 0.2958984375, |
| "learning_rate": 6.277777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 536 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.0, |
| "epoch": 1.11875, |
| "grad_norm": 2.343539237976074, |
| "kl": 0.3251953125, |
| "learning_rate": 6.270833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.71875, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 537 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.25, |
| "epoch": 1.1208333333333333, |
| "grad_norm": 0.011663875542581081, |
| "kl": 0.32763671875, |
| "learning_rate": 6.263888888888888e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 538 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.78125, |
| "epoch": 1.1229166666666666, |
| "grad_norm": 1.8692626953125, |
| "kl": 0.31982421875, |
| "learning_rate": 6.256944444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 539 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.53125, |
| "epoch": 1.125, |
| "grad_norm": 1.2807447910308838, |
| "kl": 0.328125, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.71875, |
| "epoch": 1.1270833333333332, |
| "grad_norm": 1.4935797452926636, |
| "kl": 0.33740234375, |
| "learning_rate": 6.243055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.375, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 541 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.125, |
| "epoch": 1.1291666666666667, |
| "grad_norm": 1.423712968826294, |
| "kl": 0.34033203125, |
| "learning_rate": 6.23611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 542 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 302.28125, |
| "epoch": 1.13125, |
| "grad_norm": 1.2090955972671509, |
| "kl": 0.35791015625, |
| "learning_rate": 6.229166666666666e-07, |
| "loss": 0.0004, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 543 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 328.78125, |
| "epoch": 1.1333333333333333, |
| "grad_norm": 0.017780767753720284, |
| "kl": 0.337890625, |
| "learning_rate": 6.222222222222223e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 544 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.375, |
| "epoch": 1.1354166666666667, |
| "grad_norm": 0.01260452438145876, |
| "kl": 0.33544921875, |
| "learning_rate": 6.215277777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.5625, |
| "epoch": 1.1375, |
| "grad_norm": 0.07112853229045868, |
| "kl": 0.40478515625, |
| "learning_rate": 6.208333333333334e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 546 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 292.90625, |
| "epoch": 1.1395833333333334, |
| "grad_norm": 0.010367254726588726, |
| "kl": 0.341796875, |
| "learning_rate": 6.201388888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 547 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.90625, |
| "epoch": 1.1416666666666666, |
| "grad_norm": 0.013733302243053913, |
| "kl": 0.32421875, |
| "learning_rate": 6.194444444444445e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.75, |
| "step": 548 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.03125, |
| "epoch": 1.14375, |
| "grad_norm": 2.095188856124878, |
| "kl": 0.33154296875, |
| "learning_rate": 6.1875e-07, |
| "loss": 0.0003, |
| "reward": 1.65625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 1.0, |
| "step": 549 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 318.1875, |
| "epoch": 1.1458333333333333, |
| "grad_norm": 0.011102787218987942, |
| "kl": 0.31787109375, |
| "learning_rate": 6.180555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.59375, |
| "epoch": 1.1479166666666667, |
| "grad_norm": 0.011943946592509747, |
| "kl": 0.3408203125, |
| "learning_rate": 6.173611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 551 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.75, |
| "epoch": 1.15, |
| "grad_norm": 1.2020156383514404, |
| "kl": 0.32568359375, |
| "learning_rate": 6.166666666666667e-07, |
| "loss": 0.0003, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 552 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.40625, |
| "epoch": 1.1520833333333333, |
| "grad_norm": 2.2801804542541504, |
| "kl": 0.34716796875, |
| "learning_rate": 6.159722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.3125, |
| "reward_std": 0.249358132481575, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 553 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.3125, |
| "epoch": 1.1541666666666668, |
| "grad_norm": 0.00982726365327835, |
| "kl": 0.32421875, |
| "learning_rate": 6.152777777777778e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 554 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 317.625, |
| "epoch": 1.15625, |
| "grad_norm": 2.7104454040527344, |
| "kl": 0.31591796875, |
| "learning_rate": 6.145833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 312.65625, |
| "epoch": 1.1583333333333332, |
| "grad_norm": 3.31844162940979, |
| "kl": 0.33349609375, |
| "learning_rate": 6.138888888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 556 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.46875, |
| "epoch": 1.1604166666666667, |
| "grad_norm": 0.021200576797127724, |
| "kl": 0.373046875, |
| "learning_rate": 6.131944444444444e-07, |
| "loss": 0.0004, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 557 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.53125, |
| "epoch": 1.1625, |
| "grad_norm": 1.4671574831008911, |
| "kl": 0.333984375, |
| "learning_rate": 6.125000000000001e-07, |
| "loss": 0.0003, |
| "reward": 1.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.90625, |
| "rewards/format_reward": 1.0, |
| "step": 558 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.25, |
| "epoch": 1.1645833333333333, |
| "grad_norm": 0.013326307758688927, |
| "kl": 0.35498046875, |
| "learning_rate": 6.118055555555556e-07, |
| "loss": 0.0004, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 559 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.40625, |
| "epoch": 1.1666666666666667, |
| "grad_norm": 1.3512459993362427, |
| "kl": 0.35302734375, |
| "learning_rate": 6.111111111111112e-07, |
| "loss": 0.0004, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.90625, |
| "epoch": 1.16875, |
| "grad_norm": 1.6833953857421875, |
| "kl": 0.33740234375, |
| "learning_rate": 6.104166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 561 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 320.6875, |
| "epoch": 1.1708333333333334, |
| "grad_norm": 0.08547837287187576, |
| "kl": 0.33349609375, |
| "learning_rate": 6.097222222222223e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 562 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.53125, |
| "epoch": 1.1729166666666666, |
| "grad_norm": 0.01078501995652914, |
| "kl": 0.32470703125, |
| "learning_rate": 6.090277777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 563 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 296.25, |
| "epoch": 1.175, |
| "grad_norm": 1.2430768013000488, |
| "kl": 0.35400390625, |
| "learning_rate": 6.083333333333333e-07, |
| "loss": 0.0004, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 564 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 321.9375, |
| "epoch": 1.1770833333333333, |
| "grad_norm": 1.3310391902923584, |
| "kl": 0.30419921875, |
| "learning_rate": 6.076388888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 565 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 299.1875, |
| "epoch": 1.1791666666666667, |
| "grad_norm": 0.012161072343587875, |
| "kl": 0.32958984375, |
| "learning_rate": 6.069444444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 566 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.65625, |
| "epoch": 1.18125, |
| "grad_norm": 0.011912676505744457, |
| "kl": 0.32958984375, |
| "learning_rate": 6.062499999999999e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 567 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.5625, |
| "epoch": 1.1833333333333333, |
| "grad_norm": 0.01320314034819603, |
| "kl": 0.3642578125, |
| "learning_rate": 6.055555555555555e-07, |
| "loss": 0.0004, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 568 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.46875, |
| "epoch": 1.1854166666666668, |
| "grad_norm": 3.1924169063568115, |
| "kl": 0.33984375, |
| "learning_rate": 6.04861111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.65625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.9375, |
| "step": 569 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.4375, |
| "epoch": 1.1875, |
| "grad_norm": 0.012552078813314438, |
| "kl": 0.32080078125, |
| "learning_rate": 6.041666666666666e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 304.125, |
| "epoch": 1.1895833333333332, |
| "grad_norm": 3.063969373703003, |
| "kl": 0.3232421875, |
| "learning_rate": 6.034722222222221e-07, |
| "loss": 0.0003, |
| "reward": 1.625, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 1.0, |
| "step": 571 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.15625, |
| "epoch": 1.1916666666666667, |
| "grad_norm": 1.8180301189422607, |
| "kl": 0.3349609375, |
| "learning_rate": 6.027777777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 572 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.96875, |
| "epoch": 1.19375, |
| "grad_norm": 1.8491965532302856, |
| "kl": 0.33935546875, |
| "learning_rate": 6.020833333333333e-07, |
| "loss": 0.0003, |
| "reward": 0.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.25, |
| "step": 573 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.6875, |
| "epoch": 1.1958333333333333, |
| "grad_norm": 1.7229721546173096, |
| "kl": 0.33203125, |
| "learning_rate": 6.013888888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 574 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 302.625, |
| "epoch": 1.1979166666666667, |
| "grad_norm": 3.754805564880371, |
| "kl": 0.32080078125, |
| "learning_rate": 6.006944444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.2630179077386856, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.40625, |
| "epoch": 1.2, |
| "grad_norm": 1.3297032117843628, |
| "kl": 0.33056640625, |
| "learning_rate": 6e-07, |
| "loss": 0.0003, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 576 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.125, |
| "epoch": 1.2020833333333334, |
| "grad_norm": 1.3171557188034058, |
| "kl": 0.32080078125, |
| "learning_rate": 5.993055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 577 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 292.28125, |
| "epoch": 1.2041666666666666, |
| "grad_norm": 0.010273904539644718, |
| "kl": 0.3369140625, |
| "learning_rate": 5.986111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 578 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.84375, |
| "epoch": 1.20625, |
| "grad_norm": 3.0093436241149902, |
| "kl": 0.3671875, |
| "learning_rate": 5.979166666666666e-07, |
| "loss": 0.0004, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 579 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.15625, |
| "epoch": 1.2083333333333333, |
| "grad_norm": 1.7877336740493774, |
| "kl": 0.36328125, |
| "learning_rate": 5.972222222222222e-07, |
| "loss": 0.0004, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.3125, |
| "epoch": 1.2104166666666667, |
| "grad_norm": 1.9994487762451172, |
| "kl": 0.34375, |
| "learning_rate": 5.965277777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 581 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.65625, |
| "epoch": 1.2125, |
| "grad_norm": 1.301345705986023, |
| "kl": 0.37646484375, |
| "learning_rate": 5.958333333333333e-07, |
| "loss": 0.0004, |
| "reward": 1.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 582 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.96875, |
| "epoch": 1.2145833333333333, |
| "grad_norm": 2.8755271434783936, |
| "kl": 0.32958984375, |
| "learning_rate": 5.951388888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.3335031494498253, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 583 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.9375, |
| "epoch": 1.2166666666666668, |
| "grad_norm": 2.305840253829956, |
| "kl": 0.37548828125, |
| "learning_rate": 5.944444444444444e-07, |
| "loss": 0.0004, |
| "reward": 1.25, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 584 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 272.46875, |
| "epoch": 1.21875, |
| "grad_norm": 0.011355455964803696, |
| "kl": 0.33740234375, |
| "learning_rate": 5.937499999999999e-07, |
| "loss": 0.0003, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 585 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 279.375, |
| "epoch": 1.2208333333333332, |
| "grad_norm": 1.3942476511001587, |
| "kl": 0.3388671875, |
| "learning_rate": 5.930555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 284.28125, |
| "epoch": 1.2229166666666667, |
| "grad_norm": 1.5005948543548584, |
| "kl": 0.349609375, |
| "learning_rate": 5.923611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.09375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 587 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.5, |
| "epoch": 1.225, |
| "grad_norm": 0.01924080029129982, |
| "kl": 0.357421875, |
| "learning_rate": 5.916666666666667e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 588 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 267.71875, |
| "epoch": 1.2270833333333333, |
| "grad_norm": 1.5551037788391113, |
| "kl": 0.951171875, |
| "learning_rate": 5.909722222222222e-07, |
| "loss": 0.001, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 589 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 274.03125, |
| "epoch": 1.2291666666666667, |
| "grad_norm": 1.4413371086120605, |
| "kl": 0.36376953125, |
| "learning_rate": 5.902777777777778e-07, |
| "loss": 0.0004, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.34375, |
| "epoch": 1.23125, |
| "grad_norm": 1.7674862146377563, |
| "kl": 0.373046875, |
| "learning_rate": 5.895833333333333e-07, |
| "loss": 0.0004, |
| "reward": 1.125, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.75, |
| "step": 591 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.125, |
| "epoch": 1.2333333333333334, |
| "grad_norm": 8.082505226135254, |
| "kl": 0.3798828125, |
| "learning_rate": 5.888888888888889e-07, |
| "loss": 0.0004, |
| "reward": 1.09375, |
| "reward_std": 0.18600594997406006, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.96875, |
| "step": 592 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 259.65625, |
| "epoch": 1.2354166666666666, |
| "grad_norm": 1.4379527568817139, |
| "kl": 0.39208984375, |
| "learning_rate": 5.881944444444444e-07, |
| "loss": 0.0004, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 593 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.15625, |
| "epoch": 1.2375, |
| "grad_norm": 0.6245195269584656, |
| "kl": 0.38427734375, |
| "learning_rate": 5.875e-07, |
| "loss": 0.0004, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 594 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 260.8125, |
| "epoch": 1.2395833333333333, |
| "grad_norm": 0.010753520764410496, |
| "kl": 0.36669921875, |
| "learning_rate": 5.868055555555555e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.78125, |
| "epoch": 1.2416666666666667, |
| "grad_norm": 1.4395649433135986, |
| "kl": 0.3857421875, |
| "learning_rate": 5.861111111111111e-07, |
| "loss": 0.0004, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 596 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.0625, |
| "epoch": 1.24375, |
| "grad_norm": 1.6355886459350586, |
| "kl": 0.365234375, |
| "learning_rate": 5.854166666666666e-07, |
| "loss": 0.0004, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 597 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.3125, |
| "epoch": 1.2458333333333333, |
| "grad_norm": 0.01275028195232153, |
| "kl": 0.3564453125, |
| "learning_rate": 5.847222222222222e-07, |
| "loss": 0.0004, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 598 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.625, |
| "epoch": 1.2479166666666668, |
| "grad_norm": 1.4088735580444336, |
| "kl": 0.37939453125, |
| "learning_rate": 5.840277777777777e-07, |
| "loss": 0.0004, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 599 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.25, |
| "epoch": 1.25, |
| "grad_norm": 0.02105838432908058, |
| "kl": 0.3818359375, |
| "learning_rate": 5.833333333333334e-07, |
| "loss": 0.0004, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 600 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.0, |
| "epoch": 1.2520833333333332, |
| "grad_norm": 3.5220136642456055, |
| "kl": 0.37451171875, |
| "learning_rate": 5.826388888888889e-07, |
| "loss": 0.0004, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 601 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.4375, |
| "epoch": 1.2541666666666667, |
| "grad_norm": 0.013569949194788933, |
| "kl": 0.375, |
| "learning_rate": 5.819444444444445e-07, |
| "loss": 0.0004, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 602 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.5625, |
| "epoch": 1.25625, |
| "grad_norm": 1.4851642847061157, |
| "kl": 0.3671875, |
| "learning_rate": 5.8125e-07, |
| "loss": 0.0004, |
| "reward": 1.375, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 603 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.8125, |
| "epoch": 1.2583333333333333, |
| "grad_norm": 1.887512445449829, |
| "kl": 0.33935546875, |
| "learning_rate": 5.805555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 604 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.3125, |
| "epoch": 1.2604166666666667, |
| "grad_norm": 1.4296413660049438, |
| "kl": 0.3642578125, |
| "learning_rate": 5.798611111111111e-07, |
| "loss": 0.0004, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 239.375, |
| "epoch": 1.2625, |
| "grad_norm": 0.010707022622227669, |
| "kl": 0.35888671875, |
| "learning_rate": 5.791666666666667e-07, |
| "loss": 0.0004, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 606 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.625, |
| "epoch": 1.2645833333333334, |
| "grad_norm": 8.736783981323242, |
| "kl": 0.353515625, |
| "learning_rate": 5.784722222222222e-07, |
| "loss": 0.0004, |
| "reward": 0.625, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.5, |
| "step": 607 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.25, |
| "epoch": 1.2666666666666666, |
| "grad_norm": 0.012715312652289867, |
| "kl": 0.36669921875, |
| "learning_rate": 5.777777777777777e-07, |
| "loss": 0.0004, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 608 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.34375, |
| "epoch": 1.26875, |
| "grad_norm": 1.8922051191329956, |
| "kl": 0.39208984375, |
| "learning_rate": 5.770833333333332e-07, |
| "loss": 0.0004, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 609 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.375, |
| "epoch": 1.2708333333333333, |
| "grad_norm": 0.028345325961709023, |
| "kl": 0.36767578125, |
| "learning_rate": 5.763888888888888e-07, |
| "loss": 0.0004, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 610 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 243.96875, |
| "epoch": 1.2729166666666667, |
| "grad_norm": 0.015319216065108776, |
| "kl": 0.37646484375, |
| "learning_rate": 5.756944444444443e-07, |
| "loss": 0.0004, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.75, |
| "step": 611 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.65625, |
| "epoch": 1.275, |
| "grad_norm": 2.6803572177886963, |
| "kl": 0.3662109375, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.0004, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 612 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.9375, |
| "epoch": 1.2770833333333333, |
| "grad_norm": 6.572545051574707, |
| "kl": 0.361328125, |
| "learning_rate": 5.743055555555554e-07, |
| "loss": 0.0004, |
| "reward": 1.78125, |
| "reward_std": 0.24511480331420898, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 613 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 246.75, |
| "epoch": 1.2791666666666668, |
| "grad_norm": 2.399791717529297, |
| "kl": 0.3876953125, |
| "learning_rate": 5.73611111111111e-07, |
| "loss": 0.0004, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 614 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.625, |
| "epoch": 1.28125, |
| "grad_norm": 1.9824419021606445, |
| "kl": 0.388671875, |
| "learning_rate": 5.729166666666667e-07, |
| "loss": 0.0004, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 242.3125, |
| "epoch": 1.2833333333333332, |
| "grad_norm": 1.5071178674697876, |
| "kl": 0.3486328125, |
| "learning_rate": 5.722222222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.5625, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 616 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.65625, |
| "epoch": 1.2854166666666667, |
| "grad_norm": 0.010986674576997757, |
| "kl": 0.3466796875, |
| "learning_rate": 5.715277777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 617 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.0625, |
| "epoch": 1.2875, |
| "grad_norm": 1.5354124307632446, |
| "kl": 0.3525390625, |
| "learning_rate": 5.708333333333333e-07, |
| "loss": 0.0004, |
| "reward": 1.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 618 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.96875, |
| "epoch": 1.2895833333333333, |
| "grad_norm": 2.0228993892669678, |
| "kl": 0.4052734375, |
| "learning_rate": 5.701388888888889e-07, |
| "loss": 0.0004, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 619 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.5625, |
| "epoch": 1.2916666666666667, |
| "grad_norm": 2.1687755584716797, |
| "kl": 0.3505859375, |
| "learning_rate": 5.694444444444444e-07, |
| "loss": 0.0004, |
| "reward": 1.21875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 620 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.8125, |
| "epoch": 1.29375, |
| "grad_norm": 2.1152918338775635, |
| "kl": 0.3662109375, |
| "learning_rate": 5.6875e-07, |
| "loss": 0.0004, |
| "reward": 1.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 621 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.78125, |
| "epoch": 1.2958333333333334, |
| "grad_norm": 0.011094697751104832, |
| "kl": 0.35546875, |
| "learning_rate": 5.680555555555555e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 622 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.28125, |
| "epoch": 1.2979166666666666, |
| "grad_norm": 0.02124650590121746, |
| "kl": 0.36181640625, |
| "learning_rate": 5.673611111111111e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 623 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 267.9375, |
| "epoch": 1.3, |
| "grad_norm": 0.012937084771692753, |
| "kl": 0.35693359375, |
| "learning_rate": 5.666666666666666e-07, |
| "loss": 0.0004, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 624 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.34375, |
| "epoch": 1.3020833333333333, |
| "grad_norm": 0.011210695840418339, |
| "kl": 0.32861328125, |
| "learning_rate": 5.659722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 625 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 290.375, |
| "epoch": 1.3041666666666667, |
| "grad_norm": 0.035728760063648224, |
| "kl": 0.34716796875, |
| "learning_rate": 5.652777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 626 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.625, |
| "epoch": 1.30625, |
| "grad_norm": 1.3605639934539795, |
| "kl": 0.353515625, |
| "learning_rate": 5.645833333333333e-07, |
| "loss": 0.0004, |
| "reward": 1.09375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 627 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 302.53125, |
| "epoch": 1.3083333333333333, |
| "grad_norm": 0.012040174566209316, |
| "kl": 0.3154296875, |
| "learning_rate": 5.638888888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 628 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 280.96875, |
| "epoch": 1.3104166666666668, |
| "grad_norm": 0.009581932798027992, |
| "kl": 0.3369140625, |
| "learning_rate": 5.631944444444445e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 629 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.03125, |
| "epoch": 1.3125, |
| "grad_norm": 3.3300986289978027, |
| "kl": 0.3271484375, |
| "learning_rate": 5.625e-07, |
| "loss": 0.0003, |
| "reward": 1.15625, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 630 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 287.59375, |
| "epoch": 1.3145833333333332, |
| "grad_norm": 1.4340507984161377, |
| "kl": 0.3232421875, |
| "learning_rate": 5.618055555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 631 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.09375, |
| "epoch": 1.3166666666666667, |
| "grad_norm": 0.015347130596637726, |
| "kl": 0.36181640625, |
| "learning_rate": 5.611111111111111e-07, |
| "loss": 0.0004, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 632 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 292.75, |
| "epoch": 1.31875, |
| "grad_norm": 0.16325514018535614, |
| "kl": 0.328125, |
| "learning_rate": 5.604166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 633 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.9375, |
| "epoch": 1.3208333333333333, |
| "grad_norm": 0.013178522698581219, |
| "kl": 0.34130859375, |
| "learning_rate": 5.597222222222222e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 634 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 299.375, |
| "epoch": 1.3229166666666667, |
| "grad_norm": 1.6165486574172974, |
| "kl": 0.34130859375, |
| "learning_rate": 5.590277777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 635 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 299.0625, |
| "epoch": 1.325, |
| "grad_norm": 0.013300875201821327, |
| "kl": 0.34521484375, |
| "learning_rate": 5.583333333333333e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 636 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 283.28125, |
| "epoch": 1.3270833333333334, |
| "grad_norm": 1.5510921478271484, |
| "kl": 0.318359375, |
| "learning_rate": 5.576388888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 637 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 297.03125, |
| "epoch": 1.3291666666666666, |
| "grad_norm": 1.1744096279144287, |
| "kl": 0.3017578125, |
| "learning_rate": 5.569444444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 638 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.8125, |
| "epoch": 1.33125, |
| "grad_norm": 0.009618040174245834, |
| "kl": 0.3095703125, |
| "learning_rate": 5.5625e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 639 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 307.09375, |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.017916277050971985, |
| "kl": 0.3583984375, |
| "learning_rate": 5.555555555555555e-07, |
| "loss": 0.0004, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 640 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 297.53125, |
| "epoch": 1.3354166666666667, |
| "grad_norm": 0.011587453074753284, |
| "kl": 0.32958984375, |
| "learning_rate": 5.548611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 641 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 304.40625, |
| "epoch": 1.3375, |
| "grad_norm": 1.2505037784576416, |
| "kl": 0.3388671875, |
| "learning_rate": 5.541666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 642 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 306.21875, |
| "epoch": 1.3395833333333333, |
| "grad_norm": 0.009628149680793285, |
| "kl": 0.3154296875, |
| "learning_rate": 5.534722222222223e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 643 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.90625, |
| "epoch": 1.3416666666666668, |
| "grad_norm": 0.012987801805138588, |
| "kl": 0.32666015625, |
| "learning_rate": 5.527777777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 644 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.75, |
| "epoch": 1.34375, |
| "grad_norm": 0.009420250542461872, |
| "kl": 0.314453125, |
| "learning_rate": 5.520833333333334e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 645 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 284.8125, |
| "epoch": 1.3458333333333332, |
| "grad_norm": 0.012694254517555237, |
| "kl": 0.3291015625, |
| "learning_rate": 5.513888888888889e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 646 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.40625, |
| "epoch": 1.3479166666666667, |
| "grad_norm": 0.009924824349582195, |
| "kl": 0.30517578125, |
| "learning_rate": 5.506944444444445e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 647 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 318.9375, |
| "epoch": 1.35, |
| "grad_norm": 0.010647875256836414, |
| "kl": 0.31396484375, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 648 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.1875, |
| "epoch": 1.3520833333333333, |
| "grad_norm": 1.8801367282867432, |
| "kl": 0.310546875, |
| "learning_rate": 5.493055555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 649 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.875, |
| "epoch": 1.3541666666666667, |
| "grad_norm": 1.416724443435669, |
| "kl": 0.306640625, |
| "learning_rate": 5.486111111111111e-07, |
| "loss": 0.0003, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 650 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.875, |
| "epoch": 1.35625, |
| "grad_norm": 1.2208101749420166, |
| "kl": 0.31689453125, |
| "learning_rate": 5.479166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 651 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.75, |
| "epoch": 1.3583333333333334, |
| "grad_norm": 1.3379855155944824, |
| "kl": 0.322265625, |
| "learning_rate": 5.472222222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 652 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.28125, |
| "epoch": 1.3604166666666666, |
| "grad_norm": 0.012338054366409779, |
| "kl": 0.32177734375, |
| "learning_rate": 5.465277777777777e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 653 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.125, |
| "epoch": 1.3625, |
| "grad_norm": 1.8774542808532715, |
| "kl": 0.31494140625, |
| "learning_rate": 5.458333333333332e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 654 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.125, |
| "epoch": 1.3645833333333333, |
| "grad_norm": 1.324816107749939, |
| "kl": 0.30126953125, |
| "learning_rate": 5.451388888888888e-07, |
| "loss": 0.0003, |
| "reward": 0.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.25, |
| "step": 655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.375, |
| "epoch": 1.3666666666666667, |
| "grad_norm": 2.735501766204834, |
| "kl": 0.30419921875, |
| "learning_rate": 5.444444444444443e-07, |
| "loss": 0.0003, |
| "reward": 1.0625, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.75, |
| "step": 656 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 322.0625, |
| "epoch": 1.36875, |
| "grad_norm": 4.300278663635254, |
| "kl": 0.333984375, |
| "learning_rate": 5.4375e-07, |
| "loss": 0.0003, |
| "reward": 1.6875, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 657 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.375, |
| "epoch": 1.3708333333333333, |
| "grad_norm": 0.017509862780570984, |
| "kl": 0.30517578125, |
| "learning_rate": 5.430555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 658 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.53125, |
| "epoch": 1.3729166666666668, |
| "grad_norm": 0.011455986648797989, |
| "kl": 0.33056640625, |
| "learning_rate": 5.423611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 659 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.8125, |
| "epoch": 1.375, |
| "grad_norm": 3.3278894424438477, |
| "kl": 0.326171875, |
| "learning_rate": 5.416666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.375, |
| "reward_std": 0.2314550280570984, |
| "rewards/accuracy_reward": 0.625, |
| "rewards/format_reward": 0.75, |
| "step": 660 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.125, |
| "epoch": 1.3770833333333332, |
| "grad_norm": 1.3124842643737793, |
| "kl": 0.31884765625, |
| "learning_rate": 5.409722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 661 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.625, |
| "epoch": 1.3791666666666667, |
| "grad_norm": 6.085054874420166, |
| "kl": 0.29736328125, |
| "learning_rate": 5.402777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 662 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 306.3125, |
| "epoch": 1.38125, |
| "grad_norm": 0.010650291107594967, |
| "kl": 0.30908203125, |
| "learning_rate": 5.395833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 663 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.1875, |
| "epoch": 1.3833333333333333, |
| "grad_norm": 0.019253544509410858, |
| "kl": 0.3056640625, |
| "learning_rate": 5.388888888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 664 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.0625, |
| "epoch": 1.3854166666666667, |
| "grad_norm": 1.2056688070297241, |
| "kl": 0.29931640625, |
| "learning_rate": 5.381944444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 665 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.15625, |
| "epoch": 1.3875, |
| "grad_norm": 0.012783597223460674, |
| "kl": 0.31298828125, |
| "learning_rate": 5.374999999999999e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 666 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 316.28125, |
| "epoch": 1.3895833333333334, |
| "grad_norm": 2.7306346893310547, |
| "kl": 0.3095703125, |
| "learning_rate": 5.368055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.40625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 667 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.71875, |
| "epoch": 1.3916666666666666, |
| "grad_norm": 1.4300445318222046, |
| "kl": 0.2900390625, |
| "learning_rate": 5.36111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 668 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.03125, |
| "epoch": 1.39375, |
| "grad_norm": 5.7081193923950195, |
| "kl": 0.28271484375, |
| "learning_rate": 5.354166666666666e-07, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5, |
| "step": 669 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 334.9375, |
| "epoch": 1.3958333333333333, |
| "grad_norm": 1.2781238555908203, |
| "kl": 0.31591796875, |
| "learning_rate": 5.347222222222221e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 670 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 354.84375, |
| "epoch": 1.3979166666666667, |
| "grad_norm": 1.4650616645812988, |
| "kl": 0.29638671875, |
| "learning_rate": 5.340277777777778e-07, |
| "loss": 0.0003, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 671 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 338.78125, |
| "epoch": 1.4, |
| "grad_norm": 0.009217875078320503, |
| "kl": 0.30224609375, |
| "learning_rate": 5.333333333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 672 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.53125, |
| "epoch": 1.4020833333333333, |
| "grad_norm": 1.2819490432739258, |
| "kl": 0.33154296875, |
| "learning_rate": 5.326388888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 673 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.34375, |
| "epoch": 1.4041666666666668, |
| "grad_norm": 0.014084805734455585, |
| "kl": 0.32080078125, |
| "learning_rate": 5.319444444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 674 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.9375, |
| "epoch": 1.40625, |
| "grad_norm": 0.008332760073244572, |
| "kl": 0.28271484375, |
| "learning_rate": 5.3125e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 675 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.9375, |
| "epoch": 1.4083333333333332, |
| "grad_norm": 0.018420930951833725, |
| "kl": 0.32373046875, |
| "learning_rate": 5.305555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 676 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 332.625, |
| "epoch": 1.4104166666666667, |
| "grad_norm": 1.9230810403823853, |
| "kl": 0.30126953125, |
| "learning_rate": 5.298611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.1875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 677 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 361.53125, |
| "epoch": 1.4125, |
| "grad_norm": 0.009829960763454437, |
| "kl": 0.2998046875, |
| "learning_rate": 5.291666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 678 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.1875, |
| "epoch": 1.4145833333333333, |
| "grad_norm": 0.009316562674939632, |
| "kl": 0.2978515625, |
| "learning_rate": 5.284722222222222e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 679 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 373.125, |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.010273626074194908, |
| "kl": 0.28125, |
| "learning_rate": 5.277777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 680 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.875, |
| "epoch": 1.41875, |
| "grad_norm": 0.018658263608813286, |
| "kl": 0.3154296875, |
| "learning_rate": 5.270833333333333e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 681 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.53125, |
| "epoch": 1.4208333333333334, |
| "grad_norm": 1.121277928352356, |
| "kl": 0.29736328125, |
| "learning_rate": 5.263888888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 682 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.21875, |
| "epoch": 1.4229166666666666, |
| "grad_norm": 0.0887700691819191, |
| "kl": 0.2900390625, |
| "learning_rate": 5.256944444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 683 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 371.84375, |
| "epoch": 1.425, |
| "grad_norm": 0.009055635891854763, |
| "kl": 0.2861328125, |
| "learning_rate": 5.25e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 684 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.34375, |
| "epoch": 1.4270833333333333, |
| "grad_norm": 1.2657846212387085, |
| "kl": 0.296875, |
| "learning_rate": 5.243055555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 685 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.65625, |
| "epoch": 1.4291666666666667, |
| "grad_norm": 1.931689739227295, |
| "kl": 0.28369140625, |
| "learning_rate": 5.236111111111112e-07, |
| "loss": 0.0003, |
| "reward": 1.09375, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 686 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.28125, |
| "epoch": 1.43125, |
| "grad_norm": 1.098535418510437, |
| "kl": 0.25341796875, |
| "learning_rate": 5.229166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 687 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.15625, |
| "epoch": 1.4333333333333333, |
| "grad_norm": 0.008603488095104694, |
| "kl": 0.28125, |
| "learning_rate": 5.222222222222223e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 688 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.46875, |
| "epoch": 1.4354166666666668, |
| "grad_norm": 1.1769787073135376, |
| "kl": 0.254150390625, |
| "learning_rate": 5.215277777777778e-07, |
| "loss": 0.0003, |
| "reward": 1.84375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 689 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 412.78125, |
| "epoch": 1.4375, |
| "grad_norm": 0.009760401211678982, |
| "kl": 0.2783203125, |
| "learning_rate": 5.208333333333334e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 690 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.53125, |
| "epoch": 1.4395833333333332, |
| "grad_norm": 0.009675565175712109, |
| "kl": 0.26318359375, |
| "learning_rate": 5.201388888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 691 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.59375, |
| "epoch": 1.4416666666666667, |
| "grad_norm": 0.03820687532424927, |
| "kl": 0.260498046875, |
| "learning_rate": 5.194444444444445e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 692 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.03125, |
| "epoch": 1.44375, |
| "grad_norm": 0.00820316094905138, |
| "kl": 0.249755859375, |
| "learning_rate": 5.1875e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 693 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.1875, |
| "epoch": 1.4458333333333333, |
| "grad_norm": 0.00827599223703146, |
| "kl": 0.254638671875, |
| "learning_rate": 5.180555555555556e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 694 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.3125, |
| "epoch": 1.4479166666666667, |
| "grad_norm": 1.2021926641464233, |
| "kl": 0.2626953125, |
| "learning_rate": 5.173611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 695 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 399.59375, |
| "epoch": 1.45, |
| "grad_norm": 2.0005180835723877, |
| "kl": 0.2646484375, |
| "learning_rate": 5.166666666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.125, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.71875, |
| "step": 696 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.5, |
| "epoch": 1.4520833333333334, |
| "grad_norm": 1.7120201587677002, |
| "kl": 0.232177734375, |
| "learning_rate": 5.159722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 697 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 410.46875, |
| "epoch": 1.4541666666666666, |
| "grad_norm": 4.130648136138916, |
| "kl": 0.2626953125, |
| "learning_rate": 5.152777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 698 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.03125, |
| "epoch": 1.45625, |
| "grad_norm": 0.008720851503312588, |
| "kl": 0.248779296875, |
| "learning_rate": 5.145833333333332e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 699 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.875, |
| "epoch": 1.4583333333333333, |
| "grad_norm": 0.007419127505272627, |
| "kl": 0.248046875, |
| "learning_rate": 5.138888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 700 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.8125, |
| "epoch": 1.4604166666666667, |
| "grad_norm": 0.010546655394136906, |
| "kl": 0.256591796875, |
| "learning_rate": 5.131944444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 701 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.5625, |
| "epoch": 1.4625, |
| "grad_norm": 0.007757688872516155, |
| "kl": 0.258544921875, |
| "learning_rate": 5.125e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 702 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.3125, |
| "epoch": 1.4645833333333333, |
| "grad_norm": 1.2824113368988037, |
| "kl": 0.240234375, |
| "learning_rate": 5.118055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 703 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.53125, |
| "epoch": 1.4666666666666668, |
| "grad_norm": 0.010234753601253033, |
| "kl": 0.2685546875, |
| "learning_rate": 5.111111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 704 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.8125, |
| "epoch": 1.46875, |
| "grad_norm": 0.020811883732676506, |
| "kl": 0.262939453125, |
| "learning_rate": 5.104166666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 705 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.28125, |
| "epoch": 1.4708333333333332, |
| "grad_norm": 0.017371410503983498, |
| "kl": 0.24267578125, |
| "learning_rate": 5.097222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 706 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.5, |
| "epoch": 1.4729166666666667, |
| "grad_norm": 0.008478788658976555, |
| "kl": 0.246826171875, |
| "learning_rate": 5.090277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 707 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.0625, |
| "epoch": 1.475, |
| "grad_norm": 1.548750877380371, |
| "kl": 0.252685546875, |
| "learning_rate": 5.083333333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 708 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.4375, |
| "epoch": 1.4770833333333333, |
| "grad_norm": 1.1799025535583496, |
| "kl": 0.25927734375, |
| "learning_rate": 5.076388888888888e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 709 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.78125, |
| "epoch": 1.4791666666666667, |
| "grad_norm": 7.808667182922363, |
| "kl": 0.256591796875, |
| "learning_rate": 5.069444444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.03125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 710 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.9375, |
| "epoch": 1.48125, |
| "grad_norm": 0.009599937126040459, |
| "kl": 0.27587890625, |
| "learning_rate": 5.062499999999999e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 711 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.3125, |
| "epoch": 1.4833333333333334, |
| "grad_norm": 0.014995035715401173, |
| "kl": 0.254638671875, |
| "learning_rate": 5.055555555555555e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 712 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.21875, |
| "epoch": 1.4854166666666666, |
| "grad_norm": 0.008672283962368965, |
| "kl": 0.25390625, |
| "learning_rate": 5.04861111111111e-07, |
| "loss": 0.0003, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 713 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.34375, |
| "epoch": 1.4875, |
| "grad_norm": 1.2915209531784058, |
| "kl": 0.2451171875, |
| "learning_rate": 5.041666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 714 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.6875, |
| "epoch": 1.4895833333333333, |
| "grad_norm": 0.011465529911220074, |
| "kl": 0.26611328125, |
| "learning_rate": 5.034722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 715 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.0, |
| "epoch": 1.4916666666666667, |
| "grad_norm": 1.1036456823349, |
| "kl": 0.28173828125, |
| "learning_rate": 5.027777777777778e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 716 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.15625, |
| "epoch": 1.49375, |
| "grad_norm": 1.2302424907684326, |
| "kl": 0.2587890625, |
| "learning_rate": 5.020833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.8125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.8125, |
| "rewards/format_reward": 1.0, |
| "step": 717 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.78125, |
| "epoch": 1.4958333333333333, |
| "grad_norm": 2.1474807262420654, |
| "kl": 0.240966796875, |
| "learning_rate": 5.013888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 718 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.46875, |
| "epoch": 1.4979166666666668, |
| "grad_norm": 0.00977454986423254, |
| "kl": 0.26806640625, |
| "learning_rate": 5.006944444444444e-07, |
| "loss": 0.0003, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 719 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.625, |
| "epoch": 1.5, |
| "grad_norm": 1.7190418243408203, |
| "kl": 0.2734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0003, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 720 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.5, |
| "epoch": 1.5020833333333332, |
| "grad_norm": 1.1904847621917725, |
| "kl": 0.272705078125, |
| "learning_rate": 4.993055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 721 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.1875, |
| "epoch": 1.5041666666666667, |
| "grad_norm": 0.009021605364978313, |
| "kl": 0.261474609375, |
| "learning_rate": 4.986111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 722 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.375, |
| "epoch": 1.50625, |
| "grad_norm": 2.251453399658203, |
| "kl": 0.26171875, |
| "learning_rate": 4.979166666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.03125, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 723 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.65625, |
| "epoch": 1.5083333333333333, |
| "grad_norm": 0.008535100147128105, |
| "kl": 0.2646484375, |
| "learning_rate": 4.972222222222222e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 724 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.71875, |
| "epoch": 1.5104166666666665, |
| "grad_norm": 1.3646361827850342, |
| "kl": 0.249755859375, |
| "learning_rate": 4.965277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 725 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 392.09375, |
| "epoch": 1.5125, |
| "grad_norm": 1.2811659574508667, |
| "kl": 0.275390625, |
| "learning_rate": 4.958333333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 726 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.65625, |
| "epoch": 1.5145833333333334, |
| "grad_norm": 1.3191869258880615, |
| "kl": 0.2470703125, |
| "learning_rate": 4.951388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 727 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.78125, |
| "epoch": 1.5166666666666666, |
| "grad_norm": 0.009647144004702568, |
| "kl": 0.262451171875, |
| "learning_rate": 4.944444444444445e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 728 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 408.34375, |
| "epoch": 1.51875, |
| "grad_norm": 1.3683720827102661, |
| "kl": 0.263671875, |
| "learning_rate": 4.9375e-07, |
| "loss": 0.0003, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 729 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.59375, |
| "epoch": 1.5208333333333335, |
| "grad_norm": 1.4309377670288086, |
| "kl": 0.263427734375, |
| "learning_rate": 4.930555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 730 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.78125, |
| "epoch": 1.5229166666666667, |
| "grad_norm": 0.009884395636618137, |
| "kl": 0.243408203125, |
| "learning_rate": 4.923611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 731 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.0625, |
| "epoch": 1.525, |
| "grad_norm": 0.011238432489335537, |
| "kl": 0.274169921875, |
| "learning_rate": 4.916666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 732 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.6875, |
| "epoch": 1.5270833333333333, |
| "grad_norm": 0.008788060396909714, |
| "kl": 0.25244140625, |
| "learning_rate": 4.909722222222221e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 733 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.5625, |
| "epoch": 1.5291666666666668, |
| "grad_norm": 0.026375016197562218, |
| "kl": 0.2587890625, |
| "learning_rate": 4.902777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 734 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.0, |
| "epoch": 1.53125, |
| "grad_norm": 0.00788712315261364, |
| "kl": 0.2666015625, |
| "learning_rate": 4.895833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 735 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.71875, |
| "epoch": 1.5333333333333332, |
| "grad_norm": 0.012409915216267109, |
| "kl": 0.248046875, |
| "learning_rate": 4.888888888888889e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 736 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.75, |
| "epoch": 1.5354166666666667, |
| "grad_norm": 1.6673911809921265, |
| "kl": 0.268310546875, |
| "learning_rate": 4.881944444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 737 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.15625, |
| "epoch": 1.5375, |
| "grad_norm": 14.521210670471191, |
| "kl": 0.26953125, |
| "learning_rate": 4.875e-07, |
| "loss": 0.0003, |
| "reward": 0.71875, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 738 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.6875, |
| "epoch": 1.5395833333333333, |
| "grad_norm": 0.008589212782680988, |
| "kl": 0.271484375, |
| "learning_rate": 4.868055555555555e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 739 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.5625, |
| "epoch": 1.5416666666666665, |
| "grad_norm": 0.00834252592176199, |
| "kl": 0.24853515625, |
| "learning_rate": 4.861111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 740 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.375, |
| "epoch": 1.54375, |
| "grad_norm": 0.0074338242411613464, |
| "kl": 0.25732421875, |
| "learning_rate": 4.854166666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 741 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.125, |
| "epoch": 1.5458333333333334, |
| "grad_norm": 0.00910673663020134, |
| "kl": 0.260986328125, |
| "learning_rate": 4.847222222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 742 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.125, |
| "epoch": 1.5479166666666666, |
| "grad_norm": 1.3869645595550537, |
| "kl": 0.267333984375, |
| "learning_rate": 4.840277777777777e-07, |
| "loss": 0.0003, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 743 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.84375, |
| "epoch": 1.55, |
| "grad_norm": 2.4322197437286377, |
| "kl": 0.267822265625, |
| "learning_rate": 4.833333333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 744 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 429.34375, |
| "epoch": 1.5520833333333335, |
| "grad_norm": 1.4897021055221558, |
| "kl": 0.237548828125, |
| "learning_rate": 4.826388888888888e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 745 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 453.5625, |
| "epoch": 1.5541666666666667, |
| "grad_norm": 1.5762288570404053, |
| "kl": 0.23681640625, |
| "learning_rate": 4.819444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 746 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.53125, |
| "epoch": 1.55625, |
| "grad_norm": 0.022547965869307518, |
| "kl": 0.251220703125, |
| "learning_rate": 4.812499999999999e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 747 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.375, |
| "epoch": 1.5583333333333333, |
| "grad_norm": 0.009397338144481182, |
| "kl": 0.25732421875, |
| "learning_rate": 4.805555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 748 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.625, |
| "epoch": 1.5604166666666668, |
| "grad_norm": 0.013823941349983215, |
| "kl": 0.2705078125, |
| "learning_rate": 4.798611111111112e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 749 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.40625, |
| "epoch": 1.5625, |
| "grad_norm": 0.008257027715444565, |
| "kl": 0.24951171875, |
| "learning_rate": 4.791666666666667e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 750 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.71875, |
| "epoch": 1.5645833333333332, |
| "grad_norm": 1.193889856338501, |
| "kl": 0.254150390625, |
| "learning_rate": 4.784722222222223e-07, |
| "loss": 0.0003, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 751 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.96875, |
| "epoch": 1.5666666666666667, |
| "grad_norm": 1.2460548877716064, |
| "kl": 0.25048828125, |
| "learning_rate": 4.777777777777778e-07, |
| "loss": 0.0003, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 752 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.15625, |
| "epoch": 1.56875, |
| "grad_norm": 1.2181428670883179, |
| "kl": 0.24365234375, |
| "learning_rate": 4.770833333333334e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 753 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.40625, |
| "epoch": 1.5708333333333333, |
| "grad_norm": 0.007452361285686493, |
| "kl": 0.2451171875, |
| "learning_rate": 4.7638888888888885e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 754 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.6875, |
| "epoch": 1.5729166666666665, |
| "grad_norm": 1.5803804397583008, |
| "kl": 0.23583984375, |
| "learning_rate": 4.756944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 755 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.4375, |
| "epoch": 1.575, |
| "grad_norm": 1.030873417854309, |
| "kl": 0.259765625, |
| "learning_rate": 4.7499999999999995e-07, |
| "loss": 0.0003, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 756 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.875, |
| "epoch": 1.5770833333333334, |
| "grad_norm": 0.008705949410796165, |
| "kl": 0.248779296875, |
| "learning_rate": 4.743055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 757 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.09375, |
| "epoch": 1.5791666666666666, |
| "grad_norm": 1.3716362714767456, |
| "kl": 0.236328125, |
| "learning_rate": 4.7361111111111105e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.71875, |
| "step": 758 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.0, |
| "epoch": 1.58125, |
| "grad_norm": 0.007176287472248077, |
| "kl": 0.23779296875, |
| "learning_rate": 4.7291666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 759 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.21875, |
| "epoch": 1.5833333333333335, |
| "grad_norm": 1.3382846117019653, |
| "kl": 0.238525390625, |
| "learning_rate": 4.722222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 760 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.5625, |
| "epoch": 1.5854166666666667, |
| "grad_norm": 0.007611650042235851, |
| "kl": 0.23583984375, |
| "learning_rate": 4.7152777777777776e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 761 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.78125, |
| "epoch": 1.5875, |
| "grad_norm": 0.4260139465332031, |
| "kl": 0.2158203125, |
| "learning_rate": 4.708333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.71875, |
| "step": 762 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.65625, |
| "epoch": 1.5895833333333333, |
| "grad_norm": 0.011225441470742226, |
| "kl": 0.253173828125, |
| "learning_rate": 4.7013888888888886e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 763 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.5, |
| "epoch": 1.5916666666666668, |
| "grad_norm": 0.008185843005776405, |
| "kl": 0.244140625, |
| "learning_rate": 4.694444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 764 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.3125, |
| "epoch": 1.59375, |
| "grad_norm": 1.2148336172103882, |
| "kl": 0.236572265625, |
| "learning_rate": 4.6874999999999996e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 765 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.75, |
| "epoch": 1.5958333333333332, |
| "grad_norm": 0.009695029817521572, |
| "kl": 0.255859375, |
| "learning_rate": 4.6805555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 766 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.15625, |
| "epoch": 1.5979166666666667, |
| "grad_norm": 1.706971287727356, |
| "kl": 0.252685546875, |
| "learning_rate": 4.673611111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 767 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.9375, |
| "epoch": 1.6, |
| "grad_norm": 0.025292934849858284, |
| "kl": 0.259765625, |
| "learning_rate": 4.6666666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 768 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.34375, |
| "epoch": 1.6020833333333333, |
| "grad_norm": 1.5667407512664795, |
| "kl": 0.23681640625, |
| "learning_rate": 4.659722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.90625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.5, |
| "step": 769 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.59375, |
| "epoch": 1.6041666666666665, |
| "grad_norm": 0.011945800855755806, |
| "kl": 0.24755859375, |
| "learning_rate": 4.6527777777777776e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 770 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.65625, |
| "epoch": 1.60625, |
| "grad_norm": 0.009756573475897312, |
| "kl": 0.246826171875, |
| "learning_rate": 4.645833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 771 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.53125, |
| "epoch": 1.6083333333333334, |
| "grad_norm": 4.669853687286377, |
| "kl": 0.27197265625, |
| "learning_rate": 4.6388888888888886e-07, |
| "loss": 0.0003, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 772 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.53125, |
| "epoch": 1.6104166666666666, |
| "grad_norm": 0.007559712044894695, |
| "kl": 0.246337890625, |
| "learning_rate": 4.6319444444444447e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 773 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.75, |
| "epoch": 1.6125, |
| "grad_norm": 0.01486685499548912, |
| "kl": 0.25146484375, |
| "learning_rate": 4.625e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 774 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.71875, |
| "epoch": 1.6145833333333335, |
| "grad_norm": 0.009248960763216019, |
| "kl": 0.2490234375, |
| "learning_rate": 4.6180555555555557e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.84375, |
| "epoch": 1.6166666666666667, |
| "grad_norm": 0.010325673967599869, |
| "kl": 0.248046875, |
| "learning_rate": 4.611111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 776 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.3125, |
| "epoch": 1.61875, |
| "grad_norm": 1.281834602355957, |
| "kl": 0.23974609375, |
| "learning_rate": 4.604166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.78125, |
| "rewards/format_reward": 1.0, |
| "step": 777 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.375, |
| "epoch": 1.6208333333333333, |
| "grad_norm": 1.2257119417190552, |
| "kl": 0.25634765625, |
| "learning_rate": 4.5972222222222217e-07, |
| "loss": 0.0003, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 778 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.65625, |
| "epoch": 1.6229166666666668, |
| "grad_norm": 1.4265564680099487, |
| "kl": 0.245849609375, |
| "learning_rate": 4.590277777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 779 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.78125, |
| "epoch": 1.625, |
| "grad_norm": 0.008513258770108223, |
| "kl": 0.253662109375, |
| "learning_rate": 4.5833333333333327e-07, |
| "loss": 0.0003, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 780 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.125, |
| "epoch": 1.6270833333333332, |
| "grad_norm": 1.229500412940979, |
| "kl": 0.24609375, |
| "learning_rate": 4.5763888888888887e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 781 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.46875, |
| "epoch": 1.6291666666666667, |
| "grad_norm": 1.1059486865997314, |
| "kl": 0.248779296875, |
| "learning_rate": 4.569444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 782 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.34375, |
| "epoch": 1.63125, |
| "grad_norm": 0.015273602679371834, |
| "kl": 0.265869140625, |
| "learning_rate": 4.5624999999999997e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 783 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.9375, |
| "epoch": 1.6333333333333333, |
| "grad_norm": 0.009410897269845009, |
| "kl": 0.261962890625, |
| "learning_rate": 4.555555555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 784 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.96875, |
| "epoch": 1.6354166666666665, |
| "grad_norm": 1.280187726020813, |
| "kl": 0.245361328125, |
| "learning_rate": 4.548611111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 785 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.65625, |
| "epoch": 1.6375, |
| "grad_norm": 0.017177268862724304, |
| "kl": 0.235595703125, |
| "learning_rate": 4.541666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 786 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.84375, |
| "epoch": 1.6395833333333334, |
| "grad_norm": 0.008518899790942669, |
| "kl": 0.239501953125, |
| "learning_rate": 4.534722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 787 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.125, |
| "epoch": 1.6416666666666666, |
| "grad_norm": 0.008767511695623398, |
| "kl": 0.2353515625, |
| "learning_rate": 4.527777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 788 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.53125, |
| "epoch": 1.64375, |
| "grad_norm": 1.5510228872299194, |
| "kl": 0.24658203125, |
| "learning_rate": 4.5208333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.40625, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.65625, |
| "rewards/format_reward": 0.75, |
| "step": 789 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.375, |
| "epoch": 1.6458333333333335, |
| "grad_norm": 2.2926583290100098, |
| "kl": 0.24755859375, |
| "learning_rate": 4.513888888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 790 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.40625, |
| "epoch": 1.6479166666666667, |
| "grad_norm": 0.01068432629108429, |
| "kl": 0.226806640625, |
| "learning_rate": 4.5069444444444443e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 791 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.5, |
| "epoch": 1.65, |
| "grad_norm": 0.007735088467597961, |
| "kl": 0.244873046875, |
| "learning_rate": 4.5e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 792 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.65625, |
| "epoch": 1.6520833333333333, |
| "grad_norm": 0.9811253547668457, |
| "kl": 0.242431640625, |
| "learning_rate": 4.4930555555555553e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 793 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.4375, |
| "epoch": 1.6541666666666668, |
| "grad_norm": 1.101609468460083, |
| "kl": 0.236328125, |
| "learning_rate": 4.486111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 794 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 544.1875, |
| "epoch": 1.65625, |
| "grad_norm": 10.281743049621582, |
| "kl": 0.244873046875, |
| "learning_rate": 4.479166666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 795 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 530.8125, |
| "epoch": 1.6583333333333332, |
| "grad_norm": 1.3257184028625488, |
| "kl": 0.244384765625, |
| "learning_rate": 4.4722222222222223e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 796 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.6875, |
| "epoch": 1.6604166666666667, |
| "grad_norm": 1.5066584348678589, |
| "kl": 0.243896484375, |
| "learning_rate": 4.465277777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.5625, |
| "reward_std": 0.2177756354212761, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 1.0, |
| "step": 797 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.4375, |
| "epoch": 1.6625, |
| "grad_norm": 0.010895702056586742, |
| "kl": 0.2431640625, |
| "learning_rate": 4.4583333333333334e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 798 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.21875, |
| "epoch": 1.6645833333333333, |
| "grad_norm": 0.007907292805612087, |
| "kl": 0.24853515625, |
| "learning_rate": 4.4513888888888883e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 799 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 558.1875, |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.0693315267562866, |
| "kl": 0.233154296875, |
| "learning_rate": 4.444444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 800 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 538.3125, |
| "epoch": 1.66875, |
| "grad_norm": 0.028570212423801422, |
| "kl": 0.235107421875, |
| "learning_rate": 4.4374999999999993e-07, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 801 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.0, |
| "epoch": 1.6708333333333334, |
| "grad_norm": 0.02017505094408989, |
| "kl": 0.254150390625, |
| "learning_rate": 4.4305555555555554e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 802 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.25, |
| "epoch": 1.6729166666666666, |
| "grad_norm": 3.2598605155944824, |
| "kl": 0.226806640625, |
| "learning_rate": 4.423611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 803 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.1875, |
| "epoch": 1.675, |
| "grad_norm": 1.6672528982162476, |
| "kl": 0.24658203125, |
| "learning_rate": 4.4166666666666664e-07, |
| "loss": 0.0002, |
| "reward": 1.65625, |
| "reward_std": 0.2651650384068489, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.96875, |
| "step": 804 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.15625, |
| "epoch": 1.6770833333333335, |
| "grad_norm": 0.006843992974609137, |
| "kl": 0.23779296875, |
| "learning_rate": 4.409722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 805 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.25, |
| "epoch": 1.6791666666666667, |
| "grad_norm": 0.009703945368528366, |
| "kl": 0.2314453125, |
| "learning_rate": 4.4027777777777774e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 806 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 553.375, |
| "epoch": 1.68125, |
| "grad_norm": 0.9394215941429138, |
| "kl": 0.2373046875, |
| "learning_rate": 4.395833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 807 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.5, |
| "epoch": 1.6833333333333333, |
| "grad_norm": 0.009763069450855255, |
| "kl": 0.237060546875, |
| "learning_rate": 4.3888888888888884e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 808 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.1875, |
| "epoch": 1.6854166666666668, |
| "grad_norm": 1.487508773803711, |
| "kl": 0.23876953125, |
| "learning_rate": 4.3819444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.5625, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.5, |
| "step": 809 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.03125, |
| "epoch": 1.6875, |
| "grad_norm": 1.174086332321167, |
| "kl": 0.220458984375, |
| "learning_rate": 4.375e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 810 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.21875, |
| "epoch": 1.6895833333333332, |
| "grad_norm": 0.010807516053318977, |
| "kl": 0.236572265625, |
| "learning_rate": 4.3680555555555554e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 811 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.96875, |
| "epoch": 1.6916666666666667, |
| "grad_norm": 1.2976332902908325, |
| "kl": 0.2353515625, |
| "learning_rate": 4.361111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.75, |
| "step": 812 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.5, |
| "epoch": 1.69375, |
| "grad_norm": 0.006790046114474535, |
| "kl": 0.226318359375, |
| "learning_rate": 4.3541666666666664e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 813 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.125, |
| "epoch": 1.6958333333333333, |
| "grad_norm": 0.9818464517593384, |
| "kl": 0.23876953125, |
| "learning_rate": 4.347222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.71875, |
| "step": 814 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.78125, |
| "epoch": 1.6979166666666665, |
| "grad_norm": 0.008597256615757942, |
| "kl": 0.23388671875, |
| "learning_rate": 4.3402777777777775e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 815 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 594.125, |
| "epoch": 1.7, |
| "grad_norm": 0.008699624799191952, |
| "kl": 0.22705078125, |
| "learning_rate": 4.3333333333333335e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 816 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 572.375, |
| "epoch": 1.7020833333333334, |
| "grad_norm": 0.00947485025972128, |
| "kl": 0.25390625, |
| "learning_rate": 4.326388888888889e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 817 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 566.21875, |
| "epoch": 1.7041666666666666, |
| "grad_norm": 0.008000546135008335, |
| "kl": 0.236572265625, |
| "learning_rate": 4.3194444444444445e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 818 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.25, |
| "epoch": 1.70625, |
| "grad_norm": 1.0371057987213135, |
| "kl": 0.2333984375, |
| "learning_rate": 4.3125e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 819 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.8125, |
| "epoch": 1.7083333333333335, |
| "grad_norm": 0.834037184715271, |
| "kl": 0.23779296875, |
| "learning_rate": 4.3055555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.53125, |
| "rewards/format_reward": 0.75, |
| "step": 820 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 573.34375, |
| "epoch": 1.7104166666666667, |
| "grad_norm": 0.00836244411766529, |
| "kl": 0.239990234375, |
| "learning_rate": 4.298611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 821 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.40625, |
| "epoch": 1.7125, |
| "grad_norm": 0.00962614081799984, |
| "kl": 0.231689453125, |
| "learning_rate": 4.291666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 822 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 553.0625, |
| "epoch": 1.7145833333333333, |
| "grad_norm": 0.008219579234719276, |
| "kl": 0.23388671875, |
| "learning_rate": 4.284722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 823 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 554.125, |
| "epoch": 1.7166666666666668, |
| "grad_norm": 0.012971649877727032, |
| "kl": 0.24169921875, |
| "learning_rate": 4.2777777777777775e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 824 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.53125, |
| "epoch": 1.71875, |
| "grad_norm": 1.1302229166030884, |
| "kl": 0.2275390625, |
| "learning_rate": 4.270833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 825 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 566.25, |
| "epoch": 1.7208333333333332, |
| "grad_norm": 0.010292713530361652, |
| "kl": 0.22119140625, |
| "learning_rate": 4.2638888888888885e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 826 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.5625, |
| "epoch": 1.7229166666666667, |
| "grad_norm": 1.4789761304855347, |
| "kl": 0.241943359375, |
| "learning_rate": 4.256944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.84375, |
| "reward_std": 0.22201896458864212, |
| "rewards/accuracy_reward": 0.84375, |
| "rewards/format_reward": 1.0, |
| "step": 827 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.78125, |
| "epoch": 1.725, |
| "grad_norm": 1.0435210466384888, |
| "kl": 0.2333984375, |
| "learning_rate": 4.2499999999999995e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 828 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.15625, |
| "epoch": 1.7270833333333333, |
| "grad_norm": 0.012116172350943089, |
| "kl": 0.216064453125, |
| "learning_rate": 4.243055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 829 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.59375, |
| "epoch": 1.7291666666666665, |
| "grad_norm": 0.007295367773622274, |
| "kl": 0.2412109375, |
| "learning_rate": 4.236111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 830 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.15625, |
| "epoch": 1.73125, |
| "grad_norm": 0.9141552448272705, |
| "kl": 0.227783203125, |
| "learning_rate": 4.2291666666666666e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 831 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.8125, |
| "epoch": 1.7333333333333334, |
| "grad_norm": 1.5482479333877563, |
| "kl": 0.240478515625, |
| "learning_rate": 4.222222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 832 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.96875, |
| "epoch": 1.7354166666666666, |
| "grad_norm": 0.013914127834141254, |
| "kl": 0.24267578125, |
| "learning_rate": 4.2152777777777776e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 833 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.0, |
| "epoch": 1.7375, |
| "grad_norm": 1.0192515850067139, |
| "kl": 0.22802734375, |
| "learning_rate": 4.208333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 834 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.8125, |
| "epoch": 1.7395833333333335, |
| "grad_norm": 0.00643956009298563, |
| "kl": 0.2177734375, |
| "learning_rate": 4.2013888888888886e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 835 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.90625, |
| "epoch": 1.7416666666666667, |
| "grad_norm": 0.008250262588262558, |
| "kl": 0.24462890625, |
| "learning_rate": 4.194444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 836 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.65625, |
| "epoch": 1.74375, |
| "grad_norm": 0.017665982246398926, |
| "kl": 0.261962890625, |
| "learning_rate": 4.1875e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 837 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.0, |
| "epoch": 1.7458333333333333, |
| "grad_norm": 0.01037755236029625, |
| "kl": 0.2333984375, |
| "learning_rate": 4.1805555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 838 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.09375, |
| "epoch": 1.7479166666666668, |
| "grad_norm": 1.4672600030899048, |
| "kl": 0.2333984375, |
| "learning_rate": 4.173611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 839 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 528.3125, |
| "epoch": 1.75, |
| "grad_norm": 1.229001522064209, |
| "kl": 0.202880859375, |
| "learning_rate": 4.1666666666666667e-07, |
| "loss": 0.0002, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 840 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.15625, |
| "epoch": 1.7520833333333332, |
| "grad_norm": 0.010309334844350815, |
| "kl": 0.255126953125, |
| "learning_rate": 4.159722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 841 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 558.59375, |
| "epoch": 1.7541666666666667, |
| "grad_norm": 0.0066090915352106094, |
| "kl": 0.232177734375, |
| "learning_rate": 4.1527777777777777e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 842 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.65625, |
| "epoch": 1.75625, |
| "grad_norm": 0.007594072259962559, |
| "kl": 0.236083984375, |
| "learning_rate": 4.145833333333333e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 843 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.71875, |
| "epoch": 1.7583333333333333, |
| "grad_norm": 1.4959443807601929, |
| "kl": 0.229248046875, |
| "learning_rate": 4.1388888888888887e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.2041158601641655, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 844 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.3125, |
| "epoch": 1.7604166666666665, |
| "grad_norm": 1.0887670516967773, |
| "kl": 0.2236328125, |
| "learning_rate": 4.131944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.71875, |
| "step": 845 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.4375, |
| "epoch": 1.7625, |
| "grad_norm": 1.3897308111190796, |
| "kl": 0.23974609375, |
| "learning_rate": 4.1249999999999997e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 846 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.75, |
| "epoch": 1.7645833333333334, |
| "grad_norm": 0.009728828445076942, |
| "kl": 0.251953125, |
| "learning_rate": 4.118055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 847 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.875, |
| "epoch": 1.7666666666666666, |
| "grad_norm": 0.00988440215587616, |
| "kl": 0.2412109375, |
| "learning_rate": 4.1111111111111107e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 848 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.125, |
| "epoch": 1.76875, |
| "grad_norm": 0.9975308179855347, |
| "kl": 0.2294921875, |
| "learning_rate": 4.104166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 849 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.65625, |
| "epoch": 1.7708333333333335, |
| "grad_norm": 0.007184633985161781, |
| "kl": 0.22998046875, |
| "learning_rate": 4.0972222222222217e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 850 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.5, |
| "epoch": 1.7729166666666667, |
| "grad_norm": 1.1164164543151855, |
| "kl": 0.232421875, |
| "learning_rate": 4.0902777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 851 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.84375, |
| "epoch": 1.775, |
| "grad_norm": 0.007584977429360151, |
| "kl": 0.233642578125, |
| "learning_rate": 4.083333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 852 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.03125, |
| "epoch": 1.7770833333333333, |
| "grad_norm": 0.007155933883041143, |
| "kl": 0.22705078125, |
| "learning_rate": 4.076388888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 853 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.09375, |
| "epoch": 1.7791666666666668, |
| "grad_norm": 0.007546697277575731, |
| "kl": 0.22900390625, |
| "learning_rate": 4.069444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 854 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.09375, |
| "epoch": 1.78125, |
| "grad_norm": 1.150571584701538, |
| "kl": 0.22412109375, |
| "learning_rate": 4.0625e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 855 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.40625, |
| "epoch": 1.7833333333333332, |
| "grad_norm": 1.047773838043213, |
| "kl": 0.239990234375, |
| "learning_rate": 4.055555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 856 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.46875, |
| "epoch": 1.7854166666666667, |
| "grad_norm": 1.4127116203308105, |
| "kl": 0.240966796875, |
| "learning_rate": 4.048611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 857 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.9375, |
| "epoch": 1.7875, |
| "grad_norm": 0.8850436806678772, |
| "kl": 0.256103515625, |
| "learning_rate": 4.041666666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 858 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.8125, |
| "epoch": 1.7895833333333333, |
| "grad_norm": 0.007401830516755581, |
| "kl": 0.222900390625, |
| "learning_rate": 4.0347222222222223e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 859 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.75, |
| "epoch": 1.7916666666666665, |
| "grad_norm": 1.490172266960144, |
| "kl": 0.234619140625, |
| "learning_rate": 4.027777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.03125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.75, |
| "step": 860 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.375, |
| "epoch": 1.79375, |
| "grad_norm": 3.965214490890503, |
| "kl": 0.228271484375, |
| "learning_rate": 4.0208333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 861 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.21875, |
| "epoch": 1.7958333333333334, |
| "grad_norm": 0.00795065052807331, |
| "kl": 0.24658203125, |
| "learning_rate": 4.013888888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 862 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.40625, |
| "epoch": 1.7979166666666666, |
| "grad_norm": 0.01374655682593584, |
| "kl": 0.212646484375, |
| "learning_rate": 4.0069444444444443e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.75, |
| "step": 863 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.15625, |
| "epoch": 1.8, |
| "grad_norm": 0.008126798085868359, |
| "kl": 0.227783203125, |
| "learning_rate": 4e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 864 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.6875, |
| "epoch": 1.8020833333333335, |
| "grad_norm": 0.007166092284023762, |
| "kl": 0.231689453125, |
| "learning_rate": 3.993055555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 865 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.3125, |
| "epoch": 1.8041666666666667, |
| "grad_norm": 1.0180275440216064, |
| "kl": 0.22607421875, |
| "learning_rate": 3.9861111111111114e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 866 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 545.71875, |
| "epoch": 1.80625, |
| "grad_norm": 0.012762402184307575, |
| "kl": 0.242431640625, |
| "learning_rate": 3.9791666666666663e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 867 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.65625, |
| "epoch": 1.8083333333333333, |
| "grad_norm": 1.4062516689300537, |
| "kl": 0.232421875, |
| "learning_rate": 3.972222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 868 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.0, |
| "epoch": 1.8104166666666668, |
| "grad_norm": 1.635831356048584, |
| "kl": 0.243896484375, |
| "learning_rate": 3.9652777777777773e-07, |
| "loss": 0.0002, |
| "reward": 1.9375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 869 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.5625, |
| "epoch": 1.8125, |
| "grad_norm": 0.007981624454259872, |
| "kl": 0.21728515625, |
| "learning_rate": 3.958333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 870 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.34375, |
| "epoch": 1.8145833333333332, |
| "grad_norm": 2.526054620742798, |
| "kl": 0.22802734375, |
| "learning_rate": 3.9513888888888883e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 871 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.3125, |
| "epoch": 1.8166666666666667, |
| "grad_norm": 1.5642163753509521, |
| "kl": 0.2314453125, |
| "learning_rate": 3.9444444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.4375, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 0.75, |
| "step": 872 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 528.625, |
| "epoch": 1.81875, |
| "grad_norm": 0.007630742155015469, |
| "kl": 0.227783203125, |
| "learning_rate": 3.9375e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 873 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 561.75, |
| "epoch": 1.8208333333333333, |
| "grad_norm": 0.0069867093116045, |
| "kl": 0.230712890625, |
| "learning_rate": 3.9305555555555554e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 874 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.40625, |
| "epoch": 1.8229166666666665, |
| "grad_norm": 0.02551759034395218, |
| "kl": 0.217041015625, |
| "learning_rate": 3.923611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 875 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.4375, |
| "epoch": 1.825, |
| "grad_norm": 0.012089493684470654, |
| "kl": 0.23583984375, |
| "learning_rate": 3.9166666666666664e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 876 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.3125, |
| "epoch": 1.8270833333333334, |
| "grad_norm": 0.03711254894733429, |
| "kl": 0.2236328125, |
| "learning_rate": 3.909722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 877 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.5625, |
| "epoch": 1.8291666666666666, |
| "grad_norm": 0.008239555172622204, |
| "kl": 0.24169921875, |
| "learning_rate": 3.9027777777777774e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 878 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 556.0, |
| "epoch": 1.83125, |
| "grad_norm": 0.007976679131388664, |
| "kl": 0.235107421875, |
| "learning_rate": 3.8958333333333334e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 879 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.875, |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.009058798663318157, |
| "kl": 0.248046875, |
| "learning_rate": 3.888888888888889e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 880 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.4375, |
| "epoch": 1.8354166666666667, |
| "grad_norm": 0.007584620267152786, |
| "kl": 0.22900390625, |
| "learning_rate": 3.8819444444444445e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 881 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 544.65625, |
| "epoch": 1.8375, |
| "grad_norm": 0.007870173081755638, |
| "kl": 0.219970703125, |
| "learning_rate": 3.875e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 882 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.96875, |
| "epoch": 1.8395833333333333, |
| "grad_norm": 1.4032890796661377, |
| "kl": 0.2431640625, |
| "learning_rate": 3.8680555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 883 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 553.78125, |
| "epoch": 1.8416666666666668, |
| "grad_norm": 1.04135000705719, |
| "kl": 0.256103515625, |
| "learning_rate": 3.861111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 884 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.3125, |
| "epoch": 1.84375, |
| "grad_norm": 0.006909274961799383, |
| "kl": 0.230224609375, |
| "learning_rate": 3.8541666666666665e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 885 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.96875, |
| "epoch": 1.8458333333333332, |
| "grad_norm": 0.006952146999537945, |
| "kl": 0.222900390625, |
| "learning_rate": 3.8472222222222225e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 886 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.875, |
| "epoch": 1.8479166666666667, |
| "grad_norm": 1.0288861989974976, |
| "kl": 0.226318359375, |
| "learning_rate": 3.840277777777778e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 887 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.28125, |
| "epoch": 1.85, |
| "grad_norm": 0.007326650433242321, |
| "kl": 0.228515625, |
| "learning_rate": 3.8333333333333335e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 888 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 539.40625, |
| "epoch": 1.8520833333333333, |
| "grad_norm": 0.015359265729784966, |
| "kl": 0.226318359375, |
| "learning_rate": 3.8263888888888885e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 889 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.40625, |
| "epoch": 1.8541666666666665, |
| "grad_norm": 5.617523193359375, |
| "kl": 0.23681640625, |
| "learning_rate": 3.819444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.875, |
| "reward_std": 0.2925042062997818, |
| "rewards/accuracy_reward": 0.875, |
| "rewards/format_reward": 1.0, |
| "step": 890 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.21875, |
| "epoch": 1.85625, |
| "grad_norm": 1.0431323051452637, |
| "kl": 0.235595703125, |
| "learning_rate": 3.8124999999999995e-07, |
| "loss": 0.0002, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 891 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.46875, |
| "epoch": 1.8583333333333334, |
| "grad_norm": 1.8033677339553833, |
| "kl": 0.232421875, |
| "learning_rate": 3.805555555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.28125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 1.0, |
| "step": 892 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.65625, |
| "epoch": 1.8604166666666666, |
| "grad_norm": 0.006438506301492453, |
| "kl": 0.214111328125, |
| "learning_rate": 3.7986111111111105e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 893 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.71875, |
| "epoch": 1.8625, |
| "grad_norm": 0.9842865467071533, |
| "kl": 0.224365234375, |
| "learning_rate": 3.7916666666666665e-07, |
| "loss": 0.0002, |
| "reward": 1.34375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/format_reward": 0.75, |
| "step": 894 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.65625, |
| "epoch": 1.8645833333333335, |
| "grad_norm": 0.007584480568766594, |
| "kl": 0.2353515625, |
| "learning_rate": 3.784722222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 895 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.71875, |
| "epoch": 1.8666666666666667, |
| "grad_norm": 0.8712323904037476, |
| "kl": 0.220703125, |
| "learning_rate": 3.7777777777777775e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 1.0, |
| "step": 896 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 539.375, |
| "epoch": 1.86875, |
| "grad_norm": 0.014179886318743229, |
| "kl": 0.240234375, |
| "learning_rate": 3.770833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 897 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.78125, |
| "epoch": 1.8708333333333333, |
| "grad_norm": 1.2494723796844482, |
| "kl": 0.2431640625, |
| "learning_rate": 3.7638888888888886e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.5, |
| "step": 898 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.75, |
| "epoch": 1.8729166666666668, |
| "grad_norm": 0.007599582429975271, |
| "kl": 0.221923828125, |
| "learning_rate": 3.756944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 899 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.375, |
| "epoch": 1.875, |
| "grad_norm": 1.021194338798523, |
| "kl": 0.239013671875, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 900 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.53125, |
| "epoch": 1.8770833333333332, |
| "grad_norm": 1.638185977935791, |
| "kl": 0.22607421875, |
| "learning_rate": 3.7430555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 901 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.1875, |
| "epoch": 1.8791666666666667, |
| "grad_norm": 1.0922802686691284, |
| "kl": 0.268310546875, |
| "learning_rate": 3.736111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.71875, |
| "step": 902 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.375, |
| "epoch": 1.88125, |
| "grad_norm": 0.011066813953220844, |
| "kl": 0.23828125, |
| "learning_rate": 3.7291666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 903 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.46875, |
| "epoch": 1.8833333333333333, |
| "grad_norm": 0.00842176005244255, |
| "kl": 0.2294921875, |
| "learning_rate": 3.722222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 904 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.0, |
| "epoch": 1.8854166666666665, |
| "grad_norm": 0.007877349853515625, |
| "kl": 0.237548828125, |
| "learning_rate": 3.7152777777777776e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 905 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.78125, |
| "epoch": 1.8875, |
| "grad_norm": 1.380505084991455, |
| "kl": 0.22900390625, |
| "learning_rate": 3.708333333333333e-07, |
| "loss": 0.0002, |
| "reward": 0.4375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.25, |
| "step": 906 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.90625, |
| "epoch": 1.8895833333333334, |
| "grad_norm": 0.9170504808425903, |
| "kl": 0.244873046875, |
| "learning_rate": 3.701388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 907 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.3125, |
| "epoch": 1.8916666666666666, |
| "grad_norm": 0.008912342600524426, |
| "kl": 0.23681640625, |
| "learning_rate": 3.6944444444444447e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 908 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.75, |
| "epoch": 1.89375, |
| "grad_norm": 1.8576105833053589, |
| "kl": 0.234375, |
| "learning_rate": 3.6875e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 909 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.9375, |
| "epoch": 1.8958333333333335, |
| "grad_norm": 0.006501065567135811, |
| "kl": 0.225830078125, |
| "learning_rate": 3.6805555555555557e-07, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 910 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.4375, |
| "epoch": 1.8979166666666667, |
| "grad_norm": 1.175131916999817, |
| "kl": 0.23388671875, |
| "learning_rate": 3.673611111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.53125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.5, |
| "step": 911 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.4375, |
| "epoch": 1.9, |
| "grad_norm": 0.008509078063070774, |
| "kl": 0.252685546875, |
| "learning_rate": 3.666666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 912 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.5625, |
| "epoch": 1.9020833333333333, |
| "grad_norm": 1.2172304391860962, |
| "kl": 0.23876953125, |
| "learning_rate": 3.6597222222222217e-07, |
| "loss": 0.0002, |
| "reward": 1.15625, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.75, |
| "step": 913 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.21875, |
| "epoch": 1.9041666666666668, |
| "grad_norm": 0.006972001399844885, |
| "kl": 0.22998046875, |
| "learning_rate": 3.652777777777777e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 914 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.84375, |
| "epoch": 1.90625, |
| "grad_norm": 1.4839038848876953, |
| "kl": 0.235595703125, |
| "learning_rate": 3.645833333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 915 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.9375, |
| "epoch": 1.9083333333333332, |
| "grad_norm": 0.011867961846292019, |
| "kl": 0.24609375, |
| "learning_rate": 3.6388888888888887e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 916 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.71875, |
| "epoch": 1.9104166666666667, |
| "grad_norm": 1.2053366899490356, |
| "kl": 0.2373046875, |
| "learning_rate": 3.631944444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 917 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.03125, |
| "epoch": 1.9125, |
| "grad_norm": 0.0455983504652977, |
| "kl": 0.222412109375, |
| "learning_rate": 3.6249999999999997e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 918 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.4375, |
| "epoch": 1.9145833333333333, |
| "grad_norm": 1.0417066812515259, |
| "kl": 0.245361328125, |
| "learning_rate": 3.618055555555555e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 919 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.75, |
| "epoch": 1.9166666666666665, |
| "grad_norm": 0.0077906264923512936, |
| "kl": 0.23583984375, |
| "learning_rate": 3.6111111111111107e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 920 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.28125, |
| "epoch": 1.91875, |
| "grad_norm": 0.00771137373521924, |
| "kl": 0.238525390625, |
| "learning_rate": 3.604166666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 921 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.125, |
| "epoch": 1.9208333333333334, |
| "grad_norm": 2.136435031890869, |
| "kl": 0.256103515625, |
| "learning_rate": 3.597222222222222e-07, |
| "loss": 0.0003, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.03125, |
| "rewards/format_reward": 0.75, |
| "step": 922 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.0, |
| "epoch": 1.9229166666666666, |
| "grad_norm": 0.006540779490023851, |
| "kl": 0.22509765625, |
| "learning_rate": 3.590277777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 923 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.8125, |
| "epoch": 1.925, |
| "grad_norm": 1.0841056108474731, |
| "kl": 0.244873046875, |
| "learning_rate": 3.583333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.71875, |
| "step": 924 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.5, |
| "epoch": 1.9270833333333335, |
| "grad_norm": 0.008191017434000969, |
| "kl": 0.240478515625, |
| "learning_rate": 3.576388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 925 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.875, |
| "epoch": 1.9291666666666667, |
| "grad_norm": 0.007460338994860649, |
| "kl": 0.2529296875, |
| "learning_rate": 3.5694444444444443e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 926 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.4375, |
| "epoch": 1.93125, |
| "grad_norm": 0.014311104081571102, |
| "kl": 0.2529296875, |
| "learning_rate": 3.5625e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 927 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.125, |
| "epoch": 1.9333333333333333, |
| "grad_norm": 1.2025405168533325, |
| "kl": 0.250244140625, |
| "learning_rate": 3.5555555555555553e-07, |
| "loss": 0.0002, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 928 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.5, |
| "epoch": 1.9354166666666668, |
| "grad_norm": 0.007025923114269972, |
| "kl": 0.23974609375, |
| "learning_rate": 3.5486111111111113e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 929 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.0625, |
| "epoch": 1.9375, |
| "grad_norm": 0.008093161508440971, |
| "kl": 0.2373046875, |
| "learning_rate": 3.541666666666667e-07, |
| "loss": 0.0002, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 930 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.9375, |
| "epoch": 1.9395833333333332, |
| "grad_norm": 0.013698437251150608, |
| "kl": 0.2470703125, |
| "learning_rate": 3.5347222222222223e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 931 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.96875, |
| "epoch": 1.9416666666666667, |
| "grad_norm": 0.00795261561870575, |
| "kl": 0.240966796875, |
| "learning_rate": 3.527777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 932 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.5625, |
| "epoch": 1.94375, |
| "grad_norm": 0.008631990291178226, |
| "kl": 0.239990234375, |
| "learning_rate": 3.5208333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 933 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.09375, |
| "epoch": 1.9458333333333333, |
| "grad_norm": 0.008530229330062866, |
| "kl": 0.252197265625, |
| "learning_rate": 3.5138888888888883e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 934 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.09375, |
| "epoch": 1.9479166666666665, |
| "grad_norm": 0.010614525526762009, |
| "kl": 0.254150390625, |
| "learning_rate": 3.506944444444444e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 935 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.78125, |
| "epoch": 1.95, |
| "grad_norm": 1.1804996728897095, |
| "kl": 0.243408203125, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 936 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.0625, |
| "epoch": 1.9520833333333334, |
| "grad_norm": 1.2165803909301758, |
| "kl": 0.244384765625, |
| "learning_rate": 3.4930555555555553e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 937 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.53125, |
| "epoch": 1.9541666666666666, |
| "grad_norm": 1.3811777830123901, |
| "kl": 0.258056640625, |
| "learning_rate": 3.486111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 938 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.96875, |
| "epoch": 1.95625, |
| "grad_norm": 0.007179384585469961, |
| "kl": 0.237060546875, |
| "learning_rate": 3.4791666666666664e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 939 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.0625, |
| "epoch": 1.9583333333333335, |
| "grad_norm": 1.3711605072021484, |
| "kl": 0.3095703125, |
| "learning_rate": 3.472222222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.09375, |
| "reward_std": 0.1293872892856598, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.75, |
| "step": 940 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.84375, |
| "epoch": 1.9604166666666667, |
| "grad_norm": 0.016370367258787155, |
| "kl": 0.2802734375, |
| "learning_rate": 3.4652777777777774e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 941 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.09375, |
| "epoch": 1.9625, |
| "grad_norm": 1.2782434225082397, |
| "kl": 0.275390625, |
| "learning_rate": 3.458333333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 942 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.6875, |
| "epoch": 1.9645833333333333, |
| "grad_norm": 1.1922998428344727, |
| "kl": 0.248046875, |
| "learning_rate": 3.451388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 943 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 475.03125, |
| "epoch": 1.9666666666666668, |
| "grad_norm": 1.0037637948989868, |
| "kl": 0.243896484375, |
| "learning_rate": 3.4444444444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 944 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.03125, |
| "epoch": 1.96875, |
| "grad_norm": 0.013457462191581726, |
| "kl": 0.25634765625, |
| "learning_rate": 3.4375e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 945 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.9375, |
| "epoch": 1.9708333333333332, |
| "grad_norm": 0.008211650885641575, |
| "kl": 0.254150390625, |
| "learning_rate": 3.4305555555555554e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 946 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.15625, |
| "epoch": 1.9729166666666667, |
| "grad_norm": 1.1255629062652588, |
| "kl": 0.257568359375, |
| "learning_rate": 3.423611111111111e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 947 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.625, |
| "epoch": 1.975, |
| "grad_norm": 1.566752552986145, |
| "kl": 0.25927734375, |
| "learning_rate": 3.4166666666666664e-07, |
| "loss": 0.0003, |
| "reward": 0.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5, |
| "step": 948 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.75, |
| "epoch": 1.9770833333333333, |
| "grad_norm": 1.967722773551941, |
| "kl": 0.250244140625, |
| "learning_rate": 3.409722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 949 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.15625, |
| "epoch": 1.9791666666666665, |
| "grad_norm": 1.687232255935669, |
| "kl": 0.24951171875, |
| "learning_rate": 3.402777777777778e-07, |
| "loss": 0.0002, |
| "reward": 1.0625, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 1.0, |
| "step": 950 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.75, |
| "epoch": 1.98125, |
| "grad_norm": 0.010934803634881973, |
| "kl": 0.253662109375, |
| "learning_rate": 3.3958333333333335e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 951 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.3125, |
| "epoch": 1.9833333333333334, |
| "grad_norm": 0.014278494752943516, |
| "kl": 0.259521484375, |
| "learning_rate": 3.388888888888889e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 952 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.875, |
| "epoch": 1.9854166666666666, |
| "grad_norm": 0.008121107704937458, |
| "kl": 0.25244140625, |
| "learning_rate": 3.3819444444444445e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 953 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.09375, |
| "epoch": 1.9875, |
| "grad_norm": 0.00813852995634079, |
| "kl": 0.253173828125, |
| "learning_rate": 3.375e-07, |
| "loss": 0.0003, |
| "reward": 0.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.25, |
| "step": 954 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.3125, |
| "epoch": 1.9895833333333335, |
| "grad_norm": 0.009365586563944817, |
| "kl": 0.260498046875, |
| "learning_rate": 3.3680555555555555e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 955 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.5625, |
| "epoch": 1.9916666666666667, |
| "grad_norm": 1.130090594291687, |
| "kl": 0.24072265625, |
| "learning_rate": 3.361111111111111e-07, |
| "loss": 0.0002, |
| "reward": 0.78125, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.5, |
| "step": 956 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.9375, |
| "epoch": 1.99375, |
| "grad_norm": 0.0085580600425601, |
| "kl": 0.256103515625, |
| "learning_rate": 3.3541666666666665e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 957 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.75, |
| "epoch": 1.9958333333333333, |
| "grad_norm": 0.010669535025954247, |
| "kl": 0.244140625, |
| "learning_rate": 3.347222222222222e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.75, |
| "step": 958 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.84375, |
| "epoch": 1.9979166666666668, |
| "grad_norm": 0.01216217689216137, |
| "kl": 0.26416015625, |
| "learning_rate": 3.3402777777777775e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 959 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.8125, |
| "epoch": 2.0, |
| "grad_norm": 0.007194915786385536, |
| "kl": 0.24365234375, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 960 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.1875, |
| "epoch": 2.002083333333333, |
| "grad_norm": 0.007638991344720125, |
| "kl": 0.243408203125, |
| "learning_rate": 3.3263888888888885e-07, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0, |
| "step": 961 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 439.03125, |
| "epoch": 2.004166666666667, |
| "grad_norm": 0.00782975647598505, |
| "kl": 0.250732421875, |
| "learning_rate": 3.319444444444444e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 962 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.875, |
| "epoch": 2.00625, |
| "grad_norm": 3.1766531467437744, |
| "kl": 0.244384765625, |
| "learning_rate": 3.3124999999999995e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 963 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.84375, |
| "epoch": 2.0083333333333333, |
| "grad_norm": 1.5000786781311035, |
| "kl": 0.253173828125, |
| "learning_rate": 3.3055555555555556e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.75, |
| "step": 964 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.75, |
| "epoch": 2.0104166666666665, |
| "grad_norm": 1.7209105491638184, |
| "kl": 0.2392578125, |
| "learning_rate": 3.298611111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 965 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.90625, |
| "epoch": 2.0125, |
| "grad_norm": 1.6055102348327637, |
| "kl": 0.24072265625, |
| "learning_rate": 3.2916666666666666e-07, |
| "loss": 0.0002, |
| "reward": 1.6875, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.6875, |
| "rewards/format_reward": 1.0, |
| "step": 966 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.4375, |
| "epoch": 2.0145833333333334, |
| "grad_norm": 1.1209073066711426, |
| "kl": 0.248291015625, |
| "learning_rate": 3.284722222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 967 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.40625, |
| "epoch": 2.0166666666666666, |
| "grad_norm": 0.009695399552583694, |
| "kl": 0.26416015625, |
| "learning_rate": 3.2777777777777776e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 968 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.6875, |
| "epoch": 2.01875, |
| "grad_norm": 1.2274049520492554, |
| "kl": 0.2587890625, |
| "learning_rate": 3.270833333333333e-07, |
| "loss": 0.0003, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 969 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.4375, |
| "epoch": 2.0208333333333335, |
| "grad_norm": 0.008248819038271904, |
| "kl": 0.244140625, |
| "learning_rate": 3.2638888888888886e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 970 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.28125, |
| "epoch": 2.0229166666666667, |
| "grad_norm": 1.0125038623809814, |
| "kl": 0.23828125, |
| "learning_rate": 3.2569444444444446e-07, |
| "loss": 0.0002, |
| "reward": 1.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 0.75, |
| "step": 971 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.46875, |
| "epoch": 2.025, |
| "grad_norm": 0.007286733016371727, |
| "kl": 0.2431640625, |
| "learning_rate": 3.25e-07, |
| "loss": 0.0002, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.25, |
| "step": 972 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.0625, |
| "epoch": 2.027083333333333, |
| "grad_norm": 0.018277425318956375, |
| "kl": 0.2314453125, |
| "learning_rate": 3.2430555555555556e-07, |
| "loss": 0.0002, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 973 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.9375, |
| "epoch": 2.029166666666667, |
| "grad_norm": 0.007195204496383667, |
| "kl": 0.24560546875, |
| "learning_rate": 3.236111111111111e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 974 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.4375, |
| "epoch": 2.03125, |
| "grad_norm": 0.008570291101932526, |
| "kl": 0.2607421875, |
| "learning_rate": 3.2291666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 975 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.1875, |
| "epoch": 2.033333333333333, |
| "grad_norm": 0.007945443503558636, |
| "kl": 0.244873046875, |
| "learning_rate": 3.222222222222222e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 976 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.53125, |
| "epoch": 2.035416666666667, |
| "grad_norm": 1.0743132829666138, |
| "kl": 0.26318359375, |
| "learning_rate": 3.2152777777777776e-07, |
| "loss": 0.0003, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 977 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.875, |
| "epoch": 2.0375, |
| "grad_norm": 1.5062912702560425, |
| "kl": 0.2529296875, |
| "learning_rate": 3.2083333333333337e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 978 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.0, |
| "epoch": 2.0395833333333333, |
| "grad_norm": 0.0329461507499218, |
| "kl": 0.2919921875, |
| "learning_rate": 3.2013888888888886e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 1.0, |
| "step": 979 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.625, |
| "epoch": 2.0416666666666665, |
| "grad_norm": 1.2301101684570312, |
| "kl": 0.24560546875, |
| "learning_rate": 3.194444444444444e-07, |
| "loss": 0.0002, |
| "reward": 1.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.96875, |
| "rewards/format_reward": 1.0, |
| "step": 980 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.21875, |
| "epoch": 2.04375, |
| "grad_norm": 1.5222910642623901, |
| "kl": 0.246826171875, |
| "learning_rate": 3.1874999999999997e-07, |
| "loss": 0.0002, |
| "reward": 1.71875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.71875, |
| "rewards/format_reward": 1.0, |
| "step": 981 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.4375, |
| "epoch": 2.0458333333333334, |
| "grad_norm": 0.010759882628917694, |
| "kl": 0.23876953125, |
| "learning_rate": 3.180555555555555e-07, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5, |
| "step": 982 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.53125, |
| "epoch": 2.0479166666666666, |
| "grad_norm": 0.007903813384473324, |
| "kl": 0.251953125, |
| "learning_rate": 3.1736111111111107e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 983 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.15625, |
| "epoch": 2.05, |
| "grad_norm": 1.4439796209335327, |
| "kl": 0.262451171875, |
| "learning_rate": 3.166666666666666e-07, |
| "loss": 0.0003, |
| "reward": 1.9375, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.9375, |
| "rewards/format_reward": 1.0, |
| "step": 984 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.90625, |
| "epoch": 2.0520833333333335, |
| "grad_norm": 0.029047423973679543, |
| "kl": 0.266845703125, |
| "learning_rate": 3.159722222222222e-07, |
| "loss": 0.0003, |
| "reward": 1.25, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.75, |
| "step": 985 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 470.84375, |
| "epoch": 2.0541666666666667, |
| "grad_norm": 1.1269396543502808, |
| "kl": 0.2607421875, |
| "learning_rate": 3.1527777777777777e-07, |
| "loss": 0.0003, |
| "reward": 1.3125, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.5625, |
| "rewards/format_reward": 0.75, |
| "step": 986 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.25, |
| "epoch": 2.05625, |
| "grad_norm": 0.007986017502844334, |
| "kl": 0.2509765625, |
| "learning_rate": 3.145833333333333e-07, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.0, |
| "rewards/format_reward": 1.0, |
| "step": 987 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.75, |
| "epoch": 2.058333333333333, |
| "grad_norm": 0.007769202347844839, |
| "kl": 0.2490234375, |
| "learning_rate": 3.1388888888888887e-07, |
| "loss": 0.0002, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 988 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.09375, |
| "epoch": 2.060416666666667, |
| "grad_norm": 1.3015766143798828, |
| "kl": 0.24951171875, |
| "learning_rate": 3.131944444444444e-07, |
| "loss": 0.0002, |
| "reward": 0.46875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.25, |
| "step": 989 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.4375, |
| "epoch": 2.0625, |
| "grad_norm": 1.3159650564193726, |
| "kl": 0.251953125, |
| "learning_rate": 3.1249999999999997e-07, |
| "loss": 0.0003, |
| "reward": 1.1875, |
| "reward_std": 0.1157275140285492, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.75, |
| "step": 990 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.0, |
| "epoch": 2.064583333333333, |
| "grad_norm": 0.010242442600429058, |
| "kl": 0.256591796875, |
| "learning_rate": 3.118055555555555e-07, |
| "loss": 0.0003, |
| "reward": 1.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 1.0, |
| "step": 991 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.53125, |
| "epoch": 2.066666666666667, |
| "grad_norm": 1.5279037952423096, |
| "kl": 0.26611328125, |
| "learning_rate": 3.111111111111111e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.1767766922712326, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 992 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.21875, |
| "epoch": 2.06875, |
| "grad_norm": 0.01802394911646843, |
| "kl": 0.258544921875, |
| "learning_rate": 3.104166666666667e-07, |
| "loss": 0.0003, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.75, |
| "rewards/format_reward": 0.75, |
| "step": 993 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 458.53125, |
| "epoch": 2.0708333333333333, |
| "grad_norm": 1.1756128072738647, |
| "kl": 0.2548828125, |
| "learning_rate": 3.0972222222222223e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 994 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.03125, |
| "epoch": 2.0729166666666665, |
| "grad_norm": 1.3839462995529175, |
| "kl": 0.251220703125, |
| "learning_rate": 3.090277777777778e-07, |
| "loss": 0.0003, |
| "reward": 0.96875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.5, |
| "step": 995 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.9375, |
| "epoch": 2.075, |
| "grad_norm": 1.179376482963562, |
| "kl": 0.243408203125, |
| "learning_rate": 3.0833333333333333e-07, |
| "loss": 0.0002, |
| "reward": 1.21875, |
| "reward_std": 0.0883883461356163, |
| "rewards/accuracy_reward": 0.46875, |
| "rewards/format_reward": 0.75, |
| "step": 996 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 423.6875, |
| "epoch": 2.0770833333333334, |
| "grad_norm": 0.011729571036994457, |
| "kl": 0.235107421875, |
| "learning_rate": 3.076388888888889e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 997 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 460.0, |
| "epoch": 2.0791666666666666, |
| "grad_norm": 0.007601437624543905, |
| "kl": 0.240234375, |
| "learning_rate": 3.0694444444444443e-07, |
| "loss": 0.0002, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 998 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.53125, |
| "epoch": 2.08125, |
| "grad_norm": 0.007883809506893158, |
| "kl": 0.255859375, |
| "learning_rate": 3.0625000000000003e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.5, |
| "step": 999 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.34375, |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.01050996221601963, |
| "kl": 0.250244140625, |
| "learning_rate": 3.055555555555556e-07, |
| "loss": 0.0003, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.75, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 1440, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|