| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "advantages": -2.60770320892334e-08, | |
| "completion_length": 256.0, | |
| "epoch": 0.001, | |
| "grad_norm": 3.7380807399749756, | |
| "kl": 0.0, | |
| "learning_rate": 9.989999999999999e-07, | |
| "loss": 0.0637, | |
| "reward": 0.7604166865348816, | |
| "reward_mean": 0.7604166865348816, | |
| "reward_std": 0.42027419805526733, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 0.25, | |
| "step": 1 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 201.0625, | |
| "epoch": 0.002, | |
| "grad_norm": 5.145930290222168, | |
| "kl": 0.00118255615234375, | |
| "learning_rate": 9.98e-07, | |
| "loss": -0.0282, | |
| "reward": 0.7708333730697632, | |
| "reward_mean": 0.7708333730697632, | |
| "reward_std": 0.7378304600715637, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.5, | |
| "step": 2 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 232.0, | |
| "epoch": 0.003, | |
| "grad_norm": 3.798980474472046, | |
| "kl": 0.003448486328125, | |
| "learning_rate": 9.97e-07, | |
| "loss": 0.0955, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.7253239154815674, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.8125, | |
| "step": 3 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 245.25, | |
| "epoch": 0.004, | |
| "grad_norm": 4.136316299438477, | |
| "kl": 0.00421142578125, | |
| "learning_rate": 9.959999999999999e-07, | |
| "loss": 0.0824, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.598172664642334, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.8125, | |
| "step": 4 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 229.5, | |
| "epoch": 0.005, | |
| "grad_norm": 3.53371000289917, | |
| "kl": 0.00439453125, | |
| "learning_rate": 9.95e-07, | |
| "loss": 0.0099, | |
| "reward": 1.21875, | |
| "reward_mean": 1.21875, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.21875, | |
| "rewards/format_reward": 1.0, | |
| "step": 5 | |
| }, | |
| { | |
| "advantages": -5.960464477539063e-08, | |
| "completion_length": 191.125, | |
| "epoch": 0.006, | |
| "grad_norm": 4.162847518920898, | |
| "kl": 0.00921630859375, | |
| "learning_rate": 9.94e-07, | |
| "loss": 0.0321, | |
| "reward": 1.3020833730697632, | |
| "reward_mean": 1.3020833730697632, | |
| "reward_std": 0.41478484869003296, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.875, | |
| "step": 6 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 212.5, | |
| "epoch": 0.007, | |
| "grad_norm": 3.7386105060577393, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 9.929999999999999e-07, | |
| "loss": 0.0328, | |
| "reward": 1.1041667461395264, | |
| "reward_mean": 1.1041667461395264, | |
| "reward_std": 0.349293053150177, | |
| "rewards/accuracy_reward": 0.2291666865348816, | |
| "rewards/format_reward": 0.875, | |
| "step": 7 | |
| }, | |
| { | |
| "advantages": -1.1920928955078125e-07, | |
| "completion_length": 229.5625, | |
| "epoch": 0.008, | |
| "grad_norm": 3.4274792671203613, | |
| "kl": 0.01171875, | |
| "learning_rate": 9.92e-07, | |
| "loss": -0.0102, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.28498581051826477, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 8 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 187.4375, | |
| "epoch": 0.009, | |
| "grad_norm": 5.677432537078857, | |
| "kl": 0.01123046875, | |
| "learning_rate": 9.91e-07, | |
| "loss": 0.1596, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.4972116947174072, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.9375, | |
| "step": 9 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 196.25, | |
| "epoch": 0.01, | |
| "grad_norm": 4.712809085845947, | |
| "kl": 0.0247802734375, | |
| "learning_rate": 9.9e-07, | |
| "loss": -0.1289, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.37918925285339355, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.0625, | |
| "epoch": 0.011, | |
| "grad_norm": 4.270755767822266, | |
| "kl": 0.0751953125, | |
| "learning_rate": 9.89e-07, | |
| "loss": -0.0513, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 11 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 170.0625, | |
| "epoch": 0.012, | |
| "grad_norm": 3.437450408935547, | |
| "kl": 0.04638671875, | |
| "learning_rate": 9.88e-07, | |
| "loss": -0.0187, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 12 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 86.5, | |
| "epoch": 0.013, | |
| "grad_norm": 4.844762802124023, | |
| "kl": 0.0419921875, | |
| "learning_rate": 9.87e-07, | |
| "loss": 0.0026, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "advantages": -6.332993507385254e-08, | |
| "completion_length": 186.875, | |
| "epoch": 0.014, | |
| "grad_norm": 4.118823528289795, | |
| "kl": 0.04638671875, | |
| "learning_rate": 9.86e-07, | |
| "loss": -0.0292, | |
| "reward": 1.5833333730697632, | |
| "reward_mean": 1.5833333730697632, | |
| "reward_std": 0.32946425676345825, | |
| "rewards/accuracy_reward": 0.5833333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 95.75, | |
| "epoch": 0.015, | |
| "grad_norm": 4.095740795135498, | |
| "kl": 0.0703125, | |
| "learning_rate": 9.849999999999999e-07, | |
| "loss": 0.1122, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-08, | |
| "completion_length": 136.9375, | |
| "epoch": 0.016, | |
| "grad_norm": 5.639898777008057, | |
| "kl": 0.0859375, | |
| "learning_rate": 9.84e-07, | |
| "loss": -0.0948, | |
| "reward": 1.5833333730697632, | |
| "reward_mean": 1.5833333730697632, | |
| "reward_std": 0.3827785551548004, | |
| "rewards/accuracy_reward": 0.5833333134651184, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "advantages": 2.2351741790771484e-08, | |
| "completion_length": 134.1875, | |
| "epoch": 0.017, | |
| "grad_norm": 5.9321980476379395, | |
| "kl": 0.0654296875, | |
| "learning_rate": 9.83e-07, | |
| "loss": -0.126, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.47921282052993774, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "advantages": 5.21540641784668e-08, | |
| "completion_length": 124.375, | |
| "epoch": 0.018, | |
| "grad_norm": 7.639815807342529, | |
| "kl": 0.052734375, | |
| "learning_rate": 9.819999999999999e-07, | |
| "loss": 0.0134, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.5096293687820435, | |
| "rewards/accuracy_reward": 0.3125000298023224, | |
| "rewards/format_reward": 0.9375, | |
| "step": 18 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 160.6875, | |
| "epoch": 0.019, | |
| "grad_norm": 4.792241096496582, | |
| "kl": 0.064453125, | |
| "learning_rate": 9.81e-07, | |
| "loss": 0.1099, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 1.0, | |
| "step": 19 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 161.0625, | |
| "epoch": 0.02, | |
| "grad_norm": 3.4121909141540527, | |
| "kl": 0.064453125, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0186, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "advantages": -2.60770320892334e-07, | |
| "completion_length": 204.0625, | |
| "epoch": 0.021, | |
| "grad_norm": 3.9842889308929443, | |
| "kl": 0.0703125, | |
| "learning_rate": 9.789999999999999e-07, | |
| "loss": -0.0411, | |
| "reward": 1.4166667461395264, | |
| "reward_mean": 1.4166667461395264, | |
| "reward_std": 0.18292954564094543, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 140.875, | |
| "epoch": 0.022, | |
| "grad_norm": 6.302048206329346, | |
| "kl": 0.050537109375, | |
| "learning_rate": 9.78e-07, | |
| "loss": 0.1697, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.3471825420856476, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 22 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 111.5625, | |
| "epoch": 0.023, | |
| "grad_norm": 8.212870597839355, | |
| "kl": 0.1103515625, | |
| "learning_rate": 9.77e-07, | |
| "loss": 0.1932, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.44403791427612305, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "advantages": -9.685754776000977e-08, | |
| "completion_length": 180.25, | |
| "epoch": 0.024, | |
| "grad_norm": 4.934231758117676, | |
| "kl": 0.0791015625, | |
| "learning_rate": 9.759999999999998e-07, | |
| "loss": 0.2501, | |
| "reward": 1.4270833730697632, | |
| "reward_mean": 1.4270833730697632, | |
| "reward_std": 0.31544241309165955, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 24 | |
| }, | |
| { | |
| "advantages": -7.82310962677002e-08, | |
| "completion_length": 117.625, | |
| "epoch": 0.025, | |
| "grad_norm": 6.088715076446533, | |
| "kl": 0.126953125, | |
| "learning_rate": 9.75e-07, | |
| "loss": -0.0576, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.36558622121810913, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 93.5625, | |
| "epoch": 0.026, | |
| "grad_norm": 7.816601753234863, | |
| "kl": 0.1103515625, | |
| "learning_rate": 9.74e-07, | |
| "loss": 0.1863, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "advantages": -4.842877388000488e-08, | |
| "completion_length": 150.6875, | |
| "epoch": 0.027, | |
| "grad_norm": 5.928228378295898, | |
| "kl": 0.162109375, | |
| "learning_rate": 9.729999999999998e-07, | |
| "loss": -0.2134, | |
| "reward": 1.3645833730697632, | |
| "reward_mean": 1.3645833730697632, | |
| "reward_std": 0.28207486867904663, | |
| "rewards/accuracy_reward": 0.4270833730697632, | |
| "rewards/format_reward": 0.9375, | |
| "step": 27 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 98.0, | |
| "epoch": 0.028, | |
| "grad_norm": 6.595263481140137, | |
| "kl": 0.1005859375, | |
| "learning_rate": 9.72e-07, | |
| "loss": -0.0471, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2925041913986206, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 119.75, | |
| "epoch": 0.029, | |
| "grad_norm": 4.980852127075195, | |
| "kl": 0.1279296875, | |
| "learning_rate": 9.709999999999999e-07, | |
| "loss": 0.0662, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 1.0, | |
| "step": 29 | |
| }, | |
| { | |
| "advantages": 7.078051567077637e-08, | |
| "completion_length": 112.875, | |
| "epoch": 0.03, | |
| "grad_norm": 6.368246555328369, | |
| "kl": 0.166015625, | |
| "learning_rate": 9.7e-07, | |
| "loss": 0.0972, | |
| "reward": 1.5729167461395264, | |
| "reward_mean": 1.5729167461395264, | |
| "reward_std": 0.27226415276527405, | |
| "rewards/accuracy_reward": 0.5729166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 109.4375, | |
| "epoch": 0.031, | |
| "grad_norm": 3.693007469177246, | |
| "kl": 0.1259765625, | |
| "learning_rate": 9.69e-07, | |
| "loss": -0.0644, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "advantages": 2.9802322387695312e-08, | |
| "completion_length": 99.0, | |
| "epoch": 0.032, | |
| "grad_norm": 7.649048805236816, | |
| "kl": 0.150390625, | |
| "learning_rate": 9.679999999999999e-07, | |
| "loss": -0.0374, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.41912031173706055, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "advantages": 1.0803341865539551e-07, | |
| "completion_length": 107.25, | |
| "epoch": 0.033, | |
| "grad_norm": 7.077078819274902, | |
| "kl": 0.1416015625, | |
| "learning_rate": 9.67e-07, | |
| "loss": -0.0125, | |
| "reward": 1.4791667461395264, | |
| "reward_mean": 1.4791667461395264, | |
| "reward_std": 0.27867573499679565, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 102.875, | |
| "epoch": 0.034, | |
| "grad_norm": 7.0495991706848145, | |
| "kl": 0.125, | |
| "learning_rate": 9.66e-07, | |
| "loss": -0.1118, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 34 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 102.5625, | |
| "epoch": 0.035, | |
| "grad_norm": 6.116304874420166, | |
| "kl": 0.1328125, | |
| "learning_rate": 9.649999999999999e-07, | |
| "loss": -0.023, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.4972116947174072, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 35 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 121.9375, | |
| "epoch": 0.036, | |
| "grad_norm": 5.6247453689575195, | |
| "kl": 0.25, | |
| "learning_rate": 9.64e-07, | |
| "loss": 0.0057, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.5260357856750488, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 114.0, | |
| "epoch": 0.037, | |
| "grad_norm": 5.942628860473633, | |
| "kl": 0.0859375, | |
| "learning_rate": 9.63e-07, | |
| "loss": 0.0132, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 120.9375, | |
| "epoch": 0.038, | |
| "grad_norm": 6.8025312423706055, | |
| "kl": 0.09375, | |
| "learning_rate": 9.619999999999999e-07, | |
| "loss": 0.298, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 38 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 95.0, | |
| "epoch": 0.039, | |
| "grad_norm": 4.515552520751953, | |
| "kl": 0.05712890625, | |
| "learning_rate": 9.61e-07, | |
| "loss": -0.0356, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.12400396168231964, | |
| "rewards/accuracy_reward": 0.7291666269302368, | |
| "rewards/format_reward": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "advantages": -1.6391277313232422e-07, | |
| "completion_length": 115.6875, | |
| "epoch": 0.04, | |
| "grad_norm": 3.725029706954956, | |
| "kl": 0.0947265625, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0472, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.08908708393573761, | |
| "rewards/accuracy_reward": 0.3333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 108.125, | |
| "epoch": 0.041, | |
| "grad_norm": 4.70515775680542, | |
| "kl": 0.11328125, | |
| "learning_rate": 9.589999999999998e-07, | |
| "loss": 0.0818, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.08908706903457642, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-07, | |
| "completion_length": 116.1875, | |
| "epoch": 0.042, | |
| "grad_norm": 6.178482532501221, | |
| "kl": 0.125, | |
| "learning_rate": 9.58e-07, | |
| "loss": -0.0352, | |
| "reward": 1.4166667461395264, | |
| "reward_mean": 1.4166667461395264, | |
| "reward_std": 0.18292953073978424, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 114.5625, | |
| "epoch": 0.043, | |
| "grad_norm": 3.4989614486694336, | |
| "kl": 0.087890625, | |
| "learning_rate": 9.57e-07, | |
| "loss": -0.0562, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 43 | |
| }, | |
| { | |
| "advantages": -5.960464477539063e-08, | |
| "completion_length": 111.4375, | |
| "epoch": 0.044, | |
| "grad_norm": 5.46613883972168, | |
| "kl": 0.11181640625, | |
| "learning_rate": 9.559999999999998e-07, | |
| "loss": 0.0258, | |
| "reward": 1.4895833730697632, | |
| "reward_mean": 1.4895833730697632, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.4895833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 145.5625, | |
| "epoch": 0.045, | |
| "grad_norm": 4.604372501373291, | |
| "kl": 0.140625, | |
| "learning_rate": 9.55e-07, | |
| "loss": -0.1122, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "advantages": -1.0803341865539551e-07, | |
| "completion_length": 133.6875, | |
| "epoch": 0.046, | |
| "grad_norm": 5.48823881149292, | |
| "kl": 0.08935546875, | |
| "learning_rate": 9.539999999999999e-07, | |
| "loss": 0.0234, | |
| "reward": 1.5416667461395264, | |
| "reward_mean": 1.5416667461395264, | |
| "reward_std": 0.24800795316696167, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 113.8125, | |
| "epoch": 0.047, | |
| "grad_norm": 3.9073755741119385, | |
| "kl": 0.12451171875, | |
| "learning_rate": 9.529999999999999e-07, | |
| "loss": 0.0176, | |
| "reward": 1.4583333730697632, | |
| "reward_mean": 1.4583333730697632, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.4583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 47 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 145.6875, | |
| "epoch": 0.048, | |
| "grad_norm": 5.332810878753662, | |
| "kl": 0.08740234375, | |
| "learning_rate": 9.52e-07, | |
| "loss": -0.0314, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.22201895713806152, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 126.875, | |
| "epoch": 0.049, | |
| "grad_norm": 5.358933925628662, | |
| "kl": 0.150390625, | |
| "learning_rate": 9.509999999999999e-07, | |
| "loss": 0.1055, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.249358132481575, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 49 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 138.875, | |
| "epoch": 0.05, | |
| "grad_norm": 5.692139625549316, | |
| "kl": 0.0859375, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": -0.1248, | |
| "reward": 1.4479167461395264, | |
| "reward_mean": 1.4479167461395264, | |
| "reward_std": 0.17747542262077332, | |
| "rewards/accuracy_reward": 0.4479166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 146.875, | |
| "epoch": 0.051, | |
| "grad_norm": 5.381588459014893, | |
| "kl": 0.08935546875, | |
| "learning_rate": 9.489999999999999e-07, | |
| "loss": -0.129, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.41746097803115845, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "advantages": 2.2351741790771484e-08, | |
| "completion_length": 150.875, | |
| "epoch": 0.052, | |
| "grad_norm": 5.1832451820373535, | |
| "kl": 0.083984375, | |
| "learning_rate": 9.479999999999999e-07, | |
| "loss": -0.0529, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.2925041913986206, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 124.8125, | |
| "epoch": 0.053, | |
| "grad_norm": 0.0, | |
| "kl": 0.1689453125, | |
| "learning_rate": 9.469999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "advantages": -2.9802322387695312e-08, | |
| "completion_length": 126.0, | |
| "epoch": 0.054, | |
| "grad_norm": 6.027964115142822, | |
| "kl": 0.07421875, | |
| "learning_rate": 9.459999999999999e-07, | |
| "loss": 0.1005, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.3471825420856476, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 144.75, | |
| "epoch": 0.055, | |
| "grad_norm": 4.921864986419678, | |
| "kl": 0.07568359375, | |
| "learning_rate": 9.45e-07, | |
| "loss": 0.0249, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.3047097325325012, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 125.125, | |
| "epoch": 0.056, | |
| "grad_norm": 5.077033042907715, | |
| "kl": 0.0673828125, | |
| "learning_rate": 9.439999999999999e-07, | |
| "loss": -0.045, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.5260357856750488, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 56 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 131.625, | |
| "epoch": 0.057, | |
| "grad_norm": 4.984986782073975, | |
| "kl": 0.1015625, | |
| "learning_rate": 9.429999999999999e-07, | |
| "loss": -0.0956, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "advantages": 4.470348358154297e-08, | |
| "completion_length": 175.25, | |
| "epoch": 0.058, | |
| "grad_norm": 3.103456974029541, | |
| "kl": 0.0810546875, | |
| "learning_rate": 9.419999999999999e-07, | |
| "loss": -0.0034, | |
| "reward": 1.2291667461395264, | |
| "reward_mean": 1.2291667461395264, | |
| "reward_std": 0.12400396913290024, | |
| "rewards/accuracy_reward": 0.2291666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 147.25, | |
| "epoch": 0.059, | |
| "grad_norm": 4.62039852142334, | |
| "kl": 0.07763671875, | |
| "learning_rate": 9.409999999999999e-07, | |
| "loss": 0.1301, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.37714511156082153, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 59 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 167.25, | |
| "epoch": 0.06, | |
| "grad_norm": 4.27726411819458, | |
| "kl": 0.111328125, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": -0.0044, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 147.25, | |
| "epoch": 0.061, | |
| "grad_norm": 4.738762855529785, | |
| "kl": 0.087890625, | |
| "learning_rate": 9.389999999999999e-07, | |
| "loss": -0.0094, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.40089184045791626, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 163.75, | |
| "epoch": 0.062, | |
| "grad_norm": 4.51692008972168, | |
| "kl": 0.07421875, | |
| "learning_rate": 9.379999999999998e-07, | |
| "loss": 0.1029, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.48037588596343994, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 128.5625, | |
| "epoch": 0.063, | |
| "grad_norm": 3.7537429332733154, | |
| "kl": 0.06396484375, | |
| "learning_rate": 9.37e-07, | |
| "loss": 0.0021, | |
| "reward": 1.21875, | |
| "reward_mean": 1.21875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.21875, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "advantages": -8.195638656616211e-08, | |
| "completion_length": 183.0625, | |
| "epoch": 0.064, | |
| "grad_norm": 4.70877742767334, | |
| "kl": 0.0732421875, | |
| "learning_rate": 9.36e-07, | |
| "loss": -0.0168, | |
| "reward": 1.1041667461395264, | |
| "reward_mean": 1.1041667461395264, | |
| "reward_std": 0.25392839312553406, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/format_reward": 1.0, | |
| "step": 64 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 169.0625, | |
| "epoch": 0.065, | |
| "grad_norm": 2.69047212600708, | |
| "kl": 0.0927734375, | |
| "learning_rate": 9.35e-07, | |
| "loss": -0.0626, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 65 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 125.9375, | |
| "epoch": 0.066, | |
| "grad_norm": 5.199371814727783, | |
| "kl": 0.109375, | |
| "learning_rate": 9.34e-07, | |
| "loss": 0.0566, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "advantages": 7.078051567077637e-08, | |
| "completion_length": 161.5, | |
| "epoch": 0.067, | |
| "grad_norm": 4.959042549133301, | |
| "kl": 0.1240234375, | |
| "learning_rate": 9.33e-07, | |
| "loss": -0.0491, | |
| "reward": 1.5416667461395264, | |
| "reward_mean": 1.5416667461395264, | |
| "reward_std": 0.20198571681976318, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 131.1875, | |
| "epoch": 0.068, | |
| "grad_norm": 0.0, | |
| "kl": 0.064453125, | |
| "learning_rate": 9.32e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-08, | |
| "completion_length": 149.3125, | |
| "epoch": 0.069, | |
| "grad_norm": 5.306145668029785, | |
| "kl": 0.0908203125, | |
| "learning_rate": 9.31e-07, | |
| "loss": 0.1358, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.384762704372406, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 69 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 158.8125, | |
| "epoch": 0.07, | |
| "grad_norm": 4.328370571136475, | |
| "kl": 0.107421875, | |
| "learning_rate": 9.3e-07, | |
| "loss": -0.0507, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.3204349875450134, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 130.5, | |
| "epoch": 0.071, | |
| "grad_norm": 5.123632431030273, | |
| "kl": 0.1259765625, | |
| "learning_rate": 9.29e-07, | |
| "loss": -0.0134, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 71 | |
| }, | |
| { | |
| "advantages": 2.9802322387695312e-08, | |
| "completion_length": 180.6875, | |
| "epoch": 0.072, | |
| "grad_norm": 2.0722601413726807, | |
| "kl": 0.0810546875, | |
| "learning_rate": 9.28e-07, | |
| "loss": -0.02, | |
| "reward": 1.4791667461395264, | |
| "reward_mean": 1.4791667461395264, | |
| "reward_std": 0.15268757939338684, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 138.25, | |
| "epoch": 0.073, | |
| "grad_norm": 6.854410648345947, | |
| "kl": 0.1533203125, | |
| "learning_rate": 9.27e-07, | |
| "loss": -0.1078, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.3369941711425781, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "advantages": 1.564621925354004e-07, | |
| "completion_length": 159.75, | |
| "epoch": 0.074, | |
| "grad_norm": 5.237288951873779, | |
| "kl": 0.11328125, | |
| "learning_rate": 9.26e-07, | |
| "loss": -0.0132, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2658637762069702, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "advantages": -1.0803341865539551e-07, | |
| "completion_length": 158.4375, | |
| "epoch": 0.075, | |
| "grad_norm": 4.552402496337891, | |
| "kl": 0.11328125, | |
| "learning_rate": 9.25e-07, | |
| "loss": 0.078, | |
| "reward": 1.5520833730697632, | |
| "reward_mean": 1.5520833730697632, | |
| "reward_std": 0.20653896033763885, | |
| "rewards/accuracy_reward": 0.5520833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 172.8125, | |
| "epoch": 0.076, | |
| "grad_norm": 4.742386817932129, | |
| "kl": 0.146484375, | |
| "learning_rate": 9.24e-07, | |
| "loss": -0.016, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.3198433816432953, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 125.3125, | |
| "epoch": 0.077, | |
| "grad_norm": 3.670785903930664, | |
| "kl": 0.10791015625, | |
| "learning_rate": 9.23e-07, | |
| "loss": -0.0209, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 132.4375, | |
| "epoch": 0.078, | |
| "grad_norm": 3.141366958618164, | |
| "kl": 0.17578125, | |
| "learning_rate": 9.22e-07, | |
| "loss": 0.0153, | |
| "reward": 1.28125, | |
| "reward_mean": 1.28125, | |
| "reward_std": 0.1602174937725067, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 155.0, | |
| "epoch": 0.079, | |
| "grad_norm": 4.80902099609375, | |
| "kl": 0.16796875, | |
| "learning_rate": 9.21e-07, | |
| "loss": 0.013, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.44403791427612305, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 79 | |
| }, | |
| { | |
| "advantages": 5.587935447692871e-08, | |
| "completion_length": 151.1875, | |
| "epoch": 0.08, | |
| "grad_norm": 2.728870391845703, | |
| "kl": 0.150390625, | |
| "learning_rate": 9.2e-07, | |
| "loss": -0.0162, | |
| "reward": 1.2291667461395264, | |
| "reward_mean": 1.2291667461395264, | |
| "reward_std": 0.08625819534063339, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 135.375, | |
| "epoch": 0.081, | |
| "grad_norm": 0.0, | |
| "kl": 0.140625, | |
| "learning_rate": 9.19e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 81 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 131.0625, | |
| "epoch": 0.082, | |
| "grad_norm": 5.101280212402344, | |
| "kl": 0.119140625, | |
| "learning_rate": 9.18e-07, | |
| "loss": 0.0076, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 149.8125, | |
| "epoch": 0.083, | |
| "grad_norm": 6.299161911010742, | |
| "kl": 0.1328125, | |
| "learning_rate": 9.17e-07, | |
| "loss": 0.1103, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.5430608987808228, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 83 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 158.8125, | |
| "epoch": 0.084, | |
| "grad_norm": 5.345361232757568, | |
| "kl": 0.16015625, | |
| "learning_rate": 9.16e-07, | |
| "loss": -0.0678, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.3471825420856476, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 84 | |
| }, | |
| { | |
| "advantages": 8.940696716308594e-08, | |
| "completion_length": 147.0, | |
| "epoch": 0.085, | |
| "grad_norm": 4.728163719177246, | |
| "kl": 0.1328125, | |
| "learning_rate": 9.15e-07, | |
| "loss": 0.0498, | |
| "reward": 1.5104167461395264, | |
| "reward_mean": 1.5104167461395264, | |
| "reward_std": 0.16554003953933716, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 135.4375, | |
| "epoch": 0.086, | |
| "grad_norm": 5.456924915313721, | |
| "kl": 0.146484375, | |
| "learning_rate": 9.14e-07, | |
| "loss": -0.0519, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.3924052119255066, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 141.875, | |
| "epoch": 0.087, | |
| "grad_norm": 3.1715574264526367, | |
| "kl": 0.2578125, | |
| "learning_rate": 9.13e-07, | |
| "loss": 0.0425, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 136.25, | |
| "epoch": 0.088, | |
| "grad_norm": 6.198694705963135, | |
| "kl": 0.154296875, | |
| "learning_rate": 9.12e-07, | |
| "loss": 0.148, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 88 | |
| }, | |
| { | |
| "advantages": -1.1175870895385742e-08, | |
| "completion_length": 150.3125, | |
| "epoch": 0.089, | |
| "grad_norm": 5.361752510070801, | |
| "kl": 0.244140625, | |
| "learning_rate": 9.109999999999999e-07, | |
| "loss": -0.1088, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 89 | |
| }, | |
| { | |
| "advantages": 1.2665987014770508e-07, | |
| "completion_length": 162.5625, | |
| "epoch": 0.09, | |
| "grad_norm": 3.3533167839050293, | |
| "kl": 0.15625, | |
| "learning_rate": 9.1e-07, | |
| "loss": 0.0269, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 154.125, | |
| "epoch": 0.091, | |
| "grad_norm": 4.257230281829834, | |
| "kl": 0.138671875, | |
| "learning_rate": 9.09e-07, | |
| "loss": -0.0494, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.3369941711425781, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 91 | |
| }, | |
| { | |
| "advantages": 5.587935447692871e-08, | |
| "completion_length": 153.5625, | |
| "epoch": 0.092, | |
| "grad_norm": 3.06853985786438, | |
| "kl": 0.146484375, | |
| "learning_rate": 9.08e-07, | |
| "loss": -0.0155, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.08625819534063339, | |
| "rewards/accuracy_reward": 0.7291666269302368, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 173.5625, | |
| "epoch": 0.093, | |
| "grad_norm": 4.021603584289551, | |
| "kl": 0.150390625, | |
| "learning_rate": 9.07e-07, | |
| "loss": 0.1183, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.3369941711425781, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 93 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 137.125, | |
| "epoch": 0.094, | |
| "grad_norm": 3.569105625152588, | |
| "kl": 0.1728515625, | |
| "learning_rate": 9.06e-07, | |
| "loss": -0.0621, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "advantages": -3.3527612686157227e-08, | |
| "completion_length": 151.4375, | |
| "epoch": 0.095, | |
| "grad_norm": 4.879980564117432, | |
| "kl": 0.1416015625, | |
| "learning_rate": 9.05e-07, | |
| "loss": -0.0726, | |
| "reward": 1.3958333730697632, | |
| "reward_mean": 1.3958333730697632, | |
| "reward_std": 0.3177132308483124, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 138.9375, | |
| "epoch": 0.096, | |
| "grad_norm": 3.0653719902038574, | |
| "kl": 0.201171875, | |
| "learning_rate": 9.039999999999999e-07, | |
| "loss": 0.0793, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 96 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 165.0, | |
| "epoch": 0.097, | |
| "grad_norm": 5.2285027503967285, | |
| "kl": 0.17578125, | |
| "learning_rate": 9.03e-07, | |
| "loss": -0.0091, | |
| "reward": 1.5833333730697632, | |
| "reward_mean": 1.5833333730697632, | |
| "reward_std": 0.5487886071205139, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 0.875, | |
| "step": 97 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 187.9375, | |
| "epoch": 0.098, | |
| "grad_norm": 5.323462963104248, | |
| "kl": 0.13671875, | |
| "learning_rate": 9.02e-07, | |
| "loss": 0.1548, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.3471629321575165, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "advantages": -1.043081283569336e-07, | |
| "completion_length": 203.3125, | |
| "epoch": 0.099, | |
| "grad_norm": 4.417811870574951, | |
| "kl": 0.1181640625, | |
| "learning_rate": 9.01e-07, | |
| "loss": -0.0588, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.21507522463798523, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 205.1875, | |
| "epoch": 0.1, | |
| "grad_norm": 5.388199329376221, | |
| "kl": 0.1484375, | |
| "learning_rate": 9e-07, | |
| "loss": 0.2401, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.2486058473587036, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 128.9375, | |
| "epoch": 0.101, | |
| "grad_norm": 0.0, | |
| "kl": 0.13671875, | |
| "learning_rate": 8.99e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "advantages": -1.0803341865539551e-07, | |
| "completion_length": 199.75, | |
| "epoch": 0.102, | |
| "grad_norm": 4.06512975692749, | |
| "kl": 0.12109375, | |
| "learning_rate": 8.98e-07, | |
| "loss": 0.0036, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.33592626452445984, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 177.625, | |
| "epoch": 0.103, | |
| "grad_norm": 4.752602577209473, | |
| "kl": 0.140625, | |
| "learning_rate": 8.969999999999999e-07, | |
| "loss": 0.0261, | |
| "reward": 1.4583333730697632, | |
| "reward_mean": 1.4583333730697632, | |
| "reward_std": 0.27215445041656494, | |
| "rewards/accuracy_reward": 0.4583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 123.625, | |
| "epoch": 0.104, | |
| "grad_norm": 5.437667369842529, | |
| "kl": 0.177734375, | |
| "learning_rate": 8.96e-07, | |
| "loss": -0.0163, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "advantages": 1.1175870895385742e-08, | |
| "completion_length": 162.4375, | |
| "epoch": 0.105, | |
| "grad_norm": 5.383893013000488, | |
| "kl": 0.1123046875, | |
| "learning_rate": 8.95e-07, | |
| "loss": 0.0715, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "advantages": -1.30385160446167e-07, | |
| "completion_length": 178.75, | |
| "epoch": 0.106, | |
| "grad_norm": 4.805429935455322, | |
| "kl": 0.142578125, | |
| "learning_rate": 8.939999999999999e-07, | |
| "loss": -0.0543, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.1451837718486786, | |
| "rewards/accuracy_reward": 0.3750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 170.75, | |
| "epoch": 0.107, | |
| "grad_norm": 2.5841424465179443, | |
| "kl": 0.28125, | |
| "learning_rate": 8.93e-07, | |
| "loss": -0.0829, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-08, | |
| "completion_length": 188.6875, | |
| "epoch": 0.108, | |
| "grad_norm": 4.707062244415283, | |
| "kl": 0.15234375, | |
| "learning_rate": 8.92e-07, | |
| "loss": -0.0264, | |
| "reward": 1.6458333730697632, | |
| "reward_mean": 1.6458333730697632, | |
| "reward_std": 0.31493228673934937, | |
| "rewards/accuracy_reward": 0.6458333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 190.9375, | |
| "epoch": 0.109, | |
| "grad_norm": 5.0554022789001465, | |
| "kl": 0.1396484375, | |
| "learning_rate": 8.91e-07, | |
| "loss": 0.0244, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "advantages": 9.685754776000977e-08, | |
| "completion_length": 186.0, | |
| "epoch": 0.11, | |
| "grad_norm": 5.242109298706055, | |
| "kl": 0.142578125, | |
| "learning_rate": 8.9e-07, | |
| "loss": -0.0693, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.25392839312553406, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-08, | |
| "completion_length": 176.25, | |
| "epoch": 0.111, | |
| "grad_norm": 4.1332621574401855, | |
| "kl": 0.1494140625, | |
| "learning_rate": 8.89e-07, | |
| "loss": -0.0321, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.12400396913290024, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 144.4375, | |
| "epoch": 0.112, | |
| "grad_norm": 3.3191065788269043, | |
| "kl": 0.15234375, | |
| "learning_rate": 8.88e-07, | |
| "loss": -0.0225, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 176.5, | |
| "epoch": 0.113, | |
| "grad_norm": 4.919429302215576, | |
| "kl": 0.14453125, | |
| "learning_rate": 8.869999999999999e-07, | |
| "loss": 0.0628, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "advantages": 7.078051567077637e-08, | |
| "completion_length": 189.375, | |
| "epoch": 0.114, | |
| "grad_norm": 4.54962682723999, | |
| "kl": 0.14453125, | |
| "learning_rate": 8.86e-07, | |
| "loss": 0.0199, | |
| "reward": 1.6041667461395264, | |
| "reward_mean": 1.6041667461395264, | |
| "reward_std": 0.33592626452445984, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 199.25, | |
| "epoch": 0.115, | |
| "grad_norm": 3.2728166580200195, | |
| "kl": 0.1640625, | |
| "learning_rate": 8.85e-07, | |
| "loss": -0.0123, | |
| "reward": 1.3958333730697632, | |
| "reward_mean": 1.3958333730697632, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 192.375, | |
| "epoch": 0.116, | |
| "grad_norm": 3.095080614089966, | |
| "kl": 0.146484375, | |
| "learning_rate": 8.839999999999999e-07, | |
| "loss": -0.0267, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "advantages": 3.3527612686157227e-08, | |
| "completion_length": 152.625, | |
| "epoch": 0.117, | |
| "grad_norm": 5.22807502746582, | |
| "kl": 0.1669921875, | |
| "learning_rate": 8.83e-07, | |
| "loss": -0.0066, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 155.375, | |
| "epoch": 0.118, | |
| "grad_norm": 0.0, | |
| "kl": 0.150390625, | |
| "learning_rate": 8.82e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 143.375, | |
| "epoch": 0.119, | |
| "grad_norm": 0.0, | |
| "kl": 0.1943359375, | |
| "learning_rate": 8.81e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-08, | |
| "completion_length": 164.5625, | |
| "epoch": 0.12, | |
| "grad_norm": 3.923374891281128, | |
| "kl": 0.162109375, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": -0.0003, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.26346173882484436, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 180.0625, | |
| "epoch": 0.121, | |
| "grad_norm": 4.817902565002441, | |
| "kl": 0.169921875, | |
| "learning_rate": 8.79e-07, | |
| "loss": -0.0816, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 159.5625, | |
| "epoch": 0.122, | |
| "grad_norm": 3.3247320652008057, | |
| "kl": 0.3046875, | |
| "learning_rate": 8.78e-07, | |
| "loss": 0.0201, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 114.375, | |
| "epoch": 0.123, | |
| "grad_norm": 4.4976091384887695, | |
| "kl": 0.2001953125, | |
| "learning_rate": 8.769999999999999e-07, | |
| "loss": -0.0023, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "advantages": -3.203749656677246e-07, | |
| "completion_length": 151.625, | |
| "epoch": 0.124, | |
| "grad_norm": 4.727357387542725, | |
| "kl": 0.162109375, | |
| "learning_rate": 8.76e-07, | |
| "loss": -0.1441, | |
| "reward": 1.5208333730697632, | |
| "reward_mean": 1.5208333730697632, | |
| "reward_std": 0.058925580233335495, | |
| "rewards/accuracy_reward": 0.5208333134651184, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 154.625, | |
| "epoch": 0.125, | |
| "grad_norm": 5.586273670196533, | |
| "kl": 0.212890625, | |
| "learning_rate": 8.75e-07, | |
| "loss": -0.0295, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.3745020925998688, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 210.4375, | |
| "epoch": 0.126, | |
| "grad_norm": 3.2797603607177734, | |
| "kl": 0.154296875, | |
| "learning_rate": 8.739999999999999e-07, | |
| "loss": -0.0005, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 149.25, | |
| "epoch": 0.127, | |
| "grad_norm": 0.0, | |
| "kl": 0.171875, | |
| "learning_rate": 8.729999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 142.9375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.0, | |
| "kl": 0.2158203125, | |
| "learning_rate": 8.72e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "advantages": 1.0058283805847168e-07, | |
| "completion_length": 214.9375, | |
| "epoch": 0.129, | |
| "grad_norm": 4.753498554229736, | |
| "kl": 0.12890625, | |
| "learning_rate": 8.71e-07, | |
| "loss": 0.0282, | |
| "reward": 1.5208333730697632, | |
| "reward_mean": 1.5208333730697632, | |
| "reward_std": 0.2298392653465271, | |
| "rewards/accuracy_reward": 0.5208333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-08, | |
| "completion_length": 169.9375, | |
| "epoch": 0.13, | |
| "grad_norm": 2.9244163036346436, | |
| "kl": 0.146484375, | |
| "learning_rate": 8.699999999999999e-07, | |
| "loss": -0.0707, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.08908706903457642, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 140.1875, | |
| "epoch": 0.131, | |
| "grad_norm": 3.525092840194702, | |
| "kl": 0.15625, | |
| "learning_rate": 8.69e-07, | |
| "loss": 0.0386, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 150.75, | |
| "epoch": 0.132, | |
| "grad_norm": 3.3508222103118896, | |
| "kl": 0.150390625, | |
| "learning_rate": 8.68e-07, | |
| "loss": -0.0317, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "advantages": -1.2293457984924316e-07, | |
| "completion_length": 207.0625, | |
| "epoch": 0.133, | |
| "grad_norm": 4.877673625946045, | |
| "kl": 0.15625, | |
| "learning_rate": 8.669999999999999e-07, | |
| "loss": -0.0678, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.17251640558242798, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 114.5625, | |
| "epoch": 0.134, | |
| "grad_norm": 4.1597580909729, | |
| "kl": 0.1904296875, | |
| "learning_rate": 8.659999999999999e-07, | |
| "loss": -0.0128, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 171.625, | |
| "epoch": 0.135, | |
| "grad_norm": 0.0, | |
| "kl": 0.16015625, | |
| "learning_rate": 8.65e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 135 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 193.3125, | |
| "epoch": 0.136, | |
| "grad_norm": 3.460597276687622, | |
| "kl": 0.1650390625, | |
| "learning_rate": 8.639999999999999e-07, | |
| "loss": -0.0345, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 164.75, | |
| "epoch": 0.137, | |
| "grad_norm": 3.3782408237457275, | |
| "kl": 0.16015625, | |
| "learning_rate": 8.629999999999999e-07, | |
| "loss": 0.0302, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 159.6875, | |
| "epoch": 0.138, | |
| "grad_norm": 6.104968547821045, | |
| "kl": 0.162109375, | |
| "learning_rate": 8.62e-07, | |
| "loss": 0.064, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.249358132481575, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 204.4375, | |
| "epoch": 0.139, | |
| "grad_norm": 4.3379902839660645, | |
| "kl": 0.1484375, | |
| "learning_rate": 8.61e-07, | |
| "loss": 0.0819, | |
| "reward": 1.4791667461395264, | |
| "reward_mean": 1.4791667461395264, | |
| "reward_std": 0.3759046792984009, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "advantages": -6.146728992462158e-08, | |
| "completion_length": 184.6875, | |
| "epoch": 0.14, | |
| "grad_norm": 2.6453442573547363, | |
| "kl": 0.1630859375, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": -0.0203, | |
| "reward": 1.7708333730697632, | |
| "reward_mean": 1.7708333730697632, | |
| "reward_std": 0.19795583188533783, | |
| "rewards/accuracy_reward": 0.7708333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 154.0625, | |
| "epoch": 0.141, | |
| "grad_norm": 3.7319183349609375, | |
| "kl": 0.15234375, | |
| "learning_rate": 8.59e-07, | |
| "loss": -0.011, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 141 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 148.25, | |
| "epoch": 0.142, | |
| "grad_norm": 0.0, | |
| "kl": 0.171875, | |
| "learning_rate": 8.58e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "advantages": 1.1175870895385742e-08, | |
| "completion_length": 149.1875, | |
| "epoch": 0.143, | |
| "grad_norm": 3.001418113708496, | |
| "kl": 0.1650390625, | |
| "learning_rate": 8.569999999999999e-07, | |
| "loss": -0.0812, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.15268756449222565, | |
| "rewards/accuracy_reward": 0.7291667461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 176.3125, | |
| "epoch": 0.144, | |
| "grad_norm": 4.027692794799805, | |
| "kl": 0.1669921875, | |
| "learning_rate": 8.559999999999999e-07, | |
| "loss": 0.037, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 144 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 156.0, | |
| "epoch": 0.145, | |
| "grad_norm": 0.0, | |
| "kl": 0.19921875, | |
| "learning_rate": 8.55e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "advantages": -2.942979335784912e-07, | |
| "completion_length": 190.5625, | |
| "epoch": 0.146, | |
| "grad_norm": 4.445368766784668, | |
| "kl": 0.15625, | |
| "learning_rate": 8.539999999999999e-07, | |
| "loss": -0.0205, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.18292956054210663, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "advantages": 4.470348358154297e-07, | |
| "completion_length": 192.4375, | |
| "epoch": 0.147, | |
| "grad_norm": 5.451050758361816, | |
| "kl": 0.169921875, | |
| "learning_rate": 8.529999999999999e-07, | |
| "loss": -0.0488, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.117851123213768, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "advantages": -9.685754776000977e-08, | |
| "completion_length": 175.25, | |
| "epoch": 0.148, | |
| "grad_norm": 4.9530205726623535, | |
| "kl": 0.169921875, | |
| "learning_rate": 8.52e-07, | |
| "loss": -0.0464, | |
| "reward": 1.5416667461395264, | |
| "reward_mean": 1.5416667461395264, | |
| "reward_std": 0.20693820714950562, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 148 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-08, | |
| "completion_length": 177.75, | |
| "epoch": 0.149, | |
| "grad_norm": 6.3942551612854, | |
| "kl": 0.158203125, | |
| "learning_rate": 8.51e-07, | |
| "loss": -0.1093, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.2630349099636078, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 198.8125, | |
| "epoch": 0.15, | |
| "grad_norm": 4.109989166259766, | |
| "kl": 0.1533203125, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": -0.0619, | |
| "reward": 1.03125, | |
| "reward_mean": 1.03125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 163.8125, | |
| "epoch": 0.151, | |
| "grad_norm": 0.0, | |
| "kl": 0.158203125, | |
| "learning_rate": 8.489999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 151 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 147.3125, | |
| "epoch": 0.152, | |
| "grad_norm": 3.5985171794891357, | |
| "kl": 0.173828125, | |
| "learning_rate": 8.48e-07, | |
| "loss": 0.0055, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 152 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 150.875, | |
| "epoch": 0.153, | |
| "grad_norm": 4.749815940856934, | |
| "kl": 0.1767578125, | |
| "learning_rate": 8.469999999999999e-07, | |
| "loss": -0.0527, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 165.75, | |
| "epoch": 0.154, | |
| "grad_norm": 0.0, | |
| "kl": 0.1962890625, | |
| "learning_rate": 8.459999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 154 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 184.1875, | |
| "epoch": 0.155, | |
| "grad_norm": 5.736739635467529, | |
| "kl": 0.18359375, | |
| "learning_rate": 8.45e-07, | |
| "loss": 0.1821, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 155 | |
| }, | |
| { | |
| "advantages": 3.203749656677246e-07, | |
| "completion_length": 214.9375, | |
| "epoch": 0.156, | |
| "grad_norm": 3.542316436767578, | |
| "kl": 0.208984375, | |
| "learning_rate": 8.439999999999999e-07, | |
| "loss": 0.033, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.058925580233335495, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 156 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 204.625, | |
| "epoch": 0.157, | |
| "grad_norm": 4.769254684448242, | |
| "kl": 0.173828125, | |
| "learning_rate": 8.429999999999999e-07, | |
| "loss": 0.0339, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.4189920723438263, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 157 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 178.25, | |
| "epoch": 0.158, | |
| "grad_norm": 2.834043264389038, | |
| "kl": 0.1796875, | |
| "learning_rate": 8.419999999999999e-07, | |
| "loss": 0.0343, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 192.75, | |
| "epoch": 0.159, | |
| "grad_norm": 3.128997802734375, | |
| "kl": 0.150390625, | |
| "learning_rate": 8.41e-07, | |
| "loss": 0.008, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 159 | |
| }, | |
| { | |
| "advantages": 1.1920928955078125e-07, | |
| "completion_length": 183.6875, | |
| "epoch": 0.16, | |
| "grad_norm": 5.255495071411133, | |
| "kl": 0.171875, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.1298, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.3250930905342102, | |
| "rewards/accuracy_reward": 0.4062500298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 192.5625, | |
| "epoch": 0.161, | |
| "grad_norm": 3.1085081100463867, | |
| "kl": 0.18359375, | |
| "learning_rate": 8.389999999999999e-07, | |
| "loss": -0.0476, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 161 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 149.125, | |
| "epoch": 0.162, | |
| "grad_norm": 5.676258563995361, | |
| "kl": 0.208984375, | |
| "learning_rate": 8.38e-07, | |
| "loss": -0.0042, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.4189920723438263, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 202.4375, | |
| "epoch": 0.163, | |
| "grad_norm": 3.1146128177642822, | |
| "kl": 0.19140625, | |
| "learning_rate": 8.369999999999999e-07, | |
| "loss": -0.0357, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.12400397658348083, | |
| "rewards/accuracy_reward": 0.7291666269302368, | |
| "rewards/format_reward": 1.0, | |
| "step": 163 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 190.375, | |
| "epoch": 0.164, | |
| "grad_norm": 4.653083324432373, | |
| "kl": 0.251953125, | |
| "learning_rate": 8.359999999999999e-07, | |
| "loss": 0.0293, | |
| "reward": 1.5104167461395264, | |
| "reward_mean": 1.5104167461395264, | |
| "reward_std": 0.1473138928413391, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 164 | |
| }, | |
| { | |
| "advantages": 2.9802322387695312e-08, | |
| "completion_length": 208.0625, | |
| "epoch": 0.165, | |
| "grad_norm": 3.3702642917633057, | |
| "kl": 0.185546875, | |
| "learning_rate": 8.349999999999999e-07, | |
| "loss": -0.001, | |
| "reward": 1.8541667461395264, | |
| "reward_mean": 1.8541667461395264, | |
| "reward_std": 0.10681165009737015, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 165 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 166.375, | |
| "epoch": 0.166, | |
| "grad_norm": 4.143738746643066, | |
| "kl": 0.1865234375, | |
| "learning_rate": 8.34e-07, | |
| "loss": -0.0756, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 1.0, | |
| "step": 166 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 153.25, | |
| "epoch": 0.167, | |
| "grad_norm": 3.872225522994995, | |
| "kl": 0.185546875, | |
| "learning_rate": 8.329999999999999e-07, | |
| "loss": -0.0012, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 167 | |
| }, | |
| { | |
| "advantages": -4.470348358154297e-08, | |
| "completion_length": 197.3125, | |
| "epoch": 0.168, | |
| "grad_norm": 4.1173319816589355, | |
| "kl": 0.1953125, | |
| "learning_rate": 8.319999999999999e-07, | |
| "loss": 0.1437, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.15430334210395813, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 168 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 159.75, | |
| "epoch": 0.169, | |
| "grad_norm": 2.953240156173706, | |
| "kl": 0.18359375, | |
| "learning_rate": 8.31e-07, | |
| "loss": 0.0664, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 169 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 153.4375, | |
| "epoch": 0.17, | |
| "grad_norm": 4.36264705657959, | |
| "kl": 0.18359375, | |
| "learning_rate": 8.299999999999999e-07, | |
| "loss": -0.0093, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 163.1875, | |
| "epoch": 0.171, | |
| "grad_norm": 3.9977848529815674, | |
| "kl": 0.240234375, | |
| "learning_rate": 8.289999999999999e-07, | |
| "loss": 0.0138, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.24775780737400055, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 174.875, | |
| "epoch": 0.172, | |
| "grad_norm": 4.679101467132568, | |
| "kl": 0.189453125, | |
| "learning_rate": 8.28e-07, | |
| "loss": 0.1979, | |
| "reward": 1.4166667461395264, | |
| "reward_mean": 1.4166667461395264, | |
| "reward_std": 0.34194856882095337, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 172 | |
| }, | |
| { | |
| "advantages": 7.82310962677002e-08, | |
| "completion_length": 205.75, | |
| "epoch": 0.173, | |
| "grad_norm": 5.067877292633057, | |
| "kl": 0.22265625, | |
| "learning_rate": 8.269999999999999e-07, | |
| "loss": -0.1021, | |
| "reward": 1.3854167461395264, | |
| "reward_mean": 1.3854167461395264, | |
| "reward_std": 0.30385708808898926, | |
| "rewards/accuracy_reward": 0.3854166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 155.3125, | |
| "epoch": 0.174, | |
| "grad_norm": 6.50193977355957, | |
| "kl": 0.2412109375, | |
| "learning_rate": 8.259999999999999e-07, | |
| "loss": 0.0323, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 174 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 215.9375, | |
| "epoch": 0.175, | |
| "grad_norm": 4.612828731536865, | |
| "kl": 0.18359375, | |
| "learning_rate": 8.249999999999999e-07, | |
| "loss": -0.0642, | |
| "reward": 1.6458333730697632, | |
| "reward_mean": 1.6458333730697632, | |
| "reward_std": 0.35351940989494324, | |
| "rewards/accuracy_reward": 0.6458333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 175 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 166.5, | |
| "epoch": 0.176, | |
| "grad_norm": 5.000982761383057, | |
| "kl": 0.1982421875, | |
| "learning_rate": 8.24e-07, | |
| "loss": 0.0311, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.44403791427612305, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 122.5625, | |
| "epoch": 0.177, | |
| "grad_norm": 4.273613929748535, | |
| "kl": 0.2578125, | |
| "learning_rate": 8.229999999999999e-07, | |
| "loss": 0.0966, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.24775780737400055, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 177 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 191.5625, | |
| "epoch": 0.178, | |
| "grad_norm": 4.648405075073242, | |
| "kl": 0.193359375, | |
| "learning_rate": 8.219999999999999e-07, | |
| "loss": 0.0292, | |
| "reward": 1.8645833730697632, | |
| "reward_mean": 1.8645833730697632, | |
| "reward_std": 0.1746465265750885, | |
| "rewards/accuracy_reward": 0.8645833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 178 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 173.9375, | |
| "epoch": 0.179, | |
| "grad_norm": 3.65451717376709, | |
| "kl": 0.2119140625, | |
| "learning_rate": 8.21e-07, | |
| "loss": -0.0106, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 179 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 126.375, | |
| "epoch": 0.18, | |
| "grad_norm": 5.720065116882324, | |
| "kl": 0.73828125, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": -0.1224, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.35564959049224854, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 149.0, | |
| "epoch": 0.181, | |
| "grad_norm": 4.597268581390381, | |
| "kl": 0.21875, | |
| "learning_rate": 8.189999999999999e-07, | |
| "loss": 0.0604, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 181 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 148.6875, | |
| "epoch": 0.182, | |
| "grad_norm": 3.944310188293457, | |
| "kl": 0.232421875, | |
| "learning_rate": 8.179999999999999e-07, | |
| "loss": 0.0728, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 182 | |
| }, | |
| { | |
| "advantages": -1.043081283569336e-07, | |
| "completion_length": 128.1875, | |
| "epoch": 0.183, | |
| "grad_norm": 5.491823673248291, | |
| "kl": 0.2314453125, | |
| "learning_rate": 8.169999999999999e-07, | |
| "loss": -0.0613, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.07715167105197906, | |
| "rewards/accuracy_reward": 0.7083332538604736, | |
| "rewards/format_reward": 1.0, | |
| "step": 183 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 141.625, | |
| "epoch": 0.184, | |
| "grad_norm": 0.0, | |
| "kl": 0.2451171875, | |
| "learning_rate": 8.159999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 184 | |
| }, | |
| { | |
| "advantages": -2.9802322387695312e-08, | |
| "completion_length": 161.125, | |
| "epoch": 0.185, | |
| "grad_norm": 5.50625467300415, | |
| "kl": 0.2421875, | |
| "learning_rate": 8.149999999999999e-07, | |
| "loss": -0.0947, | |
| "reward": 1.3958333730697632, | |
| "reward_mean": 1.3958333730697632, | |
| "reward_std": 0.43129098415374756, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 185 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 143.0625, | |
| "epoch": 0.186, | |
| "grad_norm": 6.193937301635742, | |
| "kl": 0.271484375, | |
| "learning_rate": 8.14e-07, | |
| "loss": -0.0267, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.3061639666557312, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 186 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 149.125, | |
| "epoch": 0.187, | |
| "grad_norm": 0.0, | |
| "kl": 0.2490234375, | |
| "learning_rate": 8.129999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 187 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 111.75, | |
| "epoch": 0.188, | |
| "grad_norm": 4.042557716369629, | |
| "kl": 0.275390625, | |
| "learning_rate": 8.12e-07, | |
| "loss": -0.0301, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 188 | |
| }, | |
| { | |
| "advantages": 3.3527612686157227e-08, | |
| "completion_length": 129.0, | |
| "epoch": 0.189, | |
| "grad_norm": 5.769200325012207, | |
| "kl": 0.38671875, | |
| "learning_rate": 8.11e-07, | |
| "loss": 0.0278, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.5065323710441589, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 189 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 194.125, | |
| "epoch": 0.19, | |
| "grad_norm": 4.864434242248535, | |
| "kl": 0.228515625, | |
| "learning_rate": 8.1e-07, | |
| "loss": 0.0537, | |
| "reward": 1.5208333730697632, | |
| "reward_mean": 1.5208333730697632, | |
| "reward_std": 0.38895100355148315, | |
| "rewards/accuracy_reward": 0.5208333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 190 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 130.5, | |
| "epoch": 0.191, | |
| "grad_norm": 3.6852304935455322, | |
| "kl": 0.2490234375, | |
| "learning_rate": 8.09e-07, | |
| "loss": -0.0524, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 158.9375, | |
| "epoch": 0.192, | |
| "grad_norm": 4.945519924163818, | |
| "kl": 0.2373046875, | |
| "learning_rate": 8.08e-07, | |
| "loss": -0.0072, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.3657589256763458, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 192 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 164.4375, | |
| "epoch": 0.193, | |
| "grad_norm": 0.0, | |
| "kl": 0.25, | |
| "learning_rate": 8.070000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 193 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 118.75, | |
| "epoch": 0.194, | |
| "grad_norm": 4.313383102416992, | |
| "kl": 0.2734375, | |
| "learning_rate": 8.06e-07, | |
| "loss": 0.0147, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.22160130739212036, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 194 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 156.125, | |
| "epoch": 0.195, | |
| "grad_norm": 3.259519577026367, | |
| "kl": 0.26171875, | |
| "learning_rate": 8.05e-07, | |
| "loss": -0.0172, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 195 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 128.1875, | |
| "epoch": 0.196, | |
| "grad_norm": 0.0, | |
| "kl": 0.2216796875, | |
| "learning_rate": 8.04e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 196 | |
| }, | |
| { | |
| "advantages": 8.195638656616211e-08, | |
| "completion_length": 156.0, | |
| "epoch": 0.197, | |
| "grad_norm": 5.883679389953613, | |
| "kl": 0.283203125, | |
| "learning_rate": 8.03e-07, | |
| "loss": -0.0908, | |
| "reward": 1.6041667461395264, | |
| "reward_mean": 1.6041667461395264, | |
| "reward_std": 0.2335786670446396, | |
| "rewards/accuracy_reward": 0.6041667461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 197 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 122.625, | |
| "epoch": 0.198, | |
| "grad_norm": 3.723879814147949, | |
| "kl": 0.3203125, | |
| "learning_rate": 8.02e-07, | |
| "loss": -0.0498, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 198 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 135.375, | |
| "epoch": 0.199, | |
| "grad_norm": 4.400403022766113, | |
| "kl": 0.30078125, | |
| "learning_rate": 8.01e-07, | |
| "loss": -0.0618, | |
| "reward": 1.4583333730697632, | |
| "reward_mean": 1.4583333730697632, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.4583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 199 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 176.5625, | |
| "epoch": 0.2, | |
| "grad_norm": 5.768075942993164, | |
| "kl": 0.2578125, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0495, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 200 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 131.25, | |
| "epoch": 0.201, | |
| "grad_norm": 5.473985195159912, | |
| "kl": 0.259765625, | |
| "learning_rate": 7.99e-07, | |
| "loss": 0.0693, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 201 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 141.25, | |
| "epoch": 0.202, | |
| "grad_norm": 5.938058853149414, | |
| "kl": 0.26171875, | |
| "learning_rate": 7.98e-07, | |
| "loss": -0.0473, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.44478052854537964, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 202 | |
| }, | |
| { | |
| "advantages": 1.1175870895385742e-08, | |
| "completion_length": 163.875, | |
| "epoch": 0.203, | |
| "grad_norm": 5.90596342086792, | |
| "kl": 0.263671875, | |
| "learning_rate": 7.970000000000001e-07, | |
| "loss": 0.2037, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.4355512857437134, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 203 | |
| }, | |
| { | |
| "advantages": 1.825392246246338e-07, | |
| "completion_length": 126.875, | |
| "epoch": 0.204, | |
| "grad_norm": 6.201707363128662, | |
| "kl": 0.259765625, | |
| "learning_rate": 7.96e-07, | |
| "loss": -0.0593, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.3478616774082184, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 204 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 137.0, | |
| "epoch": 0.205, | |
| "grad_norm": 0.0, | |
| "kl": 0.2578125, | |
| "learning_rate": 7.95e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 205 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 130.375, | |
| "epoch": 0.206, | |
| "grad_norm": 3.8991453647613525, | |
| "kl": 0.3203125, | |
| "learning_rate": 7.94e-07, | |
| "loss": -0.0435, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-08, | |
| "completion_length": 141.1875, | |
| "epoch": 0.207, | |
| "grad_norm": 5.12335205078125, | |
| "kl": 0.28515625, | |
| "learning_rate": 7.93e-07, | |
| "loss": -0.134, | |
| "reward": 1.7708333730697632, | |
| "reward_mean": 1.7708333730697632, | |
| "reward_std": 0.12400396913290024, | |
| "rewards/accuracy_reward": 0.7708333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 207 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 155.9375, | |
| "epoch": 0.208, | |
| "grad_norm": 0.0, | |
| "kl": 0.28125, | |
| "learning_rate": 7.92e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-08, | |
| "completion_length": 158.9375, | |
| "epoch": 0.209, | |
| "grad_norm": 4.06764030456543, | |
| "kl": 0.333984375, | |
| "learning_rate": 7.91e-07, | |
| "loss": 0.0656, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.10681164264678955, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 209 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 180.9375, | |
| "epoch": 0.21, | |
| "grad_norm": 0.0, | |
| "kl": 0.2451171875, | |
| "learning_rate": 7.9e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 210 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 152.1875, | |
| "epoch": 0.211, | |
| "grad_norm": 3.491947650909424, | |
| "kl": 0.28515625, | |
| "learning_rate": 7.89e-07, | |
| "loss": -0.0244, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.16517186164855957, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 211 | |
| }, | |
| { | |
| "advantages": 1.2665987014770508e-07, | |
| "completion_length": 175.125, | |
| "epoch": 0.212, | |
| "grad_norm": 5.700802326202393, | |
| "kl": 0.29296875, | |
| "learning_rate": 7.88e-07, | |
| "loss": 0.0936, | |
| "reward": 1.6041667461395264, | |
| "reward_mean": 1.6041667461395264, | |
| "reward_std": 0.32618677616119385, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 164.0, | |
| "epoch": 0.213, | |
| "grad_norm": 4.179893493652344, | |
| "kl": 0.25390625, | |
| "learning_rate": 7.87e-07, | |
| "loss": -0.0081, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 213 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 137.125, | |
| "epoch": 0.214, | |
| "grad_norm": 4.17854118347168, | |
| "kl": 0.2470703125, | |
| "learning_rate": 7.86e-07, | |
| "loss": 0.0071, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 214 | |
| }, | |
| { | |
| "advantages": 1.4156103134155273e-07, | |
| "completion_length": 189.875, | |
| "epoch": 0.215, | |
| "grad_norm": 4.708441734313965, | |
| "kl": 0.30859375, | |
| "learning_rate": 7.85e-07, | |
| "loss": -0.0134, | |
| "reward": 1.4791667461395264, | |
| "reward_mean": 1.4791667461395264, | |
| "reward_std": 0.2903805673122406, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 215 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 132.0, | |
| "epoch": 0.216, | |
| "grad_norm": 0.0, | |
| "kl": 0.30078125, | |
| "learning_rate": 7.84e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 216 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 145.3125, | |
| "epoch": 0.217, | |
| "grad_norm": 3.518888473510742, | |
| "kl": 0.34765625, | |
| "learning_rate": 7.83e-07, | |
| "loss": 0.0506, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 217 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-08, | |
| "completion_length": 165.9375, | |
| "epoch": 0.218, | |
| "grad_norm": 5.872474193572998, | |
| "kl": 0.28125, | |
| "learning_rate": 7.82e-07, | |
| "loss": 0.0886, | |
| "reward": 1.6770833730697632, | |
| "reward_mean": 1.6770833730697632, | |
| "reward_std": 0.541657567024231, | |
| "rewards/accuracy_reward": 0.7395833730697632, | |
| "rewards/format_reward": 0.9375, | |
| "step": 218 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 142.6875, | |
| "epoch": 0.219, | |
| "grad_norm": 5.611164093017578, | |
| "kl": 0.298828125, | |
| "learning_rate": 7.81e-07, | |
| "loss": -0.0346, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 219 | |
| }, | |
| { | |
| "advantages": -5.21540641784668e-08, | |
| "completion_length": 152.9375, | |
| "epoch": 0.22, | |
| "grad_norm": 5.375847816467285, | |
| "kl": 0.287109375, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0275, | |
| "reward": 1.3541667461395264, | |
| "reward_mean": 1.3541667461395264, | |
| "reward_std": 0.2335786670446396, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 220 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 125.8125, | |
| "epoch": 0.221, | |
| "grad_norm": 5.226174354553223, | |
| "kl": 0.328125, | |
| "learning_rate": 7.79e-07, | |
| "loss": 0.1148, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.875, | |
| "step": 221 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 138.4375, | |
| "epoch": 0.222, | |
| "grad_norm": 4.286291122436523, | |
| "kl": 0.3515625, | |
| "learning_rate": 7.78e-07, | |
| "loss": 0.0188, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 222 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 133.9375, | |
| "epoch": 0.223, | |
| "grad_norm": 5.600376605987549, | |
| "kl": 0.39453125, | |
| "learning_rate": 7.77e-07, | |
| "loss": -0.0054, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 223 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 162.125, | |
| "epoch": 0.224, | |
| "grad_norm": 4.409877300262451, | |
| "kl": 0.34375, | |
| "learning_rate": 7.76e-07, | |
| "loss": 0.0028, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 224 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 115.5625, | |
| "epoch": 0.225, | |
| "grad_norm": 0.0, | |
| "kl": 0.291015625, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 225 | |
| }, | |
| { | |
| "advantages": 5.587935447692871e-08, | |
| "completion_length": 133.25, | |
| "epoch": 0.226, | |
| "grad_norm": 6.370109558105469, | |
| "kl": 0.30859375, | |
| "learning_rate": 7.74e-07, | |
| "loss": -0.1677, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.3450327515602112, | |
| "rewards/accuracy_reward": 0.7500000596046448, | |
| "rewards/format_reward": 1.0, | |
| "step": 226 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 101.8125, | |
| "epoch": 0.227, | |
| "grad_norm": 0.0, | |
| "kl": 0.3828125, | |
| "learning_rate": 7.729999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 227 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 136.875, | |
| "epoch": 0.228, | |
| "grad_norm": 3.316059112548828, | |
| "kl": 0.44921875, | |
| "learning_rate": 7.72e-07, | |
| "loss": 0.0161, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 228 | |
| }, | |
| { | |
| "advantages": -1.4156103134155273e-07, | |
| "completion_length": 131.5625, | |
| "epoch": 0.229, | |
| "grad_norm": 5.905332088470459, | |
| "kl": 0.349609375, | |
| "learning_rate": 7.71e-07, | |
| "loss": 0.0514, | |
| "reward": 1.8541667461395264, | |
| "reward_mean": 1.8541667461395264, | |
| "reward_std": 0.2903805673122406, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 229 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 146.5, | |
| "epoch": 0.23, | |
| "grad_norm": 4.266251564025879, | |
| "kl": 0.322265625, | |
| "learning_rate": 7.699999999999999e-07, | |
| "loss": 0.0306, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 230 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 127.9375, | |
| "epoch": 0.231, | |
| "grad_norm": 0.0, | |
| "kl": 0.30078125, | |
| "learning_rate": 7.69e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 231 | |
| }, | |
| { | |
| "advantages": -5.960464477539063e-08, | |
| "completion_length": 113.8125, | |
| "epoch": 0.232, | |
| "grad_norm": 6.29781436920166, | |
| "kl": 0.34765625, | |
| "learning_rate": 7.68e-07, | |
| "loss": -0.009, | |
| "reward": 1.7708333730697632, | |
| "reward_mean": 1.7708333730697632, | |
| "reward_std": 0.2048145979642868, | |
| "rewards/accuracy_reward": 0.7708333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 232 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 112.5625, | |
| "epoch": 0.233, | |
| "grad_norm": 4.406736373901367, | |
| "kl": 0.298828125, | |
| "learning_rate": 7.67e-07, | |
| "loss": -0.0023, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 233 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 100.125, | |
| "epoch": 0.234, | |
| "grad_norm": 0.0, | |
| "kl": 0.3359375, | |
| "learning_rate": 7.66e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "advantages": -6.705522537231445e-08, | |
| "completion_length": 120.125, | |
| "epoch": 0.235, | |
| "grad_norm": 3.8253743648529053, | |
| "kl": 0.3359375, | |
| "learning_rate": 7.65e-07, | |
| "loss": 0.0283, | |
| "reward": 1.0416667461395264, | |
| "reward_mean": 1.0416667461395264, | |
| "reward_std": 0.1178511530160904, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 134.6875, | |
| "epoch": 0.236, | |
| "grad_norm": 3.9648969173431396, | |
| "kl": 0.373046875, | |
| "learning_rate": 7.64e-07, | |
| "loss": 0.0765, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.24775780737400055, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 236 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 132.0625, | |
| "epoch": 0.237, | |
| "grad_norm": 7.067671775817871, | |
| "kl": 0.36328125, | |
| "learning_rate": 7.629999999999999e-07, | |
| "loss": -0.1304, | |
| "reward": 1.6979167461395264, | |
| "reward_mean": 1.6979167461395264, | |
| "reward_std": 0.28634417057037354, | |
| "rewards/accuracy_reward": 0.6979166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 237 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 116.75, | |
| "epoch": 0.238, | |
| "grad_norm": 5.4808030128479, | |
| "kl": 0.375, | |
| "learning_rate": 7.62e-07, | |
| "loss": 0.0004, | |
| "reward": 1.7604167461395264, | |
| "reward_mean": 1.7604167461395264, | |
| "reward_std": 0.1473138928413391, | |
| "rewards/accuracy_reward": 0.7604166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 238 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 110.3125, | |
| "epoch": 0.239, | |
| "grad_norm": 4.075715065002441, | |
| "kl": 0.31640625, | |
| "learning_rate": 7.61e-07, | |
| "loss": 0.0067, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 0.9375, | |
| "step": 239 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 103.3125, | |
| "epoch": 0.24, | |
| "grad_norm": 0.0, | |
| "kl": 0.396484375, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 240 | |
| }, | |
| { | |
| "advantages": 1.6391277313232422e-07, | |
| "completion_length": 109.4375, | |
| "epoch": 0.241, | |
| "grad_norm": 5.156554222106934, | |
| "kl": 0.421875, | |
| "learning_rate": 7.59e-07, | |
| "loss": -0.0393, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.08908708393573761, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 241 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 119.125, | |
| "epoch": 0.242, | |
| "grad_norm": 4.303339004516602, | |
| "kl": 0.3984375, | |
| "learning_rate": 7.58e-07, | |
| "loss": 0.1162, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 242 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 111.1875, | |
| "epoch": 0.243, | |
| "grad_norm": 4.342909336090088, | |
| "kl": 0.439453125, | |
| "learning_rate": 7.57e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 243 | |
| }, | |
| { | |
| "advantages": 4.470348358154297e-08, | |
| "completion_length": 118.4375, | |
| "epoch": 0.244, | |
| "grad_norm": 7.558548450469971, | |
| "kl": 0.37890625, | |
| "learning_rate": 7.559999999999999e-07, | |
| "loss": -0.1255, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.4149954617023468, | |
| "rewards/accuracy_reward": 0.3750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 244 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 106.875, | |
| "epoch": 0.245, | |
| "grad_norm": 4.70227575302124, | |
| "kl": 0.36328125, | |
| "learning_rate": 7.55e-07, | |
| "loss": 0.0574, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.22903135418891907, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 245 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 129.9375, | |
| "epoch": 0.246, | |
| "grad_norm": 0.0, | |
| "kl": 0.37109375, | |
| "learning_rate": 7.54e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 246 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 118.5625, | |
| "epoch": 0.247, | |
| "grad_norm": 4.569678783416748, | |
| "kl": 0.421875, | |
| "learning_rate": 7.529999999999999e-07, | |
| "loss": -0.0377, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 247 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 126.25, | |
| "epoch": 0.248, | |
| "grad_norm": 4.764584064483643, | |
| "kl": 0.33203125, | |
| "learning_rate": 7.52e-07, | |
| "loss": 0.0018, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 1.0, | |
| "step": 248 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 125.3125, | |
| "epoch": 0.249, | |
| "grad_norm": 5.263643264770508, | |
| "kl": 0.384765625, | |
| "learning_rate": 7.51e-07, | |
| "loss": 0.0607, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 249 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 120.75, | |
| "epoch": 0.25, | |
| "grad_norm": 4.139052867889404, | |
| "kl": 0.38671875, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0403, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 250 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 113.0625, | |
| "epoch": 0.251, | |
| "grad_norm": 4.267086029052734, | |
| "kl": 0.40234375, | |
| "learning_rate": 7.489999999999999e-07, | |
| "loss": -0.0034, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 251 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 110.9375, | |
| "epoch": 0.252, | |
| "grad_norm": 0.0, | |
| "kl": 0.44140625, | |
| "learning_rate": 7.48e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 252 | |
| }, | |
| { | |
| "advantages": 1.2665987014770508e-07, | |
| "completion_length": 125.0, | |
| "epoch": 0.253, | |
| "grad_norm": 4.108771324157715, | |
| "kl": 0.375, | |
| "learning_rate": 7.47e-07, | |
| "loss": 0.0269, | |
| "reward": 1.8541667461395264, | |
| "reward_mean": 1.8541667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.8541667461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 109.4375, | |
| "epoch": 0.254, | |
| "grad_norm": 6.75657320022583, | |
| "kl": 0.53125, | |
| "learning_rate": 7.459999999999999e-07, | |
| "loss": -0.0183, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 254 | |
| }, | |
| { | |
| "advantages": 1.043081283569336e-07, | |
| "completion_length": 125.5625, | |
| "epoch": 0.255, | |
| "grad_norm": 6.262571334838867, | |
| "kl": 0.443359375, | |
| "learning_rate": 7.45e-07, | |
| "loss": 0.1093, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.13908715546131134, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 255 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 114.25, | |
| "epoch": 0.256, | |
| "grad_norm": 4.935299396514893, | |
| "kl": 0.5078125, | |
| "learning_rate": 7.44e-07, | |
| "loss": -0.0599, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 256 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 141.25, | |
| "epoch": 0.257, | |
| "grad_norm": 5.354793548583984, | |
| "kl": 0.419921875, | |
| "learning_rate": 7.429999999999999e-07, | |
| "loss": 0.0394, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.22201895713806152, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 95.3125, | |
| "epoch": 0.258, | |
| "grad_norm": 4.425192832946777, | |
| "kl": 0.40234375, | |
| "learning_rate": 7.42e-07, | |
| "loss": 0.0065, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 258 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 114.4375, | |
| "epoch": 0.259, | |
| "grad_norm": 6.3800835609436035, | |
| "kl": 0.41015625, | |
| "learning_rate": 7.41e-07, | |
| "loss": -0.017, | |
| "reward": 1.8645833730697632, | |
| "reward_mean": 1.8645833730697632, | |
| "reward_std": 0.1746465265750885, | |
| "rewards/accuracy_reward": 0.8645833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 259 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 132.375, | |
| "epoch": 0.26, | |
| "grad_norm": 4.138468265533447, | |
| "kl": 0.41015625, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0899, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-08, | |
| "completion_length": 127.375, | |
| "epoch": 0.261, | |
| "grad_norm": 5.36328649520874, | |
| "kl": 0.490234375, | |
| "learning_rate": 7.389999999999999e-07, | |
| "loss": -0.1071, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.2136232852935791, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 119.875, | |
| "epoch": 0.262, | |
| "grad_norm": 4.338840007781982, | |
| "kl": 0.451171875, | |
| "learning_rate": 7.38e-07, | |
| "loss": -0.0061, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 0.9375, | |
| "step": 262 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 126.5625, | |
| "epoch": 0.263, | |
| "grad_norm": 4.404613971710205, | |
| "kl": 0.5078125, | |
| "learning_rate": 7.37e-07, | |
| "loss": -0.0745, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 263 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 117.5625, | |
| "epoch": 0.264, | |
| "grad_norm": 0.0, | |
| "kl": 0.37109375, | |
| "learning_rate": 7.359999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 264 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 123.875, | |
| "epoch": 0.265, | |
| "grad_norm": 0.0, | |
| "kl": 0.404296875, | |
| "learning_rate": 7.35e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 265 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 117.4375, | |
| "epoch": 0.266, | |
| "grad_norm": 5.04351282119751, | |
| "kl": 0.4375, | |
| "learning_rate": 7.34e-07, | |
| "loss": 0.0671, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.18600594997406006, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 266 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 98.3125, | |
| "epoch": 0.267, | |
| "grad_norm": 4.765639305114746, | |
| "kl": 0.421875, | |
| "learning_rate": 7.329999999999999e-07, | |
| "loss": 0.0228, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 267 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 132.8125, | |
| "epoch": 0.268, | |
| "grad_norm": 0.0, | |
| "kl": 0.4296875, | |
| "learning_rate": 7.319999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 268 | |
| }, | |
| { | |
| "advantages": -1.9371509552001953e-07, | |
| "completion_length": 118.875, | |
| "epoch": 0.269, | |
| "grad_norm": 4.1043314933776855, | |
| "kl": 0.392578125, | |
| "learning_rate": 7.31e-07, | |
| "loss": -0.0282, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.07715165615081787, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 269 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 137.8125, | |
| "epoch": 0.27, | |
| "grad_norm": 4.980680465698242, | |
| "kl": 0.41015625, | |
| "learning_rate": 7.3e-07, | |
| "loss": 0.0036, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 134.1875, | |
| "epoch": 0.271, | |
| "grad_norm": 0.0, | |
| "kl": 0.421875, | |
| "learning_rate": 7.289999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 271 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 93.8125, | |
| "epoch": 0.272, | |
| "grad_norm": 5.348329544067383, | |
| "kl": 0.46484375, | |
| "learning_rate": 7.28e-07, | |
| "loss": 0.0097, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 272 | |
| }, | |
| { | |
| "advantages": 3.203749656677246e-07, | |
| "completion_length": 134.125, | |
| "epoch": 0.273, | |
| "grad_norm": 3.749969244003296, | |
| "kl": 0.4375, | |
| "learning_rate": 7.27e-07, | |
| "loss": -0.062, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.058925580233335495, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 273 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.875, | |
| "epoch": 0.274, | |
| "grad_norm": 4.896990776062012, | |
| "kl": 0.455078125, | |
| "learning_rate": 7.259999999999999e-07, | |
| "loss": 0.0062, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 274 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 98.25, | |
| "epoch": 0.275, | |
| "grad_norm": 5.642269611358643, | |
| "kl": 0.5, | |
| "learning_rate": 7.249999999999999e-07, | |
| "loss": -0.0376, | |
| "reward": 1.28125, | |
| "reward_mean": 1.28125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 140.1875, | |
| "epoch": 0.276, | |
| "grad_norm": 3.443995714187622, | |
| "kl": 0.392578125, | |
| "learning_rate": 7.24e-07, | |
| "loss": -0.0466, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 276 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 96.4375, | |
| "epoch": 0.277, | |
| "grad_norm": 7.930581092834473, | |
| "kl": 0.38671875, | |
| "learning_rate": 7.229999999999999e-07, | |
| "loss": -0.138, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.4355512857437134, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 277 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 125.25, | |
| "epoch": 0.278, | |
| "grad_norm": 0.0, | |
| "kl": 0.3984375, | |
| "learning_rate": 7.219999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 278 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 108.0625, | |
| "epoch": 0.279, | |
| "grad_norm": 6.782789707183838, | |
| "kl": 0.375, | |
| "learning_rate": 7.21e-07, | |
| "loss": 0.0696, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 106.5, | |
| "epoch": 0.28, | |
| "grad_norm": 4.9994611740112305, | |
| "kl": 0.443359375, | |
| "learning_rate": 7.2e-07, | |
| "loss": -0.0264, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 280 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 143.5625, | |
| "epoch": 0.281, | |
| "grad_norm": 6.117532253265381, | |
| "kl": 0.640625, | |
| "learning_rate": 7.189999999999999e-07, | |
| "loss": 0.0769, | |
| "reward": 1.9479167461395264, | |
| "reward_mean": 1.9479167461395264, | |
| "reward_std": 0.1473138928413391, | |
| "rewards/accuracy_reward": 0.9479166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 281 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 157.4375, | |
| "epoch": 0.282, | |
| "grad_norm": 3.375563859939575, | |
| "kl": 0.39453125, | |
| "learning_rate": 7.179999999999999e-07, | |
| "loss": -0.0054, | |
| "reward": 1.7083333730697632, | |
| "reward_mean": 1.7083333730697632, | |
| "reward_std": 0.1178511381149292, | |
| "rewards/accuracy_reward": 0.7083333134651184, | |
| "rewards/format_reward": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 121.9375, | |
| "epoch": 0.283, | |
| "grad_norm": 0.0, | |
| "kl": 0.41015625, | |
| "learning_rate": 7.17e-07, | |
| "loss": 0.0, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 283 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 100.875, | |
| "epoch": 0.284, | |
| "grad_norm": 6.159756183624268, | |
| "kl": 0.41015625, | |
| "learning_rate": 7.159999999999999e-07, | |
| "loss": 0.0617, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.3104073107242584, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 284 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 111.3125, | |
| "epoch": 0.285, | |
| "grad_norm": 5.778822898864746, | |
| "kl": 0.412109375, | |
| "learning_rate": 7.149999999999999e-07, | |
| "loss": -0.0822, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 110.1875, | |
| "epoch": 0.286, | |
| "grad_norm": 0.0, | |
| "kl": 0.40234375, | |
| "learning_rate": 7.14e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 121.25, | |
| "epoch": 0.287, | |
| "grad_norm": 4.658452987670898, | |
| "kl": 0.42578125, | |
| "learning_rate": 7.129999999999999e-07, | |
| "loss": -0.01, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 287 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 151.0, | |
| "epoch": 0.288, | |
| "grad_norm": 3.2589261531829834, | |
| "kl": 0.45703125, | |
| "learning_rate": 7.119999999999999e-07, | |
| "loss": -0.0573, | |
| "reward": 1.4791667461395264, | |
| "reward_mean": 1.4791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-08, | |
| "completion_length": 112.9375, | |
| "epoch": 0.289, | |
| "grad_norm": 4.990071773529053, | |
| "kl": 0.400390625, | |
| "learning_rate": 7.11e-07, | |
| "loss": -0.0088, | |
| "reward": 1.9166667461395264, | |
| "reward_mean": 1.9166667461395264, | |
| "reward_std": 0.12598814070224762, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 289 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 123.6875, | |
| "epoch": 0.29, | |
| "grad_norm": 4.007847309112549, | |
| "kl": 0.4375, | |
| "learning_rate": 7.1e-07, | |
| "loss": 0.0285, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.1602174937725067, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 108.4375, | |
| "epoch": 0.291, | |
| "grad_norm": 4.9294867515563965, | |
| "kl": 0.443359375, | |
| "learning_rate": 7.089999999999999e-07, | |
| "loss": -0.0249, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 135.375, | |
| "epoch": 0.292, | |
| "grad_norm": 4.507473945617676, | |
| "kl": 0.3984375, | |
| "learning_rate": 7.079999999999999e-07, | |
| "loss": 0.0089, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 292 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 122.25, | |
| "epoch": 0.293, | |
| "grad_norm": 0.0, | |
| "kl": 0.41796875, | |
| "learning_rate": 7.07e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 293 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 135.625, | |
| "epoch": 0.294, | |
| "grad_norm": 5.223430633544922, | |
| "kl": 0.466796875, | |
| "learning_rate": 7.059999999999999e-07, | |
| "loss": 0.0928, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 105.6875, | |
| "epoch": 0.295, | |
| "grad_norm": 5.42147970199585, | |
| "kl": 0.44140625, | |
| "learning_rate": 7.049999999999999e-07, | |
| "loss": 0.0288, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 295 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 126.6875, | |
| "epoch": 0.296, | |
| "grad_norm": 3.41044545173645, | |
| "kl": 0.51953125, | |
| "learning_rate": 7.04e-07, | |
| "loss": -0.0624, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 296 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 126.1875, | |
| "epoch": 0.297, | |
| "grad_norm": 0.0, | |
| "kl": 0.42578125, | |
| "learning_rate": 7.029999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 297 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 107.6875, | |
| "epoch": 0.298, | |
| "grad_norm": 4.168430328369141, | |
| "kl": 0.390625, | |
| "learning_rate": 7.019999999999999e-07, | |
| "loss": 0.0322, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 298 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 135.125, | |
| "epoch": 0.299, | |
| "grad_norm": 0.0, | |
| "kl": 0.4375, | |
| "learning_rate": 7.009999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 299 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 123.4375, | |
| "epoch": 0.3, | |
| "grad_norm": 7.8173346519470215, | |
| "kl": 0.45703125, | |
| "learning_rate": 7e-07, | |
| "loss": 0.2183, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 300 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 112.4375, | |
| "epoch": 0.301, | |
| "grad_norm": 4.600705623626709, | |
| "kl": 0.46484375, | |
| "learning_rate": 6.989999999999999e-07, | |
| "loss": 0.0528, | |
| "reward": 1.6458333730697632, | |
| "reward_mean": 1.6458333730697632, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.6458333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 301 | |
| }, | |
| { | |
| "advantages": 7.078051567077637e-08, | |
| "completion_length": 119.1875, | |
| "epoch": 0.302, | |
| "grad_norm": 4.796161651611328, | |
| "kl": 0.53515625, | |
| "learning_rate": 6.979999999999999e-07, | |
| "loss": 0.057, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.0862581804394722, | |
| "rewards/accuracy_reward": 0.9375000596046448, | |
| "rewards/format_reward": 1.0, | |
| "step": 302 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.0, | |
| "epoch": 0.303, | |
| "grad_norm": 4.70276403427124, | |
| "kl": 0.515625, | |
| "learning_rate": 6.97e-07, | |
| "loss": -0.0333, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 303 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 123.875, | |
| "epoch": 0.304, | |
| "grad_norm": 4.684284687042236, | |
| "kl": 0.43359375, | |
| "learning_rate": 6.959999999999999e-07, | |
| "loss": -0.0197, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 304 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 104.625, | |
| "epoch": 0.305, | |
| "grad_norm": 4.7765889167785645, | |
| "kl": 0.5078125, | |
| "learning_rate": 6.949999999999999e-07, | |
| "loss": -0.0436, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 305 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 112.125, | |
| "epoch": 0.306, | |
| "grad_norm": 0.0, | |
| "kl": 0.419921875, | |
| "learning_rate": 6.939999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 306 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 111.5, | |
| "epoch": 0.307, | |
| "grad_norm": 8.246498107910156, | |
| "kl": 0.4765625, | |
| "learning_rate": 6.929999999999999e-07, | |
| "loss": 0.1117, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 307 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.6875, | |
| "epoch": 0.308, | |
| "grad_norm": 0.0, | |
| "kl": 0.39453125, | |
| "learning_rate": 6.919999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 308 | |
| }, | |
| { | |
| "advantages": -2.60770320892334e-08, | |
| "completion_length": 105.5, | |
| "epoch": 0.309, | |
| "grad_norm": 8.390800476074219, | |
| "kl": 0.470703125, | |
| "learning_rate": 6.909999999999999e-07, | |
| "loss": -0.0529, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.4355512857437134, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "step": 309 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 108.0, | |
| "epoch": 0.31, | |
| "grad_norm": 0.0, | |
| "kl": 0.482421875, | |
| "learning_rate": 6.9e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 310 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 112.5625, | |
| "epoch": 0.311, | |
| "grad_norm": 0.0, | |
| "kl": 0.51953125, | |
| "learning_rate": 6.889999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 311 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 105.8125, | |
| "epoch": 0.312, | |
| "grad_norm": 0.0, | |
| "kl": 0.41796875, | |
| "learning_rate": 6.879999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 95.5625, | |
| "epoch": 0.313, | |
| "grad_norm": 5.6205830574035645, | |
| "kl": 0.5390625, | |
| "learning_rate": 6.87e-07, | |
| "loss": -0.0113, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 0.9375, | |
| "step": 313 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 111.5, | |
| "epoch": 0.314, | |
| "grad_norm": 0.0, | |
| "kl": 0.416015625, | |
| "learning_rate": 6.86e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 314 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 123.125, | |
| "epoch": 0.315, | |
| "grad_norm": 0.0, | |
| "kl": 0.46875, | |
| "learning_rate": 6.85e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 315 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 113.3125, | |
| "epoch": 0.316, | |
| "grad_norm": 0.0, | |
| "kl": 0.419921875, | |
| "learning_rate": 6.84e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 316 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 116.9375, | |
| "epoch": 0.317, | |
| "grad_norm": 5.153122901916504, | |
| "kl": 0.453125, | |
| "learning_rate": 6.830000000000001e-07, | |
| "loss": 0.0341, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 317 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 144.5625, | |
| "epoch": 0.318, | |
| "grad_norm": 0.0, | |
| "kl": 0.4375, | |
| "learning_rate": 6.82e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 318 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 102.3125, | |
| "epoch": 0.319, | |
| "grad_norm": 0.0, | |
| "kl": 0.5234375, | |
| "learning_rate": 6.81e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 319 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 109.75, | |
| "epoch": 0.32, | |
| "grad_norm": 0.0, | |
| "kl": 0.5078125, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 320 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 104.5625, | |
| "epoch": 0.321, | |
| "grad_norm": 4.661564826965332, | |
| "kl": 0.53125, | |
| "learning_rate": 6.79e-07, | |
| "loss": -0.0622, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 321 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 121.375, | |
| "epoch": 0.322, | |
| "grad_norm": 5.486865043640137, | |
| "kl": 0.49609375, | |
| "learning_rate": 6.78e-07, | |
| "loss": 0.0983, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 322 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 128.4375, | |
| "epoch": 0.323, | |
| "grad_norm": 3.9005072116851807, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.77e-07, | |
| "loss": -0.041, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 115.8125, | |
| "epoch": 0.324, | |
| "grad_norm": 0.0, | |
| "kl": 0.44140625, | |
| "learning_rate": 6.76e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 324 | |
| }, | |
| { | |
| "advantages": 1.1175870895385742e-08, | |
| "completion_length": 125.4375, | |
| "epoch": 0.325, | |
| "grad_norm": 5.992334842681885, | |
| "kl": 0.40625, | |
| "learning_rate": 6.75e-07, | |
| "loss": 0.0715, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.447756826877594, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 325 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 117.0, | |
| "epoch": 0.326, | |
| "grad_norm": 0.0, | |
| "kl": 0.482421875, | |
| "learning_rate": 6.74e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 326 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 121.6875, | |
| "epoch": 0.327, | |
| "grad_norm": 5.490609169006348, | |
| "kl": 0.5078125, | |
| "learning_rate": 6.730000000000001e-07, | |
| "loss": -0.0609, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 327 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 125.5, | |
| "epoch": 0.328, | |
| "grad_norm": 4.8279337882995605, | |
| "kl": 0.41796875, | |
| "learning_rate": 6.72e-07, | |
| "loss": -0.0221, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 328 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 115.3125, | |
| "epoch": 0.329, | |
| "grad_norm": 0.0, | |
| "kl": 1.2578125, | |
| "learning_rate": 6.71e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 329 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 124.5, | |
| "epoch": 0.33, | |
| "grad_norm": 0.0, | |
| "kl": 0.49609375, | |
| "learning_rate": 6.7e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 330 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 117.375, | |
| "epoch": 0.331, | |
| "grad_norm": 0.0, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.69e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 331 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 113.0, | |
| "epoch": 0.332, | |
| "grad_norm": 6.589673042297363, | |
| "kl": 0.390625, | |
| "learning_rate": 6.68e-07, | |
| "loss": 0.0033, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "step": 332 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.5, | |
| "epoch": 0.333, | |
| "grad_norm": 0.0, | |
| "kl": 0.48828125, | |
| "learning_rate": 6.67e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 333 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-08, | |
| "completion_length": 125.5625, | |
| "epoch": 0.334, | |
| "grad_norm": 4.017887592315674, | |
| "kl": 0.625, | |
| "learning_rate": 6.66e-07, | |
| "loss": -0.0525, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.2182178944349289, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 334 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 118.5, | |
| "epoch": 0.335, | |
| "grad_norm": 5.249420166015625, | |
| "kl": 0.453125, | |
| "learning_rate": 6.65e-07, | |
| "loss": 0.0686, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 111.25, | |
| "epoch": 0.336, | |
| "grad_norm": 0.0, | |
| "kl": 0.44921875, | |
| "learning_rate": 6.64e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 336 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 121.125, | |
| "epoch": 0.337, | |
| "grad_norm": 0.0, | |
| "kl": 0.4609375, | |
| "learning_rate": 6.63e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 337 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.375, | |
| "epoch": 0.338, | |
| "grad_norm": 0.0, | |
| "kl": 0.466796875, | |
| "learning_rate": 6.62e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 338 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 110.5, | |
| "epoch": 0.339, | |
| "grad_norm": 4.943254470825195, | |
| "kl": 0.48828125, | |
| "learning_rate": 6.61e-07, | |
| "loss": 0.0704, | |
| "reward": 1.15625, | |
| "reward_mean": 1.15625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.15625, | |
| "rewards/format_reward": 1.0, | |
| "step": 339 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 114.625, | |
| "epoch": 0.34, | |
| "grad_norm": 4.797520637512207, | |
| "kl": 0.4921875, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0547, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 340 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 123.1875, | |
| "epoch": 0.341, | |
| "grad_norm": 5.215485095977783, | |
| "kl": 0.447265625, | |
| "learning_rate": 6.59e-07, | |
| "loss": 0.0024, | |
| "reward": 1.6770833730697632, | |
| "reward_mean": 1.6770833730697632, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.6770833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 341 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 132.0625, | |
| "epoch": 0.342, | |
| "grad_norm": 0.0, | |
| "kl": 0.48828125, | |
| "learning_rate": 6.58e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 342 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 100.0, | |
| "epoch": 0.343, | |
| "grad_norm": 0.0, | |
| "kl": 0.43359375, | |
| "learning_rate": 6.57e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 343 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 124.125, | |
| "epoch": 0.344, | |
| "grad_norm": 5.066404819488525, | |
| "kl": 0.46484375, | |
| "learning_rate": 6.56e-07, | |
| "loss": 0.0417, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 344 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 111.9375, | |
| "epoch": 0.345, | |
| "grad_norm": 0.0, | |
| "kl": 0.5390625, | |
| "learning_rate": 6.55e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 345 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 125.25, | |
| "epoch": 0.346, | |
| "grad_norm": 5.6763505935668945, | |
| "kl": 0.4765625, | |
| "learning_rate": 6.54e-07, | |
| "loss": 0.0047, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 108.8125, | |
| "epoch": 0.347, | |
| "grad_norm": 5.239328384399414, | |
| "kl": 0.71484375, | |
| "learning_rate": 6.53e-07, | |
| "loss": 0.0418, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 347 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 113.0, | |
| "epoch": 0.348, | |
| "grad_norm": 0.0, | |
| "kl": 0.4609375, | |
| "learning_rate": 6.52e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 348 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 116.125, | |
| "epoch": 0.349, | |
| "grad_norm": 0.0, | |
| "kl": 0.4765625, | |
| "learning_rate": 6.51e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 349 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 105.3125, | |
| "epoch": 0.35, | |
| "grad_norm": 7.965950012207031, | |
| "kl": 0.447265625, | |
| "learning_rate": 6.5e-07, | |
| "loss": -0.0053, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.6307864785194397, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 350 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 135.9375, | |
| "epoch": 0.351, | |
| "grad_norm": 0.0, | |
| "kl": 0.625, | |
| "learning_rate": 6.49e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 351 | |
| }, | |
| { | |
| "advantages": 1.9371509552001953e-07, | |
| "completion_length": 142.0, | |
| "epoch": 0.352, | |
| "grad_norm": 3.862729787826538, | |
| "kl": 0.453125, | |
| "learning_rate": 6.48e-07, | |
| "loss": -0.0357, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.07715165615081787, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 109.625, | |
| "epoch": 0.353, | |
| "grad_norm": 0.0, | |
| "kl": 0.57421875, | |
| "learning_rate": 6.47e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 353 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 106.125, | |
| "epoch": 0.354, | |
| "grad_norm": 0.0, | |
| "kl": 0.5078125, | |
| "learning_rate": 6.46e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 113.3125, | |
| "epoch": 0.355, | |
| "grad_norm": 5.779082775115967, | |
| "kl": 0.52734375, | |
| "learning_rate": 6.45e-07, | |
| "loss": -0.0804, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 1.0, | |
| "step": 355 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 112.4375, | |
| "epoch": 0.356, | |
| "grad_norm": 0.0, | |
| "kl": 0.42578125, | |
| "learning_rate": 6.44e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 356 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 117.6875, | |
| "epoch": 0.357, | |
| "grad_norm": 6.260042190551758, | |
| "kl": 0.458984375, | |
| "learning_rate": 6.43e-07, | |
| "loss": -0.081, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 133.5625, | |
| "epoch": 0.358, | |
| "grad_norm": 0.0, | |
| "kl": 0.46875, | |
| "learning_rate": 6.42e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 358 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 123.0625, | |
| "epoch": 0.359, | |
| "grad_norm": 0.0, | |
| "kl": 1.40625, | |
| "learning_rate": 6.41e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 359 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 115.9375, | |
| "epoch": 0.36, | |
| "grad_norm": 0.0, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 125.375, | |
| "epoch": 0.361, | |
| "grad_norm": 4.933547019958496, | |
| "kl": 0.48828125, | |
| "learning_rate": 6.389999999999999e-07, | |
| "loss": -0.1138, | |
| "reward": 1.8958333730697632, | |
| "reward_mean": 1.8958333730697632, | |
| "reward_std": 0.0862581804394722, | |
| "rewards/accuracy_reward": 0.8958333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 361 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 131.9375, | |
| "epoch": 0.362, | |
| "grad_norm": 5.297484874725342, | |
| "kl": 0.458984375, | |
| "learning_rate": 6.38e-07, | |
| "loss": -0.0348, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 362 | |
| }, | |
| { | |
| "advantages": 6.705522537231445e-08, | |
| "completion_length": 124.125, | |
| "epoch": 0.363, | |
| "grad_norm": 6.302598476409912, | |
| "kl": 0.4140625, | |
| "learning_rate": 6.37e-07, | |
| "loss": -0.0008, | |
| "reward": 1.7604167461395264, | |
| "reward_mean": 1.7604167461395264, | |
| "reward_std": 0.2062394917011261, | |
| "rewards/accuracy_reward": 0.7604167461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 135.25, | |
| "epoch": 0.364, | |
| "grad_norm": 3.608915328979492, | |
| "kl": 0.390625, | |
| "learning_rate": 6.36e-07, | |
| "loss": -0.0212, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 364 | |
| }, | |
| { | |
| "advantages": -6.705522537231445e-08, | |
| "completion_length": 140.625, | |
| "epoch": 0.365, | |
| "grad_norm": 5.799376964569092, | |
| "kl": 0.4296875, | |
| "learning_rate": 6.35e-07, | |
| "loss": 0.0235, | |
| "reward": 1.4583333730697632, | |
| "reward_mean": 1.4583333730697632, | |
| "reward_std": 0.2630348801612854, | |
| "rewards/accuracy_reward": 0.4583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 365 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 129.8125, | |
| "epoch": 0.366, | |
| "grad_norm": 0.0, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.34e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 366 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 137.6875, | |
| "epoch": 0.367, | |
| "grad_norm": 4.999783039093018, | |
| "kl": 0.44140625, | |
| "learning_rate": 6.33e-07, | |
| "loss": 0.0351, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 367 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 125.875, | |
| "epoch": 0.368, | |
| "grad_norm": 0.0, | |
| "kl": 0.3671875, | |
| "learning_rate": 6.319999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 368 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 134.875, | |
| "epoch": 0.369, | |
| "grad_norm": 0.0, | |
| "kl": 0.43359375, | |
| "learning_rate": 6.31e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 369 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 136.8125, | |
| "epoch": 0.37, | |
| "grad_norm": 0.0, | |
| "kl": 0.427734375, | |
| "learning_rate": 6.3e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 370 | |
| }, | |
| { | |
| "advantages": -8.195638656616211e-08, | |
| "completion_length": 142.375, | |
| "epoch": 0.371, | |
| "grad_norm": 6.962843418121338, | |
| "kl": 0.40625, | |
| "learning_rate": 6.289999999999999e-07, | |
| "loss": -0.0594, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.2630348801612854, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 371 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 136.625, | |
| "epoch": 0.372, | |
| "grad_norm": 6.798043251037598, | |
| "kl": 0.447265625, | |
| "learning_rate": 6.28e-07, | |
| "loss": -0.0675, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 372 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 130.625, | |
| "epoch": 0.373, | |
| "grad_norm": 5.091549396514893, | |
| "kl": 0.447265625, | |
| "learning_rate": 6.27e-07, | |
| "loss": 0.0482, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 131.5625, | |
| "epoch": 0.374, | |
| "grad_norm": 5.158649444580078, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.26e-07, | |
| "loss": 0.0248, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 374 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 150.875, | |
| "epoch": 0.375, | |
| "grad_norm": 4.258111953735352, | |
| "kl": 0.392578125, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": -0.0959, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 375 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 128.4375, | |
| "epoch": 0.376, | |
| "grad_norm": 4.292641639709473, | |
| "kl": 0.40625, | |
| "learning_rate": 6.24e-07, | |
| "loss": 0.0573, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 376 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 129.8125, | |
| "epoch": 0.377, | |
| "grad_norm": 0.0, | |
| "kl": 0.41796875, | |
| "learning_rate": 6.23e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 377 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 161.0, | |
| "epoch": 0.378, | |
| "grad_norm": 0.0, | |
| "kl": 0.390625, | |
| "learning_rate": 6.219999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 148.5625, | |
| "epoch": 0.379, | |
| "grad_norm": 4.622002124786377, | |
| "kl": 0.42578125, | |
| "learning_rate": 6.21e-07, | |
| "loss": 0.0414, | |
| "reward": 1.9166667461395264, | |
| "reward_mean": 1.9166667461395264, | |
| "reward_std": 0.08908706158399582, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 379 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 130.8125, | |
| "epoch": 0.38, | |
| "grad_norm": 6.805364608764648, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.1685, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 380 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 146.0, | |
| "epoch": 0.381, | |
| "grad_norm": 4.019841194152832, | |
| "kl": 0.416015625, | |
| "learning_rate": 6.189999999999999e-07, | |
| "loss": -0.0306, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 381 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 133.1875, | |
| "epoch": 0.382, | |
| "grad_norm": 0.0, | |
| "kl": 0.54296875, | |
| "learning_rate": 6.18e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 382 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 129.8125, | |
| "epoch": 0.383, | |
| "grad_norm": 4.163370132446289, | |
| "kl": 0.435546875, | |
| "learning_rate": 6.17e-07, | |
| "loss": -0.0085, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 383 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 170.8125, | |
| "epoch": 0.384, | |
| "grad_norm": 3.4316840171813965, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.16e-07, | |
| "loss": -0.1353, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.18898223340511322, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 384 | |
| }, | |
| { | |
| "advantages": 6.705522537231445e-08, | |
| "completion_length": 181.375, | |
| "epoch": 0.385, | |
| "grad_norm": 3.732250690460205, | |
| "kl": 0.486328125, | |
| "learning_rate": 6.149999999999999e-07, | |
| "loss": -0.0222, | |
| "reward": 1.2916667461395264, | |
| "reward_mean": 1.2916667461395264, | |
| "reward_std": 0.1178511530160904, | |
| "rewards/accuracy_reward": 0.2916666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 385 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 143.1875, | |
| "epoch": 0.386, | |
| "grad_norm": 4.219268321990967, | |
| "kl": 0.404296875, | |
| "learning_rate": 6.14e-07, | |
| "loss": 0.0139, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 386 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-08, | |
| "completion_length": 145.0625, | |
| "epoch": 0.387, | |
| "grad_norm": 4.608545780181885, | |
| "kl": 0.5234375, | |
| "learning_rate": 6.13e-07, | |
| "loss": 0.0689, | |
| "reward": 1.0833333730697632, | |
| "reward_mean": 1.0833333730697632, | |
| "reward_std": 0.15430335700511932, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 1.0, | |
| "step": 387 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 124.1875, | |
| "epoch": 0.388, | |
| "grad_norm": 5.094681262969971, | |
| "kl": 0.4140625, | |
| "learning_rate": 6.119999999999999e-07, | |
| "loss": -0.0963, | |
| "reward": 1.1875, | |
| "reward_mean": 1.1875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 1.0, | |
| "step": 388 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 158.1875, | |
| "epoch": 0.389, | |
| "grad_norm": 4.464499473571777, | |
| "kl": 0.4296875, | |
| "learning_rate": 6.11e-07, | |
| "loss": -0.0446, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 389 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 182.6875, | |
| "epoch": 0.39, | |
| "grad_norm": 0.0, | |
| "kl": 0.3984375, | |
| "learning_rate": 6.1e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 154.5625, | |
| "epoch": 0.391, | |
| "grad_norm": 0.0, | |
| "kl": 0.75, | |
| "learning_rate": 6.089999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 391 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 149.4375, | |
| "epoch": 0.392, | |
| "grad_norm": 0.0, | |
| "kl": 0.46484375, | |
| "learning_rate": 6.079999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 392 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 136.8125, | |
| "epoch": 0.393, | |
| "grad_norm": 0.0, | |
| "kl": 0.421875, | |
| "learning_rate": 6.07e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 393 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 169.1875, | |
| "epoch": 0.394, | |
| "grad_norm": 0.0, | |
| "kl": 0.4296875, | |
| "learning_rate": 6.06e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 394 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 164.6875, | |
| "epoch": 0.395, | |
| "grad_norm": 0.0, | |
| "kl": 0.416015625, | |
| "learning_rate": 6.049999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 395 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 128.9375, | |
| "epoch": 0.396, | |
| "grad_norm": 0.0, | |
| "kl": 0.4296875, | |
| "learning_rate": 6.04e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 396 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 153.375, | |
| "epoch": 0.397, | |
| "grad_norm": 0.0, | |
| "kl": 0.392578125, | |
| "learning_rate": 6.03e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 397 | |
| }, | |
| { | |
| "advantages": 8.195638656616211e-08, | |
| "completion_length": 154.6875, | |
| "epoch": 0.398, | |
| "grad_norm": 5.41452169418335, | |
| "kl": 0.671875, | |
| "learning_rate": 6.019999999999999e-07, | |
| "loss": -0.184, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.3382667005062103, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 398 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 154.3125, | |
| "epoch": 0.399, | |
| "grad_norm": 4.080648899078369, | |
| "kl": 0.453125, | |
| "learning_rate": 6.009999999999999e-07, | |
| "loss": 0.0161, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 399 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 169.75, | |
| "epoch": 0.4, | |
| "grad_norm": 0.0, | |
| "kl": 0.4453125, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 400 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 170.5, | |
| "epoch": 0.401, | |
| "grad_norm": 0.0, | |
| "kl": 0.54296875, | |
| "learning_rate": 5.989999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 401 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 141.875, | |
| "epoch": 0.402, | |
| "grad_norm": 0.0, | |
| "kl": 0.4375, | |
| "learning_rate": 5.979999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 402 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 186.8125, | |
| "epoch": 0.403, | |
| "grad_norm": 4.032413959503174, | |
| "kl": 0.375, | |
| "learning_rate": 5.97e-07, | |
| "loss": 0.0794, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 403 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 142.25, | |
| "epoch": 0.404, | |
| "grad_norm": 4.112726211547852, | |
| "kl": 0.4140625, | |
| "learning_rate": 5.96e-07, | |
| "loss": -0.1048, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 404 | |
| }, | |
| { | |
| "advantages": 6.705522537231445e-08, | |
| "completion_length": 190.5625, | |
| "epoch": 0.405, | |
| "grad_norm": 3.8361196517944336, | |
| "kl": 0.357421875, | |
| "learning_rate": 5.949999999999999e-07, | |
| "loss": 0.0243, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.1178511530160904, | |
| "rewards/accuracy_reward": 0.6250000596046448, | |
| "rewards/format_reward": 1.0, | |
| "step": 405 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 168.5625, | |
| "epoch": 0.406, | |
| "grad_norm": 0.0, | |
| "kl": 0.390625, | |
| "learning_rate": 5.939999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 406 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 144.75, | |
| "epoch": 0.407, | |
| "grad_norm": 0.0, | |
| "kl": 0.396484375, | |
| "learning_rate": 5.93e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 407 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 200.375, | |
| "epoch": 0.408, | |
| "grad_norm": 0.0, | |
| "kl": 0.3828125, | |
| "learning_rate": 5.919999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 408 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 177.75, | |
| "epoch": 0.409, | |
| "grad_norm": 0.0, | |
| "kl": 0.44140625, | |
| "learning_rate": 5.909999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 409 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 150.4375, | |
| "epoch": 0.41, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.9e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-08, | |
| "completion_length": 191.9375, | |
| "epoch": 0.411, | |
| "grad_norm": 3.727123737335205, | |
| "kl": 0.404296875, | |
| "learning_rate": 5.89e-07, | |
| "loss": 0.0212, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.0862581878900528, | |
| "rewards/accuracy_reward": 0.9375000596046448, | |
| "rewards/format_reward": 1.0, | |
| "step": 411 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 202.25, | |
| "epoch": 0.412, | |
| "grad_norm": 3.3219895362854004, | |
| "kl": 0.37890625, | |
| "learning_rate": 5.879999999999999e-07, | |
| "loss": -0.0367, | |
| "reward": 1.3125, | |
| "reward_mean": 1.3125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "step": 412 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 218.8125, | |
| "epoch": 0.413, | |
| "grad_norm": 3.1788170337677, | |
| "kl": 0.3515625, | |
| "learning_rate": 5.87e-07, | |
| "loss": -0.0693, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 413 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 229.3125, | |
| "epoch": 0.414, | |
| "grad_norm": 0.0, | |
| "kl": 0.36328125, | |
| "learning_rate": 5.86e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 414 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 226.9375, | |
| "epoch": 0.415, | |
| "grad_norm": 2.9099948406219482, | |
| "kl": 0.388671875, | |
| "learning_rate": 5.849999999999999e-07, | |
| "loss": 0.0902, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 415 | |
| }, | |
| { | |
| "advantages": -2.9802322387695312e-08, | |
| "completion_length": 260.3125, | |
| "epoch": 0.416, | |
| "grad_norm": 4.535805702209473, | |
| "kl": 0.37109375, | |
| "learning_rate": 5.839999999999999e-07, | |
| "loss": -0.0929, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.44478052854537964, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 416 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 213.5625, | |
| "epoch": 0.417, | |
| "grad_norm": 3.6784188747406006, | |
| "kl": 0.39453125, | |
| "learning_rate": 5.83e-07, | |
| "loss": 0.0398, | |
| "reward": 1.6770833730697632, | |
| "reward_mean": 1.6770833730697632, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.6770833730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 417 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 169.125, | |
| "epoch": 0.418, | |
| "grad_norm": 0.0, | |
| "kl": 0.34375, | |
| "learning_rate": 5.819999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 418 | |
| }, | |
| { | |
| "advantages": -7.078051567077637e-08, | |
| "completion_length": 241.4375, | |
| "epoch": 0.419, | |
| "grad_norm": 3.5229690074920654, | |
| "kl": 0.380859375, | |
| "learning_rate": 5.809999999999999e-07, | |
| "loss": 0.0437, | |
| "reward": 1.3958333730697632, | |
| "reward_mean": 1.3958333730697632, | |
| "reward_std": 0.0862581804394722, | |
| "rewards/accuracy_reward": 0.3958333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 419 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 199.5, | |
| "epoch": 0.42, | |
| "grad_norm": 0.0, | |
| "kl": 0.392578125, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 420 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 234.625, | |
| "epoch": 0.421, | |
| "grad_norm": 4.843015193939209, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.79e-07, | |
| "loss": -0.0363, | |
| "reward": 1.7916667461395264, | |
| "reward_mean": 1.7916667461395264, | |
| "reward_std": 0.3205420970916748, | |
| "rewards/accuracy_reward": 0.7916667461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 421 | |
| }, | |
| { | |
| "advantages": 1.2665987014770508e-07, | |
| "completion_length": 218.4375, | |
| "epoch": 0.422, | |
| "grad_norm": 5.061634540557861, | |
| "kl": 0.37890625, | |
| "learning_rate": 5.779999999999999e-07, | |
| "loss": 0.0372, | |
| "reward": 1.3854167461395264, | |
| "reward_mean": 1.3854167461395264, | |
| "reward_std": 0.1473138928413391, | |
| "rewards/accuracy_reward": 0.3854166865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 422 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 150.4375, | |
| "epoch": 0.423, | |
| "grad_norm": 4.365501880645752, | |
| "kl": 0.44140625, | |
| "learning_rate": 5.769999999999999e-07, | |
| "loss": 0.0644, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 423 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 184.5, | |
| "epoch": 0.424, | |
| "grad_norm": 0.0, | |
| "kl": 0.380859375, | |
| "learning_rate": 5.76e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 424 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 170.8125, | |
| "epoch": 0.425, | |
| "grad_norm": 3.8927226066589355, | |
| "kl": 0.38671875, | |
| "learning_rate": 5.749999999999999e-07, | |
| "loss": -0.0322, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 425 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 201.1875, | |
| "epoch": 0.426, | |
| "grad_norm": 0.0, | |
| "kl": 0.390625, | |
| "learning_rate": 5.739999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 426 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 238.875, | |
| "epoch": 0.427, | |
| "grad_norm": 2.8062853813171387, | |
| "kl": 0.37890625, | |
| "learning_rate": 5.73e-07, | |
| "loss": -0.0371, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 427 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 191.8125, | |
| "epoch": 0.428, | |
| "grad_norm": 3.564711570739746, | |
| "kl": 0.39453125, | |
| "learning_rate": 5.719999999999999e-07, | |
| "loss": -0.012, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "step": 428 | |
| }, | |
| { | |
| "advantages": -2.60770320892334e-08, | |
| "completion_length": 237.375, | |
| "epoch": 0.429, | |
| "grad_norm": 5.137650012969971, | |
| "kl": 0.35546875, | |
| "learning_rate": 5.709999999999999e-07, | |
| "loss": -0.0172, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.4355512857437134, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 429 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 228.375, | |
| "epoch": 0.43, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.699999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 430 | |
| }, | |
| { | |
| "advantages": 3.203749656677246e-07, | |
| "completion_length": 189.4375, | |
| "epoch": 0.431, | |
| "grad_norm": 3.582122325897217, | |
| "kl": 0.3828125, | |
| "learning_rate": 5.69e-07, | |
| "loss": 0.0219, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.058925580233335495, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 431 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 153.125, | |
| "epoch": 0.432, | |
| "grad_norm": 0.0, | |
| "kl": 0.3671875, | |
| "learning_rate": 5.679999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 432 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 226.8125, | |
| "epoch": 0.433, | |
| "grad_norm": 0.0, | |
| "kl": 0.38671875, | |
| "learning_rate": 5.669999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 433 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 194.875, | |
| "epoch": 0.434, | |
| "grad_norm": 0.0, | |
| "kl": 0.38671875, | |
| "learning_rate": 5.66e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 434 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 135.875, | |
| "epoch": 0.435, | |
| "grad_norm": 0.0, | |
| "kl": 0.396484375, | |
| "learning_rate": 5.649999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 435 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 173.5, | |
| "epoch": 0.436, | |
| "grad_norm": 0.0, | |
| "kl": 0.390625, | |
| "learning_rate": 5.639999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 436 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 221.3125, | |
| "epoch": 0.437, | |
| "grad_norm": 4.741230010986328, | |
| "kl": 0.3515625, | |
| "learning_rate": 5.629999999999999e-07, | |
| "loss": -0.0242, | |
| "reward": 1.7291667461395264, | |
| "reward_mean": 1.7291667461395264, | |
| "reward_std": 0.32618677616119385, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 437 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 229.0625, | |
| "epoch": 0.438, | |
| "grad_norm": 0.0, | |
| "kl": 0.359375, | |
| "learning_rate": 5.620000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 438 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 217.1875, | |
| "epoch": 0.439, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.61e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 439 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 221.0625, | |
| "epoch": 0.44, | |
| "grad_norm": 0.0, | |
| "kl": 0.49609375, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.3333333730697632, | |
| "reward_mean": 1.3333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 440 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 192.6875, | |
| "epoch": 0.441, | |
| "grad_norm": 2.999258041381836, | |
| "kl": 0.47265625, | |
| "learning_rate": 5.590000000000001e-07, | |
| "loss": 0.0634, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 441 | |
| }, | |
| { | |
| "advantages": 1.2665987014770508e-07, | |
| "completion_length": 195.4375, | |
| "epoch": 0.442, | |
| "grad_norm": 4.589319705963135, | |
| "kl": 0.37109375, | |
| "learning_rate": 5.58e-07, | |
| "loss": -0.1397, | |
| "reward": 1.8541667461395264, | |
| "reward_mean": 1.8541667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 442 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 204.5625, | |
| "epoch": 0.443, | |
| "grad_norm": 3.8165395259857178, | |
| "kl": 0.46484375, | |
| "learning_rate": 5.57e-07, | |
| "loss": 0.122, | |
| "reward": 1.9166667461395264, | |
| "reward_mean": 1.9166667461395264, | |
| "reward_std": 0.08908707648515701, | |
| "rewards/accuracy_reward": 0.9166667461395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 443 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 235.3125, | |
| "epoch": 0.444, | |
| "grad_norm": 3.3493289947509766, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.560000000000001e-07, | |
| "loss": -0.0344, | |
| "reward": 1.4583333730697632, | |
| "reward_mean": 1.4583333730697632, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.4583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 444 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 173.0625, | |
| "epoch": 0.445, | |
| "grad_norm": 3.844341278076172, | |
| "kl": 0.52734375, | |
| "learning_rate": 5.55e-07, | |
| "loss": 0.0581, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.18600594997406006, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 445 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 164.0, | |
| "epoch": 0.446, | |
| "grad_norm": 0.0, | |
| "kl": 0.421875, | |
| "learning_rate": 5.54e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 446 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 160.0625, | |
| "epoch": 0.447, | |
| "grad_norm": 3.5318963527679443, | |
| "kl": 0.37890625, | |
| "learning_rate": 5.53e-07, | |
| "loss": 0.0321, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 447 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 207.6875, | |
| "epoch": 0.448, | |
| "grad_norm": 5.7608489990234375, | |
| "kl": 0.359375, | |
| "learning_rate": 5.520000000000001e-07, | |
| "loss": 0.0509, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.49871626496315, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 448 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 242.75, | |
| "epoch": 0.449, | |
| "grad_norm": 4.810704708099365, | |
| "kl": 0.46875, | |
| "learning_rate": 5.51e-07, | |
| "loss": -0.0634, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 0.9375, | |
| "step": 449 | |
| }, | |
| { | |
| "advantages": 2.60770320892334e-08, | |
| "completion_length": 161.0625, | |
| "epoch": 0.45, | |
| "grad_norm": 7.428137302398682, | |
| "kl": 0.3984375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0819, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.4355512857437134, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 193.1875, | |
| "epoch": 0.451, | |
| "grad_norm": 0.0, | |
| "kl": 0.388671875, | |
| "learning_rate": 5.490000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 451 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-07, | |
| "completion_length": 235.75, | |
| "epoch": 0.452, | |
| "grad_norm": 3.380284547805786, | |
| "kl": 0.36328125, | |
| "learning_rate": 5.48e-07, | |
| "loss": -0.0072, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.0589255690574646, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 452 | |
| }, | |
| { | |
| "advantages": 7.078051567077637e-08, | |
| "completion_length": 177.75, | |
| "epoch": 0.453, | |
| "grad_norm": 4.489373683929443, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.47e-07, | |
| "loss": -0.1424, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.0862581804394722, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 453 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 161.125, | |
| "epoch": 0.454, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.46e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 454 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 179.1875, | |
| "epoch": 0.455, | |
| "grad_norm": 4.947906017303467, | |
| "kl": 0.419921875, | |
| "learning_rate": 5.45e-07, | |
| "loss": -0.1043, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 455 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 155.6875, | |
| "epoch": 0.456, | |
| "grad_norm": 0.0, | |
| "kl": 0.4140625, | |
| "learning_rate": 5.44e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 456 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 136.625, | |
| "epoch": 0.457, | |
| "grad_norm": 0.0, | |
| "kl": 0.361328125, | |
| "learning_rate": 5.43e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 457 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 149.5625, | |
| "epoch": 0.458, | |
| "grad_norm": 0.0, | |
| "kl": 0.421875, | |
| "learning_rate": 5.420000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 458 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 185.8125, | |
| "epoch": 0.459, | |
| "grad_norm": 0.0, | |
| "kl": 0.3984375, | |
| "learning_rate": 5.41e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 459 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 180.25, | |
| "epoch": 0.46, | |
| "grad_norm": 4.1631245613098145, | |
| "kl": 0.3671875, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0139, | |
| "reward": 1.125, | |
| "reward_mean": 1.125, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/format_reward": 1.0, | |
| "step": 460 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 190.875, | |
| "epoch": 0.461, | |
| "grad_norm": 4.054723262786865, | |
| "kl": 0.37109375, | |
| "learning_rate": 5.39e-07, | |
| "loss": 0.0473, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 461 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 192.8125, | |
| "epoch": 0.462, | |
| "grad_norm": 4.4658122062683105, | |
| "kl": 0.447265625, | |
| "learning_rate": 5.38e-07, | |
| "loss": 0.0566, | |
| "reward": 1.5833333730697632, | |
| "reward_mean": 1.5833333730697632, | |
| "reward_std": 0.26726123690605164, | |
| "rewards/accuracy_reward": 0.5833333134651184, | |
| "rewards/format_reward": 1.0, | |
| "step": 462 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 192.625, | |
| "epoch": 0.463, | |
| "grad_norm": 0.0, | |
| "kl": 0.400390625, | |
| "learning_rate": 5.37e-07, | |
| "loss": 0.0, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.6666667461395264, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.6666666269302368, | |
| "rewards/format_reward": 1.0, | |
| "step": 463 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 161.0, | |
| "epoch": 0.464, | |
| "grad_norm": 0.0, | |
| "kl": 0.36328125, | |
| "learning_rate": 5.36e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 464 | |
| }, | |
| { | |
| "advantages": 1.6391277313232422e-07, | |
| "completion_length": 180.875, | |
| "epoch": 0.465, | |
| "grad_norm": 4.05633544921875, | |
| "kl": 0.384765625, | |
| "learning_rate": 5.35e-07, | |
| "loss": 0.0124, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.08908708393573761, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 465 | |
| }, | |
| { | |
| "advantages": -7.82310962677002e-08, | |
| "completion_length": 189.9375, | |
| "epoch": 0.466, | |
| "grad_norm": 5.274670124053955, | |
| "kl": 0.3828125, | |
| "learning_rate": 5.34e-07, | |
| "loss": -0.021, | |
| "reward": 1.8333333730697632, | |
| "reward_mean": 1.8333333730697632, | |
| "reward_std": 0.2630348801612854, | |
| "rewards/accuracy_reward": 0.8958333730697632, | |
| "rewards/format_reward": 0.9375, | |
| "step": 466 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 219.5, | |
| "epoch": 0.467, | |
| "grad_norm": 0.0, | |
| "kl": 0.384765625, | |
| "learning_rate": 5.33e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 467 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 193.125, | |
| "epoch": 0.468, | |
| "grad_norm": 0.0, | |
| "kl": 0.369140625, | |
| "learning_rate": 5.32e-07, | |
| "loss": 0.0, | |
| "reward": 1.0, | |
| "reward_mean": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 468 | |
| }, | |
| { | |
| "advantages": -2.2351741790771484e-08, | |
| "completion_length": 181.75, | |
| "epoch": 0.469, | |
| "grad_norm": 5.394594669342041, | |
| "kl": 0.42578125, | |
| "learning_rate": 5.31e-07, | |
| "loss": 0.0852, | |
| "reward": 1.6458333730697632, | |
| "reward_mean": 1.6458333730697632, | |
| "reward_std": 0.4082317352294922, | |
| "rewards/accuracy_reward": 0.7708333730697632, | |
| "rewards/format_reward": 0.875, | |
| "step": 469 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 162.875, | |
| "epoch": 0.47, | |
| "grad_norm": 5.32183837890625, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.3e-07, | |
| "loss": 0.1006, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 470 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 171.9375, | |
| "epoch": 0.471, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.29e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 471 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 156.0, | |
| "epoch": 0.472, | |
| "grad_norm": 0.0, | |
| "kl": 0.390625, | |
| "learning_rate": 5.28e-07, | |
| "loss": 0.0, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 472 | |
| }, | |
| { | |
| "advantages": -1.2665987014770508e-07, | |
| "completion_length": 153.8125, | |
| "epoch": 0.473, | |
| "grad_norm": 5.101055145263672, | |
| "kl": 0.42578125, | |
| "learning_rate": 5.27e-07, | |
| "loss": 0.0903, | |
| "reward": 1.9791667461395264, | |
| "reward_mean": 1.9791667461395264, | |
| "reward_std": 0.05892554670572281, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 473 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 186.9375, | |
| "epoch": 0.474, | |
| "grad_norm": 0.0, | |
| "kl": 0.41796875, | |
| "learning_rate": 5.26e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 474 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 185.75, | |
| "epoch": 0.475, | |
| "grad_norm": 4.348298072814941, | |
| "kl": 0.390625, | |
| "learning_rate": 5.25e-07, | |
| "loss": 0.0827, | |
| "reward": 1.15625, | |
| "reward_mean": 1.15625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.15625, | |
| "rewards/format_reward": 1.0, | |
| "step": 475 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 199.0625, | |
| "epoch": 0.476, | |
| "grad_norm": 3.2782394886016846, | |
| "kl": 0.375, | |
| "learning_rate": 5.24e-07, | |
| "loss": -0.0661, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 476 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 170.6875, | |
| "epoch": 0.477, | |
| "grad_norm": 4.559285640716553, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.23e-07, | |
| "loss": -0.0058, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 477 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-08, | |
| "completion_length": 182.0, | |
| "epoch": 0.478, | |
| "grad_norm": 3.9179017543792725, | |
| "kl": 0.44140625, | |
| "learning_rate": 5.22e-07, | |
| "loss": 0.0788, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 478 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 174.9375, | |
| "epoch": 0.479, | |
| "grad_norm": 4.1898298263549805, | |
| "kl": 0.3984375, | |
| "learning_rate": 5.21e-07, | |
| "loss": -0.058, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 479 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 158.5625, | |
| "epoch": 0.48, | |
| "grad_norm": 3.0333094596862793, | |
| "kl": 0.3828125, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0462, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 480 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 142.5, | |
| "epoch": 0.481, | |
| "grad_norm": 0.0, | |
| "kl": 0.39453125, | |
| "learning_rate": 5.19e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 481 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 162.3125, | |
| "epoch": 0.482, | |
| "grad_norm": 0.0, | |
| "kl": 0.4375, | |
| "learning_rate": 5.18e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 482 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 152.4375, | |
| "epoch": 0.483, | |
| "grad_norm": 4.092982292175293, | |
| "kl": 0.3984375, | |
| "learning_rate": 5.17e-07, | |
| "loss": -0.008, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 483 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 175.5, | |
| "epoch": 0.484, | |
| "grad_norm": 0.0, | |
| "kl": 0.40625, | |
| "learning_rate": 5.16e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 484 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 166.8125, | |
| "epoch": 0.485, | |
| "grad_norm": 0.0, | |
| "kl": 0.4375, | |
| "learning_rate": 5.149999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 485 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-08, | |
| "completion_length": 151.8125, | |
| "epoch": 0.486, | |
| "grad_norm": 5.26322078704834, | |
| "kl": 0.41015625, | |
| "learning_rate": 5.14e-07, | |
| "loss": -0.0091, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 486 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 182.5625, | |
| "epoch": 0.487, | |
| "grad_norm": 0.0, | |
| "kl": 0.42578125, | |
| "learning_rate": 5.13e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 487 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 158.125, | |
| "epoch": 0.488, | |
| "grad_norm": 0.0, | |
| "kl": 0.44140625, | |
| "learning_rate": 5.12e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 488 | |
| }, | |
| { | |
| "advantages": 1.4901161193847656e-08, | |
| "completion_length": 202.3125, | |
| "epoch": 0.489, | |
| "grad_norm": 6.207299709320068, | |
| "kl": 0.33984375, | |
| "learning_rate": 5.11e-07, | |
| "loss": -0.1083, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 489 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 167.625, | |
| "epoch": 0.49, | |
| "grad_norm": 3.2399141788482666, | |
| "kl": 0.6015625, | |
| "learning_rate": 5.1e-07, | |
| "loss": -0.042, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 490 | |
| }, | |
| { | |
| "advantages": 3.203749656677246e-07, | |
| "completion_length": 148.0625, | |
| "epoch": 0.491, | |
| "grad_norm": 4.004068851470947, | |
| "kl": 0.48046875, | |
| "learning_rate": 5.09e-07, | |
| "loss": 0.0066, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.058925580233335495, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 491 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 189.375, | |
| "epoch": 0.492, | |
| "grad_norm": 0.0, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.079999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 492 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 157.5625, | |
| "epoch": 0.493, | |
| "grad_norm": 0.0, | |
| "kl": 0.40234375, | |
| "learning_rate": 5.07e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 493 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 145.6875, | |
| "epoch": 0.494, | |
| "grad_norm": 0.0, | |
| "kl": 0.494140625, | |
| "learning_rate": 5.06e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 494 | |
| }, | |
| { | |
| "advantages": 1.6391277313232422e-07, | |
| "completion_length": 151.25, | |
| "epoch": 0.495, | |
| "grad_norm": 4.36698579788208, | |
| "kl": 0.427734375, | |
| "learning_rate": 5.049999999999999e-07, | |
| "loss": -0.0001, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0890870913863182, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 495 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 179.875, | |
| "epoch": 0.496, | |
| "grad_norm": 4.543258190155029, | |
| "kl": 0.3828125, | |
| "learning_rate": 5.04e-07, | |
| "loss": -0.0217, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 496 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 188.625, | |
| "epoch": 0.497, | |
| "grad_norm": 0.0, | |
| "kl": 0.41796875, | |
| "learning_rate": 5.03e-07, | |
| "loss": 0.0, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 497 | |
| }, | |
| { | |
| "advantages": -1.4901161193847656e-08, | |
| "completion_length": 148.8125, | |
| "epoch": 0.498, | |
| "grad_norm": 4.038569450378418, | |
| "kl": 0.4609375, | |
| "learning_rate": 5.02e-07, | |
| "loss": -0.0576, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 498 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 129.25, | |
| "epoch": 0.499, | |
| "grad_norm": 0.0, | |
| "kl": 0.42578125, | |
| "learning_rate": 5.009999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 499 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 148.125, | |
| "epoch": 0.5, | |
| "grad_norm": 5.02844762802124, | |
| "kl": 0.4453125, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0852, | |
| "reward": 1.0625, | |
| "reward_mean": 1.0625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 1.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |