diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 45.11194029850746, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 197.55555725097656, + "epoch": 0.007462686567164179, + "grad_norm": 0.5631081961063045, + "learning_rate": 3.7313432835820896e-10, + "loss": -0.0004, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5, + "epoch": 0.014925373134328358, + "grad_norm": 0.5038824401687312, + "learning_rate": 7.462686567164179e-10, + "loss": 0.0043, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.38888549804688, + "epoch": 0.022388059701492536, + "grad_norm": 1.0134161995696207, + "learning_rate": 1.1194029850746268e-09, + "loss": -0.0079, + "reward": 0.6111111044883728, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.72222900390625, + "epoch": 0.029850746268656716, + "grad_norm": 0.8103940303804281, + "learning_rate": 1.4925373134328358e-09, + "loss": 0.0077, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.72222900390625, + "epoch": 0.03731343283582089, + "grad_norm": 1.2603892692308314, + "learning_rate": 1.8656716417910446e-09, + "loss": -0.0186, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.72222900390625, + "epoch": 0.04477611940298507, + "grad_norm": 0.8824953975666499, + "learning_rate": 2.2388059701492537e-09, + "loss": -0.0302, + "reward": 0.5, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.86111450195312, + "epoch": 0.05223880597014925, + "grad_norm": 0.8003647841785024, + "learning_rate": 2.6119402985074627e-09, + "loss": 0.0038, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.9166717529297, + "epoch": 0.05970149253731343, + "grad_norm": 0.9393410861724302, + "learning_rate": 2.9850746268656717e-09, + "loss": 0.0015, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 0.06716417910447761, + "grad_norm": 0.42494207336785655, + "learning_rate": 3.3582089552238803e-09, + "loss": -0.0006, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5, + "epoch": 0.07462686567164178, + "grad_norm": 0.779541101455777, + "learning_rate": 3.731343283582089e-09, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.4166717529297, + "epoch": 0.08208955223880597, + "grad_norm": 0.5024152503000584, + "learning_rate": 4.104477611940298e-09, + "loss": 0.0033, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 0.08955223880597014, + "grad_norm": 1.2594784062286446, + "learning_rate": 4.477611940298507e-09, + "loss": 0.0049, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.86111450195312, + "epoch": 0.09701492537313433, + "grad_norm": 0.7928989227276712, + "learning_rate": 4.850746268656716e-09, + "loss": 0.005, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0833282470703, + "epoch": 0.1044776119402985, + "grad_norm": 1.4534540407685688, + "learning_rate": 5.223880597014925e-09, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 0.11194029850746269, + "grad_norm": 0.7999321503728164, + "learning_rate": 5.5970149253731335e-09, + "loss": 0.0295, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 0.11940298507462686, + "grad_norm": 0.9889772278492106, + "learning_rate": 5.970149253731343e-09, + "loss": 0.0023, + "reward": 0.5555555820465088, + "reward_std": 0.47882235050201416, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.36111450195312, + "epoch": 0.12686567164179105, + "grad_norm": 0.7616569924906974, + "learning_rate": 6.3432835820895516e-09, + "loss": -0.0012, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.69444274902344, + "epoch": 0.13432835820895522, + "grad_norm": 1.7576858537592845, + "learning_rate": 6.7164179104477606e-09, + "loss": 0.046, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5, + "epoch": 0.1417910447761194, + "grad_norm": 0.4173145624258726, + "learning_rate": 7.08955223880597e-09, + "loss": 0.0004, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 0.14925373134328357, + "grad_norm": 0.7521143101962191, + "learning_rate": 7.462686567164179e-09, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 0.15671641791044777, + "grad_norm": 0.2664895694135841, + "learning_rate": 7.835820895522388e-09, + "loss": -0.0009, + "reward": 0.1944444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0, + "epoch": 0.16417910447761194, + "grad_norm": 0.7514515174220607, + "learning_rate": 8.208955223880597e-09, + "loss": 0.0036, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 0.17164179104477612, + "grad_norm": 0.5107932855212023, + "learning_rate": 8.582089552238806e-09, + "loss": -0.0031, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1666717529297, + "epoch": 0.1791044776119403, + "grad_norm": 0.49475419087206207, + "learning_rate": 8.955223880597015e-09, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 0.1865671641791045, + "grad_norm": 0.9739785670881288, + "learning_rate": 9.328358208955222e-09, + "loss": -0.0086, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.44444274902344, + "epoch": 0.19402985074626866, + "grad_norm": 0.7222336513253671, + "learning_rate": 9.701492537313433e-09, + "loss": -0.0023, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6666717529297, + "epoch": 0.20149253731343283, + "grad_norm": 1.3038961482249225, + "learning_rate": 1.0074626865671642e-08, + "loss": -0.0117, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.22222900390625, + "epoch": 0.208955223880597, + "grad_norm": 1.1618891668602875, + "learning_rate": 1.044776119402985e-08, + "loss": 0.0062, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 0.21641791044776118, + "grad_norm": 1.23355504982039, + "learning_rate": 1.082089552238806e-08, + "loss": 0.0039, + "reward": 0.6388888955116272, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 0.22388059701492538, + "grad_norm": 0.9549599710512591, + "learning_rate": 1.1194029850746267e-08, + "loss": 0.0045, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.69444274902344, + "epoch": 0.23134328358208955, + "grad_norm": 0.44456829702469963, + "learning_rate": 1.1567164179104476e-08, + "loss": 0.0006, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 0.23880597014925373, + "grad_norm": 1.3012163138312864, + "learning_rate": 1.1940298507462687e-08, + "loss": 0.0048, + "reward": 0.5555555820465088, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.63888549804688, + "epoch": 0.2462686567164179, + "grad_norm": 0.94548262808444, + "learning_rate": 1.2313432835820896e-08, + "loss": 0.0076, + "reward": 0.5833333134651184, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.13888549804688, + "epoch": 0.2537313432835821, + "grad_norm": 0.6718461055776338, + "learning_rate": 1.2686567164179103e-08, + "loss": -0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.1666717529297, + "epoch": 0.26119402985074625, + "grad_norm": 0.7487182083041063, + "learning_rate": 1.3059701492537312e-08, + "loss": 0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 0.26865671641791045, + "grad_norm": 1.11171240249481, + "learning_rate": 1.3432835820895521e-08, + "loss": 0.009, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.8333282470703, + "epoch": 0.27611940298507465, + "grad_norm": 0.472364324124779, + "learning_rate": 1.3805970149253732e-08, + "loss": 0.0157, + "reward": 0.7777777910232544, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1666717529297, + "epoch": 0.2835820895522388, + "grad_norm": 0.5935518424146652, + "learning_rate": 1.417910447761194e-08, + "loss": -0.0006, + "reward": 0.3611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 0.291044776119403, + "grad_norm": 0.6870277385397338, + "learning_rate": 1.4552238805970148e-08, + "loss": -0.0031, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.19444274902344, + "epoch": 0.29850746268656714, + "grad_norm": 1.8733429486844386, + "learning_rate": 1.4925373134328357e-08, + "loss": -0.0072, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.63888549804688, + "epoch": 0.30597014925373134, + "grad_norm": 0.9196086950133876, + "learning_rate": 1.5298507462686568e-08, + "loss": -0.0084, + "reward": 0.5277777910232544, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.8333282470703, + "epoch": 0.31343283582089554, + "grad_norm": 1.0768128442949263, + "learning_rate": 1.5671641791044775e-08, + "loss": -0.0072, + "reward": 0.4444444477558136, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5, + "epoch": 0.3208955223880597, + "grad_norm": 0.4201863640844439, + "learning_rate": 1.6044776119402983e-08, + "loss": 0.0011, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.63888549804688, + "epoch": 0.3283582089552239, + "grad_norm": 0.46268134989961557, + "learning_rate": 1.6417910447761193e-08, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30555725097656, + "epoch": 0.3358208955223881, + "grad_norm": 1.0137822399595888, + "learning_rate": 1.67910447761194e-08, + "loss": 0.0177, + "reward": 0.2777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.7777862548828, + "epoch": 0.34328358208955223, + "grad_norm": 0.3888334754177038, + "learning_rate": 1.716417910447761e-08, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.25660011172294617, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.6666717529297, + "epoch": 0.35074626865671643, + "grad_norm": 1.2446140567650146, + "learning_rate": 1.7537313432835822e-08, + "loss": 0.0014, + "reward": 0.4444444477558136, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.88888549804688, + "epoch": 0.3582089552238806, + "grad_norm": 0.40785831862181066, + "learning_rate": 1.791044776119403e-08, + "loss": 0.0008, + "reward": 0.3055555522441864, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.36111450195312, + "epoch": 0.3656716417910448, + "grad_norm": 0.7205486289518113, + "learning_rate": 1.828358208955224e-08, + "loss": -0.0006, + "reward": 0.2777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.36111450195312, + "epoch": 0.373134328358209, + "grad_norm": 0.8954099273526168, + "learning_rate": 1.8656716417910444e-08, + "loss": 0.0015, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.94444274902344, + "epoch": 0.3805970149253731, + "grad_norm": 0.5363104646247921, + "learning_rate": 1.9029850746268655e-08, + "loss": -0.001, + "reward": 0.3888888955116272, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.72222900390625, + "epoch": 0.3880597014925373, + "grad_norm": 0.36121351418153913, + "learning_rate": 1.9402985074626865e-08, + "loss": -0.0033, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 0.39552238805970147, + "grad_norm": 1.0088070110620406, + "learning_rate": 1.9776119402985073e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.86111450195312, + "epoch": 0.40298507462686567, + "grad_norm": 1.1624876823251244, + "learning_rate": 2.0149253731343283e-08, + "loss": -0.0229, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.25, + "epoch": 0.41044776119402987, + "grad_norm": 0.8027374957845462, + "learning_rate": 2.052238805970149e-08, + "loss": -0.0007, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.13888549804688, + "epoch": 0.417910447761194, + "grad_norm": 1.3372544444256025, + "learning_rate": 2.08955223880597e-08, + "loss": 0.0197, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.72222900390625, + "epoch": 0.4253731343283582, + "grad_norm": 0.5498831625052575, + "learning_rate": 2.1268656716417912e-08, + "loss": 0.0014, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 0.43283582089552236, + "grad_norm": 0.6015524726563576, + "learning_rate": 2.164179104477612e-08, + "loss": 0.0102, + "reward": 0.3055555522441864, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.36111450195312, + "epoch": 0.44029850746268656, + "grad_norm": 0.9822789952685754, + "learning_rate": 2.2014925373134327e-08, + "loss": -0.0064, + "reward": 0.694444477558136, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 0.44776119402985076, + "grad_norm": 1.5316833298300077, + "learning_rate": 2.2388059701492534e-08, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.80555725097656, + "epoch": 0.4552238805970149, + "grad_norm": 0.4521828495097594, + "learning_rate": 2.2761194029850745e-08, + "loss": 0.0017, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05555725097656, + "epoch": 0.4626865671641791, + "grad_norm": 0.47868111772870336, + "learning_rate": 2.3134328358208952e-08, + "loss": -0.0012, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5277862548828, + "epoch": 0.4701492537313433, + "grad_norm": 1.7393411076975418, + "learning_rate": 2.3507462686567163e-08, + "loss": 0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5833282470703, + "epoch": 0.47761194029850745, + "grad_norm": 1.1221527782279836, + "learning_rate": 2.3880597014925373e-08, + "loss": 0.0042, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0833282470703, + "epoch": 0.48507462686567165, + "grad_norm": 1.7044572598668821, + "learning_rate": 2.425373134328358e-08, + "loss": 0.0085, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.19444274902344, + "epoch": 0.4925373134328358, + "grad_norm": 2.4780161687842264, + "learning_rate": 2.462686567164179e-08, + "loss": -0.0015, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.4166717529297, + "epoch": 0.5, + "grad_norm": 1.5126126790183594, + "learning_rate": 2.5e-08, + "loss": -0.0035, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3333282470703, + "epoch": 0.5074626865671642, + "grad_norm": 0.4016817433284776, + "learning_rate": 2.5373134328358206e-08, + "loss": 0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.75, + "epoch": 0.5149253731343284, + "grad_norm": 1.2452173989405944, + "learning_rate": 2.5746268656716417e-08, + "loss": 0.0494, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.08333587646484, + "epoch": 0.5223880597014925, + "grad_norm": 0.4205474717974056, + "learning_rate": 2.6119402985074624e-08, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.86111450195312, + "epoch": 0.5298507462686567, + "grad_norm": 0.5777207477933705, + "learning_rate": 2.6492537313432835e-08, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.36111450195312, + "epoch": 0.5373134328358209, + "grad_norm": 1.1334233916791134, + "learning_rate": 2.6865671641791042e-08, + "loss": -0.0027, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.13888549804688, + "epoch": 0.5447761194029851, + "grad_norm": 0.9303616561457156, + "learning_rate": 2.7238805970149253e-08, + "loss": 0.0013, + "reward": 0.8611111044883728, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9166717529297, + "epoch": 0.5522388059701493, + "grad_norm": 1.167517967167216, + "learning_rate": 2.7611940298507464e-08, + "loss": 0.0034, + "reward": 0.3055555522441864, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.05555725097656, + "epoch": 0.5597014925373134, + "grad_norm": 0.22491206214017717, + "learning_rate": 2.798507462686567e-08, + "loss": 0.0003, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.47222900390625, + "epoch": 0.5671641791044776, + "grad_norm": 1.2780135505285333, + "learning_rate": 2.835820895522388e-08, + "loss": -0.0139, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 0.5746268656716418, + "grad_norm": 0.7319175327049431, + "learning_rate": 2.8731343283582086e-08, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.4318612813949585, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.75, + "epoch": 0.582089552238806, + "grad_norm": 0.9270266853734483, + "learning_rate": 2.9104477611940296e-08, + "loss": -0.0202, + "reward": 0.5277777910232544, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.55555725097656, + "epoch": 0.5895522388059702, + "grad_norm": 1.2320898403916178, + "learning_rate": 2.9477611940298504e-08, + "loss": -0.0086, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.69444274902344, + "epoch": 0.5970149253731343, + "grad_norm": 0.9600111757660571, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.0249, + "reward": 0.3888888955116272, + "reward_std": 0.4318612813949585, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0, + "epoch": 0.6044776119402985, + "grad_norm": 1.3844001129953967, + "learning_rate": 3.022388059701492e-08, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.38888549804688, + "epoch": 0.6119402985074627, + "grad_norm": 0.37620215380502214, + "learning_rate": 3.0597014925373136e-08, + "loss": -0.0023, + "reward": 0.3888888955116272, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 0.6194029850746269, + "grad_norm": 0.5457168241322372, + "learning_rate": 3.097014925373134e-08, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.8333282470703, + "epoch": 0.6268656716417911, + "grad_norm": 0.48597967206483, + "learning_rate": 3.134328358208955e-08, + "loss": -0.0016, + "reward": 0.5277777910232544, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.55555725097656, + "epoch": 0.6343283582089553, + "grad_norm": 0.9937342130029043, + "learning_rate": 3.1716417910447764e-08, + "loss": 0.0046, + "reward": 0.5277777910232544, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8333282470703, + "epoch": 0.6417910447761194, + "grad_norm": 0.7164461340299342, + "learning_rate": 3.2089552238805965e-08, + "loss": 0.0009, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5833282470703, + "epoch": 0.6492537313432836, + "grad_norm": 0.9292515185344566, + "learning_rate": 3.246268656716418e-08, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.47222900390625, + "epoch": 0.6567164179104478, + "grad_norm": 2.1836686984184466, + "learning_rate": 3.2835820895522386e-08, + "loss": 0.0113, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5277862548828, + "epoch": 0.664179104477612, + "grad_norm": 0.839237853756395, + "learning_rate": 3.32089552238806e-08, + "loss": -0.0021, + "reward": 0.5833333134651184, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.61111450195312, + "epoch": 0.6716417910447762, + "grad_norm": 0.4512541235664152, + "learning_rate": 3.35820895522388e-08, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 0.6791044776119403, + "grad_norm": 0.8609518088129657, + "learning_rate": 3.395522388059701e-08, + "loss": -0.0022, + "reward": 0.5555555820465088, + "reward_std": 0.47882235050201416, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.36111450195312, + "epoch": 0.6865671641791045, + "grad_norm": 1.9687677485492001, + "learning_rate": 3.432835820895522e-08, + "loss": 0.0034, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.97222900390625, + "epoch": 0.6940298507462687, + "grad_norm": 1.0623121357684706, + "learning_rate": 3.470149253731343e-08, + "loss": 0.0103, + "reward": 0.8055555820465088, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5277862548828, + "epoch": 0.7014925373134329, + "grad_norm": 1.0940002439164866, + "learning_rate": 3.5074626865671644e-08, + "loss": 0.0033, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 0.7089552238805971, + "grad_norm": 0.780793150064705, + "learning_rate": 3.5447761194029845e-08, + "loss": -0.0036, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.30555725097656, + "epoch": 0.7164179104477612, + "grad_norm": 1.2759536349295197, + "learning_rate": 3.582089552238806e-08, + "loss": 0.0635, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.94444274902344, + "epoch": 0.7238805970149254, + "grad_norm": 1.006271870985759, + "learning_rate": 3.6194029850746266e-08, + "loss": 0.0038, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.2777862548828, + "epoch": 0.7313432835820896, + "grad_norm": 0.5595659175765553, + "learning_rate": 3.656716417910448e-08, + "loss": -0.0014, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.69444274902344, + "epoch": 0.7388059701492538, + "grad_norm": 0.6454756984319348, + "learning_rate": 3.694029850746269e-08, + "loss": 0.0078, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.72222900390625, + "epoch": 0.746268656716418, + "grad_norm": 0.984173728741099, + "learning_rate": 3.731343283582089e-08, + "loss": 0.0018, + "reward": 0.472222238779068, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.75, + "epoch": 0.753731343283582, + "grad_norm": 0.34482971775748483, + "learning_rate": 3.76865671641791e-08, + "loss": 0.0, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.86111450195312, + "epoch": 0.7611940298507462, + "grad_norm": 0.3799280072420946, + "learning_rate": 3.805970149253731e-08, + "loss": -0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.80555725097656, + "epoch": 0.7686567164179104, + "grad_norm": 0.9141860679623273, + "learning_rate": 3.8432835820895523e-08, + "loss": 0.028, + "reward": 0.472222238779068, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.88888549804688, + "epoch": 0.7761194029850746, + "grad_norm": 1.0008747788195913, + "learning_rate": 3.880597014925373e-08, + "loss": -0.0014, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.86111450195312, + "epoch": 0.7835820895522388, + "grad_norm": 0.6354606331052948, + "learning_rate": 3.917910447761194e-08, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5277862548828, + "epoch": 0.7910447761194029, + "grad_norm": 1.3873456178478731, + "learning_rate": 3.9552238805970145e-08, + "loss": 0.0009, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.22222900390625, + "epoch": 0.7985074626865671, + "grad_norm": 0.20069139389840554, + "learning_rate": 3.992537313432836e-08, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.30555725097656, + "epoch": 0.8059701492537313, + "grad_norm": 1.2230955752129038, + "learning_rate": 4.029850746268657e-08, + "loss": -0.0077, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.8333282470703, + "epoch": 0.8134328358208955, + "grad_norm": 1.3693560970759273, + "learning_rate": 4.0671641791044774e-08, + "loss": 0.0201, + "reward": 0.5277777910232544, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.61111450195312, + "epoch": 0.8208955223880597, + "grad_norm": 0.2787400328159651, + "learning_rate": 4.104477611940298e-08, + "loss": 0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.86111450195312, + "epoch": 0.8283582089552238, + "grad_norm": 0.6384693893751411, + "learning_rate": 4.141791044776119e-08, + "loss": -0.0104, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.22222900390625, + "epoch": 0.835820895522388, + "grad_norm": 0.7052688389983904, + "learning_rate": 4.17910447761194e-08, + "loss": 0.0024, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.38888549804688, + "epoch": 0.8432835820895522, + "grad_norm": 0.7137904599259128, + "learning_rate": 4.216417910447761e-08, + "loss": -0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 0.8507462686567164, + "grad_norm": 0.6142688539481657, + "learning_rate": 4.2537313432835824e-08, + "loss": 0.003, + "reward": 0.5277777910232544, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 0.8582089552238806, + "grad_norm": 0.5892242525412239, + "learning_rate": 4.2910447761194025e-08, + "loss": 0.0002, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.25, + "epoch": 0.8656716417910447, + "grad_norm": 0.7675142970059907, + "learning_rate": 4.328358208955224e-08, + "loss": -0.0036, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.9166717529297, + "epoch": 0.8731343283582089, + "grad_norm": 0.47016244296290627, + "learning_rate": 4.3656716417910446e-08, + "loss": 0.006, + "reward": 0.2777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 0.8805970149253731, + "grad_norm": 0.945607391095875, + "learning_rate": 4.4029850746268654e-08, + "loss": -0.0083, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.22222900390625, + "epoch": 0.8880597014925373, + "grad_norm": 1.3058361685902509, + "learning_rate": 4.440298507462686e-08, + "loss": -0.0085, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.36111450195312, + "epoch": 0.8955223880597015, + "grad_norm": 1.0054552550231002, + "learning_rate": 4.477611940298507e-08, + "loss": -0.0015, + "reward": 0.4166666567325592, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 0.9029850746268657, + "grad_norm": 0.5455748468210248, + "learning_rate": 4.514925373134328e-08, + "loss": -0.0059, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.30555725097656, + "epoch": 0.9104477611940298, + "grad_norm": 0.4418641245043197, + "learning_rate": 4.552238805970149e-08, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.75, + "epoch": 0.917910447761194, + "grad_norm": 1.38088451467929, + "learning_rate": 4.5895522388059704e-08, + "loss": -0.0292, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.80555725097656, + "epoch": 0.9253731343283582, + "grad_norm": 0.34146686411904353, + "learning_rate": 4.6268656716417904e-08, + "loss": -0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.19444274902344, + "epoch": 0.9328358208955224, + "grad_norm": 1.8596592587171634, + "learning_rate": 4.664179104477612e-08, + "loss": -0.0023, + "reward": 0.694444477558136, + "reward_std": 0.4702278673648834, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.77777862548828, + "epoch": 0.9402985074626866, + "grad_norm": 4.546501605862894, + "learning_rate": 4.7014925373134326e-08, + "loss": -0.0027, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.7777862548828, + "epoch": 0.9477611940298507, + "grad_norm": 0.5813408165850864, + "learning_rate": 4.738805970149253e-08, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 0.9552238805970149, + "grad_norm": 0.7231031759259423, + "learning_rate": 4.776119402985075e-08, + "loss": 0.0, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 0.9626865671641791, + "grad_norm": 0.7728192657499862, + "learning_rate": 4.813432835820895e-08, + "loss": 0.0027, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.63888549804688, + "epoch": 0.9701492537313433, + "grad_norm": 0.5477065589651177, + "learning_rate": 4.850746268656716e-08, + "loss": -0.0021, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 0.9776119402985075, + "grad_norm": 0.7457399292840936, + "learning_rate": 4.888059701492537e-08, + "loss": -0.001, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1666717529297, + "epoch": 0.9850746268656716, + "grad_norm": 0.41035086694671996, + "learning_rate": 4.925373134328358e-08, + "loss": -0.0016, + "reward": 0.2222222238779068, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.2222222238779068, + "rewards/format_reward": 0.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 0.9925373134328358, + "grad_norm": 0.8582675958041729, + "learning_rate": 4.962686567164179e-08, + "loss": -0.0137, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.38888549804688, + "epoch": 1.007462686567164, + "grad_norm": 2.799329535399177, + "learning_rate": 5e-08, + "loss": 0.0043, + "reward": 0.694444477558136, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.55555725097656, + "epoch": 1.0149253731343284, + "grad_norm": 0.7201606187158188, + "learning_rate": 5.0373134328358205e-08, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2777862548828, + "epoch": 1.0223880597014925, + "grad_norm": 0.6209799621030211, + "learning_rate": 5.074626865671641e-08, + "loss": -0.003, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.63888549804688, + "epoch": 1.0298507462686568, + "grad_norm": 2.271939694399643, + "learning_rate": 5.1119402985074626e-08, + "loss": -0.0021, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 1.037313432835821, + "grad_norm": 2.5651451362471818, + "learning_rate": 5.1492537313432834e-08, + "loss": -0.0233, + "reward": 0.7222222089767456, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.55555725097656, + "epoch": 1.044776119402985, + "grad_norm": 0.23013949421132024, + "learning_rate": 5.186567164179104e-08, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.13888549804688, + "epoch": 1.0522388059701493, + "grad_norm": 0.7431444625931214, + "learning_rate": 5.223880597014925e-08, + "loss": 0.0007, + "reward": 0.3611111044883728, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.97222900390625, + "epoch": 1.0597014925373134, + "grad_norm": 0.952162037935231, + "learning_rate": 5.261194029850746e-08, + "loss": 0.067, + "reward": 0.7222222089767456, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1666717529297, + "epoch": 1.0671641791044777, + "grad_norm": 0.668430007105298, + "learning_rate": 5.298507462686567e-08, + "loss": -0.0027, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.44444274902344, + "epoch": 1.0746268656716418, + "grad_norm": 0.5070402779688151, + "learning_rate": 5.3358208955223884e-08, + "loss": 0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 1.0820895522388059, + "grad_norm": 1.3260400235059706, + "learning_rate": 5.3731343283582085e-08, + "loss": 0.002, + "reward": 0.6111111044883728, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.11111450195312, + "epoch": 1.0895522388059702, + "grad_norm": 2.0750029257476372, + "learning_rate": 5.410447761194029e-08, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 1.0970149253731343, + "grad_norm": 1.7702863992388642, + "learning_rate": 5.4477611940298506e-08, + "loss": 0.0067, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.97222900390625, + "epoch": 1.1044776119402986, + "grad_norm": 0.6415619108257816, + "learning_rate": 5.485074626865671e-08, + "loss": 0.0053, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 1.1119402985074627, + "grad_norm": 1.087477302903533, + "learning_rate": 5.522388059701493e-08, + "loss": -0.0015, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.94444274902344, + "epoch": 1.1194029850746268, + "grad_norm": 0.9574844747280943, + "learning_rate": 5.559701492537313e-08, + "loss": -0.0018, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 1.126865671641791, + "grad_norm": 0.5314179817412721, + "learning_rate": 5.597014925373134e-08, + "loss": 0.0019, + "reward": 0.1944444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0833282470703, + "epoch": 1.1343283582089552, + "grad_norm": 1.4317245351686614, + "learning_rate": 5.634328358208955e-08, + "loss": -0.0021, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.5, + "epoch": 1.1417910447761195, + "grad_norm": 2.448707441552141, + "learning_rate": 5.671641791044776e-08, + "loss": -0.0028, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.13888549804688, + "epoch": 1.1492537313432836, + "grad_norm": 0.9431844976980338, + "learning_rate": 5.7089552238805964e-08, + "loss": -0.007, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.38888549804688, + "epoch": 1.1567164179104479, + "grad_norm": 3.47412191142643, + "learning_rate": 5.746268656716417e-08, + "loss": -0.011, + "reward": 0.4166666567325592, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8333282470703, + "epoch": 1.164179104477612, + "grad_norm": 1.615317966206997, + "learning_rate": 5.7835820895522385e-08, + "loss": 0.0141, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.63888549804688, + "epoch": 1.171641791044776, + "grad_norm": 0.5276358771628825, + "learning_rate": 5.820895522388059e-08, + "loss": 0.0029, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.13888549804688, + "epoch": 1.1791044776119404, + "grad_norm": 0.9923411938598559, + "learning_rate": 5.8582089552238807e-08, + "loss": 0.0045, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.69444274902344, + "epoch": 1.1865671641791045, + "grad_norm": 0.6066273213848927, + "learning_rate": 5.895522388059701e-08, + "loss": -0.0002, + "reward": 0.3333333432674408, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.7777862548828, + "epoch": 1.1940298507462686, + "grad_norm": 0.895317143492697, + "learning_rate": 5.932835820895522e-08, + "loss": -0.0043, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2777862548828, + "epoch": 1.2014925373134329, + "grad_norm": 0.9935900904575643, + "learning_rate": 5.970149253731343e-08, + "loss": -0.0072, + "reward": 0.694444477558136, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.7777862548828, + "epoch": 1.208955223880597, + "grad_norm": 0.46767529847130207, + "learning_rate": 6.007462686567164e-08, + "loss": 0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8333282470703, + "epoch": 1.2164179104477613, + "grad_norm": 0.7423074606892169, + "learning_rate": 6.044776119402984e-08, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.2777862548828, + "epoch": 1.2238805970149254, + "grad_norm": 1.1406650871433337, + "learning_rate": 6.082089552238805e-08, + "loss": 0.006, + "reward": 0.8333333134651184, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0, + "epoch": 1.2313432835820897, + "grad_norm": 0.4399999469097477, + "learning_rate": 6.119402985074627e-08, + "loss": -0.001, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.22222900390625, + "epoch": 1.2388059701492538, + "grad_norm": 0.21897743948225007, + "learning_rate": 6.156716417910447e-08, + "loss": -0.0008, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1666717529297, + "epoch": 1.2462686567164178, + "grad_norm": 0.8560554462775481, + "learning_rate": 6.194029850746269e-08, + "loss": 0.0298, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.7777862548828, + "epoch": 1.2537313432835822, + "grad_norm": 1.3496669476915888, + "learning_rate": 6.23134328358209e-08, + "loss": 0.0178, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5, + "epoch": 1.2611940298507462, + "grad_norm": 0.9401923383038809, + "learning_rate": 6.26865671641791e-08, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.22222900390625, + "epoch": 1.2686567164179103, + "grad_norm": 1.1302477402738056, + "learning_rate": 6.305970149253731e-08, + "loss": 0.0259, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.72222900390625, + "epoch": 1.2761194029850746, + "grad_norm": 0.8803801354382012, + "learning_rate": 6.343283582089553e-08, + "loss": -0.0367, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.88888549804688, + "epoch": 1.2835820895522387, + "grad_norm": 0.7982845968033118, + "learning_rate": 6.380597014925374e-08, + "loss": 0.0012, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.72222900390625, + "epoch": 1.291044776119403, + "grad_norm": 0.7799988565966472, + "learning_rate": 6.417910447761193e-08, + "loss": -0.0133, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.88888549804688, + "epoch": 1.2985074626865671, + "grad_norm": 0.9605478795817078, + "learning_rate": 6.455223880597015e-08, + "loss": -0.0163, + "reward": 0.6666666865348816, + "reward_std": 0.47882235050201416, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.94444274902344, + "epoch": 1.3059701492537314, + "grad_norm": 0.4728146013547414, + "learning_rate": 6.492537313432836e-08, + "loss": 0.0013, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.22222900390625, + "epoch": 1.3134328358208955, + "grad_norm": 1.1005197525617143, + "learning_rate": 6.529850746268655e-08, + "loss": -0.0046, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.55555725097656, + "epoch": 1.3208955223880596, + "grad_norm": 1.1113306275559722, + "learning_rate": 6.567164179104477e-08, + "loss": 0.0075, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.3333282470703, + "epoch": 1.328358208955224, + "grad_norm": 0.8246411040136626, + "learning_rate": 6.604477611940298e-08, + "loss": 0.0028, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.88888549804688, + "epoch": 1.335820895522388, + "grad_norm": 1.1807930042629948, + "learning_rate": 6.64179104477612e-08, + "loss": 0.0049, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9166717529297, + "epoch": 1.3432835820895521, + "grad_norm": 1.3901824755777965, + "learning_rate": 6.67910447761194e-08, + "loss": 0.0246, + "reward": 0.4166666567325592, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.11111450195312, + "epoch": 1.3507462686567164, + "grad_norm": 1.1631915721093633, + "learning_rate": 6.71641791044776e-08, + "loss": -0.0246, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.80555725097656, + "epoch": 1.3582089552238805, + "grad_norm": 1.0917720501066932, + "learning_rate": 6.753731343283582e-08, + "loss": -0.0008, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.88888549804688, + "epoch": 1.3656716417910448, + "grad_norm": 0.6235878848592473, + "learning_rate": 6.791044776119402e-08, + "loss": 0.008, + "reward": 0.3611111044883728, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6666717529297, + "epoch": 1.373134328358209, + "grad_norm": 1.085376094454201, + "learning_rate": 6.828358208955224e-08, + "loss": -0.0033, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5277862548828, + "epoch": 1.3805970149253732, + "grad_norm": 0.7279844747607321, + "learning_rate": 6.865671641791045e-08, + "loss": -0.0121, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.94444274902344, + "epoch": 1.3880597014925373, + "grad_norm": 0.6758320266138823, + "learning_rate": 6.902985074626865e-08, + "loss": -0.0156, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.63888549804688, + "epoch": 1.3955223880597014, + "grad_norm": 0.6798397507804114, + "learning_rate": 6.940298507462686e-08, + "loss": 0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9166717529297, + "epoch": 1.4029850746268657, + "grad_norm": 0.7026913041445959, + "learning_rate": 6.977611940298507e-08, + "loss": -0.0028, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.25, + "epoch": 1.4104477611940298, + "grad_norm": 1.3502005047921115, + "learning_rate": 7.014925373134329e-08, + "loss": -0.0113, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5277862548828, + "epoch": 1.417910447761194, + "grad_norm": 0.7289078250937877, + "learning_rate": 7.05223880597015e-08, + "loss": -0.0067, + "reward": 0.4444444477558136, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.2777862548828, + "epoch": 1.4253731343283582, + "grad_norm": 0.5425474797150042, + "learning_rate": 7.089552238805969e-08, + "loss": 0.0011, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9166717529297, + "epoch": 1.4328358208955223, + "grad_norm": 0.9913443373743331, + "learning_rate": 7.126865671641791e-08, + "loss": 0.0008, + "reward": 0.5555555820465088, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.47222900390625, + "epoch": 1.4402985074626866, + "grad_norm": 1.098548562850326, + "learning_rate": 7.164179104477612e-08, + "loss": -0.0283, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 1.4477611940298507, + "grad_norm": 0.606922388896709, + "learning_rate": 7.201492537313432e-08, + "loss": 0.0015, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 1.455223880597015, + "grad_norm": 1.767507260164952, + "learning_rate": 7.238805970149253e-08, + "loss": 0.0032, + "reward": 0.5833333134651184, + "reward_std": 0.4060778319835663, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.0, + "epoch": 1.462686567164179, + "grad_norm": 0.630244479602814, + "learning_rate": 7.276119402985074e-08, + "loss": -0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.19444274902344, + "epoch": 1.4701492537313432, + "grad_norm": 0.8922489841917355, + "learning_rate": 7.313432835820896e-08, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.05555725097656, + "epoch": 1.4776119402985075, + "grad_norm": 1.68671189587131, + "learning_rate": 7.350746268656715e-08, + "loss": 0.0057, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.19444274902344, + "epoch": 1.4850746268656716, + "grad_norm": 0.7907022115820024, + "learning_rate": 7.388059701492537e-08, + "loss": 0.0153, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.44444274902344, + "epoch": 1.4925373134328357, + "grad_norm": 1.020997830792687, + "learning_rate": 7.425373134328358e-08, + "loss": 0.0071, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.63888549804688, + "epoch": 1.5, + "grad_norm": 0.534717777643113, + "learning_rate": 7.462686567164178e-08, + "loss": -0.001, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.2777862548828, + "epoch": 1.5074626865671643, + "grad_norm": 0.9036973194199066, + "learning_rate": 7.5e-08, + "loss": -0.0173, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.61111450195312, + "epoch": 1.5149253731343284, + "grad_norm": 1.2446544211183803, + "learning_rate": 7.53731343283582e-08, + "loss": 0.0024, + "reward": 0.25, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.11111450195312, + "epoch": 1.5223880597014925, + "grad_norm": 0.699871888118135, + "learning_rate": 7.574626865671642e-08, + "loss": 0.013, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0, + "epoch": 1.5298507462686568, + "grad_norm": 0.43869545093590684, + "learning_rate": 7.611940298507462e-08, + "loss": 0.0036, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.22222137451172, + "epoch": 1.537313432835821, + "grad_norm": 1.1766203005329536, + "learning_rate": 7.649253731343283e-08, + "loss": -0.0024, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.86111450195312, + "epoch": 1.544776119402985, + "grad_norm": 1.0653313795984878, + "learning_rate": 7.686567164179105e-08, + "loss": 0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.13888549804688, + "epoch": 1.5522388059701493, + "grad_norm": 0.4121697307914053, + "learning_rate": 7.723880597014925e-08, + "loss": -0.0086, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.94444274902344, + "epoch": 1.5597014925373134, + "grad_norm": 0.4564746516436987, + "learning_rate": 7.761194029850746e-08, + "loss": 0.0009, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.44444274902344, + "epoch": 1.5671641791044775, + "grad_norm": 0.9454043909786164, + "learning_rate": 7.798507462686567e-08, + "loss": 0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.47222900390625, + "epoch": 1.5746268656716418, + "grad_norm": 0.8066956577912607, + "learning_rate": 7.835820895522388e-08, + "loss": 0.0039, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.63888549804688, + "epoch": 1.582089552238806, + "grad_norm": 0.7535617060282043, + "learning_rate": 7.873134328358208e-08, + "loss": 0.0009, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.25, + "epoch": 1.5895522388059702, + "grad_norm": 1.279347324092183, + "learning_rate": 7.910447761194029e-08, + "loss": -0.0099, + "reward": 0.3055555522441864, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.2777862548828, + "epoch": 1.5970149253731343, + "grad_norm": 0.3411398808150024, + "learning_rate": 7.947761194029851e-08, + "loss": -0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.19444274902344, + "epoch": 1.6044776119402986, + "grad_norm": 1.1250991273903954, + "learning_rate": 7.985074626865672e-08, + "loss": 0.001, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.75, + "epoch": 1.6119402985074627, + "grad_norm": 1.0984755107721347, + "learning_rate": 8.022388059701491e-08, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.4616333544254303, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 1.6194029850746268, + "grad_norm": 0.9072729381930402, + "learning_rate": 8.059701492537313e-08, + "loss": -0.0004, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.22222900390625, + "epoch": 1.626865671641791, + "grad_norm": 1.4771085028698203, + "learning_rate": 8.097014925373134e-08, + "loss": 0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.88888549804688, + "epoch": 1.6343283582089554, + "grad_norm": 0.3973677613945088, + "learning_rate": 8.134328358208955e-08, + "loss": 0.0023, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.7777862548828, + "epoch": 1.6417910447761193, + "grad_norm": 0.45051421689212956, + "learning_rate": 8.171641791044776e-08, + "loss": 0.001, + "reward": 0.4166666567325592, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.94444274902344, + "epoch": 1.6492537313432836, + "grad_norm": 1.2546619833088213, + "learning_rate": 8.208955223880596e-08, + "loss": 0.0157, + "reward": 0.5277777910232544, + "reward_std": 0.5515668988227844, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.86111450195312, + "epoch": 1.6567164179104479, + "grad_norm": 0.8752463441277103, + "learning_rate": 8.246268656716418e-08, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.47222900390625, + "epoch": 1.664179104477612, + "grad_norm": 2.2937663163077606, + "learning_rate": 8.283582089552238e-08, + "loss": 0.022, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 1.671641791044776, + "grad_norm": 0.9119195425467292, + "learning_rate": 8.32089552238806e-08, + "loss": -0.0157, + "reward": 0.6111111044883728, + "reward_std": 0.414672315120697, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.11111450195312, + "epoch": 1.6791044776119404, + "grad_norm": 0.6361976770890705, + "learning_rate": 8.35820895522388e-08, + "loss": -0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.44444274902344, + "epoch": 1.6865671641791045, + "grad_norm": 0.7092702252485561, + "learning_rate": 8.395522388059701e-08, + "loss": -0.0002, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0, + "epoch": 1.6940298507462686, + "grad_norm": 0.9980688000419133, + "learning_rate": 8.432835820895522e-08, + "loss": 0.0031, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.97222900390625, + "epoch": 1.7014925373134329, + "grad_norm": 1.0468548278081884, + "learning_rate": 8.470149253731343e-08, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5833282470703, + "epoch": 1.7089552238805972, + "grad_norm": 1.1567118443153994, + "learning_rate": 8.507462686567165e-08, + "loss": -0.001, + "reward": 0.25, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.75, + "epoch": 1.716417910447761, + "grad_norm": 0.6196319073701687, + "learning_rate": 8.544776119402984e-08, + "loss": 0.0057, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0277862548828, + "epoch": 1.7238805970149254, + "grad_norm": 1.3333602840521872, + "learning_rate": 8.582089552238805e-08, + "loss": 0.0024, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.97222900390625, + "epoch": 1.7313432835820897, + "grad_norm": 0.9937770480073449, + "learning_rate": 8.619402985074627e-08, + "loss": -0.0008, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.63888549804688, + "epoch": 1.7388059701492538, + "grad_norm": 0.6770689223333507, + "learning_rate": 8.656716417910448e-08, + "loss": 0.0009, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.2777862548828, + "epoch": 1.7462686567164178, + "grad_norm": 0.5315605669527473, + "learning_rate": 8.694029850746267e-08, + "loss": 0.001, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.69444274902344, + "epoch": 1.7537313432835822, + "grad_norm": 0.44536253369635703, + "learning_rate": 8.731343283582089e-08, + "loss": 0.0018, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0, + "epoch": 1.7611940298507462, + "grad_norm": 1.5675157412548217, + "learning_rate": 8.76865671641791e-08, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.02777862548828, + "epoch": 1.7686567164179103, + "grad_norm": 0.983020053222029, + "learning_rate": 8.805970149253731e-08, + "loss": -0.0025, + "reward": 0.472222238779068, + "reward_std": 0.4702278673648834, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.72222900390625, + "epoch": 1.7761194029850746, + "grad_norm": 0.5567998903403788, + "learning_rate": 8.843283582089551e-08, + "loss": -0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.77777862548828, + "epoch": 1.783582089552239, + "grad_norm": 1.2521900445944862, + "learning_rate": 8.880597014925372e-08, + "loss": 0.0127, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.4166717529297, + "epoch": 1.7910447761194028, + "grad_norm": 0.7265715650287723, + "learning_rate": 8.917910447761194e-08, + "loss": 0.012, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.86111450195312, + "epoch": 1.7985074626865671, + "grad_norm": 1.1810165725606057, + "learning_rate": 8.955223880597014e-08, + "loss": -0.0108, + "reward": 0.3055555522441864, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4166717529297, + "epoch": 1.8059701492537314, + "grad_norm": 1.2362595341218703, + "learning_rate": 8.992537313432836e-08, + "loss": -0.0355, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 1.8134328358208955, + "grad_norm": 2.1366144556128766, + "learning_rate": 9.029850746268656e-08, + "loss": 0.0142, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 1.8208955223880596, + "grad_norm": 1.3927446006848412, + "learning_rate": 9.067164179104477e-08, + "loss": 0.0085, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.97222900390625, + "epoch": 1.828358208955224, + "grad_norm": 1.643850993398488, + "learning_rate": 9.104477611940298e-08, + "loss": -0.029, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.19444274902344, + "epoch": 1.835820895522388, + "grad_norm": 0.5232431803879087, + "learning_rate": 9.141791044776119e-08, + "loss": -0.0014, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.4166717529297, + "epoch": 1.8432835820895521, + "grad_norm": 1.1497624281311731, + "learning_rate": 9.179104477611941e-08, + "loss": 0.018, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.38888549804688, + "epoch": 1.8507462686567164, + "grad_norm": 0.510553682781255, + "learning_rate": 9.21641791044776e-08, + "loss": -0.0141, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 1.8582089552238807, + "grad_norm": 0.3600874550277717, + "learning_rate": 9.253731343283581e-08, + "loss": 0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.05555725097656, + "epoch": 1.8656716417910446, + "grad_norm": 0.5354943595795134, + "learning_rate": 9.291044776119403e-08, + "loss": 0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.19444274902344, + "epoch": 1.873134328358209, + "grad_norm": 0.5766965321176167, + "learning_rate": 9.328358208955224e-08, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.19444274902344, + "epoch": 1.8805970149253732, + "grad_norm": 0.5116714405554367, + "learning_rate": 9.365671641791044e-08, + "loss": -0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.69444274902344, + "epoch": 1.8880597014925373, + "grad_norm": 1.2012328302790263, + "learning_rate": 9.402985074626865e-08, + "loss": 0.0122, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.02777777798473835, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5833282470703, + "epoch": 1.8955223880597014, + "grad_norm": 1.728637859709939, + "learning_rate": 9.440298507462686e-08, + "loss": -0.0279, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.55555725097656, + "epoch": 1.9029850746268657, + "grad_norm": 0.6643294446995719, + "learning_rate": 9.477611940298507e-08, + "loss": -0.0019, + "reward": 0.5555555820465088, + "reward_std": 0.25660011172294617, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.69444274902344, + "epoch": 1.9104477611940298, + "grad_norm": 0.4631786900309044, + "learning_rate": 9.514925373134327e-08, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.86111450195312, + "epoch": 1.917910447761194, + "grad_norm": 0.4299328312550903, + "learning_rate": 9.55223880597015e-08, + "loss": 0.0019, + "reward": 0.6111111044883728, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 1.9253731343283582, + "grad_norm": 0.20689030063212102, + "learning_rate": 9.58955223880597e-08, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.22222900390625, + "epoch": 1.9328358208955225, + "grad_norm": 0.7860875851093956, + "learning_rate": 9.62686567164179e-08, + "loss": -0.0073, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.72222900390625, + "epoch": 1.9402985074626866, + "grad_norm": 0.5091104564423133, + "learning_rate": 9.664179104477612e-08, + "loss": 0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 1.9477611940298507, + "grad_norm": 0.4249460521221764, + "learning_rate": 9.701492537313432e-08, + "loss": 0.0006, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.13888549804688, + "epoch": 1.955223880597015, + "grad_norm": 0.9201447763448197, + "learning_rate": 9.738805970149254e-08, + "loss": -0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 1.962686567164179, + "grad_norm": 0.6950009286019873, + "learning_rate": 9.776119402985074e-08, + "loss": -0.0073, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 1.9701492537313432, + "grad_norm": 1.0665160541365881, + "learning_rate": 9.813432835820895e-08, + "loss": 0.0008, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 1.9776119402985075, + "grad_norm": 0.9658722689679419, + "learning_rate": 9.850746268656717e-08, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.94444274902344, + "epoch": 1.9850746268656716, + "grad_norm": 0.6390131004647505, + "learning_rate": 9.888059701492536e-08, + "loss": -0.0032, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.1666717529297, + "epoch": 1.9925373134328357, + "grad_norm": 0.749879610405361, + "learning_rate": 9.925373134328358e-08, + "loss": -0.0214, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5833282470703, + "epoch": 2.0074626865671643, + "grad_norm": 0.6255661602149306, + "learning_rate": 9.962686567164179e-08, + "loss": 0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.94444274902344, + "epoch": 2.014925373134328, + "grad_norm": 0.2676407436972521, + "learning_rate": 1e-07, + "loss": 0.0004, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6666717529297, + "epoch": 2.0223880597014925, + "grad_norm": 0.8319004491715992, + "learning_rate": 1.003731343283582e-07, + "loss": 0.0188, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1666717529297, + "epoch": 2.029850746268657, + "grad_norm": 0.9195085730097275, + "learning_rate": 1.0074626865671641e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.86111450195312, + "epoch": 2.0373134328358207, + "grad_norm": 0.9342688195983914, + "learning_rate": 1.0111940298507463e-07, + "loss": 0.0173, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5277862548828, + "epoch": 2.044776119402985, + "grad_norm": 0.9458517666104721, + "learning_rate": 1.0149253731343282e-07, + "loss": 0.0379, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.47222900390625, + "epoch": 2.0522388059701493, + "grad_norm": 0.7420252682289704, + "learning_rate": 1.0186567164179103e-07, + "loss": 0.012, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13888549804688, + "epoch": 2.0597014925373136, + "grad_norm": 0.9114027738213052, + "learning_rate": 1.0223880597014925e-07, + "loss": 0.0007, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0277862548828, + "epoch": 2.0671641791044775, + "grad_norm": 2.0666770805638763, + "learning_rate": 1.0261194029850746e-07, + "loss": 0.0645, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 2.074626865671642, + "grad_norm": 0.8355004177794821, + "learning_rate": 1.0298507462686567e-07, + "loss": -0.0016, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.9166717529297, + "epoch": 2.082089552238806, + "grad_norm": 0.8444594234301113, + "learning_rate": 1.0335820895522387e-07, + "loss": -0.0, + "reward": 0.6111111044883728, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.7777862548828, + "epoch": 2.08955223880597, + "grad_norm": 0.5061052228174773, + "learning_rate": 1.0373134328358208e-07, + "loss": -0.0024, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.2777862548828, + "epoch": 2.0970149253731343, + "grad_norm": 0.9447851400922491, + "learning_rate": 1.041044776119403e-07, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.414672315120697, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.61111450195312, + "epoch": 2.1044776119402986, + "grad_norm": 0.23593326273081974, + "learning_rate": 1.044776119402985e-07, + "loss": -0.0009, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.47222900390625, + "epoch": 2.111940298507463, + "grad_norm": 0.5940046368225097, + "learning_rate": 1.0485074626865672e-07, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.75, + "epoch": 2.1194029850746268, + "grad_norm": 0.8922157577203326, + "learning_rate": 1.0522388059701492e-07, + "loss": -0.0059, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.97222900390625, + "epoch": 2.126865671641791, + "grad_norm": 0.46495942842939536, + "learning_rate": 1.0559701492537312e-07, + "loss": 0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.86111450195312, + "epoch": 2.1343283582089554, + "grad_norm": 0.47985875628113966, + "learning_rate": 1.0597014925373134e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.44444274902344, + "epoch": 2.1417910447761193, + "grad_norm": 0.8113688506670526, + "learning_rate": 1.0634328358208955e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0833282470703, + "epoch": 2.1492537313432836, + "grad_norm": 1.1479338677815163, + "learning_rate": 1.0671641791044777e-07, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 2.156716417910448, + "grad_norm": 0.7302893110893656, + "learning_rate": 1.0708955223880596e-07, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.94444274902344, + "epoch": 2.1641791044776117, + "grad_norm": 0.4132176470394809, + "learning_rate": 1.0746268656716417e-07, + "loss": -0.0004, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.44444274902344, + "epoch": 2.171641791044776, + "grad_norm": 0.5783161967085073, + "learning_rate": 1.0783582089552239e-07, + "loss": 0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.58333587646484, + "epoch": 2.1791044776119404, + "grad_norm": 0.6871935944994115, + "learning_rate": 1.0820895522388058e-07, + "loss": 0.0009, + "reward": 0.6388888955116272, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.13888549804688, + "epoch": 2.1865671641791047, + "grad_norm": 1.5187459292686512, + "learning_rate": 1.085820895522388e-07, + "loss": -0.0211, + "reward": 0.25, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6666717529297, + "epoch": 2.1940298507462686, + "grad_norm": 0.9529511238428625, + "learning_rate": 1.0895522388059701e-07, + "loss": -0.0075, + "reward": 0.3055555522441864, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 2.201492537313433, + "grad_norm": 0.5855580360272308, + "learning_rate": 1.0932835820895522e-07, + "loss": 0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.22222900390625, + "epoch": 2.208955223880597, + "grad_norm": 0.931636753285744, + "learning_rate": 1.0970149253731343e-07, + "loss": -0.0056, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5, + "epoch": 2.216417910447761, + "grad_norm": 0.7239127627793425, + "learning_rate": 1.1007462686567163e-07, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.25, + "epoch": 2.2238805970149254, + "grad_norm": 1.4213395822935952, + "learning_rate": 1.1044776119402985e-07, + "loss": 0.0099, + "reward": 0.7222222089767456, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.3333282470703, + "epoch": 2.2313432835820897, + "grad_norm": 0.6498344594444871, + "learning_rate": 1.1082089552238806e-07, + "loss": 0.0016, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 2.2388059701492535, + "grad_norm": 0.8877821117094779, + "learning_rate": 1.1119402985074626e-07, + "loss": -0.0064, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.72222900390625, + "epoch": 2.246268656716418, + "grad_norm": 0.5845650873313559, + "learning_rate": 1.1156716417910448e-07, + "loss": -0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.2777862548828, + "epoch": 2.253731343283582, + "grad_norm": 0.8147123545730173, + "learning_rate": 1.1194029850746268e-07, + "loss": -0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0833282470703, + "epoch": 2.2611940298507465, + "grad_norm": 0.9037493575516136, + "learning_rate": 1.1231343283582088e-07, + "loss": 0.0009, + "reward": 0.3055555522441864, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0277862548828, + "epoch": 2.2686567164179103, + "grad_norm": 0.848981789475869, + "learning_rate": 1.126865671641791e-07, + "loss": -0.015, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5277862548828, + "epoch": 2.2761194029850746, + "grad_norm": 0.9129287630236713, + "learning_rate": 1.130597014925373e-07, + "loss": 0.0031, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.61111450195312, + "epoch": 2.283582089552239, + "grad_norm": 0.860897285472136, + "learning_rate": 1.1343283582089553e-07, + "loss": 0.0048, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0833282470703, + "epoch": 2.291044776119403, + "grad_norm": 1.3079404067875404, + "learning_rate": 1.1380597014925372e-07, + "loss": -0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 2.298507462686567, + "grad_norm": 0.894486006625319, + "learning_rate": 1.1417910447761193e-07, + "loss": -0.0242, + "reward": 0.3611111044883728, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.6666717529297, + "epoch": 2.3059701492537314, + "grad_norm": 1.0003947015118688, + "learning_rate": 1.1455223880597015e-07, + "loss": -0.005, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.22222900390625, + "epoch": 2.3134328358208958, + "grad_norm": 0.9864825839254732, + "learning_rate": 1.1492537313432834e-07, + "loss": -0.0122, + "reward": 0.7222222089767456, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.36111450195312, + "epoch": 2.3208955223880596, + "grad_norm": 0.6631516157258819, + "learning_rate": 1.1529850746268656e-07, + "loss": -0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1666717529297, + "epoch": 2.328358208955224, + "grad_norm": 0.4330830352038163, + "learning_rate": 1.1567164179104477e-07, + "loss": 0.0012, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.75, + "epoch": 2.3358208955223883, + "grad_norm": 3.3060770348421364, + "learning_rate": 1.1604477611940298e-07, + "loss": 0.0085, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.13888549804688, + "epoch": 2.343283582089552, + "grad_norm": 0.7269980234501239, + "learning_rate": 1.1641791044776119e-07, + "loss": 0.0007, + "reward": 0.3888888955116272, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.1666717529297, + "epoch": 2.3507462686567164, + "grad_norm": 1.2758297991546488, + "learning_rate": 1.1679104477611939e-07, + "loss": 0.0019, + "reward": 0.472222238779068, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.47222137451172, + "epoch": 2.3582089552238807, + "grad_norm": 0.739147970050267, + "learning_rate": 1.1716417910447761e-07, + "loss": -0.0014, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.13888549804688, + "epoch": 2.3656716417910446, + "grad_norm": 4.5056939763099715, + "learning_rate": 1.1753731343283582e-07, + "loss": -0.062, + "reward": 0.4444444477558136, + "reward_std": 0.47882235050201416, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.2777862548828, + "epoch": 2.373134328358209, + "grad_norm": 0.6394332401365043, + "learning_rate": 1.1791044776119401e-07, + "loss": 0.0005, + "reward": 0.8888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.44444274902344, + "epoch": 2.3805970149253732, + "grad_norm": 0.387304110866356, + "learning_rate": 1.1828358208955224e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4166717529297, + "epoch": 2.388059701492537, + "grad_norm": 0.7343457914149706, + "learning_rate": 1.1865671641791044e-07, + "loss": -0.0045, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.11111450195312, + "epoch": 2.3955223880597014, + "grad_norm": 0.8271925651252346, + "learning_rate": 1.1902985074626865e-07, + "loss": -0.0003, + "reward": 0.3055555522441864, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5277862548828, + "epoch": 2.4029850746268657, + "grad_norm": 1.9285061635921048, + "learning_rate": 1.1940298507462686e-07, + "loss": 0.0245, + "reward": 0.4166666567325592, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.36111450195312, + "epoch": 2.41044776119403, + "grad_norm": 0.7207565043206827, + "learning_rate": 1.1977611940298506e-07, + "loss": 0.0003, + "reward": 0.3888888955116272, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0277862548828, + "epoch": 2.417910447761194, + "grad_norm": 0.9008642856876821, + "learning_rate": 1.2014925373134327e-07, + "loss": -0.0072, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.55555725097656, + "epoch": 2.425373134328358, + "grad_norm": 0.3777830065819705, + "learning_rate": 1.2052238805970148e-07, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.36111450195312, + "epoch": 2.4328358208955225, + "grad_norm": 0.9435307488225212, + "learning_rate": 1.208955223880597e-07, + "loss": 0.0157, + "reward": 0.3611111044883728, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.8333282470703, + "epoch": 2.4402985074626864, + "grad_norm": 0.7132004727500433, + "learning_rate": 1.2126865671641792e-07, + "loss": 0.0016, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.9166717529297, + "epoch": 2.4477611940298507, + "grad_norm": 0.9835609148086228, + "learning_rate": 1.216417910447761e-07, + "loss": -0.0009, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.19444274902344, + "epoch": 2.455223880597015, + "grad_norm": 0.8935361517972893, + "learning_rate": 1.220149253731343e-07, + "loss": 0.0086, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5833282470703, + "epoch": 2.4626865671641793, + "grad_norm": 0.9119389668897341, + "learning_rate": 1.2238805970149254e-07, + "loss": 0.0066, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.1666717529297, + "epoch": 2.470149253731343, + "grad_norm": 0.4755112412822099, + "learning_rate": 1.2276119402985075e-07, + "loss": 0.0033, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5833282470703, + "epoch": 2.4776119402985075, + "grad_norm": 1.0380341005307838, + "learning_rate": 1.2313432835820893e-07, + "loss": 0.0295, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.72222900390625, + "epoch": 2.485074626865672, + "grad_norm": 1.6890954783354768, + "learning_rate": 1.2350746268656716e-07, + "loss": 0.0049, + "reward": 0.7777777910232544, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 2.4925373134328357, + "grad_norm": 1.0470865475378264, + "learning_rate": 1.2388059701492537e-07, + "loss": 0.0012, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5277862548828, + "epoch": 2.5, + "grad_norm": 0.5616184341535164, + "learning_rate": 1.2425373134328358e-07, + "loss": 0.0037, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6666717529297, + "epoch": 2.5074626865671643, + "grad_norm": 0.6028919347347339, + "learning_rate": 1.246268656716418e-07, + "loss": 0.0009, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 2.5149253731343286, + "grad_norm": 0.817076212339797, + "learning_rate": 1.25e-07, + "loss": -0.0002, + "reward": 0.3333333432674408, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5277862548828, + "epoch": 2.5223880597014925, + "grad_norm": 0.9209984304509901, + "learning_rate": 1.253731343283582e-07, + "loss": 0.0216, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.1666717529297, + "epoch": 2.529850746268657, + "grad_norm": 0.5937234891920685, + "learning_rate": 1.257462686567164e-07, + "loss": -0.0155, + "reward": 0.3333333432674408, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 2.5373134328358207, + "grad_norm": 2.8159875033340755, + "learning_rate": 1.2611940298507462e-07, + "loss": 0.0108, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0277862548828, + "epoch": 2.544776119402985, + "grad_norm": 1.2534143893501775, + "learning_rate": 1.2649253731343282e-07, + "loss": -0.0037, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.08333587646484, + "epoch": 2.5522388059701493, + "grad_norm": 0.6085776982357631, + "learning_rate": 1.2686567164179106e-07, + "loss": -0.0008, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.47222900390625, + "epoch": 2.5597014925373136, + "grad_norm": 0.7216269301121704, + "learning_rate": 1.2723880597014924e-07, + "loss": 0.0009, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 2.5671641791044775, + "grad_norm": 0.6068037869135169, + "learning_rate": 1.2761194029850747e-07, + "loss": 0.0006, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9166717529297, + "epoch": 2.574626865671642, + "grad_norm": 0.8124231821986206, + "learning_rate": 1.2798507462686568e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.6666717529297, + "epoch": 2.582089552238806, + "grad_norm": 0.5271663715334634, + "learning_rate": 1.2835820895522386e-07, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.8333282470703, + "epoch": 2.58955223880597, + "grad_norm": 1.6434945099018408, + "learning_rate": 1.287313432835821e-07, + "loss": 0.0013, + "reward": 0.5277777910232544, + "reward_std": 0.4702278673648834, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.16666412353516, + "epoch": 2.5970149253731343, + "grad_norm": 1.506210148786564, + "learning_rate": 1.291044776119403e-07, + "loss": -0.0273, + "reward": 0.5, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.22222900390625, + "epoch": 2.6044776119402986, + "grad_norm": 1.3750684917148879, + "learning_rate": 1.2947761194029848e-07, + "loss": -0.0042, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 2.611940298507463, + "grad_norm": 0.6885097491570175, + "learning_rate": 1.2985074626865672e-07, + "loss": 0.0044, + "reward": 0.3888888955116272, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5, + "epoch": 2.6194029850746268, + "grad_norm": 0.5471393976450394, + "learning_rate": 1.3022388059701492e-07, + "loss": 0.0074, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.88888549804688, + "epoch": 2.626865671641791, + "grad_norm": 1.1327252067660243, + "learning_rate": 1.305970149253731e-07, + "loss": -0.0114, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.88888549804688, + "epoch": 2.6343283582089554, + "grad_norm": 0.8643417008263201, + "learning_rate": 1.3097014925373134e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.69444274902344, + "epoch": 2.6417910447761193, + "grad_norm": 2.019476340259835, + "learning_rate": 1.3134328358208955e-07, + "loss": -0.009, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.13888549804688, + "epoch": 2.6492537313432836, + "grad_norm": 0.800302450151764, + "learning_rate": 1.3171641791044778e-07, + "loss": -0.0011, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.75, + "epoch": 2.656716417910448, + "grad_norm": 0.438752380781134, + "learning_rate": 1.3208955223880596e-07, + "loss": 0.0012, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.4166717529297, + "epoch": 2.664179104477612, + "grad_norm": 1.0777178109206198, + "learning_rate": 1.3246268656716417e-07, + "loss": -0.0132, + "reward": 0.3055555522441864, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.61111450195312, + "epoch": 2.671641791044776, + "grad_norm": 0.6064904990531479, + "learning_rate": 1.328358208955224e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1666717529297, + "epoch": 2.6791044776119404, + "grad_norm": 1.2193654372313107, + "learning_rate": 1.3320895522388058e-07, + "loss": 0.0247, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.55555725097656, + "epoch": 2.6865671641791042, + "grad_norm": 0.9454697392060731, + "learning_rate": 1.335820895522388e-07, + "loss": 0.065, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.86111450195312, + "epoch": 2.6940298507462686, + "grad_norm": 0.9262544075223099, + "learning_rate": 1.3395522388059702e-07, + "loss": -0.0046, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.75, + "epoch": 2.701492537313433, + "grad_norm": 0.8528278873952209, + "learning_rate": 1.343283582089552e-07, + "loss": -0.0157, + "reward": 0.3888888955116272, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.25, + "epoch": 2.708955223880597, + "grad_norm": 0.7636849685434622, + "learning_rate": 1.347014925373134e-07, + "loss": -0.0042, + "reward": 0.5, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.38888549804688, + "epoch": 2.716417910447761, + "grad_norm": 1.3050764757789686, + "learning_rate": 1.3507462686567165e-07, + "loss": 0.0189, + "reward": 0.694444477558136, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.7777862548828, + "epoch": 2.7238805970149254, + "grad_norm": 0.7976434483070527, + "learning_rate": 1.3544776119402985e-07, + "loss": -0.0082, + "reward": 0.3611111044883728, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6666717529297, + "epoch": 2.7313432835820897, + "grad_norm": 1.0141880710308406, + "learning_rate": 1.3582089552238803e-07, + "loss": 0.0071, + "reward": 0.5277777910232544, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.75, + "epoch": 2.7388059701492535, + "grad_norm": 0.25455706080706125, + "learning_rate": 1.3619402985074627e-07, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.72222900390625, + "epoch": 2.746268656716418, + "grad_norm": 0.6343217951894543, + "learning_rate": 1.3656716417910448e-07, + "loss": 0.0002, + "reward": 0.3611111044883728, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.97222900390625, + "epoch": 2.753731343283582, + "grad_norm": 1.5657565500379413, + "learning_rate": 1.3694029850746268e-07, + "loss": -0.1062, + "reward": 0.5833333134651184, + "reward_std": 0.35911673307418823, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 2.7611940298507465, + "grad_norm": 2.41218931150842, + "learning_rate": 1.373134328358209e-07, + "loss": 0.0032, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.30555725097656, + "epoch": 2.7686567164179103, + "grad_norm": 0.7632949733429577, + "learning_rate": 1.376865671641791e-07, + "loss": 0.0079, + "reward": 0.3611111044883728, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.88888549804688, + "epoch": 2.7761194029850746, + "grad_norm": 0.42941727596531776, + "learning_rate": 1.380597014925373e-07, + "loss": -0.0035, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.47222900390625, + "epoch": 2.783582089552239, + "grad_norm": 0.5696465823496082, + "learning_rate": 1.384328358208955e-07, + "loss": 0.012, + "reward": 0.25, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.25, + "epoch": 2.791044776119403, + "grad_norm": 0.0, + "learning_rate": 1.3880597014925372e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.63888549804688, + "epoch": 2.798507462686567, + "grad_norm": 0.9034874825505514, + "learning_rate": 1.3917910447761195e-07, + "loss": 0.0156, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0833282470703, + "epoch": 2.8059701492537314, + "grad_norm": 0.6101258637257191, + "learning_rate": 1.3955223880597013e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.4060778319835663, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.44444274902344, + "epoch": 2.8134328358208958, + "grad_norm": 0.7381568104263113, + "learning_rate": 1.3992537313432834e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.22222900390625, + "epoch": 2.8208955223880596, + "grad_norm": 1.0512286624443579, + "learning_rate": 1.4029850746268658e-07, + "loss": -0.0031, + "reward": 0.472222238779068, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.88888549804688, + "epoch": 2.828358208955224, + "grad_norm": 0.8108467751680759, + "learning_rate": 1.4067164179104476e-07, + "loss": 0.0189, + "reward": 0.2777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.30555725097656, + "epoch": 2.835820895522388, + "grad_norm": 0.8330860525713535, + "learning_rate": 1.41044776119403e-07, + "loss": 0.0077, + "reward": 0.4166666567325592, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6388931274414, + "epoch": 2.843283582089552, + "grad_norm": 1.0627181910053614, + "learning_rate": 1.414179104477612e-07, + "loss": 0.0138, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.94444274902344, + "epoch": 2.8507462686567164, + "grad_norm": 0.6874876933927447, + "learning_rate": 1.4179104477611938e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.6666717529297, + "epoch": 2.8582089552238807, + "grad_norm": 0.5953540603752323, + "learning_rate": 1.421641791044776e-07, + "loss": -0.0001, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 2.8656716417910446, + "grad_norm": 0.9006378867707123, + "learning_rate": 1.4253731343283582e-07, + "loss": -0.0059, + "reward": 0.472222238779068, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.05555725097656, + "epoch": 2.873134328358209, + "grad_norm": 3.6699488126889275, + "learning_rate": 1.4291044776119403e-07, + "loss": 0.0012, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.1666717529297, + "epoch": 2.8805970149253732, + "grad_norm": 1.0344504096218445, + "learning_rate": 1.4328358208955223e-07, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.88888549804688, + "epoch": 2.888059701492537, + "grad_norm": 0.9781445050890373, + "learning_rate": 1.4365671641791044e-07, + "loss": -0.008, + "reward": 0.5555555820465088, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8333282470703, + "epoch": 2.8955223880597014, + "grad_norm": 1.2517591311066179, + "learning_rate": 1.4402985074626865e-07, + "loss": -0.0119, + "reward": 0.5555555820465088, + "reward_std": 0.414672315120697, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.88888549804688, + "epoch": 2.9029850746268657, + "grad_norm": 0.6360643970245703, + "learning_rate": 1.4440298507462686e-07, + "loss": -0.0049, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.2777862548828, + "epoch": 2.91044776119403, + "grad_norm": 1.261827636362542, + "learning_rate": 1.4477611940298506e-07, + "loss": -0.0095, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.75, + "epoch": 2.917910447761194, + "grad_norm": 0.638239803844168, + "learning_rate": 1.451492537313433e-07, + "loss": 0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.75, + "epoch": 2.925373134328358, + "grad_norm": 2.9926443056681484, + "learning_rate": 1.4552238805970148e-07, + "loss": 0.0008, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8333282470703, + "epoch": 2.9328358208955225, + "grad_norm": 0.9583274348525217, + "learning_rate": 1.4589552238805969e-07, + "loss": -0.0038, + "reward": 0.3611111044883728, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1666717529297, + "epoch": 2.9402985074626864, + "grad_norm": 0.397102276397263, + "learning_rate": 1.4626865671641792e-07, + "loss": -0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.61111450195312, + "epoch": 2.9477611940298507, + "grad_norm": 0.5389844203175517, + "learning_rate": 1.4664179104477613e-07, + "loss": -0.0013, + "reward": 0.3888888955116272, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.11111450195312, + "epoch": 2.955223880597015, + "grad_norm": 1.2393715909863812, + "learning_rate": 1.470149253731343e-07, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.36111450195312, + "epoch": 2.9626865671641793, + "grad_norm": 0.5492102323075478, + "learning_rate": 1.4738805970149254e-07, + "loss": 0.0006, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.72222900390625, + "epoch": 2.970149253731343, + "grad_norm": 0.6676257130236477, + "learning_rate": 1.4776119402985075e-07, + "loss": -0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.61111450195312, + "epoch": 2.9776119402985075, + "grad_norm": 1.5118763678870824, + "learning_rate": 1.4813432835820893e-07, + "loss": -0.0018, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 2.9850746268656714, + "grad_norm": 0.8280685645026818, + "learning_rate": 1.4850746268656716e-07, + "loss": 0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1666717529297, + "epoch": 2.9925373134328357, + "grad_norm": 3.6619343589286153, + "learning_rate": 1.4888059701492537e-07, + "loss": 0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.36111450195312, + "epoch": 3.0074626865671643, + "grad_norm": 0.5638286449959626, + "learning_rate": 1.4925373134328355e-07, + "loss": 0.005, + "reward": 0.6388888955116272, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.94444274902344, + "epoch": 3.014925373134328, + "grad_norm": 0.9566843934702487, + "learning_rate": 1.4962686567164179e-07, + "loss": 0.0138, + "reward": 0.5277777910232544, + "reward_std": 0.44045570492744446, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30555725097656, + "epoch": 3.0223880597014925, + "grad_norm": 0.6191114555452076, + "learning_rate": 1.5e-07, + "loss": 0.0003, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.8333282470703, + "epoch": 3.029850746268657, + "grad_norm": 0.9508991977187893, + "learning_rate": 1.5037313432835823e-07, + "loss": 0.0137, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.7777862548828, + "epoch": 3.0373134328358207, + "grad_norm": 0.6921150053217313, + "learning_rate": 1.507462686567164e-07, + "loss": 0.0085, + "reward": 0.3611111044883728, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.4166717529297, + "epoch": 3.044776119402985, + "grad_norm": 0.40882613004279583, + "learning_rate": 1.5111940298507462e-07, + "loss": -0.0007, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.88888549804688, + "epoch": 3.0522388059701493, + "grad_norm": 1.9930651346718793, + "learning_rate": 1.5149253731343285e-07, + "loss": 0.0125, + "reward": 0.5833333134651184, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0277862548828, + "epoch": 3.0597014925373136, + "grad_norm": 0.901486847203177, + "learning_rate": 1.5186567164179103e-07, + "loss": 0.0118, + "reward": 0.3611111044883728, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.2777862548828, + "epoch": 3.0671641791044775, + "grad_norm": 1.055893178028207, + "learning_rate": 1.5223880597014924e-07, + "loss": -0.0057, + "reward": 0.6388888955116272, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.63888549804688, + "epoch": 3.074626865671642, + "grad_norm": 0.6612597893764923, + "learning_rate": 1.5261194029850747e-07, + "loss": 0.0168, + "reward": 0.25, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.75, + "epoch": 3.082089552238806, + "grad_norm": 0.8834894735155102, + "learning_rate": 1.5298507462686565e-07, + "loss": 0.0012, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.30555725097656, + "epoch": 3.08955223880597, + "grad_norm": 0.8033408054814034, + "learning_rate": 1.5335820895522386e-07, + "loss": -0.0, + "reward": 0.3888888955116272, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.94444274902344, + "epoch": 3.0970149253731343, + "grad_norm": 0.16373411407576366, + "learning_rate": 1.537313432835821e-07, + "loss": -0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8888931274414, + "epoch": 3.1044776119402986, + "grad_norm": 0.5202301711699289, + "learning_rate": 1.5410447761194027e-07, + "loss": 0.0012, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5, + "epoch": 3.111940298507463, + "grad_norm": 1.205265004510679, + "learning_rate": 1.544776119402985e-07, + "loss": 0.008, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.3333282470703, + "epoch": 3.1194029850746268, + "grad_norm": 1.5007768230932288, + "learning_rate": 1.5485074626865672e-07, + "loss": 0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 3.126865671641791, + "grad_norm": 0.5889081359450927, + "learning_rate": 1.5522388059701492e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.3333282470703, + "epoch": 3.1343283582089554, + "grad_norm": 0.5341670311264196, + "learning_rate": 1.5559701492537313e-07, + "loss": -0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 3.1417910447761193, + "grad_norm": 0.6653896431197764, + "learning_rate": 1.5597014925373134e-07, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.3333282470703, + "epoch": 3.1492537313432836, + "grad_norm": 1.0100196508825778, + "learning_rate": 1.5634328358208954e-07, + "loss": -0.0054, + "reward": 0.5555555820465088, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.25, + "epoch": 3.156716417910448, + "grad_norm": 0.7782913618759119, + "learning_rate": 1.5671641791044775e-07, + "loss": -0.0014, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 3.1641791044776117, + "grad_norm": 0.9283041402481265, + "learning_rate": 1.5708955223880596e-07, + "loss": 0.0197, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.69444274902344, + "epoch": 3.171641791044776, + "grad_norm": 0.6373252602958563, + "learning_rate": 1.5746268656716417e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6666717529297, + "epoch": 3.1791044776119404, + "grad_norm": 0.5130521014538767, + "learning_rate": 1.5783582089552237e-07, + "loss": -0.0012, + "reward": 0.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5833282470703, + "epoch": 3.1865671641791047, + "grad_norm": 0.622997133342563, + "learning_rate": 1.5820895522388058e-07, + "loss": -0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.47222137451172, + "epoch": 3.1940298507462686, + "grad_norm": 1.4116483221695437, + "learning_rate": 1.5858208955223882e-07, + "loss": 0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0833282470703, + "epoch": 3.201492537313433, + "grad_norm": 0.5152049870348934, + "learning_rate": 1.5895522388059702e-07, + "loss": 0.0015, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.19444274902344, + "epoch": 3.208955223880597, + "grad_norm": 0.9937123693814396, + "learning_rate": 1.593283582089552e-07, + "loss": 0.0016, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.22222900390625, + "epoch": 3.216417910447761, + "grad_norm": 1.0178684590869829, + "learning_rate": 1.5970149253731344e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.69444274902344, + "epoch": 3.2238805970149254, + "grad_norm": 1.5175054503822238, + "learning_rate": 1.6007462686567164e-07, + "loss": -0.0007, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.30555725097656, + "epoch": 3.2313432835820897, + "grad_norm": 0.6037039831187315, + "learning_rate": 1.6044776119402983e-07, + "loss": -0.0016, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.47222900390625, + "epoch": 3.2388059701492535, + "grad_norm": 0.9517141306535732, + "learning_rate": 1.6082089552238806e-07, + "loss": 0.0018, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5, + "epoch": 3.246268656716418, + "grad_norm": 0.5993441711697698, + "learning_rate": 1.6119402985074627e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25, + "epoch": 3.253731343283582, + "grad_norm": 0.8593142386462923, + "learning_rate": 1.6156716417910445e-07, + "loss": 0.0015, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.5277862548828, + "epoch": 3.2611940298507465, + "grad_norm": 0.638864288251366, + "learning_rate": 1.6194029850746268e-07, + "loss": 0.0023, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 3.2686567164179103, + "grad_norm": 1.7601592795620526, + "learning_rate": 1.623134328358209e-07, + "loss": 0.0247, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5277862548828, + "epoch": 3.2761194029850746, + "grad_norm": 0.7237025255583344, + "learning_rate": 1.626865671641791e-07, + "loss": 0.0067, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.55555725097656, + "epoch": 3.283582089552239, + "grad_norm": 1.3364408542103823, + "learning_rate": 1.630597014925373e-07, + "loss": 0.0351, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.30555725097656, + "epoch": 3.291044776119403, + "grad_norm": 0.909944697529495, + "learning_rate": 1.634328358208955e-07, + "loss": 0.0146, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5833282470703, + "epoch": 3.298507462686567, + "grad_norm": 0.749784190441871, + "learning_rate": 1.6380597014925374e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.55555725097656, + "epoch": 3.3059701492537314, + "grad_norm": 1.5364030707300307, + "learning_rate": 1.6417910447761193e-07, + "loss": 0.0373, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 3.3134328358208958, + "grad_norm": 0.44112767618163473, + "learning_rate": 1.6455223880597013e-07, + "loss": 0.0012, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0, + "epoch": 3.3208955223880596, + "grad_norm": 0.6131777083145812, + "learning_rate": 1.6492537313432837e-07, + "loss": -0.001, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.2777862548828, + "epoch": 3.328358208955224, + "grad_norm": 1.376340956731594, + "learning_rate": 1.6529850746268655e-07, + "loss": 0.0066, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.44444274902344, + "epoch": 3.3358208955223883, + "grad_norm": 0.6459684015289551, + "learning_rate": 1.6567164179104476e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.7777862548828, + "epoch": 3.343283582089552, + "grad_norm": 0.5729073759303793, + "learning_rate": 1.66044776119403e-07, + "loss": -0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.58333587646484, + "epoch": 3.3507462686567164, + "grad_norm": 0.6124129430794577, + "learning_rate": 1.664179104477612e-07, + "loss": 0.0006, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.6666717529297, + "epoch": 3.3582089552238807, + "grad_norm": 0.43720661275751005, + "learning_rate": 1.6679104477611938e-07, + "loss": -0.001, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3333282470703, + "epoch": 3.3656716417910446, + "grad_norm": 0.2817840254342188, + "learning_rate": 1.671641791044776e-07, + "loss": 0.0004, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.75, + "epoch": 3.373134328358209, + "grad_norm": 0.3844096484438781, + "learning_rate": 1.6753731343283582e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.25, + "epoch": 3.3805970149253732, + "grad_norm": 0.5996677883951778, + "learning_rate": 1.6791044776119403e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.94444274902344, + "epoch": 3.388059701492537, + "grad_norm": 4.242464012991042, + "learning_rate": 1.6828358208955223e-07, + "loss": -0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.36111450195312, + "epoch": 3.3955223880597014, + "grad_norm": 0.4655449696269699, + "learning_rate": 1.6865671641791044e-07, + "loss": -0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.94444274902344, + "epoch": 3.4029850746268657, + "grad_norm": 0.6378918178635415, + "learning_rate": 1.6902985074626865e-07, + "loss": -0.0017, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5833282470703, + "epoch": 3.41044776119403, + "grad_norm": 0.6893618707900386, + "learning_rate": 1.6940298507462686e-07, + "loss": -0.001, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.5277862548828, + "epoch": 3.417910447761194, + "grad_norm": 0.47694583630827436, + "learning_rate": 1.6977611940298506e-07, + "loss": 0.0009, + "reward": 0.5, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4166717529297, + "epoch": 3.425373134328358, + "grad_norm": 2.5569055697504526, + "learning_rate": 1.701492537313433e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 3.4328358208955225, + "grad_norm": 0.7762245861849785, + "learning_rate": 1.7052238805970148e-07, + "loss": 0.002, + "reward": 0.6111111044883728, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.72222900390625, + "epoch": 3.4402985074626864, + "grad_norm": 0.26025696978548135, + "learning_rate": 1.7089552238805968e-07, + "loss": 0.0003, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5277862548828, + "epoch": 3.4477611940298507, + "grad_norm": 0.3630124121658344, + "learning_rate": 1.7126865671641792e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.94444274902344, + "epoch": 3.455223880597015, + "grad_norm": 0.4810418056183535, + "learning_rate": 1.716417910447761e-07, + "loss": -0.001, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.88888549804688, + "epoch": 3.4626865671641793, + "grad_norm": 0.5272084658974728, + "learning_rate": 1.7201492537313433e-07, + "loss": 0.0011, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.2777862548828, + "epoch": 3.470149253731343, + "grad_norm": 0.7698109141213716, + "learning_rate": 1.7238805970149254e-07, + "loss": 0.0008, + "reward": 0.5, + "reward_std": 0.414672315120697, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.22222900390625, + "epoch": 3.4776119402985075, + "grad_norm": 1.720287080914198, + "learning_rate": 1.7276119402985072e-07, + "loss": -0.005, + "reward": 0.4166666567325592, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.69444274902344, + "epoch": 3.485074626865672, + "grad_norm": 0.568433931887591, + "learning_rate": 1.7313432835820896e-07, + "loss": 0.0036, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.83333587646484, + "epoch": 3.4925373134328357, + "grad_norm": 0.45025275461775827, + "learning_rate": 1.7350746268656716e-07, + "loss": 0.0008, + "reward": 0.3611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.86111450195312, + "epoch": 3.5, + "grad_norm": 0.994437954459213, + "learning_rate": 1.7388059701492534e-07, + "loss": 0.0149, + "reward": 0.6111111044883728, + "reward_std": 0.41467228531837463, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.11111450195312, + "epoch": 3.5074626865671643, + "grad_norm": 0.7092555027784698, + "learning_rate": 1.7425373134328358e-07, + "loss": -0.0012, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0277862548828, + "epoch": 3.5149253731343286, + "grad_norm": 0.6352039677407706, + "learning_rate": 1.7462686567164178e-07, + "loss": -0.0, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.30555725097656, + "epoch": 3.5223880597014925, + "grad_norm": 1.2452234557354613, + "learning_rate": 1.75e-07, + "loss": 0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.30555725097656, + "epoch": 3.529850746268657, + "grad_norm": 1.1825206693699366, + "learning_rate": 1.753731343283582e-07, + "loss": 0.0001, + "reward": 0.2777777910232544, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.05555725097656, + "epoch": 3.5373134328358207, + "grad_norm": 3.255384614842195, + "learning_rate": 1.757462686567164e-07, + "loss": -0.0013, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.7777862548828, + "epoch": 3.544776119402985, + "grad_norm": 1.4692210845856999, + "learning_rate": 1.7611940298507461e-07, + "loss": 0.0004, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.6666717529297, + "epoch": 3.5522388059701493, + "grad_norm": 0.9099523039557104, + "learning_rate": 1.7649253731343282e-07, + "loss": 0.0199, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.75, + "epoch": 3.5597014925373136, + "grad_norm": 1.1735329351764103, + "learning_rate": 1.7686567164179103e-07, + "loss": 0.0007, + "reward": 0.1944444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.36111450195312, + "epoch": 3.5671641791044775, + "grad_norm": 0.5598890888046779, + "learning_rate": 1.7723880597014926e-07, + "loss": 0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 3.574626865671642, + "grad_norm": 1.1148223686532834, + "learning_rate": 1.7761194029850744e-07, + "loss": 0.0006, + "reward": 0.6388888955116272, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.13888549804688, + "epoch": 3.582089552238806, + "grad_norm": 1.0496726912981724, + "learning_rate": 1.7798507462686565e-07, + "loss": -0.0056, + "reward": 0.472222238779068, + "reward_std": 0.4702278673648834, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 3.58955223880597, + "grad_norm": 1.0315040820177208, + "learning_rate": 1.7835820895522388e-07, + "loss": -0.0013, + "reward": 0.6388888955116272, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8333282470703, + "epoch": 3.5970149253731343, + "grad_norm": 0.6927407081760717, + "learning_rate": 1.787313432835821e-07, + "loss": 0.0046, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 3.6044776119402986, + "grad_norm": 0.9480027780357247, + "learning_rate": 1.7910447761194027e-07, + "loss": 0.001, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.44444274902344, + "epoch": 3.611940298507463, + "grad_norm": 6.942824337431915, + "learning_rate": 1.794776119402985e-07, + "loss": -0.001, + "reward": 0.7222222089767456, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.61111450195312, + "epoch": 3.6194029850746268, + "grad_norm": 0.3508870989391064, + "learning_rate": 1.7985074626865671e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.36111450195312, + "epoch": 3.626865671641791, + "grad_norm": 0.8031623746560911, + "learning_rate": 1.802238805970149e-07, + "loss": -0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.0, + "epoch": 3.6343283582089554, + "grad_norm": 0.6649424030508002, + "learning_rate": 1.8059701492537313e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.80555725097656, + "epoch": 3.6417910447761193, + "grad_norm": 0.3761452849049308, + "learning_rate": 1.8097014925373134e-07, + "loss": 0.0122, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.25, + "epoch": 3.6492537313432836, + "grad_norm": 1.9118218269978677, + "learning_rate": 1.8134328358208954e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.69444274902344, + "epoch": 3.656716417910448, + "grad_norm": 0.33041276150868576, + "learning_rate": 1.8171641791044775e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.9166717529297, + "epoch": 3.664179104477612, + "grad_norm": 0.8593644265847272, + "learning_rate": 1.8208955223880596e-07, + "loss": 0.0013, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.94444274902344, + "epoch": 3.671641791044776, + "grad_norm": 0.7863052960231789, + "learning_rate": 1.824626865671642e-07, + "loss": 0.0071, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 3.6791044776119404, + "grad_norm": 0.3523498701960021, + "learning_rate": 1.8283582089552237e-07, + "loss": -0.0003, + "reward": 0.3333333432674408, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.11111450195312, + "epoch": 3.6865671641791042, + "grad_norm": 0.9701994691902537, + "learning_rate": 1.8320895522388058e-07, + "loss": -0.0204, + "reward": 0.5277777910232544, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.36111450195312, + "epoch": 3.6940298507462686, + "grad_norm": 0.6310465262081325, + "learning_rate": 1.8358208955223881e-07, + "loss": -0.0118, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.1666717529297, + "epoch": 3.701492537313433, + "grad_norm": 0.7884661722861092, + "learning_rate": 1.83955223880597e-07, + "loss": -0.0153, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.1666717529297, + "epoch": 3.708955223880597, + "grad_norm": 0.7144529505217109, + "learning_rate": 1.843283582089552e-07, + "loss": -0.0004, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.25, + "epoch": 3.716417910447761, + "grad_norm": 1.2631405242293328, + "learning_rate": 1.8470149253731344e-07, + "loss": -0.003, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.8333282470703, + "epoch": 3.7238805970149254, + "grad_norm": 1.0268131257803932, + "learning_rate": 1.8507462686567162e-07, + "loss": 0.0009, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.63888549804688, + "epoch": 3.7313432835820897, + "grad_norm": 1.8922484907892423, + "learning_rate": 1.8544776119402985e-07, + "loss": 0.011, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.19444274902344, + "epoch": 3.7388059701492535, + "grad_norm": 2.429658084989208, + "learning_rate": 1.8582089552238806e-07, + "loss": 0.006, + "reward": 0.7222222089767456, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0277862548828, + "epoch": 3.746268656716418, + "grad_norm": 0.5565727949615487, + "learning_rate": 1.8619402985074627e-07, + "loss": -0.0012, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3333282470703, + "epoch": 3.753731343283582, + "grad_norm": 0.7713277252567337, + "learning_rate": 1.8656716417910447e-07, + "loss": 0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0277862548828, + "epoch": 3.7611940298507465, + "grad_norm": 0.8835290976813063, + "learning_rate": 1.8694029850746268e-07, + "loss": -0.0018, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.3333282470703, + "epoch": 3.7686567164179103, + "grad_norm": 0.35342076607159595, + "learning_rate": 1.873134328358209e-07, + "loss": -0.0012, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.36111450195312, + "epoch": 3.7761194029850746, + "grad_norm": 1.031711915046143, + "learning_rate": 1.876865671641791e-07, + "loss": -0.0011, + "reward": 0.75, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4166717529297, + "epoch": 3.783582089552239, + "grad_norm": 0.6130934883205137, + "learning_rate": 1.880597014925373e-07, + "loss": 0.0012, + "reward": 0.3333333432674408, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.61111450195312, + "epoch": 3.791044776119403, + "grad_norm": 0.8088265947474628, + "learning_rate": 1.884328358208955e-07, + "loss": 0.0008, + "reward": 0.3333333432674408, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 3.798507462686567, + "grad_norm": 0.2405002541183557, + "learning_rate": 1.8880597014925372e-07, + "loss": 0.0005, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7777862548828, + "epoch": 3.8059701492537314, + "grad_norm": 0.39659464532046373, + "learning_rate": 1.8917910447761192e-07, + "loss": 0.0, + "reward": 0.2777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.36111450195312, + "epoch": 3.8134328358208958, + "grad_norm": 1.757196634973051, + "learning_rate": 1.8955223880597013e-07, + "loss": -0.0033, + "reward": 0.5277777910232544, + "reward_std": 0.3888888955116272, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5833282470703, + "epoch": 3.8208955223880596, + "grad_norm": 0.6482855888340876, + "learning_rate": 1.8992537313432837e-07, + "loss": -0.0028, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6666717529297, + "epoch": 3.828358208955224, + "grad_norm": 0.3303091252691927, + "learning_rate": 1.9029850746268655e-07, + "loss": -0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.80555725097656, + "epoch": 3.835820895522388, + "grad_norm": 0.8785421199411269, + "learning_rate": 1.9067164179104478e-07, + "loss": -0.0026, + "reward": 0.3611111044883728, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.47222900390625, + "epoch": 3.843283582089552, + "grad_norm": 0.732757236241978, + "learning_rate": 1.91044776119403e-07, + "loss": 0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.80555725097656, + "epoch": 3.8507462686567164, + "grad_norm": 0.9072261042744254, + "learning_rate": 1.9141791044776117e-07, + "loss": 0.0014, + "reward": 0.5833333134651184, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5277862548828, + "epoch": 3.8582089552238807, + "grad_norm": 1.0525179683268486, + "learning_rate": 1.917910447761194e-07, + "loss": 0.0117, + "reward": 0.5, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.1666717529297, + "epoch": 3.8656716417910446, + "grad_norm": 0.9674601872335378, + "learning_rate": 1.921641791044776e-07, + "loss": -0.0107, + "reward": 0.3611111044883728, + "reward_std": 0.3763057291507721, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.6666717529297, + "epoch": 3.873134328358209, + "grad_norm": 0.4893566179741302, + "learning_rate": 1.925373134328358e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 3.8805970149253732, + "grad_norm": 0.7299648756213316, + "learning_rate": 1.9291044776119402e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.61111450195312, + "epoch": 3.888059701492537, + "grad_norm": 0.6888136901304034, + "learning_rate": 1.9328358208955223e-07, + "loss": 0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 3.8955223880597014, + "grad_norm": 0.5227057040303975, + "learning_rate": 1.9365671641791044e-07, + "loss": 0.0013, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 3.9029850746268657, + "grad_norm": 0.8463371888856244, + "learning_rate": 1.9402985074626865e-07, + "loss": 0.002, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.36111450195312, + "epoch": 3.91044776119403, + "grad_norm": 0.7544401505331759, + "learning_rate": 1.9440298507462685e-07, + "loss": 0.0471, + "reward": 0.7222222089767456, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.94444274902344, + "epoch": 3.917910447761194, + "grad_norm": 1.2529606352977352, + "learning_rate": 1.947761194029851e-07, + "loss": 0.0034, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.97222900390625, + "epoch": 3.925373134328358, + "grad_norm": 0.9862968888576135, + "learning_rate": 1.9514925373134327e-07, + "loss": 0.0043, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.88888549804688, + "epoch": 3.9328358208955225, + "grad_norm": 0.6332533562395162, + "learning_rate": 1.9552238805970148e-07, + "loss": -0.0037, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8888931274414, + "epoch": 3.9402985074626864, + "grad_norm": 0.785514746723502, + "learning_rate": 1.958955223880597e-07, + "loss": 0.0, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.19444274902344, + "epoch": 3.9477611940298507, + "grad_norm": 0.8120780848283214, + "learning_rate": 1.962686567164179e-07, + "loss": -0.0029, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.69444274902344, + "epoch": 3.955223880597015, + "grad_norm": 1.1430588180036352, + "learning_rate": 1.966417910447761e-07, + "loss": -0.0052, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 3.9626865671641793, + "grad_norm": 1.1894243839580374, + "learning_rate": 1.9701492537313433e-07, + "loss": 0.012, + "reward": 0.4166666567325592, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.19444274902344, + "epoch": 3.970149253731343, + "grad_norm": 0.5136179505051328, + "learning_rate": 1.9738805970149254e-07, + "loss": -0.0002, + "reward": 0.3888888955116272, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.11111450195312, + "epoch": 3.9776119402985075, + "grad_norm": 0.34203011957227897, + "learning_rate": 1.9776119402985072e-07, + "loss": -0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6666717529297, + "epoch": 3.9850746268656714, + "grad_norm": 1.401529110334628, + "learning_rate": 1.9813432835820895e-07, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.08334350585938, + "epoch": 3.9925373134328357, + "grad_norm": 0.802567682516199, + "learning_rate": 1.9850746268656716e-07, + "loss": -0.0016, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.8333282470703, + "epoch": 4.007462686567164, + "grad_norm": 0.3376411459040491, + "learning_rate": 1.9888059701492537e-07, + "loss": -0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8333282470703, + "epoch": 4.014925373134329, + "grad_norm": 0.6747315686818585, + "learning_rate": 1.9925373134328358e-07, + "loss": -0.0, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5833282470703, + "epoch": 4.022388059701493, + "grad_norm": 0.5373217400171213, + "learning_rate": 1.9962686567164178e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7777862548828, + "epoch": 4.029850746268656, + "grad_norm": 0.9645249655812984, + "learning_rate": 2e-07, + "loss": 0.0115, + "reward": 0.6666666865348816, + "reward_std": 0.5085944533348083, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.38888549804688, + "epoch": 4.037313432835821, + "grad_norm": 0.5646510535491045, + "learning_rate": 2.003731343283582e-07, + "loss": -0.0025, + "reward": 0.5555555820465088, + "reward_std": 0.25660011172294617, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5277862548828, + "epoch": 4.044776119402985, + "grad_norm": 0.7524892457230887, + "learning_rate": 2.007462686567164e-07, + "loss": 0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.13888549804688, + "epoch": 4.052238805970149, + "grad_norm": 1.4277913514165743, + "learning_rate": 2.0111940298507464e-07, + "loss": 0.0006, + "reward": 0.25, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.19444274902344, + "epoch": 4.059701492537314, + "grad_norm": 0.554583239803449, + "learning_rate": 2.0149253731343282e-07, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9166717529297, + "epoch": 4.067164179104478, + "grad_norm": 0.6404940881118903, + "learning_rate": 2.0186567164179103e-07, + "loss": 0.0018, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.44444274902344, + "epoch": 4.074626865671641, + "grad_norm": 0.4001987873439748, + "learning_rate": 2.0223880597014926e-07, + "loss": -0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5277862548828, + "epoch": 4.082089552238806, + "grad_norm": 0.468566783425521, + "learning_rate": 2.0261194029850744e-07, + "loss": -0.0001, + "reward": 0.1944444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.25, + "epoch": 4.08955223880597, + "grad_norm": 0.9184174214568214, + "learning_rate": 2.0298507462686565e-07, + "loss": 0.0442, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.88888549804688, + "epoch": 4.097014925373134, + "grad_norm": 0.5707751573896132, + "learning_rate": 2.0335820895522388e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3333282470703, + "epoch": 4.104477611940299, + "grad_norm": 0.5245605920424958, + "learning_rate": 2.0373134328358206e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0, + "epoch": 4.111940298507463, + "grad_norm": 0.671771680423517, + "learning_rate": 2.041044776119403e-07, + "loss": 0.0013, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333282470703, + "epoch": 4.119402985074627, + "grad_norm": 1.2076003056848494, + "learning_rate": 2.044776119402985e-07, + "loss": 0.0062, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.69444274902344, + "epoch": 4.126865671641791, + "grad_norm": 0.538446340950092, + "learning_rate": 2.0485074626865669e-07, + "loss": -0.0014, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 4.134328358208955, + "grad_norm": 0.9144377020230732, + "learning_rate": 2.0522388059701492e-07, + "loss": -0.0075, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.13888549804688, + "epoch": 4.141791044776119, + "grad_norm": 0.7902870875312578, + "learning_rate": 2.0559701492537313e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.05555725097656, + "epoch": 4.149253731343284, + "grad_norm": 0.539914224709972, + "learning_rate": 2.0597014925373134e-07, + "loss": 0.0013, + "reward": 0.5277777910232544, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.4166717529297, + "epoch": 4.156716417910448, + "grad_norm": 0.27949238952475375, + "learning_rate": 2.0634328358208954e-07, + "loss": 0.0005, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.36111450195312, + "epoch": 4.164179104477612, + "grad_norm": 0.46940607569629333, + "learning_rate": 2.0671641791044775e-07, + "loss": 0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.38888549804688, + "epoch": 4.1716417910447765, + "grad_norm": 1.002311584143356, + "learning_rate": 2.0708955223880596e-07, + "loss": 0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.47222900390625, + "epoch": 4.17910447761194, + "grad_norm": 0.8074993005914306, + "learning_rate": 2.0746268656716416e-07, + "loss": -0.0011, + "reward": 0.3333333432674408, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0277862548828, + "epoch": 4.186567164179104, + "grad_norm": 0.3541508726897797, + "learning_rate": 2.0783582089552237e-07, + "loss": 0.0011, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.94444274902344, + "epoch": 4.1940298507462686, + "grad_norm": 0.2654717385592948, + "learning_rate": 2.082089552238806e-07, + "loss": 0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.36111450195312, + "epoch": 4.201492537313433, + "grad_norm": 0.38958655300237155, + "learning_rate": 2.0858208955223879e-07, + "loss": -0.0002, + "reward": 0.3333333432674408, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.1666717529297, + "epoch": 4.208955223880597, + "grad_norm": 0.6823566454172773, + "learning_rate": 2.08955223880597e-07, + "loss": -0.0003, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5277862548828, + "epoch": 4.2164179104477615, + "grad_norm": 0.5321407475680179, + "learning_rate": 2.0932835820895523e-07, + "loss": -0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.30555725097656, + "epoch": 4.223880597014926, + "grad_norm": 0.603828646195959, + "learning_rate": 2.0970149253731344e-07, + "loss": -0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.3333282470703, + "epoch": 4.231343283582089, + "grad_norm": 0.7321852232379786, + "learning_rate": 2.1007462686567162e-07, + "loss": 0.0019, + "reward": 0.5555555820465088, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5, + "epoch": 4.2388059701492535, + "grad_norm": 1.1637187064792582, + "learning_rate": 2.1044776119402985e-07, + "loss": 0.0043, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.44444274902344, + "epoch": 4.246268656716418, + "grad_norm": 0.48915753241318327, + "learning_rate": 2.1082089552238806e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.22222900390625, + "epoch": 4.253731343283582, + "grad_norm": 0.43204338494766387, + "learning_rate": 2.1119402985074624e-07, + "loss": 0.0006, + "reward": 0.3611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5833282470703, + "epoch": 4.2611940298507465, + "grad_norm": 0.5871160924366974, + "learning_rate": 2.1156716417910447e-07, + "loss": -0.001, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.80555725097656, + "epoch": 4.268656716417911, + "grad_norm": 0.4832460272913426, + "learning_rate": 2.1194029850746268e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.63888549804688, + "epoch": 4.276119402985074, + "grad_norm": 2.0614891597901277, + "learning_rate": 2.1231343283582089e-07, + "loss": 0.0025, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.97222900390625, + "epoch": 4.2835820895522385, + "grad_norm": 0.6829193072344408, + "learning_rate": 2.126865671641791e-07, + "loss": -0.001, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.86111450195312, + "epoch": 4.291044776119403, + "grad_norm": 0.7789181481487603, + "learning_rate": 2.130597014925373e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 4.298507462686567, + "grad_norm": 0.6131286604334807, + "learning_rate": 2.1343283582089554e-07, + "loss": -0.0012, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.05555725097656, + "epoch": 4.3059701492537314, + "grad_norm": 0.7590807706829223, + "learning_rate": 2.1380597014925372e-07, + "loss": 0.0052, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.4166717529297, + "epoch": 4.313432835820896, + "grad_norm": 0.8993946733049966, + "learning_rate": 2.1417910447761192e-07, + "loss": 0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 4.32089552238806, + "grad_norm": 1.0064132559372927, + "learning_rate": 2.1455223880597016e-07, + "loss": -0.0008, + "reward": 0.8611111044883728, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 4.3283582089552235, + "grad_norm": 0.6524524712846979, + "learning_rate": 2.1492537313432834e-07, + "loss": -0.0018, + "reward": 0.3611111044883728, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5, + "epoch": 4.335820895522388, + "grad_norm": 0.5217505872687526, + "learning_rate": 2.1529850746268655e-07, + "loss": 0.0005, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1666717529297, + "epoch": 4.343283582089552, + "grad_norm": 0.6810316219073826, + "learning_rate": 2.1567164179104478e-07, + "loss": -0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.80555725097656, + "epoch": 4.350746268656716, + "grad_norm": 0.8376767264086693, + "learning_rate": 2.1604477611940296e-07, + "loss": 0.001, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.72222900390625, + "epoch": 4.358208955223881, + "grad_norm": 0.44449478150109595, + "learning_rate": 2.1641791044776117e-07, + "loss": 0.0007, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.1666717529297, + "epoch": 4.365671641791045, + "grad_norm": 0.5631321671347562, + "learning_rate": 2.167910447761194e-07, + "loss": -0.0009, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.4166717529297, + "epoch": 4.373134328358209, + "grad_norm": 0.4749391287832556, + "learning_rate": 2.171641791044776e-07, + "loss": -0.0002, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.1666717529297, + "epoch": 4.380597014925373, + "grad_norm": 0.718349964712305, + "learning_rate": 2.1753731343283582e-07, + "loss": 0.0002, + "reward": 0.2222222238779068, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.2222222238779068, + "rewards/format_reward": 0.0, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 4.388059701492537, + "grad_norm": 1.1924192286106718, + "learning_rate": 2.1791044776119402e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.25, + "epoch": 4.395522388059701, + "grad_norm": 0.43083827309836914, + "learning_rate": 2.1828358208955223e-07, + "loss": -0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6666717529297, + "epoch": 4.402985074626866, + "grad_norm": 0.4266868176808778, + "learning_rate": 2.1865671641791044e-07, + "loss": 0.0013, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 4.41044776119403, + "grad_norm": 0.6398467203845685, + "learning_rate": 2.1902985074626865e-07, + "loss": -0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55555725097656, + "epoch": 4.417910447761194, + "grad_norm": 0.7307522842025604, + "learning_rate": 2.1940298507462685e-07, + "loss": -0.0019, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.97222900390625, + "epoch": 4.425373134328359, + "grad_norm": 0.4411589684073964, + "learning_rate": 2.1977611940298506e-07, + "loss": -0.0004, + "reward": 0.3055555522441864, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 4.432835820895522, + "grad_norm": 2.283736505164162, + "learning_rate": 2.2014925373134327e-07, + "loss": -0.0019, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0, + "epoch": 4.440298507462686, + "grad_norm": 0.4053627954797374, + "learning_rate": 2.2052238805970148e-07, + "loss": 0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.47222900390625, + "epoch": 4.447761194029851, + "grad_norm": 0.8676965999675361, + "learning_rate": 2.208955223880597e-07, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0277862548828, + "epoch": 4.455223880597015, + "grad_norm": 0.5152037073042781, + "learning_rate": 2.212686567164179e-07, + "loss": 0.0011, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.22222900390625, + "epoch": 4.462686567164179, + "grad_norm": 0.5768983304854786, + "learning_rate": 2.2164179104477612e-07, + "loss": 0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.7777862548828, + "epoch": 4.470149253731344, + "grad_norm": 1.107228477068871, + "learning_rate": 2.2201492537313433e-07, + "loss": 0.0014, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.3888931274414, + "epoch": 4.477611940298507, + "grad_norm": 0.7271962073374681, + "learning_rate": 2.223880597014925e-07, + "loss": 0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.88888549804688, + "epoch": 4.485074626865671, + "grad_norm": 0.8294500118973961, + "learning_rate": 2.2276119402985075e-07, + "loss": -0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.47222900390625, + "epoch": 4.492537313432836, + "grad_norm": 0.6024710500763076, + "learning_rate": 2.2313432835820895e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2777862548828, + "epoch": 4.5, + "grad_norm": 0.5163898679200584, + "learning_rate": 2.2350746268656713e-07, + "loss": -0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.25660011172294617, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.38888549804688, + "epoch": 4.507462686567164, + "grad_norm": 0.9555408537371715, + "learning_rate": 2.2388059701492537e-07, + "loss": 0.0011, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.19444274902344, + "epoch": 4.514925373134329, + "grad_norm": 0.9081608820881446, + "learning_rate": 2.2425373134328358e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.7777862548828, + "epoch": 4.522388059701493, + "grad_norm": 0.44624053140613384, + "learning_rate": 2.2462686567164176e-07, + "loss": 0.0015, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.11111450195312, + "epoch": 4.529850746268656, + "grad_norm": 0.8192976223648916, + "learning_rate": 2.25e-07, + "loss": 0.0049, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0833282470703, + "epoch": 4.537313432835821, + "grad_norm": 2.266610484800678, + "learning_rate": 2.253731343283582e-07, + "loss": 0.0029, + "reward": 0.4444444477558136, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.4166717529297, + "epoch": 4.544776119402985, + "grad_norm": 0.6137873087086044, + "learning_rate": 2.2574626865671643e-07, + "loss": 0.0009, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.94444274902344, + "epoch": 4.552238805970149, + "grad_norm": 0.5470706502544691, + "learning_rate": 2.261194029850746e-07, + "loss": 0.0025, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4166717529297, + "epoch": 4.559701492537314, + "grad_norm": 0.7714308763414546, + "learning_rate": 2.2649253731343282e-07, + "loss": -0.001, + "reward": 0.3888888955116272, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0833282470703, + "epoch": 4.567164179104478, + "grad_norm": 0.7525020464750954, + "learning_rate": 2.2686567164179105e-07, + "loss": 0.0159, + "reward": 0.5277777910232544, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 4.574626865671641, + "grad_norm": 0.4702504121379181, + "learning_rate": 2.2723880597014923e-07, + "loss": -0.0017, + "reward": 0.3333333432674408, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.94444274902344, + "epoch": 4.582089552238806, + "grad_norm": 0.4874144194696151, + "learning_rate": 2.2761194029850744e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.69444274902344, + "epoch": 4.58955223880597, + "grad_norm": 0.5840184579326914, + "learning_rate": 2.2798507462686568e-07, + "loss": 0.0015, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 4.597014925373134, + "grad_norm": 0.785752036800956, + "learning_rate": 2.2835820895522386e-07, + "loss": 0.0008, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.61111450195312, + "epoch": 4.604477611940299, + "grad_norm": 0.901739521284438, + "learning_rate": 2.2873134328358206e-07, + "loss": 0.0019, + "reward": 0.7222222089767456, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.4166717529297, + "epoch": 4.611940298507463, + "grad_norm": 2.0156566522240347, + "learning_rate": 2.291044776119403e-07, + "loss": -0.0008, + "reward": 0.694444477558136, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.30555725097656, + "epoch": 4.619402985074627, + "grad_norm": 0.6345350066175628, + "learning_rate": 2.294776119402985e-07, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 4.6268656716417915, + "grad_norm": 2.2734244399204395, + "learning_rate": 2.2985074626865669e-07, + "loss": 0.0166, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.8333282470703, + "epoch": 4.634328358208955, + "grad_norm": 0.7407440221278461, + "learning_rate": 2.3022388059701492e-07, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.97222900390625, + "epoch": 4.641791044776119, + "grad_norm": 1.389104728956685, + "learning_rate": 2.3059701492537313e-07, + "loss": -0.002, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.55555725097656, + "epoch": 4.649253731343284, + "grad_norm": 0.8692293368815878, + "learning_rate": 2.3097014925373133e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.36111450195312, + "epoch": 4.656716417910448, + "grad_norm": 0.3092326732392671, + "learning_rate": 2.3134328358208954e-07, + "loss": -0.0004, + "reward": 0.2777777910232544, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.4166717529297, + "epoch": 4.664179104477612, + "grad_norm": 0.23580874917411032, + "learning_rate": 2.3171641791044775e-07, + "loss": 0.0003, + "reward": 0.1944444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.69444274902344, + "epoch": 4.6716417910447765, + "grad_norm": 0.2498693186126049, + "learning_rate": 2.3208955223880596e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 4.67910447761194, + "grad_norm": 0.5205495273455459, + "learning_rate": 2.3246268656716416e-07, + "loss": -0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.61111450195312, + "epoch": 4.686567164179104, + "grad_norm": 0.8736158789007084, + "learning_rate": 2.3283582089552237e-07, + "loss": -0.0007, + "reward": 0.5277777910232544, + "reward_std": 0.35911673307418823, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 4.6940298507462686, + "grad_norm": 1.034983208493177, + "learning_rate": 2.332089552238806e-07, + "loss": -0.001, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6666717529297, + "epoch": 4.701492537313433, + "grad_norm": 0.6339894034353002, + "learning_rate": 2.3358208955223879e-07, + "loss": -0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.47222900390625, + "epoch": 4.708955223880597, + "grad_norm": 0.7105122845546399, + "learning_rate": 2.33955223880597e-07, + "loss": -0.0016, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 4.7164179104477615, + "grad_norm": 1.4060908168013129, + "learning_rate": 2.3432835820895523e-07, + "loss": -0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.30555725097656, + "epoch": 4.723880597014926, + "grad_norm": 1.2969713290292255, + "learning_rate": 2.347014925373134e-07, + "loss": -0.0026, + "reward": 0.5, + "reward_std": 0.47882235050201416, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 4.731343283582089, + "grad_norm": 0.5178798818662593, + "learning_rate": 2.3507462686567164e-07, + "loss": 0.0024, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9166717529297, + "epoch": 4.7388059701492535, + "grad_norm": 0.7234719801212561, + "learning_rate": 2.3544776119402985e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.38490018248558044, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.97222137451172, + "epoch": 4.746268656716418, + "grad_norm": 0.6052021838682412, + "learning_rate": 2.3582089552238803e-07, + "loss": -0.0018, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9166717529297, + "epoch": 4.753731343283582, + "grad_norm": 1.8367965246641251, + "learning_rate": 2.3619402985074626e-07, + "loss": -0.0, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.36111450195312, + "epoch": 4.7611940298507465, + "grad_norm": 0.6827650263800729, + "learning_rate": 2.3656716417910447e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.7777862548828, + "epoch": 4.768656716417911, + "grad_norm": 1.3005358425769238, + "learning_rate": 2.3694029850746268e-07, + "loss": -0.0006, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.6666717529297, + "epoch": 4.776119402985074, + "grad_norm": 0.2444022246618089, + "learning_rate": 2.3731343283582089e-07, + "loss": -0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 4.7835820895522385, + "grad_norm": 0.5703094428480726, + "learning_rate": 2.376865671641791e-07, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5, + "epoch": 4.791044776119403, + "grad_norm": 0.6101799372412259, + "learning_rate": 2.380597014925373e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0833282470703, + "epoch": 4.798507462686567, + "grad_norm": 0.5811633836683378, + "learning_rate": 2.3843283582089553e-07, + "loss": 0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.11111450195312, + "epoch": 4.8059701492537314, + "grad_norm": 0.4966818496461614, + "learning_rate": 2.388059701492537e-07, + "loss": -0.0002, + "reward": 0.472222238779068, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.55555725097656, + "epoch": 4.813432835820896, + "grad_norm": 0.6848614388589097, + "learning_rate": 2.3917910447761195e-07, + "loss": 0.0012, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.80555725097656, + "epoch": 4.82089552238806, + "grad_norm": 0.577758770707119, + "learning_rate": 2.3955223880597013e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.36111450195312, + "epoch": 4.8283582089552235, + "grad_norm": 1.8153881780786025, + "learning_rate": 2.3992537313432836e-07, + "loss": -0.0004, + "reward": 0.2777777910232544, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.88888549804688, + "epoch": 4.835820895522388, + "grad_norm": 0.5923111483144103, + "learning_rate": 2.4029850746268654e-07, + "loss": -0.0087, + "reward": 0.7222222089767456, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.80555725097656, + "epoch": 4.843283582089552, + "grad_norm": 0.8792045944349791, + "learning_rate": 2.406716417910448e-07, + "loss": -0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.36111450195312, + "epoch": 4.850746268656716, + "grad_norm": 0.6003673909684791, + "learning_rate": 2.4104477611940296e-07, + "loss": -0.0008, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6666717529297, + "epoch": 4.858208955223881, + "grad_norm": 0.4200942054463653, + "learning_rate": 2.414179104477612e-07, + "loss": 0.0009, + "reward": 0.3611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 4.865671641791045, + "grad_norm": 0.3994518402113344, + "learning_rate": 2.417910447761194e-07, + "loss": -0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8333282470703, + "epoch": 4.8731343283582085, + "grad_norm": 1.0771847281461189, + "learning_rate": 2.421641791044776e-07, + "loss": -0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0, + "epoch": 4.880597014925373, + "grad_norm": 0.7752684209491728, + "learning_rate": 2.4253731343283584e-07, + "loss": 0.0529, + "reward": 0.3333333432674408, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.86111450195312, + "epoch": 4.888059701492537, + "grad_norm": 0.3339722811245741, + "learning_rate": 2.42910447761194e-07, + "loss": 0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.69444274902344, + "epoch": 4.895522388059701, + "grad_norm": 1.2772086134669272, + "learning_rate": 2.432835820895522e-07, + "loss": -0.001, + "reward": 0.694444477558136, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.22222900390625, + "epoch": 4.902985074626866, + "grad_norm": 1.192525200962345, + "learning_rate": 2.4365671641791044e-07, + "loss": -0.0026, + "reward": 0.3888888955116272, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5, + "epoch": 4.91044776119403, + "grad_norm": 0.0, + "learning_rate": 2.440298507462686e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.80555725097656, + "epoch": 4.917910447761194, + "grad_norm": 1.3787769213769778, + "learning_rate": 2.4440298507462685e-07, + "loss": 0.0023, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.11111450195312, + "epoch": 4.925373134328359, + "grad_norm": 0.20132725074468671, + "learning_rate": 2.447761194029851e-07, + "loss": -0.0011, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.75, + "epoch": 4.932835820895522, + "grad_norm": 0.87554144858686, + "learning_rate": 2.4514925373134327e-07, + "loss": -0.0016, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.1666717529297, + "epoch": 4.940298507462686, + "grad_norm": 0.5818339586867857, + "learning_rate": 2.455223880597015e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.22222900390625, + "epoch": 4.947761194029851, + "grad_norm": 1.4353189107731223, + "learning_rate": 2.458955223880597e-07, + "loss": 0.0004, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 4.955223880597015, + "grad_norm": 0.3932724512328728, + "learning_rate": 2.4626865671641786e-07, + "loss": -0.0003, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.05555725097656, + "epoch": 4.962686567164179, + "grad_norm": 1.7392196427659314, + "learning_rate": 2.466417910447761e-07, + "loss": 0.0016, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0833282470703, + "epoch": 4.970149253731344, + "grad_norm": 0.3260290431487908, + "learning_rate": 2.4701492537313433e-07, + "loss": -0.0002, + "reward": 0.0833333358168602, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.69444274902344, + "epoch": 4.977611940298507, + "grad_norm": 0.5148294360624733, + "learning_rate": 2.473880597014925e-07, + "loss": 0.01, + "reward": 0.4166666567325592, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.88888549804688, + "epoch": 4.985074626865671, + "grad_norm": 0.6120927432751146, + "learning_rate": 2.4776119402985074e-07, + "loss": 0.0012, + "reward": 0.5, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.08334350585938, + "epoch": 4.992537313432836, + "grad_norm": 0.7330435224453775, + "learning_rate": 2.481343283582089e-07, + "loss": -0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0277862548828, + "epoch": 5.007462686567164, + "grad_norm": 0.24073981211387468, + "learning_rate": 2.4850746268656716e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5, + "epoch": 5.014925373134329, + "grad_norm": 0.5359377025520434, + "learning_rate": 2.4888059701492534e-07, + "loss": 0.0028, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 5.022388059701493, + "grad_norm": 1.0387112588983995, + "learning_rate": 2.492537313432836e-07, + "loss": 0.001, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.1666717529297, + "epoch": 5.029850746268656, + "grad_norm": 0.5245162432676078, + "learning_rate": 2.496268656716418e-07, + "loss": -0.0059, + "reward": 0.5277777910232544, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.22222900390625, + "epoch": 5.037313432835821, + "grad_norm": 0.5684501265157992, + "learning_rate": 2.5e-07, + "loss": -0.0011, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.7777862548828, + "epoch": 5.044776119402985, + "grad_norm": 0.9219787383128463, + "learning_rate": 2.503731343283582e-07, + "loss": -0.0011, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.13888549804688, + "epoch": 5.052238805970149, + "grad_norm": 0.40449177255162927, + "learning_rate": 2.507462686567164e-07, + "loss": 0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.19444274902344, + "epoch": 5.059701492537314, + "grad_norm": 0.7472077495347224, + "learning_rate": 2.5111940298507464e-07, + "loss": -0.001, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.11111450195312, + "epoch": 5.067164179104478, + "grad_norm": 0.846152419515396, + "learning_rate": 2.514925373134328e-07, + "loss": 0.0012, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.1666717529297, + "epoch": 5.074626865671641, + "grad_norm": 0.4475399909345128, + "learning_rate": 2.51865671641791e-07, + "loss": 0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.58333587646484, + "epoch": 5.082089552238806, + "grad_norm": 0.5461213012494088, + "learning_rate": 2.5223880597014923e-07, + "loss": -0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.86111450195312, + "epoch": 5.08955223880597, + "grad_norm": 0.5009482857534705, + "learning_rate": 2.5261194029850747e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.44444274902344, + "epoch": 5.097014925373134, + "grad_norm": 0.9981390822885589, + "learning_rate": 2.5298507462686565e-07, + "loss": -0.0024, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.38888549804688, + "epoch": 5.104477611940299, + "grad_norm": 0.22471111256928958, + "learning_rate": 2.533582089552239e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0277862548828, + "epoch": 5.111940298507463, + "grad_norm": 0.42344644371782353, + "learning_rate": 2.537313432835821e-07, + "loss": 0.001, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.36111450195312, + "epoch": 5.119402985074627, + "grad_norm": 0.9593820144059656, + "learning_rate": 2.5410447761194024e-07, + "loss": -0.0003, + "reward": 0.3888888955116272, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.19444274902344, + "epoch": 5.126865671641791, + "grad_norm": 1.078163571070975, + "learning_rate": 2.544776119402985e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.1388931274414, + "epoch": 5.134328358208955, + "grad_norm": 0.7332899543155982, + "learning_rate": 2.548507462686567e-07, + "loss": 0.0045, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.19444274902344, + "epoch": 5.141791044776119, + "grad_norm": 0.7650373044902825, + "learning_rate": 2.5522388059701494e-07, + "loss": 0.001, + "reward": 0.5555555820465088, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.19444274902344, + "epoch": 5.149253731343284, + "grad_norm": 0.5234994686108878, + "learning_rate": 2.555970149253731e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9166717529297, + "epoch": 5.156716417910448, + "grad_norm": 0.5112629878311581, + "learning_rate": 2.5597014925373136e-07, + "loss": -0.001, + "reward": 0.6388888955116272, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.1666717529297, + "epoch": 5.164179104477612, + "grad_norm": 0.32370023369888856, + "learning_rate": 2.5634328358208954e-07, + "loss": -0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 5.1716417910447765, + "grad_norm": 0.5344879165550467, + "learning_rate": 2.567164179104477e-07, + "loss": 0.0023, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.63888549804688, + "epoch": 5.17910447761194, + "grad_norm": 0.6138035382767774, + "learning_rate": 2.5708955223880595e-07, + "loss": -0.0013, + "reward": 0.25, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 5.186567164179104, + "grad_norm": 1.241243984067431, + "learning_rate": 2.574626865671642e-07, + "loss": 0.0012, + "reward": 0.5277777910232544, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1388931274414, + "epoch": 5.1940298507462686, + "grad_norm": 0.37976577207936224, + "learning_rate": 2.5783582089552237e-07, + "loss": 0.0011, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.38888549804688, + "epoch": 5.201492537313433, + "grad_norm": 0.6233833347154024, + "learning_rate": 2.582089552238806e-07, + "loss": -0.0006, + "reward": 0.2777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.97222900390625, + "epoch": 5.208955223880597, + "grad_norm": 4.049649670655894, + "learning_rate": 2.5858208955223884e-07, + "loss": -0.0002, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.22222900390625, + "epoch": 5.2164179104477615, + "grad_norm": 0.7204929902881987, + "learning_rate": 2.5895522388059697e-07, + "loss": 0.0017, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.88888549804688, + "epoch": 5.223880597014926, + "grad_norm": 0.49308995228262187, + "learning_rate": 2.593283582089552e-07, + "loss": -0.0009, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.38888549804688, + "epoch": 5.231343283582089, + "grad_norm": 1.9242189512649692, + "learning_rate": 2.5970149253731343e-07, + "loss": 0.0048, + "reward": 0.3611111044883728, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.86111450195312, + "epoch": 5.2388059701492535, + "grad_norm": 0.8291132472355451, + "learning_rate": 2.600746268656716e-07, + "loss": -0.0, + "reward": 0.6111111044883728, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.80555725097656, + "epoch": 5.246268656716418, + "grad_norm": 0.666609428040099, + "learning_rate": 2.6044776119402985e-07, + "loss": -0.0, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1666717529297, + "epoch": 5.253731343283582, + "grad_norm": 0.7077870768420402, + "learning_rate": 2.608208955223881e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.47222900390625, + "epoch": 5.2611940298507465, + "grad_norm": 0.7989710162706711, + "learning_rate": 2.611940298507462e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.94444274902344, + "epoch": 5.268656716417911, + "grad_norm": 0.8748670157657817, + "learning_rate": 2.6156716417910444e-07, + "loss": 0.0054, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.80555725097656, + "epoch": 5.276119402985074, + "grad_norm": 0.8590029407496412, + "learning_rate": 2.619402985074627e-07, + "loss": 0.0011, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.47222900390625, + "epoch": 5.2835820895522385, + "grad_norm": 0.7025814064326916, + "learning_rate": 2.6231343283582086e-07, + "loss": 0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 5.291044776119403, + "grad_norm": 0.6459950401863618, + "learning_rate": 2.626865671641791e-07, + "loss": 0.001, + "reward": 0.5555555820465088, + "reward_std": 0.41467228531837463, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1666717529297, + "epoch": 5.298507462686567, + "grad_norm": 0.6527354901367163, + "learning_rate": 2.630597014925373e-07, + "loss": 0.0008, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5833282470703, + "epoch": 5.3059701492537314, + "grad_norm": 0.6333840994634707, + "learning_rate": 2.6343283582089556e-07, + "loss": -0.0012, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.38888549804688, + "epoch": 5.313432835820896, + "grad_norm": 0.9755653394830833, + "learning_rate": 2.638059701492537e-07, + "loss": 0.0012, + "reward": 0.5555555820465088, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.44444274902344, + "epoch": 5.32089552238806, + "grad_norm": 0.4413577983412106, + "learning_rate": 2.641791044776119e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 5.3283582089552235, + "grad_norm": 0.558873008971742, + "learning_rate": 2.6455223880597016e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.1666717529297, + "epoch": 5.335820895522388, + "grad_norm": 0.6743873293272615, + "learning_rate": 2.6492537313432834e-07, + "loss": 0.0005, + "reward": 0.3055555522441864, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.72222137451172, + "epoch": 5.343283582089552, + "grad_norm": 0.3014445475590417, + "learning_rate": 2.6529850746268657e-07, + "loss": 0.0, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1666717529297, + "epoch": 5.350746268656716, + "grad_norm": 0.6311798764158023, + "learning_rate": 2.656716417910448e-07, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.05555725097656, + "epoch": 5.358208955223881, + "grad_norm": 0.7085612928207926, + "learning_rate": 2.66044776119403e-07, + "loss": -0.0024, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.25, + "epoch": 5.365671641791045, + "grad_norm": 0.7655636433765078, + "learning_rate": 2.6641791044776117e-07, + "loss": 0.0014, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9166717529297, + "epoch": 5.373134328358209, + "grad_norm": 0.8877210347092175, + "learning_rate": 2.667910447761194e-07, + "loss": -0.0021, + "reward": 0.5277777910232544, + "reward_std": 0.42326679825782776, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 5.380597014925373, + "grad_norm": 1.1161454347326618, + "learning_rate": 2.671641791044776e-07, + "loss": 0.0028, + "reward": 0.6111111044883728, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.3333282470703, + "epoch": 5.388059701492537, + "grad_norm": 1.1515210020940088, + "learning_rate": 2.675373134328358e-07, + "loss": -0.0011, + "reward": 0.5, + "reward_std": 0.41467228531837463, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.72222900390625, + "epoch": 5.395522388059701, + "grad_norm": 0.5638256810174849, + "learning_rate": 2.6791044776119405e-07, + "loss": 0.0014, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.05555725097656, + "epoch": 5.402985074626866, + "grad_norm": 0.8398206503728883, + "learning_rate": 2.6828358208955223e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.72222900390625, + "epoch": 5.41044776119403, + "grad_norm": 0.42782774838911686, + "learning_rate": 2.686567164179104e-07, + "loss": -0.0002, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.97222900390625, + "epoch": 5.417910447761194, + "grad_norm": 0.55955481591451, + "learning_rate": 2.6902985074626864e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.6666717529297, + "epoch": 5.425373134328359, + "grad_norm": 0.4705139428170534, + "learning_rate": 2.694029850746268e-07, + "loss": 0.0007, + "reward": 0.1944444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.72222900390625, + "epoch": 5.432835820895522, + "grad_norm": 1.132705793784276, + "learning_rate": 2.6977611940298506e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2777862548828, + "epoch": 5.440298507462686, + "grad_norm": 0.9101560733220188, + "learning_rate": 2.701492537313433e-07, + "loss": 0.0004, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 5.447761194029851, + "grad_norm": 0.7801137354249836, + "learning_rate": 2.7052238805970147e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.61111450195312, + "epoch": 5.455223880597015, + "grad_norm": 0.4195889696516327, + "learning_rate": 2.708955223880597e-07, + "loss": -0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5, + "epoch": 5.462686567164179, + "grad_norm": 1.8296203339036041, + "learning_rate": 2.712686567164179e-07, + "loss": -0.0014, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.22222900390625, + "epoch": 5.470149253731344, + "grad_norm": 0.6422933342480195, + "learning_rate": 2.7164179104477607e-07, + "loss": -0.0021, + "reward": 0.5277777910232544, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.61111450195312, + "epoch": 5.477611940298507, + "grad_norm": 0.46800688586998596, + "learning_rate": 2.720149253731343e-07, + "loss": 0.0012, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5, + "epoch": 5.485074626865671, + "grad_norm": 0.48391810693435583, + "learning_rate": 2.7238805970149254e-07, + "loss": -0.0013, + "reward": 0.3055555522441864, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.7777862548828, + "epoch": 5.492537313432836, + "grad_norm": 0.4162062032049081, + "learning_rate": 2.7276119402985077e-07, + "loss": -0.0002, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.05555725097656, + "epoch": 5.5, + "grad_norm": 0.7098157676876136, + "learning_rate": 2.7313432835820895e-07, + "loss": -0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 5.507462686567164, + "grad_norm": 0.43402510630943963, + "learning_rate": 2.735074626865672e-07, + "loss": -0.0003, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4166717529297, + "epoch": 5.514925373134329, + "grad_norm": 0.5196238994503612, + "learning_rate": 2.7388059701492537e-07, + "loss": 0.001, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.08333587646484, + "epoch": 5.522388059701493, + "grad_norm": 0.5433264989466529, + "learning_rate": 2.7425373134328355e-07, + "loss": -0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5277862548828, + "epoch": 5.529850746268656, + "grad_norm": 1.1749381788237963, + "learning_rate": 2.746268656716418e-07, + "loss": 0.0021, + "reward": 0.8611111044883728, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0833282470703, + "epoch": 5.537313432835821, + "grad_norm": 0.9615380070258092, + "learning_rate": 2.75e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.4702278673648834, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.22222900390625, + "epoch": 5.544776119402985, + "grad_norm": 0.7257774713445008, + "learning_rate": 2.753731343283582e-07, + "loss": 0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 5.552238805970149, + "grad_norm": 0.625456307941831, + "learning_rate": 2.7574626865671643e-07, + "loss": -0.0, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1666717529297, + "epoch": 5.559701492537314, + "grad_norm": 0.40131170034064256, + "learning_rate": 2.761194029850746e-07, + "loss": -0.0013, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.55555725097656, + "epoch": 5.567164179104478, + "grad_norm": 0.5553130884895625, + "learning_rate": 2.764925373134328e-07, + "loss": 0.0008, + "reward": 0.75, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.13888549804688, + "epoch": 5.574626865671641, + "grad_norm": 0.3130990581218142, + "learning_rate": 2.76865671641791e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.19444274902344, + "epoch": 5.582089552238806, + "grad_norm": 0.5737347705468742, + "learning_rate": 2.7723880597014926e-07, + "loss": 0.0, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.22222900390625, + "epoch": 5.58955223880597, + "grad_norm": 0.9525077491769968, + "learning_rate": 2.7761194029850744e-07, + "loss": 0.0108, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5, + "epoch": 5.597014925373134, + "grad_norm": 0.6332858440344618, + "learning_rate": 2.7798507462686567e-07, + "loss": -0.0003, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.22222900390625, + "epoch": 5.604477611940299, + "grad_norm": 0.6002138270147898, + "learning_rate": 2.783582089552239e-07, + "loss": 0.0009, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.97222900390625, + "epoch": 5.611940298507463, + "grad_norm": 1.1297477497903108, + "learning_rate": 2.7873134328358203e-07, + "loss": -0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8333282470703, + "epoch": 5.619402985074627, + "grad_norm": 0.6318538417740931, + "learning_rate": 2.7910447761194027e-07, + "loss": 0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.7777862548828, + "epoch": 5.6268656716417915, + "grad_norm": 1.1953294852194671, + "learning_rate": 2.794776119402985e-07, + "loss": -0.001, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1666717529297, + "epoch": 5.634328358208955, + "grad_norm": 0.4513156278556452, + "learning_rate": 2.798507462686567e-07, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.11111450195312, + "epoch": 5.641791044776119, + "grad_norm": 1.5463706382244997, + "learning_rate": 2.802238805970149e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 5.649253731343284, + "grad_norm": 0.557299179480191, + "learning_rate": 2.8059701492537315e-07, + "loss": 0.0009, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5833282470703, + "epoch": 5.656716417910448, + "grad_norm": 0.7007249733499371, + "learning_rate": 2.809701492537313e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7777862548828, + "epoch": 5.664179104477612, + "grad_norm": 0.40617537000008047, + "learning_rate": 2.813432835820895e-07, + "loss": -0.0014, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8333282470703, + "epoch": 5.6716417910447765, + "grad_norm": 0.8272131961375812, + "learning_rate": 2.8171641791044775e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.80555725097656, + "epoch": 5.67910447761194, + "grad_norm": 0.32174187394980786, + "learning_rate": 2.82089552238806e-07, + "loss": 0.0004, + "reward": 0.3611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2777862548828, + "epoch": 5.686567164179104, + "grad_norm": 1.0135065392756037, + "learning_rate": 2.8246268656716416e-07, + "loss": 0.0006, + "reward": 0.4444444477558136, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5833282470703, + "epoch": 5.6940298507462686, + "grad_norm": 0.6214144542420277, + "learning_rate": 2.828358208955224e-07, + "loss": 0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9166717529297, + "epoch": 5.701492537313433, + "grad_norm": 0.6863166141464792, + "learning_rate": 2.8320895522388063e-07, + "loss": 0.0004, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 5.708955223880597, + "grad_norm": 0.5502526409962936, + "learning_rate": 2.8358208955223876e-07, + "loss": -0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.63888549804688, + "epoch": 5.7164179104477615, + "grad_norm": 0.4788245688522374, + "learning_rate": 2.83955223880597e-07, + "loss": 0.0019, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.69444274902344, + "epoch": 5.723880597014926, + "grad_norm": 3.3676394901430555, + "learning_rate": 2.843283582089552e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.4166717529297, + "epoch": 5.731343283582089, + "grad_norm": 0.5217068100717336, + "learning_rate": 2.847014925373134e-07, + "loss": -0.0007, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 5.7388059701492535, + "grad_norm": 0.841850559578863, + "learning_rate": 2.8507462686567164e-07, + "loss": -0.0003, + "reward": 0.694444477558136, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3333282470703, + "epoch": 5.746268656716418, + "grad_norm": 0.7381412754582084, + "learning_rate": 2.8544776119402987e-07, + "loss": -0.0012, + "reward": 0.5555555820465088, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.69444274902344, + "epoch": 5.753731343283582, + "grad_norm": 2.935778054745247, + "learning_rate": 2.8582089552238805e-07, + "loss": 0.0019, + "reward": 0.694444477558136, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5277862548828, + "epoch": 5.7611940298507465, + "grad_norm": 0.36491669466280346, + "learning_rate": 2.8619402985074623e-07, + "loss": -0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.30555725097656, + "epoch": 5.768656716417911, + "grad_norm": 0.6272944346802753, + "learning_rate": 2.8656716417910447e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.94444274902344, + "epoch": 5.776119402985074, + "grad_norm": 0.6372039551673992, + "learning_rate": 2.8694029850746265e-07, + "loss": -0.0007, + "reward": 0.75, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.4166717529297, + "epoch": 5.7835820895522385, + "grad_norm": 0.37207516132670027, + "learning_rate": 2.873134328358209e-07, + "loss": 0.0004, + "reward": 0.8888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 5.791044776119403, + "grad_norm": 0.46882177387665497, + "learning_rate": 2.876865671641791e-07, + "loss": 0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.22222900390625, + "epoch": 5.798507462686567, + "grad_norm": 0.2703869982397011, + "learning_rate": 2.880597014925373e-07, + "loss": 0.0, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.86111450195312, + "epoch": 5.8059701492537314, + "grad_norm": 0.7426897503378797, + "learning_rate": 2.884328358208955e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.69444274902344, + "epoch": 5.813432835820896, + "grad_norm": 0.5382189600744001, + "learning_rate": 2.888059701492537e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.8888931274414, + "epoch": 5.82089552238806, + "grad_norm": 0.668195050436922, + "learning_rate": 2.891791044776119e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4166717529297, + "epoch": 5.8283582089552235, + "grad_norm": 0.4000201604548237, + "learning_rate": 2.8955223880597013e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.63888549804688, + "epoch": 5.835820895522388, + "grad_norm": 0.8082843754748689, + "learning_rate": 2.8992537313432836e-07, + "loss": -0.0, + "reward": 0.3333333432674408, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.88888549804688, + "epoch": 5.843283582089552, + "grad_norm": 0.4942703808852002, + "learning_rate": 2.902985074626866e-07, + "loss": -0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.72222900390625, + "epoch": 5.850746268656716, + "grad_norm": 0.8004354714543961, + "learning_rate": 2.906716417910448e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.36111450195312, + "epoch": 5.858208955223881, + "grad_norm": 0.9782995880924998, + "learning_rate": 2.9104477611940296e-07, + "loss": 0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5, + "epoch": 5.865671641791045, + "grad_norm": 0.6824936121321888, + "learning_rate": 2.914179104477612e-07, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.97222900390625, + "epoch": 5.8731343283582085, + "grad_norm": 0.45768125268954674, + "learning_rate": 2.9179104477611937e-07, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9166717529297, + "epoch": 5.880597014925373, + "grad_norm": 0.5246941173063537, + "learning_rate": 2.921641791044776e-07, + "loss": -0.0004, + "reward": 0.3055555522441864, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.22222900390625, + "epoch": 5.888059701492537, + "grad_norm": 0.3446543428146729, + "learning_rate": 2.9253731343283584e-07, + "loss": -0.0008, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.19444274902344, + "epoch": 5.895522388059701, + "grad_norm": 1.1434251159765205, + "learning_rate": 2.92910447761194e-07, + "loss": -0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 5.902985074626866, + "grad_norm": 1.8871599134259325, + "learning_rate": 2.9328358208955225e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5833282470703, + "epoch": 5.91044776119403, + "grad_norm": 0.5381751281224132, + "learning_rate": 2.9365671641791043e-07, + "loss": 0.0013, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 5.917910447761194, + "grad_norm": 1.0033799997975572, + "learning_rate": 2.940298507462686e-07, + "loss": -0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.30555725097656, + "epoch": 5.925373134328359, + "grad_norm": 0.6665163538960748, + "learning_rate": 2.9440298507462685e-07, + "loss": -0.0012, + "reward": 0.8055555820465088, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.69444274902344, + "epoch": 5.932835820895522, + "grad_norm": 0.36511722670728114, + "learning_rate": 2.947761194029851e-07, + "loss": 0.0004, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.19444274902344, + "epoch": 5.940298507462686, + "grad_norm": 0.2572045067756279, + "learning_rate": 2.9514925373134326e-07, + "loss": 0.0007, + "reward": 0.3055555522441864, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.55555725097656, + "epoch": 5.947761194029851, + "grad_norm": 0.404926686750617, + "learning_rate": 2.955223880597015e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5, + "epoch": 5.955223880597015, + "grad_norm": 0.8125856300593941, + "learning_rate": 2.958955223880597e-07, + "loss": -0.0003, + "reward": 0.3333333432674408, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.2777862548828, + "epoch": 5.962686567164179, + "grad_norm": 0.6129517422159235, + "learning_rate": 2.9626865671641786e-07, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.13888549804688, + "epoch": 5.970149253731344, + "grad_norm": 0.1416088442367799, + "learning_rate": 2.966417910447761e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.36111450195312, + "epoch": 5.977611940298507, + "grad_norm": 0.5669035836258952, + "learning_rate": 2.9701492537313433e-07, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.44444274902344, + "epoch": 5.985074626865671, + "grad_norm": 0.7342025579580627, + "learning_rate": 2.973880597014925e-07, + "loss": 0.0061, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.08334350585938, + "epoch": 5.992537313432836, + "grad_norm": 0.45860594767708845, + "learning_rate": 2.9776119402985074e-07, + "loss": 0.002, + "reward": 0.8611111044883728, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 6.007462686567164, + "grad_norm": 0.28857711509704287, + "learning_rate": 2.98134328358209e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.97222900390625, + "epoch": 6.014925373134329, + "grad_norm": 0.5030845790059609, + "learning_rate": 2.985074626865671e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.13888549804688, + "epoch": 6.022388059701493, + "grad_norm": 1.511301301602685, + "learning_rate": 2.9888059701492534e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.22222900390625, + "epoch": 6.029850746268656, + "grad_norm": 0.6718817451600587, + "learning_rate": 2.9925373134328357e-07, + "loss": 0.0008, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.4166717529297, + "epoch": 6.037313432835821, + "grad_norm": 0.3334784292087923, + "learning_rate": 2.996268656716418e-07, + "loss": -0.0011, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.36111450195312, + "epoch": 6.044776119402985, + "grad_norm": 0.21935664210770647, + "learning_rate": 3e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.2777862548828, + "epoch": 6.052238805970149, + "grad_norm": 0.4110107358214317, + "learning_rate": 3.003731343283582e-07, + "loss": -0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6666717529297, + "epoch": 6.059701492537314, + "grad_norm": 1.1097242625669281, + "learning_rate": 3.0074626865671645e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.86111450195312, + "epoch": 6.067164179104478, + "grad_norm": 0.30672783474865917, + "learning_rate": 3.011194029850746e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 6.074626865671641, + "grad_norm": 0.8518500951010933, + "learning_rate": 3.014925373134328e-07, + "loss": 0.0021, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.30555725097656, + "epoch": 6.082089552238806, + "grad_norm": 0.9965864499685619, + "learning_rate": 3.0186567164179105e-07, + "loss": 0.0013, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.11111450195312, + "epoch": 6.08955223880597, + "grad_norm": 0.42046137420268215, + "learning_rate": 3.0223880597014923e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.83333587646484, + "epoch": 6.097014925373134, + "grad_norm": 0.8049923890764543, + "learning_rate": 3.0261194029850746e-07, + "loss": -0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 6.104477611940299, + "grad_norm": 0.5405546711126664, + "learning_rate": 3.029850746268657e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5, + "epoch": 6.111940298507463, + "grad_norm": 0.7247251782397557, + "learning_rate": 3.033582089552238e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5833282470703, + "epoch": 6.119402985074627, + "grad_norm": 0.5725647944270323, + "learning_rate": 3.0373134328358206e-07, + "loss": 0.0013, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.97222137451172, + "epoch": 6.126865671641791, + "grad_norm": 0.28767236134998825, + "learning_rate": 3.041044776119403e-07, + "loss": 0.0004, + "reward": 0.25, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6666717529297, + "epoch": 6.134328358208955, + "grad_norm": 0.5296751710399202, + "learning_rate": 3.044776119402985e-07, + "loss": -0.0015, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0833282470703, + "epoch": 6.141791044776119, + "grad_norm": 0.5845171634137076, + "learning_rate": 3.048507462686567e-07, + "loss": -0.0062, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8888931274414, + "epoch": 6.149253731343284, + "grad_norm": 0.5218219165501611, + "learning_rate": 3.0522388059701494e-07, + "loss": -0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1666717529297, + "epoch": 6.156716417910448, + "grad_norm": 0.5982432958967095, + "learning_rate": 3.055970149253731e-07, + "loss": -0.0003, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 6.164179104477612, + "grad_norm": 0.30345021780917714, + "learning_rate": 3.059701492537313e-07, + "loss": -0.0, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6666717529297, + "epoch": 6.1716417910447765, + "grad_norm": 0.7217362659065683, + "learning_rate": 3.0634328358208954e-07, + "loss": 0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.19444274902344, + "epoch": 6.17910447761194, + "grad_norm": 0.47951649864429824, + "learning_rate": 3.067164179104477e-07, + "loss": -0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.1666717529297, + "epoch": 6.186567164179104, + "grad_norm": 0.6060642130355822, + "learning_rate": 3.0708955223880595e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.69444274902344, + "epoch": 6.1940298507462686, + "grad_norm": 0.32591584327353523, + "learning_rate": 3.074626865671642e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.38888549804688, + "epoch": 6.201492537313433, + "grad_norm": 0.27083834144880353, + "learning_rate": 3.0783582089552237e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0277862548828, + "epoch": 6.208955223880597, + "grad_norm": 0.18743742358234525, + "learning_rate": 3.0820895522388055e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.13888549804688, + "epoch": 6.2164179104477615, + "grad_norm": 0.5703864704698696, + "learning_rate": 3.085820895522388e-07, + "loss": 0.0014, + "reward": 0.6666666865348816, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.75, + "epoch": 6.223880597014926, + "grad_norm": 0.5847709057773991, + "learning_rate": 3.08955223880597e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.22222900390625, + "epoch": 6.231343283582089, + "grad_norm": 0.7669769871606118, + "learning_rate": 3.093283582089552e-07, + "loss": -0.001, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0833282470703, + "epoch": 6.2388059701492535, + "grad_norm": 0.7205936399633583, + "learning_rate": 3.0970149253731343e-07, + "loss": -0.0019, + "reward": 0.7777777910232544, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5, + "epoch": 6.246268656716418, + "grad_norm": 0.6171050366088318, + "learning_rate": 3.1007462686567166e-07, + "loss": -0.0007, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.22222900390625, + "epoch": 6.253731343283582, + "grad_norm": 0.4452926317608126, + "learning_rate": 3.1044776119402985e-07, + "loss": -0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.55555725097656, + "epoch": 6.2611940298507465, + "grad_norm": 0.8187657330219434, + "learning_rate": 3.10820895522388e-07, + "loss": -0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.72222900390625, + "epoch": 6.268656716417911, + "grad_norm": 0.5442412025445268, + "learning_rate": 3.1119402985074626e-07, + "loss": 0.0008, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.19444274902344, + "epoch": 6.276119402985074, + "grad_norm": 0.5022011251544449, + "learning_rate": 3.1156716417910444e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 6.2835820895522385, + "grad_norm": 0.5237507835534335, + "learning_rate": 3.119402985074627e-07, + "loss": 0.0013, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.75, + "epoch": 6.291044776119403, + "grad_norm": 0.5692022247061234, + "learning_rate": 3.123134328358209e-07, + "loss": 0.0014, + "reward": 0.2777777910232544, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5277862548828, + "epoch": 6.298507462686567, + "grad_norm": 1.2221314696122545, + "learning_rate": 3.126865671641791e-07, + "loss": 0.0018, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.69444274902344, + "epoch": 6.3059701492537314, + "grad_norm": 1.1194202972038347, + "learning_rate": 3.130597014925373e-07, + "loss": 0.0018, + "reward": 0.6666666865348816, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.97222900390625, + "epoch": 6.313432835820896, + "grad_norm": 0.6100977415351281, + "learning_rate": 3.134328358208955e-07, + "loss": -0.0008, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.30555725097656, + "epoch": 6.32089552238806, + "grad_norm": 1.6155940317610762, + "learning_rate": 3.138059701492537e-07, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.3333282470703, + "epoch": 6.3283582089552235, + "grad_norm": 0.25403137735963216, + "learning_rate": 3.141791044776119e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.94444274902344, + "epoch": 6.335820895522388, + "grad_norm": 0.41283335749962863, + "learning_rate": 3.1455223880597015e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.75, + "epoch": 6.343283582089552, + "grad_norm": 0.7327313479676646, + "learning_rate": 3.1492537313432833e-07, + "loss": 0.0011, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 6.350746268656716, + "grad_norm": 0.0, + "learning_rate": 3.1529850746268657e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.61111450195312, + "epoch": 6.358208955223881, + "grad_norm": 0.7792319517526697, + "learning_rate": 3.1567164179104475e-07, + "loss": -0.0, + "reward": 0.3055555522441864, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.61111450195312, + "epoch": 6.365671641791045, + "grad_norm": 0.25402370796491147, + "learning_rate": 3.1604477611940293e-07, + "loss": -0.0001, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3888931274414, + "epoch": 6.373134328358209, + "grad_norm": 0.0, + "learning_rate": 3.1641791044776116e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.2777862548828, + "epoch": 6.380597014925373, + "grad_norm": 0.4363070317293884, + "learning_rate": 3.167910447761194e-07, + "loss": 0.0015, + "reward": 0.4444444477558136, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.05555725097656, + "epoch": 6.388059701492537, + "grad_norm": 0.724035653204462, + "learning_rate": 3.1716417910447763e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.13888549804688, + "epoch": 6.395522388059701, + "grad_norm": 0.5678130661167686, + "learning_rate": 3.175373134328358e-07, + "loss": 0.0015, + "reward": 0.4166666567325592, + "reward_std": 0.31215566396713257, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.72222900390625, + "epoch": 6.402985074626866, + "grad_norm": 0.5717810460830539, + "learning_rate": 3.1791044776119405e-07, + "loss": 0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.22222900390625, + "epoch": 6.41044776119403, + "grad_norm": 0.30069645535333617, + "learning_rate": 3.182835820895522e-07, + "loss": 0.0009, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0, + "epoch": 6.417910447761194, + "grad_norm": 0.4068471824401456, + "learning_rate": 3.186567164179104e-07, + "loss": 0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.63888549804688, + "epoch": 6.425373134328359, + "grad_norm": 0.9539706535600485, + "learning_rate": 3.1902985074626864e-07, + "loss": 0.0012, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5833282470703, + "epoch": 6.432835820895522, + "grad_norm": 1.1663999417864759, + "learning_rate": 3.194029850746269e-07, + "loss": 0.0511, + "reward": 0.4166666567325592, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 6.440298507462686, + "grad_norm": 0.6015825161013132, + "learning_rate": 3.1977611940298506e-07, + "loss": -0.0006, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5277862548828, + "epoch": 6.447761194029851, + "grad_norm": 0.6751835841273415, + "learning_rate": 3.201492537313433e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.7777862548828, + "epoch": 6.455223880597015, + "grad_norm": 0.30142065229228493, + "learning_rate": 3.205223880597015e-07, + "loss": -0.0, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.61111450195312, + "epoch": 6.462686567164179, + "grad_norm": 0.6241908816097965, + "learning_rate": 3.2089552238805965e-07, + "loss": -0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 6.470149253731344, + "grad_norm": 0.6871845450234303, + "learning_rate": 3.212686567164179e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.19444274902344, + "epoch": 6.477611940298507, + "grad_norm": 0.8253927231349081, + "learning_rate": 3.216417910447761e-07, + "loss": 0.0368, + "reward": 0.3611111044883728, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.47222900390625, + "epoch": 6.485074626865671, + "grad_norm": 0.5135627429975638, + "learning_rate": 3.220149253731343e-07, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.44444274902344, + "epoch": 6.492537313432836, + "grad_norm": 0.72893829662553, + "learning_rate": 3.2238805970149253e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0833282470703, + "epoch": 6.5, + "grad_norm": 0.3161432010086615, + "learning_rate": 3.2276119402985077e-07, + "loss": -0.0003, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.11111450195312, + "epoch": 6.507462686567164, + "grad_norm": 0.44469189857948954, + "learning_rate": 3.231343283582089e-07, + "loss": -0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1666717529297, + "epoch": 6.514925373134329, + "grad_norm": 0.35191153750614435, + "learning_rate": 3.2350746268656713e-07, + "loss": -0.0015, + "reward": 0.3611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 6.522388059701493, + "grad_norm": 0.437734193559192, + "learning_rate": 3.2388059701492536e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.72222900390625, + "epoch": 6.529850746268656, + "grad_norm": 1.2443343407532304, + "learning_rate": 3.2425373134328354e-07, + "loss": -0.0014, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0277862548828, + "epoch": 6.537313432835821, + "grad_norm": 0.5190004306769375, + "learning_rate": 3.246268656716418e-07, + "loss": -0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 6.544776119402985, + "grad_norm": 0.5800323602033822, + "learning_rate": 3.25e-07, + "loss": -0.0006, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.75, + "epoch": 6.552238805970149, + "grad_norm": 0.5555053946020634, + "learning_rate": 3.253731343283582e-07, + "loss": 0.0008, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.69444274902344, + "epoch": 6.559701492537314, + "grad_norm": 1.0858773791479708, + "learning_rate": 3.257462686567164e-07, + "loss": 0.0009, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6666717529297, + "epoch": 6.567164179104478, + "grad_norm": 0.806769734881186, + "learning_rate": 3.261194029850746e-07, + "loss": 0.0018, + "reward": 0.8055555820465088, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.80555725097656, + "epoch": 6.574626865671641, + "grad_norm": 0.6790444300095406, + "learning_rate": 3.2649253731343284e-07, + "loss": 0.0011, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.1666717529297, + "epoch": 6.582089552238806, + "grad_norm": 1.4037952029782652, + "learning_rate": 3.26865671641791e-07, + "loss": 0.0014, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.19444274902344, + "epoch": 6.58955223880597, + "grad_norm": 0.3235122593051687, + "learning_rate": 3.2723880597014926e-07, + "loss": 0.0001, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 6.597014925373134, + "grad_norm": 0.30772143978440036, + "learning_rate": 3.276119402985075e-07, + "loss": 0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.47222900390625, + "epoch": 6.604477611940299, + "grad_norm": 0.27228574712821246, + "learning_rate": 3.279850746268656e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.80555725097656, + "epoch": 6.611940298507463, + "grad_norm": 1.673326130442715, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.0011, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.72222900390625, + "epoch": 6.619402985074627, + "grad_norm": 0.6198466131101574, + "learning_rate": 3.287313432835821e-07, + "loss": -0.0008, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5, + "epoch": 6.6268656716417915, + "grad_norm": 0.0, + "learning_rate": 3.2910447761194027e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 6.634328358208955, + "grad_norm": 0.4140955202602445, + "learning_rate": 3.294776119402985e-07, + "loss": -0.0005, + "reward": 0.694444477558136, + "reward_std": 0.2949666976928711, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.88888549804688, + "epoch": 6.641791044776119, + "grad_norm": 0.0, + "learning_rate": 3.2985074626865673e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75, + "epoch": 6.649253731343284, + "grad_norm": 0.3823781312223135, + "learning_rate": 3.302238805970149e-07, + "loss": -0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.30555725097656, + "epoch": 6.656716417910448, + "grad_norm": 0.4281332364414811, + "learning_rate": 3.305970149253731e-07, + "loss": -0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.88888549804688, + "epoch": 6.664179104477612, + "grad_norm": 0.3665846587944162, + "learning_rate": 3.3097014925373133e-07, + "loss": -0.0005, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.38888549804688, + "epoch": 6.6716417910447765, + "grad_norm": 0.5268833664251815, + "learning_rate": 3.313432835820895e-07, + "loss": 0.0012, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.7777862548828, + "epoch": 6.67910447761194, + "grad_norm": 0.38225760022993854, + "learning_rate": 3.3171641791044774e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.4166717529297, + "epoch": 6.686567164179104, + "grad_norm": 1.0945668034907639, + "learning_rate": 3.32089552238806e-07, + "loss": 0.0021, + "reward": 0.5277777910232544, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.94444274902344, + "epoch": 6.6940298507462686, + "grad_norm": 0.5589129125989432, + "learning_rate": 3.3246268656716416e-07, + "loss": 0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.05555725097656, + "epoch": 6.701492537313433, + "grad_norm": 0.5476779338967912, + "learning_rate": 3.328358208955224e-07, + "loss": 0.001, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.22222900390625, + "epoch": 6.708955223880597, + "grad_norm": 0.714010618441079, + "learning_rate": 3.332089552238806e-07, + "loss": 0.0027, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.80555725097656, + "epoch": 6.7164179104477615, + "grad_norm": 0.6928067305495725, + "learning_rate": 3.3358208955223875e-07, + "loss": -0.0012, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.69444274902344, + "epoch": 6.723880597014926, + "grad_norm": 0.6128029000107553, + "learning_rate": 3.33955223880597e-07, + "loss": -0.0021, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.13888549804688, + "epoch": 6.731343283582089, + "grad_norm": 0.18130102724223918, + "learning_rate": 3.343283582089552e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.80555725097656, + "epoch": 6.7388059701492535, + "grad_norm": 0.4668906414446711, + "learning_rate": 3.347014925373134e-07, + "loss": 0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0277862548828, + "epoch": 6.746268656716418, + "grad_norm": 1.1497133295871413, + "learning_rate": 3.3507462686567164e-07, + "loss": -0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.55555725097656, + "epoch": 6.753731343283582, + "grad_norm": 0.6214688458814269, + "learning_rate": 3.354477611940298e-07, + "loss": 0.0002, + "reward": 0.3611111044883728, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2777862548828, + "epoch": 6.7611940298507465, + "grad_norm": 0.5440047950971272, + "learning_rate": 3.3582089552238805e-07, + "loss": -0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.19444274902344, + "epoch": 6.768656716417911, + "grad_norm": 0.44853630583995, + "learning_rate": 3.3619402985074623e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.44444274902344, + "epoch": 6.776119402985074, + "grad_norm": 0.4506434761009768, + "learning_rate": 3.3656716417910447e-07, + "loss": 0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1666717529297, + "epoch": 6.7835820895522385, + "grad_norm": 0.7048404302473171, + "learning_rate": 3.369402985074627e-07, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.97222900390625, + "epoch": 6.791044776119403, + "grad_norm": 0.6786019531715062, + "learning_rate": 3.373134328358209e-07, + "loss": 0.001, + "reward": 0.5833333134651184, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5833282470703, + "epoch": 6.798507462686567, + "grad_norm": 0.5592549161104607, + "learning_rate": 3.376865671641791e-07, + "loss": 0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.13888549804688, + "epoch": 6.8059701492537314, + "grad_norm": 0.699453339202755, + "learning_rate": 3.380597014925373e-07, + "loss": 0.0011, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.47222900390625, + "epoch": 6.813432835820896, + "grad_norm": 0.455904866635887, + "learning_rate": 3.384328358208955e-07, + "loss": -0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.38888549804688, + "epoch": 6.82089552238806, + "grad_norm": 0.8775086168080256, + "learning_rate": 3.388059701492537e-07, + "loss": 0.0022, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6666717529297, + "epoch": 6.8283582089552235, + "grad_norm": 0.8119906878202771, + "learning_rate": 3.3917910447761194e-07, + "loss": -0.0009, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.0, + "epoch": 6.835820895522388, + "grad_norm": 0.9422564142856097, + "learning_rate": 3.395522388059701e-07, + "loss": -0.0005, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.86111450195312, + "epoch": 6.843283582089552, + "grad_norm": 0.36653234492953574, + "learning_rate": 3.3992537313432836e-07, + "loss": 0.0, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.88888549804688, + "epoch": 6.850746268656716, + "grad_norm": 1.0585017373124557, + "learning_rate": 3.402985074626866e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.9166717529297, + "epoch": 6.858208955223881, + "grad_norm": 0.4080829063602073, + "learning_rate": 3.406716417910447e-07, + "loss": -0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 6.865671641791045, + "grad_norm": 0.803205307569062, + "learning_rate": 3.4104477611940295e-07, + "loss": 0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 6.8731343283582085, + "grad_norm": 0.4208176718639878, + "learning_rate": 3.414179104477612e-07, + "loss": -0.0007, + "reward": 0.3333333432674408, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.61111450195312, + "epoch": 6.880597014925373, + "grad_norm": 0.589782192652892, + "learning_rate": 3.4179104477611937e-07, + "loss": -0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.86111450195312, + "epoch": 6.888059701492537, + "grad_norm": 0.5644833205897645, + "learning_rate": 3.421641791044776e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.30555725097656, + "epoch": 6.895522388059701, + "grad_norm": 1.1383543016235462, + "learning_rate": 3.4253731343283584e-07, + "loss": -0.001, + "reward": 0.5555555820465088, + "reward_std": 0.25660011172294617, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.63888549804688, + "epoch": 6.902985074626866, + "grad_norm": 0.5491604947612096, + "learning_rate": 3.4291044776119396e-07, + "loss": -0.0021, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.47222900390625, + "epoch": 6.91044776119403, + "grad_norm": 1.2996753340448015, + "learning_rate": 3.432835820895522e-07, + "loss": 0.0006, + "reward": 0.25, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.22222900390625, + "epoch": 6.917910447761194, + "grad_norm": 1.0683859031341938, + "learning_rate": 3.4365671641791043e-07, + "loss": 0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4166717529297, + "epoch": 6.925373134328359, + "grad_norm": 0.7544885795370516, + "learning_rate": 3.4402985074626867e-07, + "loss": -0.0073, + "reward": 0.5833333134651184, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6666717529297, + "epoch": 6.932835820895522, + "grad_norm": 0.5644387706372216, + "learning_rate": 3.4440298507462685e-07, + "loss": 0.0015, + "reward": 0.472222238779068, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 6.940298507462686, + "grad_norm": 0.29777327172028917, + "learning_rate": 3.447761194029851e-07, + "loss": 0.0007, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.86111450195312, + "epoch": 6.947761194029851, + "grad_norm": 0.5342746551521812, + "learning_rate": 3.451492537313433e-07, + "loss": -0.0021, + "reward": 0.5277777910232544, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6666717529297, + "epoch": 6.955223880597015, + "grad_norm": 0.8361239476429438, + "learning_rate": 3.4552238805970144e-07, + "loss": -0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.75, + "epoch": 6.962686567164179, + "grad_norm": 0.9724823340248171, + "learning_rate": 3.458955223880597e-07, + "loss": 0.0014, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.72222900390625, + "epoch": 6.970149253731344, + "grad_norm": 0.535012012098503, + "learning_rate": 3.462686567164179e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.58333587646484, + "epoch": 6.977611940298507, + "grad_norm": 0.2698629215190165, + "learning_rate": 3.466417910447761e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.6666717529297, + "epoch": 6.985074626865671, + "grad_norm": 0.5805352234029224, + "learning_rate": 3.470149253731343e-07, + "loss": 0.0, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.83334350585938, + "epoch": 6.992537313432836, + "grad_norm": 0.46155767156158445, + "learning_rate": 3.4738805970149256e-07, + "loss": 0.0007, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.77777862548828, + "epoch": 7.007462686567164, + "grad_norm": 0.9666724926803923, + "learning_rate": 3.477611940298507e-07, + "loss": 0.002, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0833282470703, + "epoch": 7.014925373134329, + "grad_norm": 1.3191342407852429, + "learning_rate": 3.481343283582089e-07, + "loss": 0.0027, + "reward": 0.75, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.97222900390625, + "epoch": 7.022388059701493, + "grad_norm": 0.25143569851570674, + "learning_rate": 3.4850746268656715e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.94444274902344, + "epoch": 7.029850746268656, + "grad_norm": 0.666644541654436, + "learning_rate": 3.4888059701492534e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.05555725097656, + "epoch": 7.037313432835821, + "grad_norm": 0.3500840021145202, + "learning_rate": 3.4925373134328357e-07, + "loss": 0.0001, + "reward": 0.3055555522441864, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.05555725097656, + "epoch": 7.044776119402985, + "grad_norm": 0.5120881911982862, + "learning_rate": 3.496268656716418e-07, + "loss": -0.0009, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3333282470703, + "epoch": 7.052238805970149, + "grad_norm": 0.44795803434566983, + "learning_rate": 3.5e-07, + "loss": 0.0006, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.4166717529297, + "epoch": 7.059701492537314, + "grad_norm": 0.43011417735414925, + "learning_rate": 3.5037313432835817e-07, + "loss": 0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.80555725097656, + "epoch": 7.067164179104478, + "grad_norm": 0.7120414053564085, + "learning_rate": 3.507462686567164e-07, + "loss": -0.001, + "reward": 0.5555555820465088, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.52777862548828, + "epoch": 7.074626865671641, + "grad_norm": 0.742219434141354, + "learning_rate": 3.511194029850746e-07, + "loss": -0.0, + "reward": 0.4444444477558136, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.11111450195312, + "epoch": 7.082089552238806, + "grad_norm": 0.39699792675438234, + "learning_rate": 3.514925373134328e-07, + "loss": 0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.44444274902344, + "epoch": 7.08955223880597, + "grad_norm": 0.4170407694072715, + "learning_rate": 3.5186567164179105e-07, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.94444274902344, + "epoch": 7.097014925373134, + "grad_norm": 0.4025139365688662, + "learning_rate": 3.5223880597014923e-07, + "loss": -0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 7.104477611940299, + "grad_norm": 0.9649813093245444, + "learning_rate": 3.5261194029850746e-07, + "loss": 0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.30555725097656, + "epoch": 7.111940298507463, + "grad_norm": 0.6390448435116549, + "learning_rate": 3.5298507462686564e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.13888549804688, + "epoch": 7.119402985074627, + "grad_norm": 0.4759844263787752, + "learning_rate": 3.533582089552239e-07, + "loss": 0.0001, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.3333282470703, + "epoch": 7.126865671641791, + "grad_norm": 0.9770273678750572, + "learning_rate": 3.5373134328358206e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 7.134328358208955, + "grad_norm": 0.6011477089166477, + "learning_rate": 3.541044776119403e-07, + "loss": 0.0008, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.30555725097656, + "epoch": 7.141791044776119, + "grad_norm": 0.42258373828993756, + "learning_rate": 3.544776119402985e-07, + "loss": 0.0, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 7.149253731343284, + "grad_norm": 0.7073686594740538, + "learning_rate": 3.548507462686567e-07, + "loss": 0.0027, + "reward": 0.3888888955116272, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.8333282470703, + "epoch": 7.156716417910448, + "grad_norm": 0.47165988002973236, + "learning_rate": 3.552238805970149e-07, + "loss": 0.0004, + "reward": 0.3333333432674408, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.69444274902344, + "epoch": 7.164179104477612, + "grad_norm": 0.35333271509406566, + "learning_rate": 3.555970149253731e-07, + "loss": -0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.7777862548828, + "epoch": 7.1716417910447765, + "grad_norm": 0.48746750037588726, + "learning_rate": 3.559701492537313e-07, + "loss": 0.0022, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.72222137451172, + "epoch": 7.17910447761194, + "grad_norm": 0.24625035684178595, + "learning_rate": 3.5634328358208954e-07, + "loss": -0.0004, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.55555725097656, + "epoch": 7.186567164179104, + "grad_norm": 0.5015325829881636, + "learning_rate": 3.5671641791044777e-07, + "loss": -0.0008, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 7.1940298507462686, + "grad_norm": 0.9177161378254864, + "learning_rate": 3.5708955223880595e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.44444274902344, + "epoch": 7.201492537313433, + "grad_norm": 0.41468266283583877, + "learning_rate": 3.574626865671642e-07, + "loss": 0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1666717529297, + "epoch": 7.208955223880597, + "grad_norm": 2.1643165305943506, + "learning_rate": 3.5783582089552237e-07, + "loss": -0.0011, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.61111450195312, + "epoch": 7.2164179104477615, + "grad_norm": 0.4445719445304595, + "learning_rate": 3.5820895522388055e-07, + "loss": -0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5277862548828, + "epoch": 7.223880597014926, + "grad_norm": 0.35033637905847204, + "learning_rate": 3.585820895522388e-07, + "loss": 0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.47222900390625, + "epoch": 7.231343283582089, + "grad_norm": 0.26394698306291425, + "learning_rate": 3.58955223880597e-07, + "loss": 0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.44444274902344, + "epoch": 7.2388059701492535, + "grad_norm": 0.2739764707114894, + "learning_rate": 3.593283582089552e-07, + "loss": 0.0006, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 7.246268656716418, + "grad_norm": 0.41958893730789343, + "learning_rate": 3.5970149253731343e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5277862548828, + "epoch": 7.253731343283582, + "grad_norm": 0.3062996721277127, + "learning_rate": 3.6007462686567166e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.8333282470703, + "epoch": 7.2611940298507465, + "grad_norm": 0.4451361167947593, + "learning_rate": 3.604477611940298e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2777862548828, + "epoch": 7.268656716417911, + "grad_norm": 0.6572374376331671, + "learning_rate": 3.60820895522388e-07, + "loss": -0.0009, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.44444274902344, + "epoch": 7.276119402985074, + "grad_norm": 0.31925702858383415, + "learning_rate": 3.6119402985074626e-07, + "loss": 0.0004, + "reward": 0.1666666716337204, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 7.2835820895522385, + "grad_norm": 0.0, + "learning_rate": 3.6156716417910444e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.19444274902344, + "epoch": 7.291044776119403, + "grad_norm": 0.5087152124350465, + "learning_rate": 3.6194029850746267e-07, + "loss": -0.0, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.88888549804688, + "epoch": 7.298507462686567, + "grad_norm": 0.6622331866836744, + "learning_rate": 3.623134328358209e-07, + "loss": -0.0006, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.05555725097656, + "epoch": 7.3059701492537314, + "grad_norm": 0.3746993260377292, + "learning_rate": 3.626865671641791e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1666717529297, + "epoch": 7.313432835820896, + "grad_norm": 0.7531146468223188, + "learning_rate": 3.6305970149253727e-07, + "loss": -0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.55555725097656, + "epoch": 7.32089552238806, + "grad_norm": 1.0031370365985477, + "learning_rate": 3.634328358208955e-07, + "loss": 0.0006, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.19444274902344, + "epoch": 7.3283582089552235, + "grad_norm": 0.3117938985266183, + "learning_rate": 3.6380597014925374e-07, + "loss": -0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.69444274902344, + "epoch": 7.335820895522388, + "grad_norm": 1.3304645530528145, + "learning_rate": 3.641791044776119e-07, + "loss": -0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.63888549804688, + "epoch": 7.343283582089552, + "grad_norm": 0.0, + "learning_rate": 3.6455223880597015e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.94444274902344, + "epoch": 7.350746268656716, + "grad_norm": 0.5709790049886416, + "learning_rate": 3.649253731343284e-07, + "loss": -0.0019, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.8333282470703, + "epoch": 7.358208955223881, + "grad_norm": 0.6597059898111094, + "learning_rate": 3.652985074626865e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.72222900390625, + "epoch": 7.365671641791045, + "grad_norm": 0.8953383301726151, + "learning_rate": 3.6567164179104475e-07, + "loss": -0.002, + "reward": 0.5833333134651184, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.22222900390625, + "epoch": 7.373134328358209, + "grad_norm": 0.5970876940201696, + "learning_rate": 3.66044776119403e-07, + "loss": 0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75, + "epoch": 7.380597014925373, + "grad_norm": 0.5038531879124916, + "learning_rate": 3.6641791044776116e-07, + "loss": 0.001, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 7.388059701492537, + "grad_norm": 0.3810624009572768, + "learning_rate": 3.667910447761194e-07, + "loss": 0.0011, + "reward": 0.6111111044883728, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0833282470703, + "epoch": 7.395522388059701, + "grad_norm": 0.3066555422717254, + "learning_rate": 3.6716417910447763e-07, + "loss": -0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 7.402985074626866, + "grad_norm": 0.3336826644993506, + "learning_rate": 3.6753731343283576e-07, + "loss": 0.0006, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.30555725097656, + "epoch": 7.41044776119403, + "grad_norm": 0.5204748877633243, + "learning_rate": 3.67910447761194e-07, + "loss": 0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 7.417910447761194, + "grad_norm": 0.4712083528396498, + "learning_rate": 3.682835820895522e-07, + "loss": -0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.63888549804688, + "epoch": 7.425373134328359, + "grad_norm": 0.3428921193287051, + "learning_rate": 3.686567164179104e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1666717529297, + "epoch": 7.432835820895522, + "grad_norm": 0.1511745074992687, + "learning_rate": 3.6902985074626864e-07, + "loss": 0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.86111450195312, + "epoch": 7.440298507462686, + "grad_norm": 0.4442230113692002, + "learning_rate": 3.6940298507462687e-07, + "loss": 0.0007, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.2777862548828, + "epoch": 7.447761194029851, + "grad_norm": 0.4509243947381531, + "learning_rate": 3.6977611940298505e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0, + "epoch": 7.455223880597015, + "grad_norm": 0.4025704357608743, + "learning_rate": 3.7014925373134323e-07, + "loss": -0.0009, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.94444274902344, + "epoch": 7.462686567164179, + "grad_norm": 0.6820984672173313, + "learning_rate": 3.7052238805970147e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.7777862548828, + "epoch": 7.470149253731344, + "grad_norm": 0.4107302880044209, + "learning_rate": 3.708955223880597e-07, + "loss": -0.0, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7777862548828, + "epoch": 7.477611940298507, + "grad_norm": 0.6469774103592335, + "learning_rate": 3.712686567164179e-07, + "loss": -0.0012, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.6666717529297, + "epoch": 7.485074626865671, + "grad_norm": 0.6339073133078686, + "learning_rate": 3.716417910447761e-07, + "loss": -0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5277862548828, + "epoch": 7.492537313432836, + "grad_norm": 0.0, + "learning_rate": 3.7201492537313435e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.19444274902344, + "epoch": 7.5, + "grad_norm": 0.30041398848546375, + "learning_rate": 3.7238805970149253e-07, + "loss": 0.0007, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.55555725097656, + "epoch": 7.507462686567164, + "grad_norm": 0.6438543324910457, + "learning_rate": 3.727611940298507e-07, + "loss": 0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.22222900390625, + "epoch": 7.514925373134329, + "grad_norm": 0.6588991408290651, + "learning_rate": 3.7313432835820895e-07, + "loss": -0.0002, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 7.522388059701493, + "grad_norm": 0.0, + "learning_rate": 3.7350746268656713e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.97222900390625, + "epoch": 7.529850746268656, + "grad_norm": 0.4639830490755882, + "learning_rate": 3.7388059701492536e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.19444274902344, + "epoch": 7.537313432835821, + "grad_norm": 0.8131562493550923, + "learning_rate": 3.742537313432836e-07, + "loss": 0.0025, + "reward": 0.6111111044883728, + "reward_std": 0.3505222797393799, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.86111450195312, + "epoch": 7.544776119402985, + "grad_norm": 0.6167093287852556, + "learning_rate": 3.746268656716418e-07, + "loss": -0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.05555725097656, + "epoch": 7.552238805970149, + "grad_norm": 0.3765560925561395, + "learning_rate": 3.75e-07, + "loss": -0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3333282470703, + "epoch": 7.559701492537314, + "grad_norm": 0.612845012725802, + "learning_rate": 3.753731343283582e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0, + "epoch": 7.567164179104478, + "grad_norm": 0.8822037040015835, + "learning_rate": 3.7574626865671637e-07, + "loss": -0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.25, + "epoch": 7.574626865671641, + "grad_norm": 0.45622346026441846, + "learning_rate": 3.761194029850746e-07, + "loss": -0.0006, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 7.582089552238806, + "grad_norm": 0.4278174743936225, + "learning_rate": 3.7649253731343284e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.13888549804688, + "epoch": 7.58955223880597, + "grad_norm": 0.5037884185304428, + "learning_rate": 3.76865671641791e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 7.597014925373134, + "grad_norm": 0.5181161695814072, + "learning_rate": 3.7723880597014925e-07, + "loss": 0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.3035612106323242, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.88888549804688, + "epoch": 7.604477611940299, + "grad_norm": 0.8845677474177355, + "learning_rate": 3.7761194029850743e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.44444274902344, + "epoch": 7.611940298507463, + "grad_norm": 0.3480721317048067, + "learning_rate": 3.779850746268656e-07, + "loss": -0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.22222900390625, + "epoch": 7.619402985074627, + "grad_norm": 0.9669315125194367, + "learning_rate": 3.7835820895522385e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.1666717529297, + "epoch": 7.6268656716417915, + "grad_norm": 0.37289474379429605, + "learning_rate": 3.787313432835821e-07, + "loss": -0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0833282470703, + "epoch": 7.634328358208955, + "grad_norm": 2.564744576782617, + "learning_rate": 3.7910447761194026e-07, + "loss": -0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.22222900390625, + "epoch": 7.641791044776119, + "grad_norm": 0.5021525068781497, + "learning_rate": 3.794776119402985e-07, + "loss": 0.0011, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 7.649253731343284, + "grad_norm": 0.36857192048673976, + "learning_rate": 3.7985074626865673e-07, + "loss": -0.0009, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.13888549804688, + "epoch": 7.656716417910448, + "grad_norm": 0.406409126786863, + "learning_rate": 3.802238805970149e-07, + "loss": -0.0011, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5277862548828, + "epoch": 7.664179104477612, + "grad_norm": 0.3140969169794396, + "learning_rate": 3.805970149253731e-07, + "loss": -0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.33333587646484, + "epoch": 7.6716417910447765, + "grad_norm": 0.9693702863055479, + "learning_rate": 3.8097014925373133e-07, + "loss": 0.0029, + "reward": 0.6666666865348816, + "reward_std": 0.41467228531837463, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.86111450195312, + "epoch": 7.67910447761194, + "grad_norm": 0.7259050991643109, + "learning_rate": 3.8134328358208956e-07, + "loss": -0.0012, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5833282470703, + "epoch": 7.686567164179104, + "grad_norm": 0.5558726468806928, + "learning_rate": 3.8171641791044774e-07, + "loss": -0.001, + "reward": 0.3333333432674408, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5833282470703, + "epoch": 7.6940298507462686, + "grad_norm": 0.3711226561801313, + "learning_rate": 3.82089552238806e-07, + "loss": -0.0007, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 7.701492537313433, + "grad_norm": 0.6907658759046305, + "learning_rate": 3.824626865671642e-07, + "loss": 0.0017, + "reward": 0.472222238779068, + "reward_std": 0.4060778021812439, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.36111450195312, + "epoch": 7.708955223880597, + "grad_norm": 0.658706357332171, + "learning_rate": 3.8283582089552234e-07, + "loss": -0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.30555725097656, + "epoch": 7.7164179104477615, + "grad_norm": 0.16319733627641816, + "learning_rate": 3.8320895522388057e-07, + "loss": 0.0005, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.3333282470703, + "epoch": 7.723880597014926, + "grad_norm": 0.241852024907241, + "learning_rate": 3.835820895522388e-07, + "loss": -0.0007, + "reward": 0.3611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.19444274902344, + "epoch": 7.731343283582089, + "grad_norm": 0.0, + "learning_rate": 3.83955223880597e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.4166717529297, + "epoch": 7.7388059701492535, + "grad_norm": 0.6486997938607454, + "learning_rate": 3.843283582089552e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.38888549804688, + "epoch": 7.746268656716418, + "grad_norm": 0.2607554172177004, + "learning_rate": 3.8470149253731345e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5833282470703, + "epoch": 7.753731343283582, + "grad_norm": 0.2929396224809059, + "learning_rate": 3.850746268656716e-07, + "loss": -0.0007, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.88888549804688, + "epoch": 7.7611940298507465, + "grad_norm": 0.2324809391726758, + "learning_rate": 3.854477611940298e-07, + "loss": 0.0002, + "reward": 0.1388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.1388888955116272, + "rewards/format_reward": 0.0, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.1666717529297, + "epoch": 7.768656716417911, + "grad_norm": 0.3969436586203512, + "learning_rate": 3.8582089552238805e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.55555725097656, + "epoch": 7.776119402985074, + "grad_norm": 0.2428806530037641, + "learning_rate": 3.8619402985074623e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.88888549804688, + "epoch": 7.7835820895522385, + "grad_norm": 0.48123258794956575, + "learning_rate": 3.8656716417910446e-07, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0277862548828, + "epoch": 7.791044776119403, + "grad_norm": 0.3615231977662641, + "learning_rate": 3.869402985074627e-07, + "loss": 0.0008, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.22222900390625, + "epoch": 7.798507462686567, + "grad_norm": 0.5062678577535898, + "learning_rate": 3.873134328358209e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.63888549804688, + "epoch": 7.8059701492537314, + "grad_norm": 0.39183389682313746, + "learning_rate": 3.8768656716417906e-07, + "loss": -0.001, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.86111450195312, + "epoch": 7.813432835820896, + "grad_norm": 0.9476614799169715, + "learning_rate": 3.880597014925373e-07, + "loss": -0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.11111450195312, + "epoch": 7.82089552238806, + "grad_norm": 0.4165385876237843, + "learning_rate": 3.8843283582089553e-07, + "loss": 0.0002, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.8333282470703, + "epoch": 7.8283582089552235, + "grad_norm": 0.5331284227745757, + "learning_rate": 3.888059701492537e-07, + "loss": -0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.11111450195312, + "epoch": 7.835820895522388, + "grad_norm": 0.5257034335030552, + "learning_rate": 3.8917910447761194e-07, + "loss": 0.001, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.47222900390625, + "epoch": 7.843283582089552, + "grad_norm": 0.10899062335483174, + "learning_rate": 3.895522388059702e-07, + "loss": -0.0, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 7.850746268656716, + "grad_norm": 0.6921052318258505, + "learning_rate": 3.899253731343283e-07, + "loss": 0.002, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.80555725097656, + "epoch": 7.858208955223881, + "grad_norm": 1.0099156591591107, + "learning_rate": 3.9029850746268654e-07, + "loss": 0.0008, + "reward": 0.694444477558136, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.2777862548828, + "epoch": 7.865671641791045, + "grad_norm": 0.6558200824211942, + "learning_rate": 3.9067164179104477e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36111450195312, + "epoch": 7.8731343283582085, + "grad_norm": 0.4287163423862619, + "learning_rate": 3.9104477611940295e-07, + "loss": 0.0013, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 7.880597014925373, + "grad_norm": 0.838763116391159, + "learning_rate": 3.914179104477612e-07, + "loss": -0.0027, + "reward": 0.4444444477558136, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.47222900390625, + "epoch": 7.888059701492537, + "grad_norm": 0.9333008764829813, + "learning_rate": 3.917910447761194e-07, + "loss": 0.0021, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1666717529297, + "epoch": 7.895522388059701, + "grad_norm": 0.651740901931895, + "learning_rate": 3.921641791044776e-07, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.55555725097656, + "epoch": 7.902985074626866, + "grad_norm": 0.5207510201441514, + "learning_rate": 3.925373134328358e-07, + "loss": 0.0005, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.88888549804688, + "epoch": 7.91044776119403, + "grad_norm": 0.4318746220678453, + "learning_rate": 3.92910447761194e-07, + "loss": -0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 7.917910447761194, + "grad_norm": 2.088608475940932, + "learning_rate": 3.932835820895522e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 7.925373134328359, + "grad_norm": 0.46390329347529496, + "learning_rate": 3.9365671641791043e-07, + "loss": -0.0002, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.38888549804688, + "epoch": 7.932835820895522, + "grad_norm": 0.12979632627621343, + "learning_rate": 3.9402985074626866e-07, + "loss": -0.0, + "reward": 0.1388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.1388888955116272, + "rewards/format_reward": 0.0, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.69444274902344, + "epoch": 7.940298507462686, + "grad_norm": 0.7423395458179957, + "learning_rate": 3.9440298507462685e-07, + "loss": -0.0006, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.3333282470703, + "epoch": 7.947761194029851, + "grad_norm": 0.6288127237267186, + "learning_rate": 3.947761194029851e-07, + "loss": 0.0017, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.44444274902344, + "epoch": 7.955223880597015, + "grad_norm": 0.8741143068344117, + "learning_rate": 3.9514925373134326e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.86111450195312, + "epoch": 7.962686567164179, + "grad_norm": 0.7620112294046407, + "learning_rate": 3.9552238805970144e-07, + "loss": -0.0003, + "reward": 0.3055555522441864, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.36111450195312, + "epoch": 7.970149253731344, + "grad_norm": 0.4591420852685496, + "learning_rate": 3.958955223880597e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.05555725097656, + "epoch": 7.977611940298507, + "grad_norm": 1.8791892850518477, + "learning_rate": 3.962686567164179e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.13888549804688, + "epoch": 7.985074626865671, + "grad_norm": 0.50342788160517, + "learning_rate": 3.966417910447761e-07, + "loss": 0.0, + "reward": 0.3888888955116272, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6666717529297, + "epoch": 7.992537313432836, + "grad_norm": 0.7245137053166542, + "learning_rate": 3.970149253731343e-07, + "loss": 0.0004, + "reward": 0.472222238779068, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.11111450195312, + "epoch": 8.007462686567164, + "grad_norm": 0.8211293756951371, + "learning_rate": 3.973880597014925e-07, + "loss": 0.0006, + "reward": 0.3888888955116272, + "reward_std": 0.3505222499370575, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5277862548828, + "epoch": 8.014925373134329, + "grad_norm": 0.5222390117822618, + "learning_rate": 3.9776119402985074e-07, + "loss": -0.0006, + "reward": 0.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.3333282470703, + "epoch": 8.022388059701493, + "grad_norm": 0.4297804230719302, + "learning_rate": 3.981343283582089e-07, + "loss": -0.0012, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.11111450195312, + "epoch": 8.029850746268657, + "grad_norm": 1.0363276602257674, + "learning_rate": 3.9850746268656715e-07, + "loss": 0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.7777862548828, + "epoch": 8.037313432835822, + "grad_norm": 0.6015737408971722, + "learning_rate": 3.988805970149254e-07, + "loss": -0.0005, + "reward": 0.5, + "reward_std": 0.36771121621131897, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.88888549804688, + "epoch": 8.044776119402986, + "grad_norm": 0.4589528046681521, + "learning_rate": 3.9925373134328357e-07, + "loss": 0.0012, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.05555725097656, + "epoch": 8.052238805970148, + "grad_norm": 0.7719394312314516, + "learning_rate": 3.996268656716418e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.86111450195312, + "epoch": 8.059701492537313, + "grad_norm": 0.5394455101500519, + "learning_rate": 4e-07, + "loss": -0.0002, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6666717529297, + "epoch": 8.067164179104477, + "grad_norm": 5.802873207178597, + "learning_rate": 4.0037313432835816e-07, + "loss": 0.0016, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.7777862548828, + "epoch": 8.074626865671641, + "grad_norm": 0.2382442697330808, + "learning_rate": 4.007462686567164e-07, + "loss": -0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6388931274414, + "epoch": 8.082089552238806, + "grad_norm": 0.5930282471395378, + "learning_rate": 4.0111940298507463e-07, + "loss": -0.0003, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 8.08955223880597, + "grad_norm": 1.2629824953484148, + "learning_rate": 4.014925373134328e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.86111450195312, + "epoch": 8.097014925373134, + "grad_norm": 0.556835482280392, + "learning_rate": 4.0186567164179105e-07, + "loss": -0.0005, + "reward": 0.5, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.61111450195312, + "epoch": 8.104477611940299, + "grad_norm": 1.7707146588204778, + "learning_rate": 4.022388059701493e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.80555725097656, + "epoch": 8.111940298507463, + "grad_norm": 0.6330852443726012, + "learning_rate": 4.026119402985074e-07, + "loss": 0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0833282470703, + "epoch": 8.119402985074627, + "grad_norm": 0.479376790112319, + "learning_rate": 4.0298507462686564e-07, + "loss": 0.0011, + "reward": 0.5555555820465088, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.75, + "epoch": 8.126865671641792, + "grad_norm": 0.9223272630709759, + "learning_rate": 4.033582089552239e-07, + "loss": -0.0006, + "reward": 0.9166666865348816, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.19444274902344, + "epoch": 8.134328358208956, + "grad_norm": 0.5257838201277928, + "learning_rate": 4.0373134328358206e-07, + "loss": 0.0012, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.22222900390625, + "epoch": 8.14179104477612, + "grad_norm": 0.4168446712916031, + "learning_rate": 4.041044776119403e-07, + "loss": -0.0005, + "reward": 0.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0833282470703, + "epoch": 8.149253731343283, + "grad_norm": 0.7721585823984723, + "learning_rate": 4.044776119402985e-07, + "loss": 0.0029, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.7777862548828, + "epoch": 8.156716417910447, + "grad_norm": 0.8277400397440627, + "learning_rate": 4.0485074626865665e-07, + "loss": -0.0001, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 8.164179104477611, + "grad_norm": 1.0583221976715624, + "learning_rate": 4.052238805970149e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.19444274902344, + "epoch": 8.171641791044776, + "grad_norm": 0.4469435373882556, + "learning_rate": 4.055970149253731e-07, + "loss": -0.0012, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 8.17910447761194, + "grad_norm": 0.6248071445426214, + "learning_rate": 4.059701492537313e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 8.186567164179104, + "grad_norm": 1.0099776585964264, + "learning_rate": 4.0634328358208953e-07, + "loss": -0.001, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5277862548828, + "epoch": 8.194029850746269, + "grad_norm": 0.4830361871010502, + "learning_rate": 4.0671641791044777e-07, + "loss": -0.0012, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.88888549804688, + "epoch": 8.201492537313433, + "grad_norm": 1.1398197508815346, + "learning_rate": 4.07089552238806e-07, + "loss": -0.0014, + "reward": 0.8611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1666717529297, + "epoch": 8.208955223880597, + "grad_norm": 0.44015683555033447, + "learning_rate": 4.0746268656716413e-07, + "loss": -0.0003, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9166717529297, + "epoch": 8.216417910447761, + "grad_norm": 0.3359219880667002, + "learning_rate": 4.0783582089552236e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3333282470703, + "epoch": 8.223880597014926, + "grad_norm": 0.4823730893058819, + "learning_rate": 4.082089552238806e-07, + "loss": -0.0, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.72222900390625, + "epoch": 8.23134328358209, + "grad_norm": 2.2382573316601686, + "learning_rate": 4.085820895522388e-07, + "loss": 0.0015, + "reward": 0.5277777910232544, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.75, + "epoch": 8.238805970149254, + "grad_norm": 0.48020782221187214, + "learning_rate": 4.08955223880597e-07, + "loss": -0.0005, + "reward": 0.4166666567325592, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 8.246268656716419, + "grad_norm": 0.6000458123541951, + "learning_rate": 4.0932835820895525e-07, + "loss": -0.0009, + "reward": 0.5, + "reward_std": 0.3333333432674408, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.61111450195312, + "epoch": 8.253731343283581, + "grad_norm": 0.4269795823484252, + "learning_rate": 4.0970149253731337e-07, + "loss": 0.0006, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.30555725097656, + "epoch": 8.261194029850746, + "grad_norm": 0.6975039550263236, + "learning_rate": 4.100746268656716e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 8.26865671641791, + "grad_norm": 0.24873272784816194, + "learning_rate": 4.1044776119402984e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3333282470703, + "epoch": 8.276119402985074, + "grad_norm": 0.805340542620429, + "learning_rate": 4.10820895522388e-07, + "loss": 0.0013, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.97222900390625, + "epoch": 8.283582089552239, + "grad_norm": 0.9489728492677098, + "learning_rate": 4.1119402985074626e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0277862548828, + "epoch": 8.291044776119403, + "grad_norm": 0.3155239598803428, + "learning_rate": 4.115671641791045e-07, + "loss": -0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.80555725097656, + "epoch": 8.298507462686567, + "grad_norm": 0.4029662185758968, + "learning_rate": 4.1194029850746267e-07, + "loss": 0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.38888549804688, + "epoch": 8.305970149253731, + "grad_norm": 0.443383799660861, + "learning_rate": 4.1231343283582085e-07, + "loss": -0.0129, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0, + "epoch": 8.313432835820896, + "grad_norm": 0.5683157292886962, + "learning_rate": 4.126865671641791e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.13888549804688, + "epoch": 8.32089552238806, + "grad_norm": 0.31020531165671605, + "learning_rate": 4.1305970149253727e-07, + "loss": 0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1666717529297, + "epoch": 8.328358208955224, + "grad_norm": 0.41247172958938366, + "learning_rate": 4.134328358208955e-07, + "loss": 0.0004, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.05555725097656, + "epoch": 8.335820895522389, + "grad_norm": 1.173991343053717, + "learning_rate": 4.1380597014925373e-07, + "loss": -0.0008, + "reward": 0.75, + "reward_std": 0.34192779660224915, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9166717529297, + "epoch": 8.343283582089553, + "grad_norm": 0.0, + "learning_rate": 4.141791044776119e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.94444274902344, + "epoch": 8.350746268656717, + "grad_norm": 1.0354628829089583, + "learning_rate": 4.1455223880597015e-07, + "loss": -0.0012, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.44444274902344, + "epoch": 8.35820895522388, + "grad_norm": 0.6569376844871327, + "learning_rate": 4.1492537313432833e-07, + "loss": -0.0011, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0833282470703, + "epoch": 8.365671641791044, + "grad_norm": 0.42628576766263065, + "learning_rate": 4.1529850746268656e-07, + "loss": -0.0008, + "reward": 0.2777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.55555725097656, + "epoch": 8.373134328358208, + "grad_norm": 0.7174963138277326, + "learning_rate": 4.1567164179104474e-07, + "loss": 0.0073, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.2777862548828, + "epoch": 8.380597014925373, + "grad_norm": 0.510135645822684, + "learning_rate": 4.16044776119403e-07, + "loss": 0.0025, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.30555725097656, + "epoch": 8.388059701492537, + "grad_norm": 1.2845369813273515, + "learning_rate": 4.164179104477612e-07, + "loss": 0.0021, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8333282470703, + "epoch": 8.395522388059701, + "grad_norm": 0.40477670071322264, + "learning_rate": 4.167910447761194e-07, + "loss": 0.0008, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0, + "epoch": 8.402985074626866, + "grad_norm": 1.3961449858368264, + "learning_rate": 4.1716417910447757e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.13888549804688, + "epoch": 8.41044776119403, + "grad_norm": 0.19080961236804578, + "learning_rate": 4.175373134328358e-07, + "loss": 0.0, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.94444274902344, + "epoch": 8.417910447761194, + "grad_norm": 0.19471431575505221, + "learning_rate": 4.17910447761194e-07, + "loss": -0.0, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0833282470703, + "epoch": 8.425373134328359, + "grad_norm": 0.6402301554514715, + "learning_rate": 4.182835820895522e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.05555725097656, + "epoch": 8.432835820895523, + "grad_norm": 0.3962577904922646, + "learning_rate": 4.1865671641791046e-07, + "loss": -0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.11111450195312, + "epoch": 8.440298507462687, + "grad_norm": 0.8263969630145349, + "learning_rate": 4.1902985074626864e-07, + "loss": 0.001, + "reward": 0.6388888955116272, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.94444274902344, + "epoch": 8.447761194029852, + "grad_norm": 0.30919123614801114, + "learning_rate": 4.1940298507462687e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.19444274902344, + "epoch": 8.455223880597014, + "grad_norm": 0.8390655984688168, + "learning_rate": 4.1977611940298505e-07, + "loss": -0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 8.462686567164178, + "grad_norm": 0.28864584330221277, + "learning_rate": 4.2014925373134323e-07, + "loss": 0.0005, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.72222900390625, + "epoch": 8.470149253731343, + "grad_norm": 0.9197137936365984, + "learning_rate": 4.2052238805970147e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.1666717529297, + "epoch": 8.477611940298507, + "grad_norm": 0.6268374057091163, + "learning_rate": 4.208955223880597e-07, + "loss": -0.0005, + "reward": 0.3055555522441864, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.7777862548828, + "epoch": 8.485074626865671, + "grad_norm": 0.4772127447358929, + "learning_rate": 4.212686567164179e-07, + "loss": 0.0011, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.52777862548828, + "epoch": 8.492537313432836, + "grad_norm": 0.8470196267601131, + "learning_rate": 4.216417910447761e-07, + "loss": -0.0013, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.97222900390625, + "epoch": 8.5, + "grad_norm": 0.9475191908492642, + "learning_rate": 4.2201492537313435e-07, + "loss": -0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 8.507462686567164, + "grad_norm": 0.5926366902416939, + "learning_rate": 4.223880597014925e-07, + "loss": -0.0011, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0833282470703, + "epoch": 8.514925373134329, + "grad_norm": 0.6173086292230675, + "learning_rate": 4.227611940298507e-07, + "loss": 0.0009, + "reward": 0.75, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.88888549804688, + "epoch": 8.522388059701493, + "grad_norm": 0.3932252124132601, + "learning_rate": 4.2313432835820894e-07, + "loss": 0.0012, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 8.529850746268657, + "grad_norm": 0.33208210279276895, + "learning_rate": 4.235074626865671e-07, + "loss": -0.001, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.44444274902344, + "epoch": 8.537313432835822, + "grad_norm": 0.15341428207397298, + "learning_rate": 4.2388059701492536e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5277862548828, + "epoch": 8.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.242537313432836e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5277862548828, + "epoch": 8.552238805970148, + "grad_norm": 0.7836706007856209, + "learning_rate": 4.2462686567164177e-07, + "loss": 0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3333282470703, + "epoch": 8.559701492537313, + "grad_norm": 0.329464409076312, + "learning_rate": 4.2499999999999995e-07, + "loss": -0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.11111450195312, + "epoch": 8.567164179104477, + "grad_norm": 1.0822604544244134, + "learning_rate": 4.253731343283582e-07, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5833282470703, + "epoch": 8.574626865671641, + "grad_norm": 0.3984714757063083, + "learning_rate": 4.257462686567164e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.25, + "epoch": 8.582089552238806, + "grad_norm": 0.48859044882785924, + "learning_rate": 4.261194029850746e-07, + "loss": -0.0005, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.11111450195312, + "epoch": 8.58955223880597, + "grad_norm": 0.7186403501137055, + "learning_rate": 4.2649253731343284e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.13888549804688, + "epoch": 8.597014925373134, + "grad_norm": 0.4037405479345655, + "learning_rate": 4.2686567164179107e-07, + "loss": -0.0006, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 8.604477611940299, + "grad_norm": 0.528484725076422, + "learning_rate": 4.272388059701492e-07, + "loss": -0.0012, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.69444274902344, + "epoch": 8.611940298507463, + "grad_norm": 0.8220555388664068, + "learning_rate": 4.2761194029850743e-07, + "loss": -0.0002, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.22222900390625, + "epoch": 8.619402985074627, + "grad_norm": 0.46292027037383227, + "learning_rate": 4.2798507462686567e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.11111450195312, + "epoch": 8.626865671641792, + "grad_norm": 0.2924619556241581, + "learning_rate": 4.2835820895522385e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.69444274902344, + "epoch": 8.634328358208956, + "grad_norm": 0.19517284999794016, + "learning_rate": 4.287313432835821e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.4166717529297, + "epoch": 8.64179104477612, + "grad_norm": 0.43754030336566935, + "learning_rate": 4.291044776119403e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.9166717529297, + "epoch": 8.649253731343283, + "grad_norm": 1.3165565481481627, + "learning_rate": 4.2947761194029844e-07, + "loss": -0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0277862548828, + "epoch": 8.656716417910447, + "grad_norm": 0.38998906850159976, + "learning_rate": 4.298507462686567e-07, + "loss": -0.0007, + "reward": 0.5, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 8.664179104477611, + "grad_norm": 1.0786407524994261, + "learning_rate": 4.302238805970149e-07, + "loss": -0.002, + "reward": 0.6666666865348816, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.6666717529297, + "epoch": 8.671641791044776, + "grad_norm": 1.263057071287373, + "learning_rate": 4.305970149253731e-07, + "loss": 0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.22222900390625, + "epoch": 8.67910447761194, + "grad_norm": 0.4294022243685175, + "learning_rate": 4.309701492537313e-07, + "loss": 0.0006, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.66666412353516, + "epoch": 8.686567164179104, + "grad_norm": 0.38566274517017507, + "learning_rate": 4.3134328358208956e-07, + "loss": -0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.4166717529297, + "epoch": 8.694029850746269, + "grad_norm": 1.197917743114398, + "learning_rate": 4.3171641791044774e-07, + "loss": 0.0006, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5833282470703, + "epoch": 8.701492537313433, + "grad_norm": 0.49823342997042, + "learning_rate": 4.320895522388059e-07, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5, + "epoch": 8.708955223880597, + "grad_norm": 0.41689041782433667, + "learning_rate": 4.3246268656716415e-07, + "loss": -0.0003, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 8.716417910447761, + "grad_norm": 0.4760592775323472, + "learning_rate": 4.3283582089552234e-07, + "loss": 0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.44444274902344, + "epoch": 8.723880597014926, + "grad_norm": 0.31401967796455893, + "learning_rate": 4.3320895522388057e-07, + "loss": 0.0005, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5833282470703, + "epoch": 8.73134328358209, + "grad_norm": 0.5572730831631165, + "learning_rate": 4.335820895522388e-07, + "loss": -0.0012, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.97222900390625, + "epoch": 8.738805970149254, + "grad_norm": 0.15464536112330093, + "learning_rate": 4.3395522388059704e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.47222900390625, + "epoch": 8.746268656716419, + "grad_norm": 1.5909682582336155, + "learning_rate": 4.343283582089552e-07, + "loss": 0.0003, + "reward": 0.1944444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.1944444477558136, + "rewards/format_reward": 0.0, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.30555725097656, + "epoch": 8.753731343283581, + "grad_norm": 0.3862101991225094, + "learning_rate": 4.347014925373134e-07, + "loss": 0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.55555725097656, + "epoch": 8.761194029850746, + "grad_norm": 0.398636046279684, + "learning_rate": 4.3507462686567163e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.13888549804688, + "epoch": 8.76865671641791, + "grad_norm": 0.5946741572859694, + "learning_rate": 4.354477611940298e-07, + "loss": 0.0009, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.47222137451172, + "epoch": 8.776119402985074, + "grad_norm": 0.4191435753725259, + "learning_rate": 4.3582089552238805e-07, + "loss": -0.0, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.75, + "epoch": 8.783582089552239, + "grad_norm": 0.44197128760889803, + "learning_rate": 4.361940298507463e-07, + "loss": 0.0015, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.80555725097656, + "epoch": 8.791044776119403, + "grad_norm": 0.8667713772635386, + "learning_rate": 4.3656716417910446e-07, + "loss": 0.0018, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5277862548828, + "epoch": 8.798507462686567, + "grad_norm": 0.2524909830871912, + "learning_rate": 4.3694029850746264e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6666717529297, + "epoch": 8.805970149253731, + "grad_norm": 0.9521406207250872, + "learning_rate": 4.373134328358209e-07, + "loss": 0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.97222900390625, + "epoch": 8.813432835820896, + "grad_norm": 0.502694667640737, + "learning_rate": 4.3768656716417906e-07, + "loss": 0.0013, + "reward": 0.3888888955116272, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.30555725097656, + "epoch": 8.82089552238806, + "grad_norm": 0.6790225501822224, + "learning_rate": 4.380597014925373e-07, + "loss": -0.0022, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.69444274902344, + "epoch": 8.828358208955224, + "grad_norm": 0.514640020620303, + "learning_rate": 4.384328358208955e-07, + "loss": -0.0003, + "reward": 0.3611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.36111450195312, + "epoch": 8.835820895522389, + "grad_norm": 1.0510472624744234, + "learning_rate": 4.388059701492537e-07, + "loss": -0.0002, + "reward": 0.8611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.36111450195312, + "epoch": 8.843283582089553, + "grad_norm": 0.9223266375690103, + "learning_rate": 4.3917910447761194e-07, + "loss": -0.0011, + "reward": 0.6666666865348816, + "reward_std": 0.4318612813949585, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.86111450195312, + "epoch": 8.850746268656717, + "grad_norm": 0.6059596456241756, + "learning_rate": 4.395522388059701e-07, + "loss": 0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.0, + "epoch": 8.85820895522388, + "grad_norm": 0.4590770541308279, + "learning_rate": 4.399253731343283e-07, + "loss": -0.001, + "reward": 0.4444444477558136, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.55555725097656, + "epoch": 8.865671641791044, + "grad_norm": 0.45046087075292807, + "learning_rate": 4.4029850746268654e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.19444274902344, + "epoch": 8.873134328358208, + "grad_norm": 0.5939792719903634, + "learning_rate": 4.4067164179104477e-07, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.63888549804688, + "epoch": 8.880597014925373, + "grad_norm": 0.574706757360922, + "learning_rate": 4.4104477611940295e-07, + "loss": 0.0011, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1666717529297, + "epoch": 8.888059701492537, + "grad_norm": 0.5174582454285461, + "learning_rate": 4.414179104477612e-07, + "loss": 0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.22222900390625, + "epoch": 8.895522388059701, + "grad_norm": 0.34397426359027083, + "learning_rate": 4.417910447761194e-07, + "loss": -0.0005, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.6666717529297, + "epoch": 8.902985074626866, + "grad_norm": 0.28075541932752734, + "learning_rate": 4.421641791044776e-07, + "loss": 0.0009, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.88888549804688, + "epoch": 8.91044776119403, + "grad_norm": 1.460052650127893, + "learning_rate": 4.425373134328358e-07, + "loss": -0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.19444274902344, + "epoch": 8.917910447761194, + "grad_norm": 0.48058312601045833, + "learning_rate": 4.42910447761194e-07, + "loss": -0.0005, + "reward": 0.8888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.88888549804688, + "epoch": 8.925373134328359, + "grad_norm": 0.5307647362432687, + "learning_rate": 4.4328358208955225e-07, + "loss": 0.0004, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.80555725097656, + "epoch": 8.932835820895523, + "grad_norm": 0.5665003778802926, + "learning_rate": 4.4365671641791043e-07, + "loss": -0.0001, + "reward": 0.2777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.22222900390625, + "epoch": 8.940298507462687, + "grad_norm": 1.1970693863813848, + "learning_rate": 4.4402985074626866e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.94444274902344, + "epoch": 8.947761194029852, + "grad_norm": 0.5423695346968345, + "learning_rate": 4.4440298507462684e-07, + "loss": 0.0018, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.97222900390625, + "epoch": 8.955223880597014, + "grad_norm": 0.39787965467097125, + "learning_rate": 4.44776119402985e-07, + "loss": -0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.7777862548828, + "epoch": 8.962686567164178, + "grad_norm": 0.4315560592798329, + "learning_rate": 4.4514925373134326e-07, + "loss": 0.0004, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.80555725097656, + "epoch": 8.970149253731343, + "grad_norm": 0.8415536366635408, + "learning_rate": 4.455223880597015e-07, + "loss": -0.0009, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8333282470703, + "epoch": 8.977611940298507, + "grad_norm": 0.2669885376044344, + "learning_rate": 4.4589552238805967e-07, + "loss": -0.0003, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.4166717529297, + "epoch": 8.985074626865671, + "grad_norm": 0.0, + "learning_rate": 4.462686567164179e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6666717529297, + "epoch": 8.992537313432836, + "grad_norm": 0.34387006168424894, + "learning_rate": 4.4664179104477614e-07, + "loss": 0.0009, + "reward": 0.3055555522441864, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8333282470703, + "epoch": 9.007462686567164, + "grad_norm": 0.41071564484856776, + "learning_rate": 4.4701492537313427e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.75, + "epoch": 9.014925373134329, + "grad_norm": 0.4593643474166072, + "learning_rate": 4.473880597014925e-07, + "loss": -0.0024, + "reward": 0.6388888955116272, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.69444274902344, + "epoch": 9.022388059701493, + "grad_norm": 0.4522291539630712, + "learning_rate": 4.4776119402985074e-07, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.63888549804688, + "epoch": 9.029850746268657, + "grad_norm": 0.6042097984953986, + "learning_rate": 4.481343283582089e-07, + "loss": -0.001, + "reward": 0.75, + "reward_std": 0.3591167628765106, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.05555725097656, + "epoch": 9.037313432835822, + "grad_norm": 0.22469321610847134, + "learning_rate": 4.4850746268656715e-07, + "loss": 0.0006, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.25, + "epoch": 9.044776119402986, + "grad_norm": 0.6154684338462825, + "learning_rate": 4.488805970149254e-07, + "loss": -0.0007, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.11111450195312, + "epoch": 9.052238805970148, + "grad_norm": 0.36027970110409724, + "learning_rate": 4.492537313432835e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 9.059701492537313, + "grad_norm": 0.6446815145263272, + "learning_rate": 4.4962686567164175e-07, + "loss": -0.0011, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.2777862548828, + "epoch": 9.067164179104477, + "grad_norm": 0.3877987146527487, + "learning_rate": 4.5e-07, + "loss": 0.0008, + "reward": 0.3611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.80555725097656, + "epoch": 9.074626865671641, + "grad_norm": 0.3697864269266126, + "learning_rate": 4.5037313432835816e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.13888549804688, + "epoch": 9.082089552238806, + "grad_norm": 0.36274416025632134, + "learning_rate": 4.507462686567164e-07, + "loss": -0.0003, + "reward": 0.25, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.30555725097656, + "epoch": 9.08955223880597, + "grad_norm": 0.7583143907658986, + "learning_rate": 4.5111940298507463e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 9.097014925373134, + "grad_norm": 0.36778507965306834, + "learning_rate": 4.5149253731343286e-07, + "loss": -0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.86111450195312, + "epoch": 9.104477611940299, + "grad_norm": 0.390585690502647, + "learning_rate": 4.51865671641791e-07, + "loss": 0.0003, + "reward": 0.3888888955116272, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 9.111940298507463, + "grad_norm": 0.4566170017745334, + "learning_rate": 4.522388059701492e-07, + "loss": 0.0005, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.13888549804688, + "epoch": 9.119402985074627, + "grad_norm": 0.32881076989662894, + "learning_rate": 4.5261194029850746e-07, + "loss": -0.0002, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5, + "epoch": 9.126865671641792, + "grad_norm": 0.0, + "learning_rate": 4.5298507462686564e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.36111450195312, + "epoch": 9.134328358208956, + "grad_norm": 0.484056145778573, + "learning_rate": 4.5335820895522387e-07, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.11111450195312, + "epoch": 9.14179104477612, + "grad_norm": 0.3336486434674043, + "learning_rate": 4.537313432835821e-07, + "loss": 0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.97222900390625, + "epoch": 9.149253731343283, + "grad_norm": 0.36109615666647943, + "learning_rate": 4.541044776119403e-07, + "loss": -0.0017, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0277862548828, + "epoch": 9.156716417910447, + "grad_norm": 0.13281057928614784, + "learning_rate": 4.5447761194029847e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.69444274902344, + "epoch": 9.164179104477611, + "grad_norm": 0.6345756069505194, + "learning_rate": 4.548507462686567e-07, + "loss": -0.0006, + "reward": 0.6388888955116272, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.05555725097656, + "epoch": 9.171641791044776, + "grad_norm": 0.38146631237983264, + "learning_rate": 4.552238805970149e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.47222900390625, + "epoch": 9.17910447761194, + "grad_norm": 0.40915789890898613, + "learning_rate": 4.555970149253731e-07, + "loss": -0.0014, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.4166717529297, + "epoch": 9.186567164179104, + "grad_norm": 1.0154107347098658, + "learning_rate": 4.5597014925373135e-07, + "loss": 0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.2777862548828, + "epoch": 9.194029850746269, + "grad_norm": 0.4438181514269448, + "learning_rate": 4.5634328358208953e-07, + "loss": -0.0005, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.55555725097656, + "epoch": 9.201492537313433, + "grad_norm": 0.3542572456599162, + "learning_rate": 4.567164179104477e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.72222900390625, + "epoch": 9.208955223880597, + "grad_norm": 0.33957605948099334, + "learning_rate": 4.5708955223880595e-07, + "loss": 0.0002, + "reward": 0.1666666716337204, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0277862548828, + "epoch": 9.216417910447761, + "grad_norm": 0.23149092874133112, + "learning_rate": 4.5746268656716413e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 9.223880597014926, + "grad_norm": 1.1463576152626203, + "learning_rate": 4.5783582089552236e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 9.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.582089552238806e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.05555725097656, + "epoch": 9.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.585820895522388e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.55555725097656, + "epoch": 9.246268656716419, + "grad_norm": 0.47061478086412334, + "learning_rate": 4.58955223880597e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5, + "epoch": 9.253731343283581, + "grad_norm": 0.3987419101651953, + "learning_rate": 4.593283582089552e-07, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.1666717529297, + "epoch": 9.261194029850746, + "grad_norm": 0.43070988349098205, + "learning_rate": 4.5970149253731337e-07, + "loss": 0.0012, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.86111450195312, + "epoch": 9.26865671641791, + "grad_norm": 0.3873829993210006, + "learning_rate": 4.600746268656716e-07, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.88888549804688, + "epoch": 9.276119402985074, + "grad_norm": 0.7060860483683244, + "learning_rate": 4.6044776119402984e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0277862548828, + "epoch": 9.283582089552239, + "grad_norm": 0.5876211737349851, + "learning_rate": 4.6082089552238807e-07, + "loss": -0.001, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 9.291044776119403, + "grad_norm": 0.2562576050519166, + "learning_rate": 4.6119402985074625e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.61111450195312, + "epoch": 9.298507462686567, + "grad_norm": 0.1002181727169506, + "learning_rate": 4.615671641791045e-07, + "loss": -0.0004, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.72222900390625, + "epoch": 9.305970149253731, + "grad_norm": 0.49986117267864727, + "learning_rate": 4.6194029850746267e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 9.313432835820896, + "grad_norm": 0.2043164166063733, + "learning_rate": 4.6231343283582085e-07, + "loss": -0.0005, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.22222900390625, + "epoch": 9.32089552238806, + "grad_norm": 0.48259472349848775, + "learning_rate": 4.626865671641791e-07, + "loss": -0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.44444274902344, + "epoch": 9.328358208955224, + "grad_norm": 0.31340032833384446, + "learning_rate": 4.630597014925373e-07, + "loss": 0.0007, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.86111450195312, + "epoch": 9.335820895522389, + "grad_norm": 0.3936067283880468, + "learning_rate": 4.634328358208955e-07, + "loss": 0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7777862548828, + "epoch": 9.343283582089553, + "grad_norm": 0.5573064986392371, + "learning_rate": 4.6380597014925373e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 9.350746268656717, + "grad_norm": 0.4719322799628886, + "learning_rate": 4.641791044776119e-07, + "loss": 0.0013, + "reward": 0.7222222089767456, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.72222900390625, + "epoch": 9.35820895522388, + "grad_norm": 0.9700242190663177, + "learning_rate": 4.645522388059701e-07, + "loss": -0.0007, + "reward": 0.8611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.55555725097656, + "epoch": 9.365671641791044, + "grad_norm": 0.5782869391815002, + "learning_rate": 4.6492537313432833e-07, + "loss": -0.0007, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0, + "epoch": 9.373134328358208, + "grad_norm": 0.26055212411852746, + "learning_rate": 4.6529850746268656e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.36111450195312, + "epoch": 9.380597014925373, + "grad_norm": 0.49427446171287626, + "learning_rate": 4.6567164179104474e-07, + "loss": 0.0006, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 9.388059701492537, + "grad_norm": 0.1845580051113106, + "learning_rate": 4.66044776119403e-07, + "loss": 0.0005, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.9166717529297, + "epoch": 9.395522388059701, + "grad_norm": 0.5169859865782127, + "learning_rate": 4.664179104477612e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5, + "epoch": 9.402985074626866, + "grad_norm": 0.4195055881377182, + "learning_rate": 4.6679104477611934e-07, + "loss": 0.0006, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.86111450195312, + "epoch": 9.41044776119403, + "grad_norm": 0.23799435189921378, + "learning_rate": 4.6716417910447757e-07, + "loss": -0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.36111450195312, + "epoch": 9.417910447761194, + "grad_norm": 0.5775902474246795, + "learning_rate": 4.675373134328358e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 9.425373134328359, + "grad_norm": 0.4910644518985892, + "learning_rate": 4.67910447761194e-07, + "loss": -0.0011, + "reward": 0.3333333432674408, + "reward_std": 0.28637224435806274, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.38888549804688, + "epoch": 9.432835820895523, + "grad_norm": 0.8238721774762963, + "learning_rate": 4.682835820895522e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5, + "epoch": 9.440298507462687, + "grad_norm": 0.6325660527873527, + "learning_rate": 4.6865671641791045e-07, + "loss": 0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5277862548828, + "epoch": 9.447761194029852, + "grad_norm": 0.4751057659224979, + "learning_rate": 4.690298507462687e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0277862548828, + "epoch": 9.455223880597014, + "grad_norm": 0.5622902413510386, + "learning_rate": 4.694029850746268e-07, + "loss": -0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.7777862548828, + "epoch": 9.462686567164178, + "grad_norm": 0.4589577594803114, + "learning_rate": 4.6977611940298505e-07, + "loss": -0.0005, + "reward": 0.3888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.97222900390625, + "epoch": 9.470149253731343, + "grad_norm": 0.3829355980615764, + "learning_rate": 4.701492537313433e-07, + "loss": -0.0004, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 9.477611940298507, + "grad_norm": 0.3520748585529512, + "learning_rate": 4.7052238805970146e-07, + "loss": -0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.94444274902344, + "epoch": 9.485074626865671, + "grad_norm": 0.7331720483165676, + "learning_rate": 4.708955223880597e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.8333282470703, + "epoch": 9.492537313432836, + "grad_norm": 0.4517686304908789, + "learning_rate": 4.7126865671641793e-07, + "loss": -0.0019, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9166717529297, + "epoch": 9.5, + "grad_norm": 0.7924048119675051, + "learning_rate": 4.7164179104477606e-07, + "loss": 0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.19444274902344, + "epoch": 9.507462686567164, + "grad_norm": 0.35740955277346215, + "learning_rate": 4.720149253731343e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.30555725097656, + "epoch": 9.514925373134329, + "grad_norm": 0.0, + "learning_rate": 4.7238805970149253e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0833282470703, + "epoch": 9.522388059701493, + "grad_norm": 0.6652808633697413, + "learning_rate": 4.727611940298507e-07, + "loss": 0.0001, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 9.529850746268657, + "grad_norm": 0.7504237556811281, + "learning_rate": 4.7313432835820894e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.25, + "epoch": 9.537313432835822, + "grad_norm": 0.28318196977918836, + "learning_rate": 4.735074626865672e-07, + "loss": 0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5833282470703, + "epoch": 9.544776119402986, + "grad_norm": 0.3202528571272284, + "learning_rate": 4.7388059701492536e-07, + "loss": -0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.3333282470703, + "epoch": 9.552238805970148, + "grad_norm": 0.7445492437598429, + "learning_rate": 4.7425373134328354e-07, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 9.559701492537313, + "grad_norm": 0.38111986641263, + "learning_rate": 4.7462686567164177e-07, + "loss": -0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.4166717529297, + "epoch": 9.567164179104477, + "grad_norm": 0.32632929384922543, + "learning_rate": 4.7499999999999995e-07, + "loss": 0.0008, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.3333282470703, + "epoch": 9.574626865671641, + "grad_norm": 0.0, + "learning_rate": 4.753731343283582e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4166717529297, + "epoch": 9.582089552238806, + "grad_norm": 0.392790029417539, + "learning_rate": 4.757462686567164e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5277862548828, + "epoch": 9.58955223880597, + "grad_norm": 0.5758101506485983, + "learning_rate": 4.761194029850746e-07, + "loss": 0.0017, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4166717529297, + "epoch": 9.597014925373134, + "grad_norm": 0.7272312488710028, + "learning_rate": 4.764925373134328e-07, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.7777862548828, + "epoch": 9.604477611940299, + "grad_norm": 0.0, + "learning_rate": 4.768656716417911e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.44444274902344, + "epoch": 9.611940298507463, + "grad_norm": 0.3101952991893188, + "learning_rate": 4.772388059701492e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.6666717529297, + "epoch": 9.619402985074627, + "grad_norm": 0.6367766750981361, + "learning_rate": 4.776119402985074e-07, + "loss": -0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.1666717529297, + "epoch": 9.626865671641792, + "grad_norm": 0.6409567148925185, + "learning_rate": 4.779850746268657e-07, + "loss": -0.0, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.69444274902344, + "epoch": 9.634328358208956, + "grad_norm": 0.16602674238025242, + "learning_rate": 4.783582089552239e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 9.64179104477612, + "grad_norm": 0.29984909539597926, + "learning_rate": 4.78731343283582e-07, + "loss": 0.0007, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.86111450195312, + "epoch": 9.649253731343283, + "grad_norm": 0.6295701528321788, + "learning_rate": 4.791044776119403e-07, + "loss": 0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.94444274902344, + "epoch": 9.656716417910447, + "grad_norm": 0.333570892823129, + "learning_rate": 4.794776119402985e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.44444274902344, + "epoch": 9.664179104477611, + "grad_norm": 0.0, + "learning_rate": 4.798507462686567e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 9.671641791044776, + "grad_norm": 0.4910849010940057, + "learning_rate": 4.802238805970149e-07, + "loss": -0.001, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.7777862548828, + "epoch": 9.67910447761194, + "grad_norm": 0.31476682620058616, + "learning_rate": 4.805970149253731e-07, + "loss": -0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.86111450195312, + "epoch": 9.686567164179104, + "grad_norm": 0.6887777507758581, + "learning_rate": 4.809701492537313e-07, + "loss": 0.001, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0277862548828, + "epoch": 9.694029850746269, + "grad_norm": 0.8419433970160658, + "learning_rate": 4.813432835820896e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.55555725097656, + "epoch": 9.701492537313433, + "grad_norm": 0.7486563564669354, + "learning_rate": 4.817164179104478e-07, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.9166717529297, + "epoch": 9.708955223880597, + "grad_norm": 1.845547056816163, + "learning_rate": 4.820895522388059e-07, + "loss": -0.0018, + "reward": 0.4166666567325592, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.38888549804688, + "epoch": 9.716417910447761, + "grad_norm": 0.4206071950920363, + "learning_rate": 4.824626865671642e-07, + "loss": -0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5, + "epoch": 9.723880597014926, + "grad_norm": 4.441286002139875, + "learning_rate": 4.828358208955224e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5, + "epoch": 9.73134328358209, + "grad_norm": 0.7179610935799493, + "learning_rate": 4.832089552238805e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5277862548828, + "epoch": 9.738805970149254, + "grad_norm": 0.23616134325947666, + "learning_rate": 4.835820895522387e-07, + "loss": 0.0, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.75, + "epoch": 9.746268656716419, + "grad_norm": 1.5523174750573645, + "learning_rate": 4.83955223880597e-07, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.3333282470703, + "epoch": 9.753731343283581, + "grad_norm": 0.6424332861239532, + "learning_rate": 4.843283582089552e-07, + "loss": -0.0002, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.2777862548828, + "epoch": 9.761194029850746, + "grad_norm": 0.6811459167000017, + "learning_rate": 4.847014925373134e-07, + "loss": 0.0001, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7777862548828, + "epoch": 9.76865671641791, + "grad_norm": 0.8550452055274147, + "learning_rate": 4.850746268656717e-07, + "loss": -0.0007, + "reward": 0.6666666865348816, + "reward_std": 0.39748334884643555, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.30555725097656, + "epoch": 9.776119402985074, + "grad_norm": 0.792537747107226, + "learning_rate": 4.854477611940298e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6666717529297, + "epoch": 9.783582089552239, + "grad_norm": 0.3111371409457674, + "learning_rate": 4.85820895522388e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.19444274902344, + "epoch": 9.791044776119403, + "grad_norm": 0.6382988415776253, + "learning_rate": 4.861940298507463e-07, + "loss": 0.0019, + "reward": 0.3333333432674408, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3333282470703, + "epoch": 9.798507462686567, + "grad_norm": 0.8180694625902475, + "learning_rate": 4.865671641791044e-07, + "loss": -0.0007, + "reward": 0.5277777910232544, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6666717529297, + "epoch": 9.805970149253731, + "grad_norm": 0.7832137373749383, + "learning_rate": 4.869402985074626e-07, + "loss": 0.0011, + "reward": 0.6388888955116272, + "reward_std": 0.2777777910232544, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.47222900390625, + "epoch": 9.813432835820896, + "grad_norm": 0.43126168852740426, + "learning_rate": 4.873134328358209e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0, + "epoch": 9.82089552238806, + "grad_norm": 0.4367593325543559, + "learning_rate": 4.876865671641791e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.55555725097656, + "epoch": 9.828358208955224, + "grad_norm": 0.28402462665465433, + "learning_rate": 4.880597014925372e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 9.835820895522389, + "grad_norm": 0.6937834384087703, + "learning_rate": 4.884328358208955e-07, + "loss": 0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.47222900390625, + "epoch": 9.843283582089553, + "grad_norm": 1.3458219905184012, + "learning_rate": 4.888059701492537e-07, + "loss": 0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.4318612515926361, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.38888549804688, + "epoch": 9.850746268656717, + "grad_norm": 0.6243426451320767, + "learning_rate": 4.891791044776119e-07, + "loss": 0.0018, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0833282470703, + "epoch": 9.85820895522388, + "grad_norm": 1.190272270737968, + "learning_rate": 4.895522388059702e-07, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3333282470703, + "epoch": 9.865671641791044, + "grad_norm": 0.9055509034812693, + "learning_rate": 4.899253731343284e-07, + "loss": 0.0011, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0, + "epoch": 9.873134328358208, + "grad_norm": 0.45145400446423, + "learning_rate": 4.902985074626865e-07, + "loss": -0.0003, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.11111450195312, + "epoch": 9.880597014925373, + "grad_norm": 0.3269549745679248, + "learning_rate": 4.906716417910448e-07, + "loss": 0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.86111450195312, + "epoch": 9.888059701492537, + "grad_norm": 0.3737895527351195, + "learning_rate": 4.91044776119403e-07, + "loss": 0.0007, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0277862548828, + "epoch": 9.895522388059701, + "grad_norm": 0.5412946378471611, + "learning_rate": 4.914179104477611e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.6666717529297, + "epoch": 9.902985074626866, + "grad_norm": 1.4483354365810672, + "learning_rate": 4.917910447761194e-07, + "loss": -0.0022, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.61111450195312, + "epoch": 9.91044776119403, + "grad_norm": 0.33789429395692366, + "learning_rate": 4.921641791044776e-07, + "loss": -0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.72222900390625, + "epoch": 9.917910447761194, + "grad_norm": 0.28563211652535053, + "learning_rate": 4.925373134328357e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 9.925373134328359, + "grad_norm": 0.5838674846520785, + "learning_rate": 4.92910447761194e-07, + "loss": 0.0017, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6666717529297, + "epoch": 9.932835820895523, + "grad_norm": 0.42796413573267067, + "learning_rate": 4.932835820895522e-07, + "loss": -0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 9.940298507462687, + "grad_norm": 0.304892024892549, + "learning_rate": 4.936567164179104e-07, + "loss": 0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1666717529297, + "epoch": 9.947761194029852, + "grad_norm": 0.3550553643560505, + "learning_rate": 4.940298507462687e-07, + "loss": -0.0001, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.7777862548828, + "epoch": 9.955223880597014, + "grad_norm": 0.9948677898022804, + "learning_rate": 4.944029850746269e-07, + "loss": 0.001, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6666717529297, + "epoch": 9.962686567164178, + "grad_norm": 0.3322370408543146, + "learning_rate": 4.94776119402985e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.25, + "epoch": 9.970149253731343, + "grad_norm": 0.706613505234307, + "learning_rate": 4.951492537313433e-07, + "loss": 0.0006, + "reward": 0.4444444477558136, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6666717529297, + "epoch": 9.977611940298507, + "grad_norm": 0.4421566984068235, + "learning_rate": 4.955223880597015e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.44444274902344, + "epoch": 9.985074626865671, + "grad_norm": 0.3647104530492206, + "learning_rate": 4.958955223880597e-07, + "loss": -0.0004, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.91667175292969, + "epoch": 9.992537313432836, + "grad_norm": 0.4509039536759182, + "learning_rate": 4.962686567164179e-07, + "loss": 0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.72222900390625, + "epoch": 10.007462686567164, + "grad_norm": 0.23555933268716794, + "learning_rate": 4.966417910447761e-07, + "loss": -0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 10.014925373134329, + "grad_norm": 0.6310449883541014, + "learning_rate": 4.970149253731343e-07, + "loss": 0.0008, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3333282470703, + "epoch": 10.022388059701493, + "grad_norm": 0.25753313695106733, + "learning_rate": 4.973880597014926e-07, + "loss": 0.0003, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.86111450195312, + "epoch": 10.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.977611940298507e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 10.037313432835822, + "grad_norm": 0.6507432811292775, + "learning_rate": 4.981343283582089e-07, + "loss": -0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.02777862548828, + "epoch": 10.044776119402986, + "grad_norm": 0.3093442529863285, + "learning_rate": 4.985074626865671e-07, + "loss": 0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.69444274902344, + "epoch": 10.052238805970148, + "grad_norm": 0.39396037812143575, + "learning_rate": 4.988805970149254e-07, + "loss": -0.0001, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.19444274902344, + "epoch": 10.059701492537313, + "grad_norm": 0.42774041517388417, + "learning_rate": 4.992537313432836e-07, + "loss": 0.0006, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.22222900390625, + "epoch": 10.067164179104477, + "grad_norm": 0.26826826110732943, + "learning_rate": 4.996268656716417e-07, + "loss": 0.0007, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.36111450195312, + "epoch": 10.074626865671641, + "grad_norm": 0.5522252844215468, + "learning_rate": 5e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.80555725097656, + "epoch": 10.082089552238806, + "grad_norm": 0.44469021772981737, + "learning_rate": 4.999999915176705e-07, + "loss": 0.0012, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.69444274902344, + "epoch": 10.08955223880597, + "grad_norm": 0.3611947658785573, + "learning_rate": 4.999999660706824e-07, + "loss": -0.0, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2777862548828, + "epoch": 10.097014925373134, + "grad_norm": 0.7660522744343511, + "learning_rate": 4.999999236590376e-07, + "loss": -0.0003, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.13888549804688, + "epoch": 10.104477611940299, + "grad_norm": 0.8790592192988999, + "learning_rate": 4.99999864282739e-07, + "loss": -0.0008, + "reward": 0.25, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.44444274902344, + "epoch": 10.111940298507463, + "grad_norm": 0.45141565903040476, + "learning_rate": 4.999997879417906e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.05555725097656, + "epoch": 10.119402985074627, + "grad_norm": 1.0719763967035179, + "learning_rate": 4.999996946361974e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6666717529297, + "epoch": 10.126865671641792, + "grad_norm": 1.359915868617331, + "learning_rate": 4.999995843659658e-07, + "loss": 0.0015, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.22222900390625, + "epoch": 10.134328358208956, + "grad_norm": 2.849827109185726, + "learning_rate": 4.999994571311035e-07, + "loss": 0.0008, + "reward": 0.5277777910232544, + "reward_std": 0.24800565838813782, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.1666717529297, + "epoch": 10.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.99999312931619e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.5277862548828, + "epoch": 10.149253731343283, + "grad_norm": 0.43774564168466235, + "learning_rate": 4.999991517675219e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.22222137451172, + "epoch": 10.156716417910447, + "grad_norm": 0.3594018166378687, + "learning_rate": 4.999989736388234e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.63888549804688, + "epoch": 10.164179104477611, + "grad_norm": 0.3303619702876663, + "learning_rate": 4.999987785455355e-07, + "loss": -0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 10.171641791044776, + "grad_norm": 0.0, + "learning_rate": 4.999985664876713e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 10.17910447761194, + "grad_norm": 0.6911798641616512, + "learning_rate": 4.999983374652454e-07, + "loss": -0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.13888549804688, + "epoch": 10.186567164179104, + "grad_norm": 0.1946838682874489, + "learning_rate": 4.999980914782733e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.97222900390625, + "epoch": 10.194029850746269, + "grad_norm": 0.3078376801307895, + "learning_rate": 4.999978285267717e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.7777862548828, + "epoch": 10.201492537313433, + "grad_norm": 0.47306972179250295, + "learning_rate": 4.999975486107583e-07, + "loss": -0.0004, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.63888549804688, + "epoch": 10.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.999972517302522e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.4166717529297, + "epoch": 10.216417910447761, + "grad_norm": 0.5260457332444625, + "learning_rate": 4.999969378852735e-07, + "loss": -0.001, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5277862548828, + "epoch": 10.223880597014926, + "grad_norm": 1.0081080257959936, + "learning_rate": 4.999966070758436e-07, + "loss": 0.0011, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.36111450195312, + "epoch": 10.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.999962593019849e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.19444274902344, + "epoch": 10.238805970149254, + "grad_norm": 0.7863870043607424, + "learning_rate": 4.99995894563721e-07, + "loss": -0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.55555725097656, + "epoch": 10.246268656716419, + "grad_norm": 0.3639747828427113, + "learning_rate": 4.999955128610765e-07, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.5277862548828, + "epoch": 10.253731343283581, + "grad_norm": 0.48157794804490656, + "learning_rate": 4.999951141940775e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.8333282470703, + "epoch": 10.261194029850746, + "grad_norm": 0.47551489035655026, + "learning_rate": 4.99994698562751e-07, + "loss": 0.001, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.61111450195312, + "epoch": 10.26865671641791, + "grad_norm": 0.3224760718548074, + "learning_rate": 4.999942659671251e-07, + "loss": 0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.38888549804688, + "epoch": 10.276119402985074, + "grad_norm": 0.6935234154350653, + "learning_rate": 4.999938164072294e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.38888549804688, + "epoch": 10.283582089552239, + "grad_norm": 0.23954198207706248, + "learning_rate": 4.999933498830942e-07, + "loss": -0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.86111450195312, + "epoch": 10.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.999928663947512e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.13888549804688, + "epoch": 10.298507462686567, + "grad_norm": 0.347962308192342, + "learning_rate": 4.999923659422331e-07, + "loss": -0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0277862548828, + "epoch": 10.305970149253731, + "grad_norm": 0.5202050753831072, + "learning_rate": 4.99991848525574e-07, + "loss": 0.0005, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.88888549804688, + "epoch": 10.313432835820896, + "grad_norm": 0.5282633695853965, + "learning_rate": 4.999913141448092e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.27777862548828, + "epoch": 10.32089552238806, + "grad_norm": 0.40471701933836446, + "learning_rate": 4.999907627999746e-07, + "loss": 0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.80555725097656, + "epoch": 10.328358208955224, + "grad_norm": 0.6104193415401827, + "learning_rate": 4.999901944911078e-07, + "loss": -0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.38888549804688, + "epoch": 10.335820895522389, + "grad_norm": 0.7481435717675956, + "learning_rate": 4.999896092182473e-07, + "loss": -0.0, + "reward": 0.5555555820465088, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.4166717529297, + "epoch": 10.343283582089553, + "grad_norm": 0.49159031180877627, + "learning_rate": 4.999890069814328e-07, + "loss": -0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.61111450195312, + "epoch": 10.350746268656717, + "grad_norm": 0.8356040915094516, + "learning_rate": 4.999883877807053e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.19444274902344, + "epoch": 10.35820895522388, + "grad_norm": 0.7856475386919116, + "learning_rate": 4.999877516161067e-07, + "loss": 0.0008, + "reward": 0.3888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.3888888955116272, + "rewards/format_reward": 0.0, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.69444274902344, + "epoch": 10.365671641791044, + "grad_norm": 0.10260022613117116, + "learning_rate": 4.999870984876801e-07, + "loss": 0.0, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.88888549804688, + "epoch": 10.373134328358208, + "grad_norm": 0.3875426461195343, + "learning_rate": 4.999864283954701e-07, + "loss": 0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 10.380597014925373, + "grad_norm": 8.094422304994216, + "learning_rate": 4.999857413395219e-07, + "loss": 0.0006, + "reward": 0.694444477558136, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.7777862548828, + "epoch": 10.388059701492537, + "grad_norm": 0.6528465112733723, + "learning_rate": 4.999850373198822e-07, + "loss": 0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.44444274902344, + "epoch": 10.395522388059701, + "grad_norm": 0.3035418690961421, + "learning_rate": 4.999843163365989e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.86111450195312, + "epoch": 10.402985074626866, + "grad_norm": 0.9057244309358783, + "learning_rate": 4.999835783897206e-07, + "loss": -0.0008, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.33333587646484, + "epoch": 10.41044776119403, + "grad_norm": 0.9965463639164558, + "learning_rate": 4.999828234792978e-07, + "loss": 0.0007, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.69444274902344, + "epoch": 10.417910447761194, + "grad_norm": 0.26474435399423696, + "learning_rate": 4.999820516053814e-07, + "loss": -0.0001, + "reward": 0.4444444477558136, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.19444274902344, + "epoch": 10.425373134328359, + "grad_norm": 0.5320811508524422, + "learning_rate": 4.999812627680239e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9166717529297, + "epoch": 10.432835820895523, + "grad_norm": 0.769107000167622, + "learning_rate": 4.999804569672788e-07, + "loss": -0.0006, + "reward": 0.3611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 10.440298507462687, + "grad_norm": 0.0, + "learning_rate": 4.999796342032009e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.4166717529297, + "epoch": 10.447761194029852, + "grad_norm": 0.5245203490946377, + "learning_rate": 4.999787944758459e-07, + "loss": -0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.88888549804688, + "epoch": 10.455223880597014, + "grad_norm": 0.27785034492094196, + "learning_rate": 4.999779377852708e-07, + "loss": -0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.44444274902344, + "epoch": 10.462686567164178, + "grad_norm": 1.977773480144514, + "learning_rate": 4.999770641315337e-07, + "loss": -0.0002, + "reward": 0.8888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 10.470149253731343, + "grad_norm": 0.675003887338134, + "learning_rate": 4.99976173514694e-07, + "loss": 0.0006, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.63888549804688, + "epoch": 10.477611940298507, + "grad_norm": 0.45733345995347274, + "learning_rate": 4.999752659348121e-07, + "loss": 0.0004, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.22222900390625, + "epoch": 10.485074626865671, + "grad_norm": 0.43874587487982963, + "learning_rate": 4.999743413919495e-07, + "loss": 0.0006, + "reward": 0.8055555820465088, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.25, + "epoch": 10.492537313432836, + "grad_norm": 0.5987914641529863, + "learning_rate": 4.999733998861691e-07, + "loss": -0.0021, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.2777862548828, + "epoch": 10.5, + "grad_norm": 0.6624689698083571, + "learning_rate": 4.999724414175346e-07, + "loss": -0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.11111450195312, + "epoch": 10.507462686567164, + "grad_norm": 0.4754426102443842, + "learning_rate": 4.999714659861111e-07, + "loss": -0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.88888549804688, + "epoch": 10.514925373134329, + "grad_norm": 0.5858697798511134, + "learning_rate": 4.999704735919649e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 10.522388059701493, + "grad_norm": 0.518728387560601, + "learning_rate": 4.999694642351632e-07, + "loss": 0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.05555725097656, + "epoch": 10.529850746268657, + "grad_norm": 0.26761053032774296, + "learning_rate": 4.999684379157746e-07, + "loss": 0.0008, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.63888549804688, + "epoch": 10.537313432835822, + "grad_norm": 0.7920538458521026, + "learning_rate": 4.999673946338687e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.6666717529297, + "epoch": 10.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.999663343895163e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.22222900390625, + "epoch": 10.552238805970148, + "grad_norm": 1.0714135699741405, + "learning_rate": 4.999652571827893e-07, + "loss": -0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.19444274902344, + "epoch": 10.559701492537313, + "grad_norm": 1.7538764622876721, + "learning_rate": 4.999641630137609e-07, + "loss": 0.0014, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.19444274902344, + "epoch": 10.567164179104477, + "grad_norm": 0.2087988474413746, + "learning_rate": 4.999630518825052e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.05555725097656, + "epoch": 10.574626865671641, + "grad_norm": 0.486836867014833, + "learning_rate": 4.999619237890978e-07, + "loss": 0.0, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.75, + "epoch": 10.582089552238806, + "grad_norm": 0.2660670730966183, + "learning_rate": 4.999607787336151e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.05555725097656, + "epoch": 10.58955223880597, + "grad_norm": 0.8161376977633815, + "learning_rate": 4.999596167161348e-07, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.97222900390625, + "epoch": 10.597014925373134, + "grad_norm": 0.40715510700040497, + "learning_rate": 4.999584377367359e-07, + "loss": 0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.11111450195312, + "epoch": 10.604477611940299, + "grad_norm": 0.5263263989736651, + "learning_rate": 4.999572417954982e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.7777862548828, + "epoch": 10.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.99956028892503e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.13888549804688, + "epoch": 10.619402985074627, + "grad_norm": 0.3475508513647095, + "learning_rate": 4.999547990278325e-07, + "loss": 0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.13888549804688, + "epoch": 10.626865671641792, + "grad_norm": 0.18703724603446015, + "learning_rate": 4.999535522015702e-07, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 10.634328358208956, + "grad_norm": 0.4368203251708063, + "learning_rate": 4.999522884138008e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.4166717529297, + "epoch": 10.64179104477612, + "grad_norm": 0.36190545277073954, + "learning_rate": 4.999510076646099e-07, + "loss": 0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 10.649253731343283, + "grad_norm": 0.41227018366017026, + "learning_rate": 4.999497099540847e-07, + "loss": 0.0007, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.13888549804688, + "epoch": 10.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.999483952823129e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0277862548828, + "epoch": 10.664179104477611, + "grad_norm": 0.0, + "learning_rate": 4.999470636493839e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.75, + "epoch": 10.671641791044776, + "grad_norm": 0.8609885170613415, + "learning_rate": 4.99945715055388e-07, + "loss": -0.001, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.11111450195312, + "epoch": 10.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.999443495004167e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.25, + "epoch": 10.686567164179104, + "grad_norm": 0.9674094251145361, + "learning_rate": 4.999429669845628e-07, + "loss": 0.0003, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.38888549804688, + "epoch": 10.694029850746269, + "grad_norm": 0.23527848516768932, + "learning_rate": 4.999415675079199e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.61111450195312, + "epoch": 10.701492537313433, + "grad_norm": 0.2731608548063141, + "learning_rate": 4.999401510705833e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.61111450195312, + "epoch": 10.708955223880597, + "grad_norm": 0.2520920774865539, + "learning_rate": 4.999387176726487e-07, + "loss": -0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.11111450195312, + "epoch": 10.716417910447761, + "grad_norm": 0.6382089495590939, + "learning_rate": 4.999372673142137e-07, + "loss": 0.0001, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.19444274902344, + "epoch": 10.723880597014926, + "grad_norm": 0.2544554623105827, + "learning_rate": 4.999357999953766e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.86111450195312, + "epoch": 10.73134328358209, + "grad_norm": 0.5653212883133745, + "learning_rate": 4.999343157162369e-07, + "loss": -0.0014, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.94444274902344, + "epoch": 10.738805970149254, + "grad_norm": 0.19765690595784244, + "learning_rate": 4.999328144768954e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.38888549804688, + "epoch": 10.746268656716419, + "grad_norm": 0.0, + "learning_rate": 4.99931296277454e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7777862548828, + "epoch": 10.753731343283581, + "grad_norm": 0.8838137226614755, + "learning_rate": 4.999297611180157e-07, + "loss": -0.0, + "reward": 0.6666666865348816, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.38888549804688, + "epoch": 10.761194029850746, + "grad_norm": 0.6778529847265503, + "learning_rate": 4.999282089986845e-07, + "loss": 0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1388931274414, + "epoch": 10.76865671641791, + "grad_norm": 0.48120498348887303, + "learning_rate": 4.99926639919566e-07, + "loss": 0.0006, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.69444274902344, + "epoch": 10.776119402985074, + "grad_norm": 0.9879720525848757, + "learning_rate": 4.999250538807666e-07, + "loss": -0.0017, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2777862548828, + "epoch": 10.783582089552239, + "grad_norm": 0.3878664888663649, + "learning_rate": 4.999234508823937e-07, + "loss": -0.0007, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.80555725097656, + "epoch": 10.791044776119403, + "grad_norm": 0.3870055511143289, + "learning_rate": 4.999218309245564e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.61111450195312, + "epoch": 10.798507462686567, + "grad_norm": 1.8641440874624327, + "learning_rate": 4.999201940073645e-07, + "loss": -0.0, + "reward": 0.7222222089767456, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.61111450195312, + "epoch": 10.805970149253731, + "grad_norm": 0.49387950651925017, + "learning_rate": 4.99918540130929e-07, + "loss": 0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.33333587646484, + "epoch": 10.813432835820896, + "grad_norm": 0.3346447035361689, + "learning_rate": 4.999168692953623e-07, + "loss": 0.0006, + "reward": 0.8888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36111450195312, + "epoch": 10.82089552238806, + "grad_norm": 0.1864714102322232, + "learning_rate": 4.999151815007776e-07, + "loss": -0.0003, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1666717529297, + "epoch": 10.828358208955224, + "grad_norm": 0.4803074018202512, + "learning_rate": 4.999134767472896e-07, + "loss": -0.0009, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.6666717529297, + "epoch": 10.835820895522389, + "grad_norm": 1.1608677979149395, + "learning_rate": 4.999117550350137e-07, + "loss": -0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.36111450195312, + "epoch": 10.843283582089553, + "grad_norm": 0.7332152891250489, + "learning_rate": 4.999100163640671e-07, + "loss": -0.0007, + "reward": 0.6666666865348816, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.86111450195312, + "epoch": 10.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.999082607345676e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3333282470703, + "epoch": 10.85820895522388, + "grad_norm": 0.5076496426040725, + "learning_rate": 4.999064881466342e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.4166717529297, + "epoch": 10.865671641791044, + "grad_norm": 0.32429285497304083, + "learning_rate": 4.999046986003874e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.0833282470703, + "epoch": 10.873134328358208, + "grad_norm": 1.1299810109320998, + "learning_rate": 4.999028920959486e-07, + "loss": -0.0007, + "reward": 0.472222238779068, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0277862548828, + "epoch": 10.880597014925373, + "grad_norm": 0.31786730548058906, + "learning_rate": 4.999010686334404e-07, + "loss": -0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.19444274902344, + "epoch": 10.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.998992282129864e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.61111450195312, + "epoch": 10.895522388059701, + "grad_norm": 0.5446862108609136, + "learning_rate": 4.998973708347115e-07, + "loss": 0.0009, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.38888549804688, + "epoch": 10.902985074626866, + "grad_norm": 0.24252887124727232, + "learning_rate": 4.998954964987419e-07, + "loss": 0.0003, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30555725097656, + "epoch": 10.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.998936052052048e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.86111450195312, + "epoch": 10.917910447761194, + "grad_norm": 0.5672037017272995, + "learning_rate": 4.998916969542284e-07, + "loss": 0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.94444274902344, + "epoch": 10.925373134328359, + "grad_norm": 0.836973864524552, + "learning_rate": 4.998897717459421e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.22222900390625, + "epoch": 10.932835820895523, + "grad_norm": 0.771789909777077, + "learning_rate": 4.998878295804768e-07, + "loss": 0.0015, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.16666412353516, + "epoch": 10.940298507462687, + "grad_norm": 1.3404730967735392, + "learning_rate": 4.998858704579642e-07, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 10.947761194029852, + "grad_norm": 0.23639226221729548, + "learning_rate": 4.998838943785372e-07, + "loss": 0.0003, + "reward": 0.2777777910232544, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.11111450195312, + "epoch": 10.955223880597014, + "grad_norm": 0.663164088255606, + "learning_rate": 4.998819013423298e-07, + "loss": 0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.44444274902344, + "epoch": 10.962686567164178, + "grad_norm": 0.42267593296060396, + "learning_rate": 4.998798913494775e-07, + "loss": 0.001, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.44444274902344, + "epoch": 10.970149253731343, + "grad_norm": 0.22421895658018615, + "learning_rate": 4.998778644001165e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.86111450195312, + "epoch": 10.977611940298507, + "grad_norm": 0.5091154196768861, + "learning_rate": 4.998758204943844e-07, + "loss": -0.0006, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0833282470703, + "epoch": 10.985074626865671, + "grad_norm": 0.2163998054721253, + "learning_rate": 4.998737596324199e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.58334350585938, + "epoch": 10.992537313432836, + "grad_norm": 0.0, + "learning_rate": 4.998716818143627e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 11.007462686567164, + "grad_norm": 0.3102456110337327, + "learning_rate": 4.998695870403541e-07, + "loss": 0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.44444274902344, + "epoch": 11.014925373134329, + "grad_norm": 1.5465319531013388, + "learning_rate": 4.99867475310536e-07, + "loss": -0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 11.022388059701493, + "grad_norm": 0.6713116167575864, + "learning_rate": 4.998653466250519e-07, + "loss": -0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.80555725097656, + "epoch": 11.029850746268657, + "grad_norm": 0.6642271211443411, + "learning_rate": 4.99863200984046e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.69444274902344, + "epoch": 11.037313432835822, + "grad_norm": 3.5262877801093286, + "learning_rate": 4.998610383876641e-07, + "loss": 0.0013, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.11111450195312, + "epoch": 11.044776119402986, + "grad_norm": 1.224838332498126, + "learning_rate": 4.998588588360529e-07, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.11111450195312, + "epoch": 11.052238805970148, + "grad_norm": 0.46490173609583546, + "learning_rate": 4.998566623293603e-07, + "loss": 0.0006, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.05555725097656, + "epoch": 11.059701492537313, + "grad_norm": 0.8713383508248121, + "learning_rate": 4.998544488677353e-07, + "loss": -0.0252, + "reward": 0.7777777910232544, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333282470703, + "epoch": 11.067164179104477, + "grad_norm": 0.544326096275739, + "learning_rate": 4.998522184513281e-07, + "loss": -0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.4166717529297, + "epoch": 11.074626865671641, + "grad_norm": 0.5969520629152141, + "learning_rate": 4.998499710802901e-07, + "loss": 0.0012, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.86111450195312, + "epoch": 11.082089552238806, + "grad_norm": 0.42557880607658904, + "learning_rate": 4.998477067547739e-07, + "loss": -0.001, + "reward": 0.3333333432674408, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.38888549804688, + "epoch": 11.08955223880597, + "grad_norm": 0.4598801192423718, + "learning_rate": 4.998454254749331e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5833282470703, + "epoch": 11.097014925373134, + "grad_norm": 0.3207554681174847, + "learning_rate": 4.998431272409222e-07, + "loss": -0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 11.104477611940299, + "grad_norm": 0.2967142188801707, + "learning_rate": 4.998408120528976e-07, + "loss": 0.0012, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.11111450195312, + "epoch": 11.111940298507463, + "grad_norm": 0.3052534471164519, + "learning_rate": 4.998384799110162e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.55555725097656, + "epoch": 11.119402985074627, + "grad_norm": 0.7475443786750254, + "learning_rate": 4.998361308154363e-07, + "loss": -0.0009, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 11.126865671641792, + "grad_norm": 0.5631620809107314, + "learning_rate": 4.998337647663172e-07, + "loss": -0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.36111450195312, + "epoch": 11.134328358208956, + "grad_norm": 0.4144323300834333, + "learning_rate": 4.998313817638197e-07, + "loss": 0.001, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.22222900390625, + "epoch": 11.14179104477612, + "grad_norm": 0.5079319362884119, + "learning_rate": 4.998289818081052e-07, + "loss": 0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.61111450195312, + "epoch": 11.149253731343283, + "grad_norm": 0.5776908583621652, + "learning_rate": 4.998265648993367e-07, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.2777862548828, + "epoch": 11.156716417910447, + "grad_norm": 0.7281639751909577, + "learning_rate": 4.998241310376782e-07, + "loss": 0.0007, + "reward": 0.472222238779068, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4166717529297, + "epoch": 11.164179104477611, + "grad_norm": 0.8751801917824992, + "learning_rate": 4.998216802232949e-07, + "loss": -0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8333282470703, + "epoch": 11.171641791044776, + "grad_norm": 0.31823513580594315, + "learning_rate": 4.998192124563531e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 11.17910447761194, + "grad_norm": 0.6411255195180007, + "learning_rate": 4.998167277370202e-07, + "loss": 0.001, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5833282470703, + "epoch": 11.186567164179104, + "grad_norm": 0.6597316906111331, + "learning_rate": 4.998142260654649e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0, + "epoch": 11.194029850746269, + "grad_norm": 0.5907673497780299, + "learning_rate": 4.998117074418568e-07, + "loss": -0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.86111450195312, + "epoch": 11.201492537313433, + "grad_norm": 0.25067611698401576, + "learning_rate": 4.998091718663671e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.63888549804688, + "epoch": 11.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.998066193391675e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.55555725097656, + "epoch": 11.216417910447761, + "grad_norm": 1.2639612702343916, + "learning_rate": 4.998040498604315e-07, + "loss": -0.0008, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.86111450195312, + "epoch": 11.223880597014926, + "grad_norm": 0.6890364106155197, + "learning_rate": 4.998014634303333e-07, + "loss": 0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5277862548828, + "epoch": 11.23134328358209, + "grad_norm": 0.5244110517710893, + "learning_rate": 4.997988600490485e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.6666717529297, + "epoch": 11.238805970149254, + "grad_norm": 0.23609270395766674, + "learning_rate": 4.997962397167536e-07, + "loss": 0.0005, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.19444274902344, + "epoch": 11.246268656716419, + "grad_norm": 0.34311525988312674, + "learning_rate": 4.997936024336267e-07, + "loss": 0.0009, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0, + "epoch": 11.253731343283581, + "grad_norm": 0.847385919083214, + "learning_rate": 4.997909481998465e-07, + "loss": -0.0011, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5833282470703, + "epoch": 11.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.997882770155932e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5833282470703, + "epoch": 11.26865671641791, + "grad_norm": 0.7559067291058659, + "learning_rate": 4.997855888810482e-07, + "loss": -0.0014, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.19444274902344, + "epoch": 11.276119402985074, + "grad_norm": 0.5704849222811408, + "learning_rate": 4.997828837963936e-07, + "loss": 0.0015, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.11111450195312, + "epoch": 11.283582089552239, + "grad_norm": 0.0, + "learning_rate": 4.997801617618133e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.9166717529297, + "epoch": 11.291044776119403, + "grad_norm": 0.32941613590654384, + "learning_rate": 4.997774227774918e-07, + "loss": 0.0008, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.63888549804688, + "epoch": 11.298507462686567, + "grad_norm": 0.0, + "learning_rate": 4.99774666843615e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.13888549804688, + "epoch": 11.305970149253731, + "grad_norm": 0.35191318533312566, + "learning_rate": 4.9977189396037e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8333282470703, + "epoch": 11.313432835820896, + "grad_norm": 0.18719139912361082, + "learning_rate": 4.99769104127945e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.22222900390625, + "epoch": 11.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.99766297346529e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.13888549804688, + "epoch": 11.328358208955224, + "grad_norm": 0.3348396631959664, + "learning_rate": 4.997634736163127e-07, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5833282470703, + "epoch": 11.335820895522389, + "grad_norm": 0.22394640115049572, + "learning_rate": 4.997606329374879e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.61111450195312, + "epoch": 11.343283582089553, + "grad_norm": 0.2757077125452193, + "learning_rate": 4.99757775310247e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5, + "epoch": 11.350746268656717, + "grad_norm": 0.49004896272127124, + "learning_rate": 4.997549007347842e-07, + "loss": -0.0012, + "reward": 0.8611111044883728, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0833282470703, + "epoch": 11.35820895522388, + "grad_norm": 0.3993449338863607, + "learning_rate": 4.997520092112943e-07, + "loss": 0.001, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 11.365671641791044, + "grad_norm": 0.4899049343227502, + "learning_rate": 4.997491007399738e-07, + "loss": -0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3333282470703, + "epoch": 11.373134328358208, + "grad_norm": 0.0, + "learning_rate": 4.997461753210198e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0277862548828, + "epoch": 11.380597014925373, + "grad_norm": 0.40870455478575984, + "learning_rate": 4.997432329546311e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.13888549804688, + "epoch": 11.388059701492537, + "grad_norm": 0.30675163097733077, + "learning_rate": 4.99740273641007e-07, + "loss": -0.0, + "reward": 0.3333333432674408, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.22222900390625, + "epoch": 11.395522388059701, + "grad_norm": 0.34555449474075056, + "learning_rate": 4.997372973803486e-07, + "loss": -0.0002, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5833282470703, + "epoch": 11.402985074626866, + "grad_norm": 0.3582824602588839, + "learning_rate": 4.997343041728579e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0277862548828, + "epoch": 11.41044776119403, + "grad_norm": 0.3567971544778994, + "learning_rate": 4.997312940187377e-07, + "loss": 0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05555725097656, + "epoch": 11.417910447761194, + "grad_norm": 0.5390603120934296, + "learning_rate": 4.997282669181926e-07, + "loss": -0.0009, + "reward": 0.5, + "reward_std": 0.23941117525100708, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.30555725097656, + "epoch": 11.425373134328359, + "grad_norm": 0.8822016049606057, + "learning_rate": 4.997252228714278e-07, + "loss": -0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.44444274902344, + "epoch": 11.432835820895523, + "grad_norm": 0.6371045110903532, + "learning_rate": 4.9972216187865e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5, + "epoch": 11.440298507462687, + "grad_norm": 0.35212741974007067, + "learning_rate": 4.997190839400669e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 11.447761194029852, + "grad_norm": 0.19906167691170204, + "learning_rate": 4.997159890558872e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0833282470703, + "epoch": 11.455223880597014, + "grad_norm": 0.9163065563677241, + "learning_rate": 4.99712877226321e-07, + "loss": -0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0833282470703, + "epoch": 11.462686567164178, + "grad_norm": 0.0, + "learning_rate": 4.997097484515797e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.88888549804688, + "epoch": 11.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.997066027318753e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 11.477611940298507, + "grad_norm": 0.3002030415116218, + "learning_rate": 4.997034400674213e-07, + "loss": 0.0007, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 11.485074626865671, + "grad_norm": 0.21351532597502101, + "learning_rate": 4.997002604584326e-07, + "loss": 0.0002, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.80555725097656, + "epoch": 11.492537313432836, + "grad_norm": 0.218581849146721, + "learning_rate": 4.996970639051246e-07, + "loss": 0.0005, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.88888549804688, + "epoch": 11.5, + "grad_norm": 0.0, + "learning_rate": 4.996938504077144e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0833282470703, + "epoch": 11.507462686567164, + "grad_norm": 0.163482640928225, + "learning_rate": 4.9969061996642e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.8333282470703, + "epoch": 11.514925373134329, + "grad_norm": 0.0, + "learning_rate": 4.996873725814608e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8333282470703, + "epoch": 11.522388059701493, + "grad_norm": 0.45696992748779564, + "learning_rate": 4.996841082530568e-07, + "loss": -0.0006, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.63888549804688, + "epoch": 11.529850746268657, + "grad_norm": 1.755937201971851, + "learning_rate": 4.996808269814298e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.2777862548828, + "epoch": 11.537313432835822, + "grad_norm": 0.0, + "learning_rate": 4.996775287668025e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.55555725097656, + "epoch": 11.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.996742136093985e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0277862548828, + "epoch": 11.552238805970148, + "grad_norm": 0.33330137445248514, + "learning_rate": 4.996708815094429e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.38888549804688, + "epoch": 11.559701492537313, + "grad_norm": 0.8438713593098728, + "learning_rate": 4.996675324671617e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.88888549804688, + "epoch": 11.567164179104477, + "grad_norm": 0.4760906388322811, + "learning_rate": 4.996641664827823e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.44444274902344, + "epoch": 11.574626865671641, + "grad_norm": 0.1819189934081577, + "learning_rate": 4.99660783556533e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.38888549804688, + "epoch": 11.582089552238806, + "grad_norm": 0.3472516871736872, + "learning_rate": 4.996573836886434e-07, + "loss": -0.0008, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.19444274902344, + "epoch": 11.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.996539668793443e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.97222900390625, + "epoch": 11.597014925373134, + "grad_norm": 0.35648720987763993, + "learning_rate": 4.996505331288674e-07, + "loss": -0.0008, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.22222900390625, + "epoch": 11.604477611940299, + "grad_norm": 0.870024503950432, + "learning_rate": 4.996470824374458e-07, + "loss": 0.0005, + "reward": 0.8888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.4166717529297, + "epoch": 11.611940298507463, + "grad_norm": 0.22403939056944375, + "learning_rate": 4.996436148053137e-07, + "loss": -0.0002, + "reward": 0.3055555522441864, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.80555725097656, + "epoch": 11.619402985074627, + "grad_norm": 1.1091422985433594, + "learning_rate": 4.996401302327062e-07, + "loss": -0.0009, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5277862548828, + "epoch": 11.626865671641792, + "grad_norm": 0.9731206347336017, + "learning_rate": 4.996366287198601e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.11111450195312, + "epoch": 11.634328358208956, + "grad_norm": 0.0, + "learning_rate": 4.996331102670127e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3333282470703, + "epoch": 11.64179104477612, + "grad_norm": 1.1910981088049262, + "learning_rate": 4.99629574874403e-07, + "loss": -0.0008, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.44444274902344, + "epoch": 11.649253731343283, + "grad_norm": 0.5592291992443778, + "learning_rate": 4.996260225422707e-07, + "loss": -0.0006, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0833282470703, + "epoch": 11.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.99622453270857e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.97222900390625, + "epoch": 11.664179104477611, + "grad_norm": 0.27910834398813983, + "learning_rate": 4.996188670604039e-07, + "loss": 0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.47222900390625, + "epoch": 11.671641791044776, + "grad_norm": 0.4229309850205743, + "learning_rate": 4.996152639111551e-07, + "loss": 0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.30555725097656, + "epoch": 11.67910447761194, + "grad_norm": 0.4355249240307474, + "learning_rate": 4.996116438233547e-07, + "loss": 0.0009, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9166717529297, + "epoch": 11.686567164179104, + "grad_norm": 0.5462794459074625, + "learning_rate": 4.996080067972487e-07, + "loss": 0.0011, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.11111450195312, + "epoch": 11.694029850746269, + "grad_norm": 0.833925992259627, + "learning_rate": 4.996043528330838e-07, + "loss": -0.0007, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25, + "epoch": 11.701492537313433, + "grad_norm": 0.0, + "learning_rate": 4.996006819311078e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5277862548828, + "epoch": 11.708955223880597, + "grad_norm": 0.4542688133607969, + "learning_rate": 4.995969940915699e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.11111450195312, + "epoch": 11.716417910447761, + "grad_norm": 0.9770123127857657, + "learning_rate": 4.995932893147206e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.11111450195312, + "epoch": 11.723880597014926, + "grad_norm": 0.7753571723995827, + "learning_rate": 4.995895676008109e-07, + "loss": 0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 11.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.995858289500935e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.1666717529297, + "epoch": 11.738805970149254, + "grad_norm": 0.2368794226269717, + "learning_rate": 4.995820733628221e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.38888549804688, + "epoch": 11.746268656716419, + "grad_norm": 0.8671465683632468, + "learning_rate": 4.995783008392517e-07, + "loss": -0.0006, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 11.753731343283581, + "grad_norm": 0.2984212703098894, + "learning_rate": 4.995745113796381e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.94444274902344, + "epoch": 11.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.995707049842385e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.80555725097656, + "epoch": 11.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.995668816533112e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.72222900390625, + "epoch": 11.776119402985074, + "grad_norm": 2.618802827461701, + "learning_rate": 4.995630413871158e-07, + "loss": 0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.55555725097656, + "epoch": 11.783582089552239, + "grad_norm": 0.578304249697782, + "learning_rate": 4.995591841859126e-07, + "loss": -0.0005, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.9166717529297, + "epoch": 11.791044776119403, + "grad_norm": 0.532022708352497, + "learning_rate": 4.995553100499636e-07, + "loss": 0.002, + "reward": 0.5555555820465088, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.80555725097656, + "epoch": 11.798507462686567, + "grad_norm": 0.23246461651272746, + "learning_rate": 4.995514189795316e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.6666717529297, + "epoch": 11.805970149253731, + "grad_norm": 0.43893735536226147, + "learning_rate": 4.995475109748805e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 11.813432835820896, + "grad_norm": 0.5532950602576939, + "learning_rate": 4.995435860362757e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.6388931274414, + "epoch": 11.82089552238806, + "grad_norm": 0.610310283632902, + "learning_rate": 4.995396441639835e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.72222900390625, + "epoch": 11.828358208955224, + "grad_norm": 0.8140537554056148, + "learning_rate": 4.995356853582714e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.25, + "epoch": 11.835820895522389, + "grad_norm": 0.39900851050581176, + "learning_rate": 4.99531709619408e-07, + "loss": -0.0004, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0833282470703, + "epoch": 11.843283582089553, + "grad_norm": 1.5371171187131445, + "learning_rate": 4.995277169476629e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5833282470703, + "epoch": 11.850746268656717, + "grad_norm": 0.9738798794802496, + "learning_rate": 4.995237073433074e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7777862548828, + "epoch": 11.85820895522388, + "grad_norm": 0.5720993507794343, + "learning_rate": 4.995196808066133e-07, + "loss": -0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9166717529297, + "epoch": 11.865671641791044, + "grad_norm": 1.3170450331237809, + "learning_rate": 4.995156373378541e-07, + "loss": 0.0007, + "reward": 0.694444477558136, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.86111450195312, + "epoch": 11.873134328358208, + "grad_norm": 0.7381320926180792, + "learning_rate": 4.995115769373039e-07, + "loss": 0.0009, + "reward": 0.472222238779068, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.4166717529297, + "epoch": 11.880597014925373, + "grad_norm": 0.2904671577564293, + "learning_rate": 4.995074996052384e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.86111450195312, + "epoch": 11.888059701492537, + "grad_norm": 0.5094456843920552, + "learning_rate": 4.995034053419344e-07, + "loss": -0.0006, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 11.895522388059701, + "grad_norm": 0.21433545427818174, + "learning_rate": 4.994992941476693e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.97222900390625, + "epoch": 11.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.994951660227226e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 11.91044776119403, + "grad_norm": 0.20915973280719868, + "learning_rate": 4.994910209673741e-07, + "loss": -0.0002, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.86111450195312, + "epoch": 11.917910447761194, + "grad_norm": 0.9479186545874969, + "learning_rate": 4.994868589819052e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8333282470703, + "epoch": 11.925373134328359, + "grad_norm": 0.4711938881320438, + "learning_rate": 4.994826800665981e-07, + "loss": 0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.75, + "epoch": 11.932835820895523, + "grad_norm": 0.559486103002806, + "learning_rate": 4.994784842217367e-07, + "loss": 0.0, + "reward": 0.472222238779068, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 11.940298507462687, + "grad_norm": 0.32886421843253655, + "learning_rate": 4.994742714476057e-07, + "loss": -0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.13888549804688, + "epoch": 11.947761194029852, + "grad_norm": 0.0, + "learning_rate": 4.994700417444907e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.25, + "epoch": 11.955223880597014, + "grad_norm": 0.22407134202157544, + "learning_rate": 4.994657951126789e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.11111450195312, + "epoch": 11.962686567164178, + "grad_norm": 0.6860160639464953, + "learning_rate": 4.994615315524585e-07, + "loss": -0.0006, + "reward": 0.8055555820465088, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0833282470703, + "epoch": 11.970149253731343, + "grad_norm": 0.5412769117113858, + "learning_rate": 4.994572510641188e-07, + "loss": 0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.22222900390625, + "epoch": 11.977611940298507, + "grad_norm": 0.0, + "learning_rate": 4.994529536479502e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.3333282470703, + "epoch": 11.985074626865671, + "grad_norm": 0.35890134958668013, + "learning_rate": 4.994486393042444e-07, + "loss": 0.0001, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 11.992537313432836, + "grad_norm": 0.0, + "learning_rate": 4.99444308033294e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.47222900390625, + "epoch": 12.007462686567164, + "grad_norm": 0.0, + "learning_rate": 4.994399598353933e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9166717529297, + "epoch": 12.014925373134329, + "grad_norm": 0.4975191003054879, + "learning_rate": 4.994355947108368e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.72222900390625, + "epoch": 12.022388059701493, + "grad_norm": 0.0, + "learning_rate": 4.994312126599213e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0277862548828, + "epoch": 12.029850746268657, + "grad_norm": 0.5148158441843094, + "learning_rate": 4.994268136829437e-07, + "loss": -0.0006, + "reward": 0.4166666567325592, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.80555725097656, + "epoch": 12.037313432835822, + "grad_norm": 0.0, + "learning_rate": 4.994223977802027e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0277862548828, + "epoch": 12.044776119402986, + "grad_norm": 0.431814896108935, + "learning_rate": 4.99417964951998e-07, + "loss": 0.0004, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 12.052238805970148, + "grad_norm": 0.0, + "learning_rate": 4.994135151986303e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.11111450195312, + "epoch": 12.059701492537313, + "grad_norm": 1.4543679748943363, + "learning_rate": 4.994090485204015e-07, + "loss": -0.0007, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.63888549804688, + "epoch": 12.067164179104477, + "grad_norm": 0.4240907249278325, + "learning_rate": 4.99404564917615e-07, + "loss": -0.0003, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.38888549804688, + "epoch": 12.074626865671641, + "grad_norm": 0.3958538764739779, + "learning_rate": 4.994000643905747e-07, + "loss": 0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 12.082089552238806, + "grad_norm": 0.46095904714916636, + "learning_rate": 4.993955469395862e-07, + "loss": 0.0005, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5833282470703, + "epoch": 12.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.99391012564956e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.69444274902344, + "epoch": 12.097014925373134, + "grad_norm": 0.467293986563075, + "learning_rate": 4.993864612669918e-07, + "loss": 0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.08333587646484, + "epoch": 12.104477611940299, + "grad_norm": 0.5374357447787015, + "learning_rate": 4.993818930460026e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.5277862548828, + "epoch": 12.111940298507463, + "grad_norm": 0.3727088403467002, + "learning_rate": 4.99377307902298e-07, + "loss": -0.0011, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.75, + "epoch": 12.119402985074627, + "grad_norm": 0.39578778980051804, + "learning_rate": 4.993727058361895e-07, + "loss": 0.0013, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.38888549804688, + "epoch": 12.126865671641792, + "grad_norm": 0.3419717614843469, + "learning_rate": 4.993680868479892e-07, + "loss": -0.0005, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.88888549804688, + "epoch": 12.134328358208956, + "grad_norm": 0.5888074482010958, + "learning_rate": 4.993634509380107e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.30555725097656, + "epoch": 12.14179104477612, + "grad_norm": 0.20973993115721867, + "learning_rate": 4.993587981065684e-07, + "loss": 0.0004, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.16666412353516, + "epoch": 12.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.993541283539781e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7777862548828, + "epoch": 12.156716417910447, + "grad_norm": 0.17707736884154343, + "learning_rate": 4.993494416805568e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.97222900390625, + "epoch": 12.164179104477611, + "grad_norm": 0.2563281559815523, + "learning_rate": 4.993447380866224e-07, + "loss": 0.0002, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.97222900390625, + "epoch": 12.171641791044776, + "grad_norm": 0.0, + "learning_rate": 4.993400175724941e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.55555725097656, + "epoch": 12.17910447761194, + "grad_norm": 0.18406399351905303, + "learning_rate": 4.993352801384923e-07, + "loss": -0.0002, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.80555725097656, + "epoch": 12.186567164179104, + "grad_norm": 0.28059923970585626, + "learning_rate": 4.993305257849383e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5833282470703, + "epoch": 12.194029850746269, + "grad_norm": 0.4142991761751787, + "learning_rate": 4.99325754512155e-07, + "loss": 0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.75, + "epoch": 12.201492537313433, + "grad_norm": 0.19538905175779236, + "learning_rate": 4.993209663204661e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.3333282470703, + "epoch": 12.208955223880597, + "grad_norm": 0.22870261328179617, + "learning_rate": 4.993161612101963e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.44444274902344, + "epoch": 12.216417910447761, + "grad_norm": 0.48423090777549277, + "learning_rate": 4.993113391816718e-07, + "loss": 0.0012, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.8333282470703, + "epoch": 12.223880597014926, + "grad_norm": 0.29190839072633706, + "learning_rate": 4.993065002352198e-07, + "loss": 0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.86111450195312, + "epoch": 12.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.993016443711687e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.83333587646484, + "epoch": 12.238805970149254, + "grad_norm": 0.5676806755866792, + "learning_rate": 4.992967715898481e-07, + "loss": 0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.05555725097656, + "epoch": 12.246268656716419, + "grad_norm": 0.8674059909609139, + "learning_rate": 4.992918818915885e-07, + "loss": 0.0036, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.36111450195312, + "epoch": 12.253731343283581, + "grad_norm": 0.43168489896185636, + "learning_rate": 4.992869752767218e-07, + "loss": -0.0003, + "reward": 0.25, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.80555725097656, + "epoch": 12.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.992820517455809e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.55555725097656, + "epoch": 12.26865671641791, + "grad_norm": 0.5329743569738762, + "learning_rate": 4.992771112984998e-07, + "loss": -0.0, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.91666412353516, + "epoch": 12.276119402985074, + "grad_norm": 0.6339863258632502, + "learning_rate": 4.992721539358141e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5277862548828, + "epoch": 12.283582089552239, + "grad_norm": 0.1733420344162799, + "learning_rate": 4.992671796578599e-07, + "loss": -0.0006, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5, + "epoch": 12.291044776119403, + "grad_norm": 0.6261745868199377, + "learning_rate": 4.992621884649748e-07, + "loss": 0.0005, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2777862548828, + "epoch": 12.298507462686567, + "grad_norm": 4.277850490308942, + "learning_rate": 4.992571803574975e-07, + "loss": 0.0007, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.97222900390625, + "epoch": 12.305970149253731, + "grad_norm": 0.3664843172919929, + "learning_rate": 4.992521553357679e-07, + "loss": 0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 12.313432835820896, + "grad_norm": 0.46161212340613056, + "learning_rate": 4.992471134001271e-07, + "loss": 0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6666717529297, + "epoch": 12.32089552238806, + "grad_norm": 0.4374210352695086, + "learning_rate": 4.992420545509169e-07, + "loss": -0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.25, + "epoch": 12.328358208955224, + "grad_norm": 0.0, + "learning_rate": 4.992369787884809e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1666717529297, + "epoch": 12.335820895522389, + "grad_norm": 0.0, + "learning_rate": 4.992318861131634e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.86111450195312, + "epoch": 12.343283582089553, + "grad_norm": 0.8083415149759965, + "learning_rate": 4.992267765253099e-07, + "loss": 0.0005, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.19444274902344, + "epoch": 12.350746268656717, + "grad_norm": 0.6234323085680842, + "learning_rate": 4.992216500252673e-07, + "loss": 0.0005, + "reward": 0.8611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4166717529297, + "epoch": 12.35820895522388, + "grad_norm": 0.38201063782823874, + "learning_rate": 4.992165066133834e-07, + "loss": 0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 12.365671641791044, + "grad_norm": 0.0, + "learning_rate": 4.992113462900072e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.30555725097656, + "epoch": 12.373134328358208, + "grad_norm": 0.35950903464230316, + "learning_rate": 4.992061690554891e-07, + "loss": -0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.05555725097656, + "epoch": 12.380597014925373, + "grad_norm": 0.5852613465698367, + "learning_rate": 4.992009749101801e-07, + "loss": -0.0003, + "reward": 0.7777777910232544, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 12.388059701492537, + "grad_norm": 0.17191733206771376, + "learning_rate": 4.991957638544327e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 12.395522388059701, + "grad_norm": 0.0, + "learning_rate": 4.991905358886008e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5277862548828, + "epoch": 12.402985074626866, + "grad_norm": 0.5720015430177237, + "learning_rate": 4.991852910130387e-07, + "loss": 0.0009, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4166717529297, + "epoch": 12.41044776119403, + "grad_norm": 0.48058836515808806, + "learning_rate": 4.991800292281028e-07, + "loss": -0.0014, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9166717529297, + "epoch": 12.417910447761194, + "grad_norm": 0.38343771321254716, + "learning_rate": 4.991747505341499e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 12.425373134328359, + "grad_norm": 0.8886048692892549, + "learning_rate": 4.991694549315382e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.6666717529297, + "epoch": 12.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.991641424206271e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5277862548828, + "epoch": 12.440298507462687, + "grad_norm": 0.8413238213415378, + "learning_rate": 4.991588130017772e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 12.447761194029852, + "grad_norm": 0.4004383501670445, + "learning_rate": 4.9915346667535e-07, + "loss": 0.0009, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.19444274902344, + "epoch": 12.455223880597014, + "grad_norm": 0.38853633534343573, + "learning_rate": 4.991481034417082e-07, + "loss": -0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.80555725097656, + "epoch": 12.462686567164178, + "grad_norm": 0.45318651094269885, + "learning_rate": 4.99142723301216e-07, + "loss": 0.0006, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.69444274902344, + "epoch": 12.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.991373262542384e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.36111450195312, + "epoch": 12.477611940298507, + "grad_norm": 0.9570049402372474, + "learning_rate": 4.991319123011414e-07, + "loss": 0.0002, + "reward": 0.8611111044883728, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.8333282470703, + "epoch": 12.485074626865671, + "grad_norm": 0.2417139040607155, + "learning_rate": 4.991264814422929e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3333282470703, + "epoch": 12.492537313432836, + "grad_norm": 0.5924451928025202, + "learning_rate": 4.991210336780609e-07, + "loss": -0.0012, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.72222900390625, + "epoch": 12.5, + "grad_norm": 0.372178572980646, + "learning_rate": 4.991155690088153e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 12.507462686567164, + "grad_norm": 0.18931034220346332, + "learning_rate": 4.99110087434927e-07, + "loss": -0.0004, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.72222900390625, + "epoch": 12.514925373134329, + "grad_norm": 0.6742224190105343, + "learning_rate": 4.991045889567679e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.18385560810565948, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.11111450195312, + "epoch": 12.522388059701493, + "grad_norm": 4.3938782810387655, + "learning_rate": 4.990990735747112e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.55555725097656, + "epoch": 12.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.990935412891309e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.11111450195312, + "epoch": 12.537313432835822, + "grad_norm": 0.19963922051133384, + "learning_rate": 4.990879921004026e-07, + "loss": -0.0001, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.80555725097656, + "epoch": 12.544776119402986, + "grad_norm": 0.41864157447674394, + "learning_rate": 4.99082426008903e-07, + "loss": -0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.61111450195312, + "epoch": 12.552238805970148, + "grad_norm": 0.6376690850811023, + "learning_rate": 4.990768430150095e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8333282470703, + "epoch": 12.559701492537313, + "grad_norm": 0.0, + "learning_rate": 4.990712431191012e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.2777862548828, + "epoch": 12.567164179104477, + "grad_norm": 0.5793710595437543, + "learning_rate": 4.99065626321558e-07, + "loss": 0.0005, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0833282470703, + "epoch": 12.574626865671641, + "grad_norm": 0.5271336628717449, + "learning_rate": 4.990599926227611e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.9166717529297, + "epoch": 12.582089552238806, + "grad_norm": 0.399076936939573, + "learning_rate": 4.990543420230928e-07, + "loss": -0.0006, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.80555725097656, + "epoch": 12.58955223880597, + "grad_norm": 1.1523249412341652, + "learning_rate": 4.990486745229364e-07, + "loss": -0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.55555725097656, + "epoch": 12.597014925373134, + "grad_norm": 0.5762626195793684, + "learning_rate": 4.990429901226766e-07, + "loss": -0.0003, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.30555725097656, + "epoch": 12.604477611940299, + "grad_norm": 0.0, + "learning_rate": 4.990372888226991e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.83333587646484, + "epoch": 12.611940298507463, + "grad_norm": 1.6896183866687962, + "learning_rate": 4.990315706233908e-07, + "loss": -0.0005, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 12.619402985074627, + "grad_norm": 0.4836292047244698, + "learning_rate": 4.990258355251399e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8888931274414, + "epoch": 12.626865671641792, + "grad_norm": 0.5405888779035337, + "learning_rate": 4.990200835283352e-07, + "loss": 0.0001, + "reward": 0.4166666567325592, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.88888549804688, + "epoch": 12.634328358208956, + "grad_norm": 0.36150220654357934, + "learning_rate": 4.990143146333675e-07, + "loss": -0.0004, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.94444274902344, + "epoch": 12.64179104477612, + "grad_norm": 0.48479814109868014, + "learning_rate": 4.990085288406278e-07, + "loss": -0.0007, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.88888549804688, + "epoch": 12.649253731343283, + "grad_norm": 0.17787659569019082, + "learning_rate": 4.99002726150509e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.94444274902344, + "epoch": 12.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.989969065634047e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 12.664179104477611, + "grad_norm": 0.5106691128458634, + "learning_rate": 4.989910700797099e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.22222900390625, + "epoch": 12.671641791044776, + "grad_norm": 0.25798653362613433, + "learning_rate": 4.989852166998207e-07, + "loss": -0.0003, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 12.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.989793464241343e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.25, + "epoch": 12.686567164179104, + "grad_norm": 0.6591641891279629, + "learning_rate": 4.98973459253049e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.69444274902344, + "epoch": 12.694029850746269, + "grad_norm": 0.7899256919862518, + "learning_rate": 4.989675551869643e-07, + "loss": 0.0008, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.94444274902344, + "epoch": 12.701492537313433, + "grad_norm": 2.5414922347748963, + "learning_rate": 4.989616342262807e-07, + "loss": -0.001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.8333282470703, + "epoch": 12.708955223880597, + "grad_norm": 0.31854771752020267, + "learning_rate": 4.989556963714003e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.1666717529297, + "epoch": 12.716417910447761, + "grad_norm": 0.5406582593744492, + "learning_rate": 4.989497416227256e-07, + "loss": 0.0002, + "reward": 0.4444444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1666717529297, + "epoch": 12.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.989437699806611e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.38888549804688, + "epoch": 12.73134328358209, + "grad_norm": 0.3893172824995502, + "learning_rate": 4.989377814456119e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0277862548828, + "epoch": 12.738805970149254, + "grad_norm": 0.5475114025257878, + "learning_rate": 4.989317760179844e-07, + "loss": 0.0005, + "reward": 0.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.97222900390625, + "epoch": 12.746268656716419, + "grad_norm": 0.0, + "learning_rate": 4.98925753698186e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.13888549804688, + "epoch": 12.753731343283581, + "grad_norm": 0.3378845658552358, + "learning_rate": 4.989197144866254e-07, + "loss": -0.0004, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5277862548828, + "epoch": 12.761194029850746, + "grad_norm": 0.4310855721208014, + "learning_rate": 4.989136583837125e-07, + "loss": -0.0007, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.63888549804688, + "epoch": 12.76865671641791, + "grad_norm": 0.2705551169916684, + "learning_rate": 4.989075853898581e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.3333282470703, + "epoch": 12.776119402985074, + "grad_norm": 0.23892799544770174, + "learning_rate": 4.989014955054745e-07, + "loss": 0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13888549804688, + "epoch": 12.783582089552239, + "grad_norm": 0.6375852959318107, + "learning_rate": 4.988953887309748e-07, + "loss": 0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.47222900390625, + "epoch": 12.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.988892650667735e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5277862548828, + "epoch": 12.798507462686567, + "grad_norm": 0.0, + "learning_rate": 4.988831245132861e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.55555725097656, + "epoch": 12.805970149253731, + "grad_norm": 2.2260340975596424, + "learning_rate": 4.988769670709293e-07, + "loss": -0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.11111450195312, + "epoch": 12.813432835820896, + "grad_norm": 0.48915391467795116, + "learning_rate": 4.988707927401209e-07, + "loss": 0.0, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.61111450195312, + "epoch": 12.82089552238806, + "grad_norm": 0.35196119161724865, + "learning_rate": 4.988646015212799e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.63888549804688, + "epoch": 12.828358208955224, + "grad_norm": 0.12979603675802642, + "learning_rate": 4.988583934148264e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.44444274902344, + "epoch": 12.835820895522389, + "grad_norm": 2.7593830411449383, + "learning_rate": 4.988521684211818e-07, + "loss": -0.0003, + "reward": 0.8333333134651184, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 12.843283582089553, + "grad_norm": 0.0, + "learning_rate": 4.988459265407683e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.13888549804688, + "epoch": 12.850746268656717, + "grad_norm": 3.5236594625892836, + "learning_rate": 4.988396677740097e-07, + "loss": -0.0006, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.47222137451172, + "epoch": 12.85820895522388, + "grad_norm": 0.5772504304233204, + "learning_rate": 4.988333921213306e-07, + "loss": 0.0008, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.86111450195312, + "epoch": 12.865671641791044, + "grad_norm": 0.43042545196620646, + "learning_rate": 4.988270995831567e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.63888549804688, + "epoch": 12.873134328358208, + "grad_norm": 0.0, + "learning_rate": 4.988207901599154e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.9166717529297, + "epoch": 12.880597014925373, + "grad_norm": 0.15748561042115097, + "learning_rate": 4.988144638520345e-07, + "loss": 0.0004, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.19444274902344, + "epoch": 12.888059701492537, + "grad_norm": 0.8497358007382485, + "learning_rate": 4.988081206599434e-07, + "loss": -0.0011, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.47222900390625, + "epoch": 12.895522388059701, + "grad_norm": 0.0, + "learning_rate": 4.988017605840725e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.19444274902344, + "epoch": 12.902985074626866, + "grad_norm": 1.1715862603801919, + "learning_rate": 4.987953836248535e-07, + "loss": 0.0006, + "reward": 0.7777777910232544, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.86111450195312, + "epoch": 12.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.987889897827191e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.44444274902344, + "epoch": 12.917910447761194, + "grad_norm": 0.8304184190706534, + "learning_rate": 4.987825790581031e-07, + "loss": -0.0, + "reward": 0.7222222089767456, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 12.925373134328359, + "grad_norm": 0.8864707640793072, + "learning_rate": 4.987761514514405e-07, + "loss": -0.0011, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.19444274902344, + "epoch": 12.932835820895523, + "grad_norm": 0.4481540540782714, + "learning_rate": 4.987697069631676e-07, + "loss": 0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6666717529297, + "epoch": 12.940298507462687, + "grad_norm": 0.0, + "learning_rate": 4.987632455937217e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.0833282470703, + "epoch": 12.947761194029852, + "grad_norm": 0.0, + "learning_rate": 4.987567673435411e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.47222900390625, + "epoch": 12.955223880597014, + "grad_norm": 0.7277193292026107, + "learning_rate": 4.987502722130655e-07, + "loss": -0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2777862548828, + "epoch": 12.962686567164178, + "grad_norm": 0.26905965125034376, + "learning_rate": 4.987437602027358e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0277862548828, + "epoch": 12.970149253731343, + "grad_norm": 0.7650265712503038, + "learning_rate": 4.987372313129938e-07, + "loss": 0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 12.977611940298507, + "grad_norm": 1.0328283158396718, + "learning_rate": 4.987306855442823e-07, + "loss": 0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.7777862548828, + "epoch": 12.985074626865671, + "grad_norm": 3.9844903953153623, + "learning_rate": 4.987241228970458e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4166717529297, + "epoch": 12.992537313432836, + "grad_norm": 0.26255869356787515, + "learning_rate": 4.987175433717294e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.69444274902344, + "epoch": 13.007462686567164, + "grad_norm": 0.5552214837356789, + "learning_rate": 4.987109469687799e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.72222900390625, + "epoch": 13.014925373134329, + "grad_norm": 0.0, + "learning_rate": 4.987043336886446e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.97222900390625, + "epoch": 13.022388059701493, + "grad_norm": 0.3271921384140929, + "learning_rate": 4.986977035317724e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.38888549804688, + "epoch": 13.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.986910564986132e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.13888549804688, + "epoch": 13.037313432835822, + "grad_norm": 0.4156381695733007, + "learning_rate": 4.986843925896182e-07, + "loss": -0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36111450195312, + "epoch": 13.044776119402986, + "grad_norm": 0.5725436577068681, + "learning_rate": 4.986777118052393e-07, + "loss": 0.0008, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.38888549804688, + "epoch": 13.052238805970148, + "grad_norm": 0.495654908466086, + "learning_rate": 4.986710141459301e-07, + "loss": 0.0011, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 13.059701492537313, + "grad_norm": 0.0, + "learning_rate": 4.986642996121449e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.05555725097656, + "epoch": 13.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.986575682043395e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0833282470703, + "epoch": 13.074626865671641, + "grad_norm": 0.262942054043385, + "learning_rate": 4.986508199229706e-07, + "loss": 0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.86111450195312, + "epoch": 13.082089552238806, + "grad_norm": 0.2033792682899578, + "learning_rate": 4.986440547684963e-07, + "loss": -0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.8333282470703, + "epoch": 13.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.986372727413754e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.36111450195312, + "epoch": 13.097014925373134, + "grad_norm": 0.9536060216095328, + "learning_rate": 4.986304738420683e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5833282470703, + "epoch": 13.104477611940299, + "grad_norm": 0.4580885218318285, + "learning_rate": 4.986236580710363e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 13.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.98616825428742e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.25, + "epoch": 13.119402985074627, + "grad_norm": 0.36915924830903674, + "learning_rate": 4.986099759156489e-07, + "loss": -0.0004, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 13.126865671641792, + "grad_norm": 0.44300815943609256, + "learning_rate": 4.986031095322219e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.38888549804688, + "epoch": 13.134328358208956, + "grad_norm": 0.3435140517628868, + "learning_rate": 4.985962262789269e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8333282470703, + "epoch": 13.14179104477612, + "grad_norm": 0.41625420421364406, + "learning_rate": 4.985893261562311e-07, + "loss": 0.0012, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.38888549804688, + "epoch": 13.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.985824091646025e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.2777862548828, + "epoch": 13.156716417910447, + "grad_norm": 0.6431109533092391, + "learning_rate": 4.985754753045107e-07, + "loss": 0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.86111450195312, + "epoch": 13.164179104477611, + "grad_norm": 0.9693536373568775, + "learning_rate": 4.985685245764262e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.30555725097656, + "epoch": 13.171641791044776, + "grad_norm": 3.0781095996224086, + "learning_rate": 4.985615569808206e-07, + "loss": 0.0015, + "reward": 0.4444444477558136, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.2777862548828, + "epoch": 13.17910447761194, + "grad_norm": 0.5206028367511477, + "learning_rate": 4.985545725181667e-07, + "loss": 0.0011, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3333282470703, + "epoch": 13.186567164179104, + "grad_norm": 0.6515860710422121, + "learning_rate": 4.985475711889384e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9166717529297, + "epoch": 13.194029850746269, + "grad_norm": 0.0, + "learning_rate": 4.985405529936109e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5277862548828, + "epoch": 13.201492537313433, + "grad_norm": 0.6862434092866461, + "learning_rate": 4.985335179326606e-07, + "loss": 0.0007, + "reward": 0.5833333134651184, + "reward_std": 0.2949667274951935, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.47222900390625, + "epoch": 13.208955223880597, + "grad_norm": 0.5534580550285414, + "learning_rate": 4.985264660065646e-07, + "loss": 0.0015, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.44444274902344, + "epoch": 13.216417910447761, + "grad_norm": 0.7348804269931942, + "learning_rate": 4.985193972158014e-07, + "loss": 0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.30555725097656, + "epoch": 13.223880597014926, + "grad_norm": 0.4951997902387675, + "learning_rate": 4.98512311560851e-07, + "loss": -0.0001, + "reward": 0.3055555522441864, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 13.23134328358209, + "grad_norm": 0.4810622937835525, + "learning_rate": 4.985052090421939e-07, + "loss": 0.0007, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.77777862548828, + "epoch": 13.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.984980896603124e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.69444274902344, + "epoch": 13.246268656716419, + "grad_norm": 0.548228752718674, + "learning_rate": 4.984909534156893e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.88888549804688, + "epoch": 13.253731343283581, + "grad_norm": 0.0, + "learning_rate": 4.984838003088091e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.19444274902344, + "epoch": 13.261194029850746, + "grad_norm": 1.314784202742423, + "learning_rate": 4.98476630340157e-07, + "loss": -0.0002, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.44444274902344, + "epoch": 13.26865671641791, + "grad_norm": 3.700293063344358, + "learning_rate": 4.984694435102197e-07, + "loss": 0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.1666717529297, + "epoch": 13.276119402985074, + "grad_norm": 0.30517455806795935, + "learning_rate": 4.984622398194847e-07, + "loss": -0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.22222900390625, + "epoch": 13.283582089552239, + "grad_norm": 0.490906441530784, + "learning_rate": 4.98455019268441e-07, + "loss": 0.0004, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 13.291044776119403, + "grad_norm": 0.26850124978698003, + "learning_rate": 4.984477818575786e-07, + "loss": -0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0833282470703, + "epoch": 13.298507462686567, + "grad_norm": 0.0, + "learning_rate": 4.984405275873885e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.72222900390625, + "epoch": 13.305970149253731, + "grad_norm": 0.45708442115994696, + "learning_rate": 4.98433256458363e-07, + "loss": 0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.25, + "epoch": 13.313432835820896, + "grad_norm": 0.0, + "learning_rate": 4.984259684709955e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.19444274902344, + "epoch": 13.32089552238806, + "grad_norm": 1.8842388000021266, + "learning_rate": 4.984186636257805e-07, + "loss": -0.0018, + "reward": 0.5277777910232544, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.80555725097656, + "epoch": 13.328358208955224, + "grad_norm": 0.3077457827596787, + "learning_rate": 4.98411341923214e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.47222900390625, + "epoch": 13.335820895522389, + "grad_norm": 0.40882801094154536, + "learning_rate": 4.984040033637924e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 13.343283582089553, + "grad_norm": 0.5788136700529715, + "learning_rate": 4.98396647948014e-07, + "loss": -0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.69444274902344, + "epoch": 13.350746268656717, + "grad_norm": 0.8666108171468522, + "learning_rate": 4.983892756763777e-07, + "loss": 0.0016, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.86111450195312, + "epoch": 13.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.983818865493841e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.36111450195312, + "epoch": 13.365671641791044, + "grad_norm": 0.22057510739506844, + "learning_rate": 4.983744805675342e-07, + "loss": 0.0002, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.69444274902344, + "epoch": 13.373134328358208, + "grad_norm": 0.0, + "learning_rate": 4.983670577313309e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.38888549804688, + "epoch": 13.380597014925373, + "grad_norm": 0.0, + "learning_rate": 4.983596180412778e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5277862548828, + "epoch": 13.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.983521614978797e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.02777862548828, + "epoch": 13.395522388059701, + "grad_norm": 0.4664060496759037, + "learning_rate": 4.983446881016425e-07, + "loss": 0.0003, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.88888549804688, + "epoch": 13.402985074626866, + "grad_norm": 0.24819569558831123, + "learning_rate": 4.983371978530736e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.94444274902344, + "epoch": 13.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.983296907526811e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 13.417910447761194, + "grad_norm": 0.34957456343428495, + "learning_rate": 4.983221668009744e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.44444274902344, + "epoch": 13.425373134328359, + "grad_norm": 0.587329702116452, + "learning_rate": 4.983146259984641e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.94444274902344, + "epoch": 13.432835820895523, + "grad_norm": 0.27865372207252215, + "learning_rate": 4.98307068345662e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 13.440298507462687, + "grad_norm": 0.39348014747497523, + "learning_rate": 4.982994938430808e-07, + "loss": -0.0002, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.30555725097656, + "epoch": 13.447761194029852, + "grad_norm": 0.5092204145048369, + "learning_rate": 4.982919024912347e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.19444274902344, + "epoch": 13.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.982842942906385e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.19444274902344, + "epoch": 13.462686567164178, + "grad_norm": 0.0, + "learning_rate": 4.982766692418088e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0833282470703, + "epoch": 13.470149253731343, + "grad_norm": 0.9534256034967428, + "learning_rate": 4.98269027345263e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 13.477611940298507, + "grad_norm": 0.5411273810629683, + "learning_rate": 4.982613686015195e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.88888549804688, + "epoch": 13.485074626865671, + "grad_norm": 0.33737671147607406, + "learning_rate": 4.982536930110979e-07, + "loss": -0.0004, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.44444274902344, + "epoch": 13.492537313432836, + "grad_norm": 0.7472539313374394, + "learning_rate": 4.982460005745196e-07, + "loss": 0.0005, + "reward": 0.6666666865348816, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.30555725097656, + "epoch": 13.5, + "grad_norm": 0.2149043790435115, + "learning_rate": 4.982382912923061e-07, + "loss": 0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.97222900390625, + "epoch": 13.507462686567164, + "grad_norm": 0.35047023091801044, + "learning_rate": 4.982305651649806e-07, + "loss": 0.0003, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.47222900390625, + "epoch": 13.514925373134329, + "grad_norm": 0.3818576931397794, + "learning_rate": 4.982228221930677e-07, + "loss": -0.0005, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 13.522388059701493, + "grad_norm": 0.0, + "learning_rate": 4.982150623770923e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.22222900390625, + "epoch": 13.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.982072857175815e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.75, + "epoch": 13.537313432835822, + "grad_norm": 0.44113746058513503, + "learning_rate": 4.981994922150627e-07, + "loss": -0.0002, + "reward": 0.6666666865348816, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.11111450195312, + "epoch": 13.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.981916818700649e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 13.552238805970148, + "grad_norm": 0.42727948072059224, + "learning_rate": 4.98183854683118e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.69444274902344, + "epoch": 13.559701492537313, + "grad_norm": 0.7041370717293804, + "learning_rate": 4.981760106547532e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.13888549804688, + "epoch": 13.567164179104477, + "grad_norm": 1.6470007484075637, + "learning_rate": 4.981681497855028e-07, + "loss": -0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.25, + "epoch": 13.574626865671641, + "grad_norm": 0.0, + "learning_rate": 4.981602720759001e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.13888549804688, + "epoch": 13.582089552238806, + "grad_norm": 0.5090519484952412, + "learning_rate": 4.981523775264799e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.22222900390625, + "epoch": 13.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.981444661377777e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.47222900390625, + "epoch": 13.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.981365379103305e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5, + "epoch": 13.604477611940299, + "grad_norm": 0.7202046889004133, + "learning_rate": 4.981285928446762e-07, + "loss": 0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.19444274902344, + "epoch": 13.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.981206309413539e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6666717529297, + "epoch": 13.619402985074627, + "grad_norm": 0.6248177798546601, + "learning_rate": 4.981126522009041e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36111450195312, + "epoch": 13.626865671641792, + "grad_norm": 0.6199296618564978, + "learning_rate": 4.98104656623868e-07, + "loss": -0.001, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.2777862548828, + "epoch": 13.634328358208956, + "grad_norm": 0.0, + "learning_rate": 4.980966442107883e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.30555725097656, + "epoch": 13.64179104477612, + "grad_norm": 1.1181000280023135, + "learning_rate": 4.980886149622087e-07, + "loss": 0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.72222900390625, + "epoch": 13.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.980805688786739e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7777862548828, + "epoch": 13.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.980725059607301e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.13888549804688, + "epoch": 13.664179104477611, + "grad_norm": 0.0, + "learning_rate": 4.980644262089243e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.19444274902344, + "epoch": 13.671641791044776, + "grad_norm": 0.5849500895546195, + "learning_rate": 4.98056329623805e-07, + "loss": 0.0013, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.86111450195312, + "epoch": 13.67910447761194, + "grad_norm": 0.27721068295541873, + "learning_rate": 4.980482162059213e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.8333282470703, + "epoch": 13.686567164179104, + "grad_norm": 0.6522282227883209, + "learning_rate": 4.980400859558239e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55555725097656, + "epoch": 13.694029850746269, + "grad_norm": 0.9275591451421797, + "learning_rate": 4.980319388740647e-07, + "loss": 0.0009, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.69444274902344, + "epoch": 13.701492537313433, + "grad_norm": 0.0, + "learning_rate": 4.980237749611962e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 13.708955223880597, + "grad_norm": 0.31067158745255063, + "learning_rate": 4.980155942177728e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.22222900390625, + "epoch": 13.716417910447761, + "grad_norm": 0.0, + "learning_rate": 4.980073966443493e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.22222900390625, + "epoch": 13.723880597014926, + "grad_norm": 0.48789054519768205, + "learning_rate": 4.97999182241482e-07, + "loss": 0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0833282470703, + "epoch": 13.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.979909510097285e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0, + "epoch": 13.738805970149254, + "grad_norm": 0.23033548420675704, + "learning_rate": 4.979827029496473e-07, + "loss": -0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.0, + "epoch": 13.746268656716419, + "grad_norm": 0.1704395810849778, + "learning_rate": 4.97974438061798e-07, + "loss": -0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.11111450195312, + "epoch": 13.753731343283581, + "grad_norm": 0.0, + "learning_rate": 4.979661563467415e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 13.761194029850746, + "grad_norm": 5.456244177950386, + "learning_rate": 4.979578578050399e-07, + "loss": -0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.11111450195312, + "epoch": 13.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.97949542437256e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.9166717529297, + "epoch": 13.776119402985074, + "grad_norm": 1.3545395134002989, + "learning_rate": 4.979412102439545e-07, + "loss": -0.0006, + "reward": 0.5833333134651184, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.94444274902344, + "epoch": 13.783582089552239, + "grad_norm": 0.0, + "learning_rate": 4.979328612257005e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.97222900390625, + "epoch": 13.791044776119403, + "grad_norm": 0.6205270765438848, + "learning_rate": 4.979244953830609e-07, + "loss": -0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.97222900390625, + "epoch": 13.798507462686567, + "grad_norm": 0.0, + "learning_rate": 4.979161127166028e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.44444274902344, + "epoch": 13.805970149253731, + "grad_norm": 0.4246829055404894, + "learning_rate": 4.979077132268956e-07, + "loss": -0.0002, + "reward": 0.3055555522441864, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3055555522441864, + "rewards/format_reward": 0.0, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.80555725097656, + "epoch": 13.813432835820896, + "grad_norm": 0.0, + "learning_rate": 4.97899296914509e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 13.82089552238806, + "grad_norm": 0.5941534222200323, + "learning_rate": 4.978908637800142e-07, + "loss": -0.0004, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.77777862548828, + "epoch": 13.828358208955224, + "grad_norm": 0.3871535442982758, + "learning_rate": 4.978824138239835e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.80555725097656, + "epoch": 13.835820895522389, + "grad_norm": 0.5361445013770092, + "learning_rate": 4.978739470469902e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0, + "epoch": 13.843283582089553, + "grad_norm": 0.0, + "learning_rate": 4.978654634496089e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.61111450195312, + "epoch": 13.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.978569630324153e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.44444274902344, + "epoch": 13.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.978484457959862e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.4166717529297, + "epoch": 13.865671641791044, + "grad_norm": 0.3845433003899264, + "learning_rate": 4.978399117408995e-07, + "loss": -0.0001, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 13.873134328358208, + "grad_norm": 0.0, + "learning_rate": 4.978313608677345e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.8333282470703, + "epoch": 13.880597014925373, + "grad_norm": 0.0, + "learning_rate": 4.978227931770712e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.63888549804688, + "epoch": 13.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.978142086694912e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.7777862548828, + "epoch": 13.895522388059701, + "grad_norm": 0.0, + "learning_rate": 4.97805607345577e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.86111450195312, + "epoch": 13.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.977969892059122e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1666717529297, + "epoch": 13.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.977883542510817e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3333282470703, + "epoch": 13.917910447761194, + "grad_norm": 1.1913730590006564, + "learning_rate": 4.977797024816713e-07, + "loss": -0.0006, + "reward": 0.8611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.3333282470703, + "epoch": 13.925373134328359, + "grad_norm": 0.2310337642678498, + "learning_rate": 4.977710338982683e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.94444274902344, + "epoch": 13.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.977623485014608e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.3333282470703, + "epoch": 13.940298507462687, + "grad_norm": 0.43763095523703904, + "learning_rate": 4.977536462918382e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.61111450195312, + "epoch": 13.947761194029852, + "grad_norm": 0.0, + "learning_rate": 4.97744927269991e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.55555725097656, + "epoch": 13.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.97736191436511e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.36111450195312, + "epoch": 13.962686567164178, + "grad_norm": 0.49706876982165404, + "learning_rate": 4.97727438791991e-07, + "loss": -0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.36111450195312, + "epoch": 13.970149253731343, + "grad_norm": 2.7928171776694057, + "learning_rate": 4.977186693370246e-07, + "loss": -0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.86111450195312, + "epoch": 13.977611940298507, + "grad_norm": 0.0, + "learning_rate": 4.977098830722073e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.19444274902344, + "epoch": 13.985074626865671, + "grad_norm": 0.7304214161221042, + "learning_rate": 4.977010799981351e-07, + "loss": -0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 13.992537313432836, + "grad_norm": 0.0, + "learning_rate": 4.976922601154054e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 14.007462686567164, + "grad_norm": 0.24195240332017856, + "learning_rate": 4.976834234246167e-07, + "loss": 0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 14.014925373134329, + "grad_norm": 0.17292872527788525, + "learning_rate": 4.976745699263687e-07, + "loss": -0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.55555725097656, + "epoch": 14.022388059701493, + "grad_norm": 0.0, + "learning_rate": 4.976656996212621e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.44444274902344, + "epoch": 14.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.97656812509899e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0277862548828, + "epoch": 14.037313432835822, + "grad_norm": 0.2214095997494307, + "learning_rate": 4.976479085928822e-07, + "loss": -0.0006, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.44444274902344, + "epoch": 14.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.976389878708162e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.80555725097656, + "epoch": 14.052238805970148, + "grad_norm": 0.5447249876969911, + "learning_rate": 4.976300503443061e-07, + "loss": -0.0009, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.3333282470703, + "epoch": 14.059701492537313, + "grad_norm": 0.0, + "learning_rate": 4.976210960139586e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.4166717529297, + "epoch": 14.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.976121248803811e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.13888549804688, + "epoch": 14.074626865671641, + "grad_norm": 0.0, + "learning_rate": 4.976031369441825e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.86111450195312, + "epoch": 14.082089552238806, + "grad_norm": 0.1790941789326064, + "learning_rate": 4.975941322059728e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.97222137451172, + "epoch": 14.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.975851106663629e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1666717529297, + "epoch": 14.097014925373134, + "grad_norm": 1.150676579865302, + "learning_rate": 4.975760723259651e-07, + "loss": 0.0002, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.75, + "epoch": 14.104477611940299, + "grad_norm": 0.25967796948561633, + "learning_rate": 4.975670171853925e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.2777862548828, + "epoch": 14.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.975579452452599e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.9166717529297, + "epoch": 14.119402985074627, + "grad_norm": 0.0, + "learning_rate": 4.975488565061826e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.94444274902344, + "epoch": 14.126865671641792, + "grad_norm": 0.23681984570457462, + "learning_rate": 4.975397509687776e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.80555725097656, + "epoch": 14.134328358208956, + "grad_norm": 0.39050667709089165, + "learning_rate": 4.975306286336627e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.3333282470703, + "epoch": 14.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.975214895014569e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.9166717529297, + "epoch": 14.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.975123335727804e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.1666717529297, + "epoch": 14.156716417910447, + "grad_norm": 1.6272782914794366, + "learning_rate": 4.975031608482544e-07, + "loss": -0.0005, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.63888549804688, + "epoch": 14.164179104477611, + "grad_norm": 0.0, + "learning_rate": 4.974939713285016e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.36111450195312, + "epoch": 14.171641791044776, + "grad_norm": 0.0, + "learning_rate": 4.974847650141452e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0833282470703, + "epoch": 14.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.974755419058104e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.80555725097656, + "epoch": 14.186567164179104, + "grad_norm": 0.0, + "learning_rate": 4.974663020041227e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25, + "epoch": 14.194029850746269, + "grad_norm": 0.3673189744553574, + "learning_rate": 4.974570453097092e-07, + "loss": -0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.69444274902344, + "epoch": 14.201492537313433, + "grad_norm": 0.0, + "learning_rate": 4.97447771823198e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.88888549804688, + "epoch": 14.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.974384815452186e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.80555725097656, + "epoch": 14.216417910447761, + "grad_norm": 0.0, + "learning_rate": 4.974291744764013e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.86111450195312, + "epoch": 14.223880597014926, + "grad_norm": 0.7284836580243275, + "learning_rate": 4.974198506173776e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 14.23134328358209, + "grad_norm": 0.27052699720435636, + "learning_rate": 4.974105099687803e-07, + "loss": -0.0001, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.9166717529297, + "epoch": 14.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.974011525312432e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.69444274902344, + "epoch": 14.246268656716419, + "grad_norm": 0.0, + "learning_rate": 4.973917783054012e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0833282470703, + "epoch": 14.253731343283581, + "grad_norm": 0.6164050065828688, + "learning_rate": 4.973823872918907e-07, + "loss": 0.0007, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.11111450195312, + "epoch": 14.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.973729794913486e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.19444274902344, + "epoch": 14.26865671641791, + "grad_norm": 0.5609102753840663, + "learning_rate": 4.973635549044135e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.38888549804688, + "epoch": 14.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.973541135317249e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.25, + "epoch": 14.283582089552239, + "grad_norm": 1.3966107632125986, + "learning_rate": 4.973446553739235e-07, + "loss": -0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1666717529297, + "epoch": 14.291044776119403, + "grad_norm": 0.42567943463423896, + "learning_rate": 4.973351804316512e-07, + "loss": -0.0005, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6666717529297, + "epoch": 14.298507462686567, + "grad_norm": 0.502144165453154, + "learning_rate": 4.973256887055508e-07, + "loss": 0.0, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.69444274902344, + "epoch": 14.305970149253731, + "grad_norm": 0.49604511149268393, + "learning_rate": 4.973161801962664e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.72222900390625, + "epoch": 14.313432835820896, + "grad_norm": 0.0, + "learning_rate": 4.973066549044433e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.30555725097656, + "epoch": 14.32089552238806, + "grad_norm": 0.8588104049645796, + "learning_rate": 4.97297112830728e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.0833282470703, + "epoch": 14.328358208955224, + "grad_norm": 0.0, + "learning_rate": 4.972875539757677e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.97222900390625, + "epoch": 14.335820895522389, + "grad_norm": 0.7430529870378894, + "learning_rate": 4.972779783402114e-07, + "loss": 0.0007, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.7777862548828, + "epoch": 14.343283582089553, + "grad_norm": 0.37612997487079525, + "learning_rate": 4.972683859247086e-07, + "loss": -0.0005, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.30555725097656, + "epoch": 14.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.972587767299104e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.61111450195312, + "epoch": 14.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.972491507564688e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.80555725097656, + "epoch": 14.365671641791044, + "grad_norm": 0.0, + "learning_rate": 4.97239508005037e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05555725097656, + "epoch": 14.373134328358208, + "grad_norm": 0.0, + "learning_rate": 4.972298484762694e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4166717529297, + "epoch": 14.380597014925373, + "grad_norm": 0.0, + "learning_rate": 4.972201721708213e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5833282470703, + "epoch": 14.388059701492537, + "grad_norm": 0.5932079058434281, + "learning_rate": 4.972104790893497e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.22222137451172, + "epoch": 14.395522388059701, + "grad_norm": 0.5850847249799966, + "learning_rate": 4.972007692325119e-07, + "loss": -0.0004, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.80555725097656, + "epoch": 14.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.971910426009672e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.25, + "epoch": 14.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.971812991953754e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.7777862548828, + "epoch": 14.417910447761194, + "grad_norm": 0.8609528868501037, + "learning_rate": 4.971715390163977e-07, + "loss": -0.001, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.05555725097656, + "epoch": 14.425373134328359, + "grad_norm": 0.0, + "learning_rate": 4.971617620646966e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.63888549804688, + "epoch": 14.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.971519683409351e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.22222900390625, + "epoch": 14.440298507462687, + "grad_norm": 1.6633740721186119, + "learning_rate": 4.971421578457783e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 14.447761194029852, + "grad_norm": 0.8459515266577514, + "learning_rate": 4.971323305798917e-07, + "loss": -0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55555725097656, + "epoch": 14.455223880597014, + "grad_norm": 0.7147209035141382, + "learning_rate": 4.97122486543942e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.63888549804688, + "epoch": 14.462686567164178, + "grad_norm": 0.0, + "learning_rate": 4.971126257385976e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 14.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.971027481645273e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.80555725097656, + "epoch": 14.477611940298507, + "grad_norm": 0.519319596657952, + "learning_rate": 4.970928538224016e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.6666717529297, + "epoch": 14.485074626865671, + "grad_norm": 0.0, + "learning_rate": 4.970829427128918e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.86111450195312, + "epoch": 14.492537313432836, + "grad_norm": 0.0, + "learning_rate": 4.970730148366704e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.86111450195312, + "epoch": 14.5, + "grad_norm": 0.0, + "learning_rate": 4.970630701944112e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 14.507462686567164, + "grad_norm": 0.0, + "learning_rate": 4.97053108786789e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6388931274414, + "epoch": 14.514925373134329, + "grad_norm": 0.0, + "learning_rate": 4.970431306144798e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.86111450195312, + "epoch": 14.522388059701493, + "grad_norm": 1.6611959216115568, + "learning_rate": 4.970331356781605e-07, + "loss": 0.0, + "reward": 0.472222238779068, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.69444274902344, + "epoch": 14.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.970231239785096e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9166717529297, + "epoch": 14.537313432835822, + "grad_norm": 0.335307916032579, + "learning_rate": 4.970130955162064e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0, + "epoch": 14.544776119402986, + "grad_norm": 0.7164913344686712, + "learning_rate": 4.970030502919315e-07, + "loss": -0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.94444274902344, + "epoch": 14.552238805970148, + "grad_norm": 0.47138433237613037, + "learning_rate": 4.969929883063664e-07, + "loss": -0.0001, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36111450195312, + "epoch": 14.559701492537313, + "grad_norm": 0.0, + "learning_rate": 4.969829095601938e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2777862548828, + "epoch": 14.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.96972814054098e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 14.574626865671641, + "grad_norm": 0.0, + "learning_rate": 4.969627017887637e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.55555725097656, + "epoch": 14.582089552238806, + "grad_norm": 0.4266615700226842, + "learning_rate": 4.969525727648774e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.2777862548828, + "epoch": 14.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.969424269831262e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.97222900390625, + "epoch": 14.597014925373134, + "grad_norm": 0.4072472964561656, + "learning_rate": 4.969322644441987e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.63888549804688, + "epoch": 14.604477611940299, + "grad_norm": 0.7103799990974545, + "learning_rate": 4.969220851487844e-07, + "loss": 0.0, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 14.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.969118890975742e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.22222900390625, + "epoch": 14.619402985074627, + "grad_norm": 0.0, + "learning_rate": 4.9690167629126e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.88888549804688, + "epoch": 14.626865671641792, + "grad_norm": 0.0, + "learning_rate": 4.968914467305347e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.13888549804688, + "epoch": 14.634328358208956, + "grad_norm": 0.0, + "learning_rate": 4.968812004160926e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.6666717529297, + "epoch": 14.64179104477612, + "grad_norm": 0.372481531305509, + "learning_rate": 4.968709373486288e-07, + "loss": -0.0006, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.2777862548828, + "epoch": 14.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.9686065752884e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.7777862548828, + "epoch": 14.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.968503609574236e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5, + "epoch": 14.664179104477611, + "grad_norm": 0.7175156205015029, + "learning_rate": 4.968400476350783e-07, + "loss": -0.0001, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 14.671641791044776, + "grad_norm": 0.0, + "learning_rate": 4.968297175625041e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.86111450195312, + "epoch": 14.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.968193707404018e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0, + "epoch": 14.686567164179104, + "grad_norm": 0.0, + "learning_rate": 4.968090071694736e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.19444274902344, + "epoch": 14.694029850746269, + "grad_norm": 0.0, + "learning_rate": 4.967986268504227e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.13888549804688, + "epoch": 14.701492537313433, + "grad_norm": 0.0, + "learning_rate": 4.967882297839537e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.0277862548828, + "epoch": 14.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.967778159707719e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.13888549804688, + "epoch": 14.716417910447761, + "grad_norm": 0.0, + "learning_rate": 4.967673854115841e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.4166717529297, + "epoch": 14.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.96756938107098e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.25, + "epoch": 14.73134328358209, + "grad_norm": 0.4697651419868399, + "learning_rate": 4.967464740580227e-07, + "loss": 0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.47222900390625, + "epoch": 14.738805970149254, + "grad_norm": 0.5693770609913609, + "learning_rate": 4.967359932650681e-07, + "loss": 0.0007, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.72222900390625, + "epoch": 14.746268656716419, + "grad_norm": 0.0, + "learning_rate": 4.967254957289455e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.5277862548828, + "epoch": 14.753731343283581, + "grad_norm": 1.0924989609469784, + "learning_rate": 4.967149814503672e-07, + "loss": 0.0091, + "reward": 0.5, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.72222900390625, + "epoch": 14.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.967044504300468e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.80555725097656, + "epoch": 14.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.966939026686988e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5277862548828, + "epoch": 14.776119402985074, + "grad_norm": 0.811379371083823, + "learning_rate": 4.96683338167039e-07, + "loss": 0.0001, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.80555725097656, + "epoch": 14.783582089552239, + "grad_norm": 0.0, + "learning_rate": 4.966727569257843e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.88888549804688, + "epoch": 14.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.966621589456527e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.75, + "epoch": 14.798507462686567, + "grad_norm": 0.0, + "learning_rate": 4.966515442273634e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.80555725097656, + "epoch": 14.805970149253731, + "grad_norm": 0.7338091050815445, + "learning_rate": 4.966409127716366e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4166717529297, + "epoch": 14.813432835820896, + "grad_norm": 0.0, + "learning_rate": 4.966302645791938e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.80555725097656, + "epoch": 14.82089552238806, + "grad_norm": 0.4465620949627133, + "learning_rate": 4.966195996507578e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.94444274902344, + "epoch": 14.828358208955224, + "grad_norm": 0.0, + "learning_rate": 4.966089179870519e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.55555725097656, + "epoch": 14.835820895522389, + "grad_norm": 1.0292567903226941, + "learning_rate": 4.965982195888012e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 14.843283582089553, + "grad_norm": 0.0, + "learning_rate": 4.965875044567317e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.75, + "epoch": 14.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.965767725915704e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.61111450195312, + "epoch": 14.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.965660239940456e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.9166717529297, + "epoch": 14.865671641791044, + "grad_norm": 0.974812180844557, + "learning_rate": 4.965552586648866e-07, + "loss": 0.0003, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3333282470703, + "epoch": 14.873134328358208, + "grad_norm": 0.0, + "learning_rate": 4.96544476604824e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.75, + "epoch": 14.880597014925373, + "grad_norm": 0.5674495330824593, + "learning_rate": 4.965336778145895e-07, + "loss": -0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.36111450195312, + "epoch": 14.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.965228622949158e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.13888549804688, + "epoch": 14.895522388059701, + "grad_norm": 0.0, + "learning_rate": 4.965120300465368e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.94444274902344, + "epoch": 14.902985074626866, + "grad_norm": 0.6889156468442975, + "learning_rate": 4.965011810701877e-07, + "loss": 0.0001, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.61111450195312, + "epoch": 14.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.964903153666046e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.13888549804688, + "epoch": 14.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.964794329365248e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.3888931274414, + "epoch": 14.925373134328359, + "grad_norm": 0.0, + "learning_rate": 4.96468533780687e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 14.932835820895523, + "grad_norm": 0.39052236506939286, + "learning_rate": 4.964576178998304e-07, + "loss": 0.0005, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5277862548828, + "epoch": 14.940298507462687, + "grad_norm": 0.0, + "learning_rate": 4.964466852946962e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.11111450195312, + "epoch": 14.947761194029852, + "grad_norm": 0.0, + "learning_rate": 4.964357359660259e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.61111450195312, + "epoch": 14.955223880597014, + "grad_norm": 1.9306073699292319, + "learning_rate": 4.964247699145625e-07, + "loss": -0.0006, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 14.962686567164178, + "grad_norm": 1.6315422044394499, + "learning_rate": 4.964137871410505e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.25, + "epoch": 14.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.964027876462349e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.1666717529297, + "epoch": 14.977611940298507, + "grad_norm": 0.0, + "learning_rate": 4.963917714308621e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.47222900390625, + "epoch": 14.985074626865671, + "grad_norm": 1.5722027398803478, + "learning_rate": 4.963807384956797e-07, + "loss": -0.0027, + "reward": 0.6666666865348816, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.6666717529297, + "epoch": 14.992537313432836, + "grad_norm": 0.8914788580013384, + "learning_rate": 4.963696888414365e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.47222900390625, + "epoch": 15.007462686567164, + "grad_norm": 1.0873818380704774, + "learning_rate": 4.963586224688821e-07, + "loss": -0.0011, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.83333587646484, + "epoch": 15.014925373134329, + "grad_norm": 0.4669079458430151, + "learning_rate": 4.963475393787676e-07, + "loss": -0.0001, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.05555725097656, + "epoch": 15.022388059701493, + "grad_norm": 0.0, + "learning_rate": 4.96336439571845e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.36111450195312, + "epoch": 15.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.963253230488676e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.80555725097656, + "epoch": 15.037313432835822, + "grad_norm": 0.0, + "learning_rate": 4.963141898105897e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.3333282470703, + "epoch": 15.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.963030398577668e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.69444274902344, + "epoch": 15.052238805970148, + "grad_norm": 0.0, + "learning_rate": 4.962918731911555e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.36111450195312, + "epoch": 15.059701492537313, + "grad_norm": 0.0, + "learning_rate": 4.962806898115136e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.94444274902344, + "epoch": 15.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.962694897195999e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.75, + "epoch": 15.074626865671641, + "grad_norm": 0.0, + "learning_rate": 4.962582729161745e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.63888549804688, + "epoch": 15.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.962470394019986e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.2777862548828, + "epoch": 15.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.962357891778344e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.11111450195312, + "epoch": 15.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.962245222444454e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.86111450195312, + "epoch": 15.104477611940299, + "grad_norm": 1.1275470321509395, + "learning_rate": 4.962132386025959e-07, + "loss": -0.0006, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.61111450195312, + "epoch": 15.111940298507463, + "grad_norm": 3.1013848204736796, + "learning_rate": 4.96201938253052e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.47222900390625, + "epoch": 15.119402985074627, + "grad_norm": 0.8527560330859243, + "learning_rate": 4.961906211965803e-07, + "loss": 0.0004, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.6666717529297, + "epoch": 15.126865671641792, + "grad_norm": 0.4897899605674039, + "learning_rate": 4.961792874339488e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0277862548828, + "epoch": 15.134328358208956, + "grad_norm": 0.0, + "learning_rate": 4.961679369659265e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5, + "epoch": 15.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.961565697932838e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 15.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.961451859167919e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.1666717529297, + "epoch": 15.156716417910447, + "grad_norm": 0.4799716049296357, + "learning_rate": 4.961337853372235e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.44444274902344, + "epoch": 15.164179104477611, + "grad_norm": 0.0, + "learning_rate": 4.96122368055352e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.36111450195312, + "epoch": 15.171641791044776, + "grad_norm": 0.0, + "learning_rate": 4.961109340719522e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.38888549804688, + "epoch": 15.17910447761194, + "grad_norm": 1.3578610809537315, + "learning_rate": 4.960994833878002e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.30555725097656, + "epoch": 15.186567164179104, + "grad_norm": 0.0, + "learning_rate": 4.960880160036727e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.38888549804688, + "epoch": 15.194029850746269, + "grad_norm": 0.0, + "learning_rate": 4.960765319203482e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.19444274902344, + "epoch": 15.201492537313433, + "grad_norm": 0.0, + "learning_rate": 4.960650311386057e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.80555725097656, + "epoch": 15.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.960535136592259e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.75, + "epoch": 15.216417910447761, + "grad_norm": 0.0, + "learning_rate": 4.960419794829901e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.36111450195312, + "epoch": 15.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.960304286106812e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.80555725097656, + "epoch": 15.23134328358209, + "grad_norm": 1.5360143669090194, + "learning_rate": 4.960188610430829e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 15.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.960072767809802e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.69444274902344, + "epoch": 15.246268656716419, + "grad_norm": 0.3567079473345202, + "learning_rate": 4.959956758251592e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5833282470703, + "epoch": 15.253731343283581, + "grad_norm": 0.0, + "learning_rate": 4.959840581764071e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.94444274902344, + "epoch": 15.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.959724238355123e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.47222900390625, + "epoch": 15.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.959607728032643e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.47222900390625, + "epoch": 15.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.959491050804537e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.22222900390625, + "epoch": 15.283582089552239, + "grad_norm": 0.39292951954149047, + "learning_rate": 4.959374206678721e-07, + "loss": 0.0, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6666717529297, + "epoch": 15.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.959257195663127e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6666717529297, + "epoch": 15.298507462686567, + "grad_norm": 0.0, + "learning_rate": 4.959140017765693e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.19444274902344, + "epoch": 15.305970149253731, + "grad_norm": 0.0, + "learning_rate": 4.959022672994371e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.8333282470703, + "epoch": 15.313432835820896, + "grad_norm": 0.0, + "learning_rate": 4.958905161357124e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5833282470703, + "epoch": 15.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.958787482861926e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.30555725097656, + "epoch": 15.328358208955224, + "grad_norm": 0.0, + "learning_rate": 4.958669637516762e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.88888549804688, + "epoch": 15.335820895522389, + "grad_norm": 0.0, + "learning_rate": 4.95855162532963e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.63888549804688, + "epoch": 15.343283582089553, + "grad_norm": 0.0, + "learning_rate": 4.958433446308538e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 15.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.958315100461505e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5277862548828, + "epoch": 15.35820895522388, + "grad_norm": 0.923657063069663, + "learning_rate": 4.958196587796562e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.94444274902344, + "epoch": 15.365671641791044, + "grad_norm": 0.0, + "learning_rate": 4.95807790832175e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.4166717529297, + "epoch": 15.373134328358208, + "grad_norm": 0.0, + "learning_rate": 4.957959062045124e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.36111450195312, + "epoch": 15.380597014925373, + "grad_norm": 0.7061071138583849, + "learning_rate": 4.957840048974748e-07, + "loss": -0.0046, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.7777862548828, + "epoch": 15.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.957720869118699e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.72222900390625, + "epoch": 15.395522388059701, + "grad_norm": 0.0, + "learning_rate": 4.957601522485062e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.97222900390625, + "epoch": 15.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.957482009081938e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.0277862548828, + "epoch": 15.41044776119403, + "grad_norm": 0.4066786732262992, + "learning_rate": 4.957362328917437e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.72222900390625, + "epoch": 15.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.957242481999678e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.91666412353516, + "epoch": 15.425373134328359, + "grad_norm": 1.1548364555745387, + "learning_rate": 4.957122468336797e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.63888549804688, + "epoch": 15.432835820895523, + "grad_norm": 0.9730245745852779, + "learning_rate": 4.957002287936935e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 15.440298507462687, + "grad_norm": 0.0, + "learning_rate": 4.95688194080825e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.63888549804688, + "epoch": 15.447761194029852, + "grad_norm": 1.336751683217508, + "learning_rate": 4.956761426958905e-07, + "loss": -0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.47222900390625, + "epoch": 15.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.956640746397082e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 15.462686567164178, + "grad_norm": 0.0, + "learning_rate": 4.956519899130967e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.72222900390625, + "epoch": 15.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.956398885168763e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.19444274902344, + "epoch": 15.477611940298507, + "grad_norm": 0.0, + "learning_rate": 4.956277704518678e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.4166717529297, + "epoch": 15.485074626865671, + "grad_norm": 1.8558728472588977, + "learning_rate": 4.956156357188939e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1666717529297, + "epoch": 15.492537313432836, + "grad_norm": 0.0, + "learning_rate": 4.956034843187781e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.22222900390625, + "epoch": 15.5, + "grad_norm": 0.4650835758766467, + "learning_rate": 4.955913162523446e-07, + "loss": -0.0002, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5277862548828, + "epoch": 15.507462686567164, + "grad_norm": 0.7592704899824002, + "learning_rate": 4.955791315204194e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9166717529297, + "epoch": 15.514925373134329, + "grad_norm": 0.0, + "learning_rate": 4.955669301238292e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0833282470703, + "epoch": 15.522388059701493, + "grad_norm": 0.5441387824791207, + "learning_rate": 4.95554712063402e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.86111450195312, + "epoch": 15.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.95542477339967e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.05555725097656, + "epoch": 15.537313432835822, + "grad_norm": 0.0, + "learning_rate": 4.955302259543544e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 15.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.955179579073954e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.13888549804688, + "epoch": 15.552238805970148, + "grad_norm": 0.0, + "learning_rate": 4.955056731999226e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.22222900390625, + "epoch": 15.559701492537313, + "grad_norm": 0.5450243383079808, + "learning_rate": 4.954933718327696e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6666717529297, + "epoch": 15.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.954810538067713e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0833282470703, + "epoch": 15.574626865671641, + "grad_norm": 0.0, + "learning_rate": 4.954687191227634e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 15.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.95456367781583e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.13888549804688, + "epoch": 15.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.954439997840681e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.63888549804688, + "epoch": 15.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.954316151310582e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1666717529297, + "epoch": 15.604477611940299, + "grad_norm": 0.0, + "learning_rate": 4.954192138233937e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.11111450195312, + "epoch": 15.611940298507463, + "grad_norm": 0.7416465926904207, + "learning_rate": 4.95406795861916e-07, + "loss": 0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.08333587646484, + "epoch": 15.619402985074627, + "grad_norm": 0.0, + "learning_rate": 4.953943612474678e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.97222900390625, + "epoch": 15.626865671641792, + "grad_norm": 0.4106587285048837, + "learning_rate": 4.953819099808928e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.97222900390625, + "epoch": 15.634328358208956, + "grad_norm": 0.0, + "learning_rate": 4.953694420630361e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.4166717529297, + "epoch": 15.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.953569574947437e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 15.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.953444562768628e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 15.656716417910447, + "grad_norm": 0.0, + "learning_rate": 4.953319384102416e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 15.664179104477611, + "grad_norm": 1.8136481972374314, + "learning_rate": 4.953194038957297e-07, + "loss": 0.0009, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5277862548828, + "epoch": 15.671641791044776, + "grad_norm": 0.0, + "learning_rate": 4.953068527341777e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.36111450195312, + "epoch": 15.67910447761194, + "grad_norm": 0.3074219170114569, + "learning_rate": 4.952942849264371e-07, + "loss": -0.0006, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.19444274902344, + "epoch": 15.686567164179104, + "grad_norm": 0.0, + "learning_rate": 4.952817004733608e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.38888549804688, + "epoch": 15.694029850746269, + "grad_norm": 0.22584878311312415, + "learning_rate": 4.952690993758029e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.47222900390625, + "epoch": 15.701492537313433, + "grad_norm": 0.0, + "learning_rate": 4.952564816346184e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.88888549804688, + "epoch": 15.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.952438472506635e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.3333282470703, + "epoch": 15.716417910447761, + "grad_norm": 0.0, + "learning_rate": 4.952311962247956e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.72222900390625, + "epoch": 15.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.952185285578732e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.11111450195312, + "epoch": 15.73134328358209, + "grad_norm": 0.6379925200454806, + "learning_rate": 4.952058442507558e-07, + "loss": 0.0, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.7777862548828, + "epoch": 15.738805970149254, + "grad_norm": 0.5412893522451963, + "learning_rate": 4.951931433043043e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0277862548828, + "epoch": 15.746268656716419, + "grad_norm": 0.9762100160228598, + "learning_rate": 4.951804257193804e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5277862548828, + "epoch": 15.753731343283581, + "grad_norm": 0.0, + "learning_rate": 4.951676914968471e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.6666717529297, + "epoch": 15.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.951549406375688e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.55555725097656, + "epoch": 15.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.951421731424104e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.6666717529297, + "epoch": 15.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.951293890122386e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0833282470703, + "epoch": 15.783582089552239, + "grad_norm": 0.0, + "learning_rate": 4.951165882479206e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.19444274902344, + "epoch": 15.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.951037708503253e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5, + "epoch": 15.798507462686567, + "grad_norm": 0.0, + "learning_rate": 4.950909368203224e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.38888549804688, + "epoch": 15.805970149253731, + "grad_norm": 0.755557922915481, + "learning_rate": 4.950780861587826e-07, + "loss": 0.0002, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.38888549804688, + "epoch": 15.813432835820896, + "grad_norm": 0.0, + "learning_rate": 4.950652188665783e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.80555725097656, + "epoch": 15.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.950523349445824e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.19444274902344, + "epoch": 15.828358208955224, + "grad_norm": 0.0, + "learning_rate": 4.950394343936692e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5277862548828, + "epoch": 15.835820895522389, + "grad_norm": 0.6231518179071616, + "learning_rate": 4.950265172147141e-07, + "loss": -0.0006, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.6666717529297, + "epoch": 15.843283582089553, + "grad_norm": 2.751606252140921, + "learning_rate": 4.950135834085939e-07, + "loss": 0.0007, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1666717529297, + "epoch": 15.850746268656717, + "grad_norm": 1.3488767901149845, + "learning_rate": 4.950006329761859e-07, + "loss": 0.0004, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.94444274902344, + "epoch": 15.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.949876659183692e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.80555725097656, + "epoch": 15.865671641791044, + "grad_norm": 0.0, + "learning_rate": 4.949746822360235e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 15.873134328358208, + "grad_norm": 0.0, + "learning_rate": 4.949616819300299e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.72222900390625, + "epoch": 15.880597014925373, + "grad_norm": 0.0, + "learning_rate": 4.949486650012707e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 15.888059701492537, + "grad_norm": 1.7578471591598186, + "learning_rate": 4.949356314506292e-07, + "loss": -0.0009, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.97222900390625, + "epoch": 15.895522388059701, + "grad_norm": 0.0, + "learning_rate": 4.949225812789898e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.4166717529297, + "epoch": 15.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.949095144872379e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.55555725097656, + "epoch": 15.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.948964310762605e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.0277862548828, + "epoch": 15.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.948833310469451e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.61111450195312, + "epoch": 15.925373134328359, + "grad_norm": 0.0, + "learning_rate": 4.94870214400181e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 15.932835820895523, + "grad_norm": 3.8335976154849907, + "learning_rate": 4.948570811368579e-07, + "loss": -0.001, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.5277862548828, + "epoch": 15.940298507462687, + "grad_norm": 0.39197065469003584, + "learning_rate": 4.948439312578673e-07, + "loss": 0.0001, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.9166717529297, + "epoch": 15.947761194029852, + "grad_norm": 0.0, + "learning_rate": 4.948307647641015e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.25, + "epoch": 15.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.94817581656454e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.66666412353516, + "epoch": 15.962686567164178, + "grad_norm": 0.0, + "learning_rate": 4.948043819358191e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.25, + "epoch": 15.970149253731343, + "grad_norm": 0.6413821964934306, + "learning_rate": 4.947911656030927e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.2777862548828, + "epoch": 15.977611940298507, + "grad_norm": 0.0, + "learning_rate": 4.947779326591716e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.47222900390625, + "epoch": 15.985074626865671, + "grad_norm": 0.0, + "learning_rate": 4.947646831049539e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.83334350585938, + "epoch": 15.992537313432836, + "grad_norm": 0.0, + "learning_rate": 4.947514169413386e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8333282470703, + "epoch": 16.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.947381341692259e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5833282470703, + "epoch": 16.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.947248347895172e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.97222900390625, + "epoch": 16.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.947115188031149e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0, + "epoch": 16.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.946981862109227e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.30555725097656, + "epoch": 16.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.946848370138453e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5277862548828, + "epoch": 16.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.946714712127886e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.11111450195312, + "epoch": 16.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.946580888086594e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.88888549804688, + "epoch": 16.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.946446898023662e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.63888549804688, + "epoch": 16.067164179104477, + "grad_norm": 1.4481062619315657, + "learning_rate": 4.946312741948177e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.69444274902344, + "epoch": 16.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.946178419869247e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.86111450195312, + "epoch": 16.082089552238806, + "grad_norm": 0.9350360959665837, + "learning_rate": 4.946043931795986e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.5833282470703, + "epoch": 16.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.945909277737518e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.63888549804688, + "epoch": 16.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.945774457702983e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.0, + "epoch": 16.104477611940297, + "grad_norm": 0.4749874918576014, + "learning_rate": 4.945639471701529e-07, + "loss": -0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.7777862548828, + "epoch": 16.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.945504319742315e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.30555725097656, + "epoch": 16.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.945369001834514e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0833282470703, + "epoch": 16.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.945233517987307e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.38888549804688, + "epoch": 16.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.945097868209887e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0277862548828, + "epoch": 16.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.944962052511462e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.0833282470703, + "epoch": 16.149253731343283, + "grad_norm": 0.8330770974813168, + "learning_rate": 4.944826070901246e-07, + "loss": 0.0005, + "reward": 0.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.7777862548828, + "epoch": 16.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.944689923388467e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.80555725097656, + "epoch": 16.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.944553609982362e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.72222900390625, + "epoch": 16.171641791044777, + "grad_norm": 0.5513494763483959, + "learning_rate": 4.944417130692185e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.11111450195312, + "epoch": 16.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.944280485527195e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 16.186567164179106, + "grad_norm": 1.1653597969242715, + "learning_rate": 4.944143674496664e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.63888549804688, + "epoch": 16.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.944006697609876e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.47222900390625, + "epoch": 16.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.943869554876127e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.80555725097656, + "epoch": 16.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.943732246304723e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.38888549804688, + "epoch": 16.21641791044776, + "grad_norm": 0.6442692050381003, + "learning_rate": 4.943594771904982e-07, + "loss": 0.0007, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.9166717529297, + "epoch": 16.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.943457131686231e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 16.23134328358209, + "grad_norm": 0.4537615576354159, + "learning_rate": 4.943319325657812e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.22222900390625, + "epoch": 16.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.943181353829076e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.47222900390625, + "epoch": 16.246268656716417, + "grad_norm": 1.284570521540753, + "learning_rate": 4.943043216209384e-07, + "loss": 0.0, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.38888549804688, + "epoch": 16.253731343283583, + "grad_norm": 0.750229269362183, + "learning_rate": 4.942904912808112e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.47222900390625, + "epoch": 16.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.942766443634645e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.88888549804688, + "epoch": 16.26865671641791, + "grad_norm": 0.9232980136545871, + "learning_rate": 4.942627808698378e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3333282470703, + "epoch": 16.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.942489008008718e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.75, + "epoch": 16.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.942350041575085e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.44444274902344, + "epoch": 16.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.94221090940691e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.36111450195312, + "epoch": 16.298507462686565, + "grad_norm": 0.647616695696134, + "learning_rate": 4.942071611513633e-07, + "loss": 0.0009, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.22222900390625, + "epoch": 16.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.941932147904708e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.86111450195312, + "epoch": 16.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.941792518589596e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.63888549804688, + "epoch": 16.32089552238806, + "grad_norm": 0.9871665375781437, + "learning_rate": 4.941652723577774e-07, + "loss": 0.0001, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.86111450195312, + "epoch": 16.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.941512762878728e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.4166717529297, + "epoch": 16.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.941372636501955e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.63888549804688, + "epoch": 16.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.941232344456965e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.13888549804688, + "epoch": 16.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.941091886753278e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.22222900390625, + "epoch": 16.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.940951263400424e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.11111450195312, + "epoch": 16.365671641791046, + "grad_norm": 0.8734102496758931, + "learning_rate": 4.940810474407946e-07, + "loss": 0.0001, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.75, + "epoch": 16.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.940669519785398e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.47222137451172, + "epoch": 16.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.940528399542345e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.30555725097656, + "epoch": 16.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.940387113688362e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.4166717529297, + "epoch": 16.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.94024566223304e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.2777862548828, + "epoch": 16.402985074626866, + "grad_norm": 0.6053879587921328, + "learning_rate": 4.940104045185973e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9166717529297, + "epoch": 16.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.939962262556774e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4166717529297, + "epoch": 16.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.939820314355064e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.6666717529297, + "epoch": 16.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.939678200590475e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5277862548828, + "epoch": 16.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.93953592127265e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.69444274902344, + "epoch": 16.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.939393476411244e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5277862548828, + "epoch": 16.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.939250866015923e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 16.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.939108090096366e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.72222900390625, + "epoch": 16.46268656716418, + "grad_norm": 12.975251881325262, + "learning_rate": 4.93896514866226e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5277862548828, + "epoch": 16.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.938822041723304e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.38888549804688, + "epoch": 16.47761194029851, + "grad_norm": 0.4560607108998293, + "learning_rate": 4.938678769289212e-07, + "loss": -0.0001, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5, + "epoch": 16.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.938535331369704e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 16.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.938391727974513e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.19444274902344, + "epoch": 16.5, + "grad_norm": 0.0, + "learning_rate": 4.938247959113385e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 16.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.938104024796076e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.05555725097656, + "epoch": 16.51492537313433, + "grad_norm": 2.0129117862364283, + "learning_rate": 4.937959925032353e-07, + "loss": -0.001, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.38888549804688, + "epoch": 16.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.937815659831994e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.80555725097656, + "epoch": 16.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.93767122920479e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.88888549804688, + "epoch": 16.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.937526633160539e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.72222900390625, + "epoch": 16.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.937381871709057e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.47222900390625, + "epoch": 16.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.937236944860164e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.05555725097656, + "epoch": 16.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.937091852623695e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.94444274902344, + "epoch": 16.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.936946595009497e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3333282470703, + "epoch": 16.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.936801172027427e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5277862548828, + "epoch": 16.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.936655583687353e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1666717529297, + "epoch": 16.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.936509829999153e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 16.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.93636391097272e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.83333587646484, + "epoch": 16.604477611940297, + "grad_norm": 0.789031811264405, + "learning_rate": 4.936217826617954e-07, + "loss": -0.0008, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0, + "epoch": 16.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.936071576944769e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.61111450195312, + "epoch": 16.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.935925161963089e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.30555725097656, + "epoch": 16.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.935778581682848e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.05555725097656, + "epoch": 16.634328358208954, + "grad_norm": 2.966666770225597, + "learning_rate": 4.935631836113996e-07, + "loss": -0.0013, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.44444274902344, + "epoch": 16.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.935484925266488e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.2777862548828, + "epoch": 16.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.935337849150295e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 16.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.935190607775397e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0277862548828, + "epoch": 16.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.935043201151786e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6666717529297, + "epoch": 16.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.934895629289463e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1388931274414, + "epoch": 16.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.934747892198444e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 16.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.934599989888753e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.69444274902344, + "epoch": 16.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.934451922370427e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0277862548828, + "epoch": 16.701492537313435, + "grad_norm": 1.7514122105698016, + "learning_rate": 4.934303689653513e-07, + "loss": -0.0004, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5, + "epoch": 16.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.934155291748072e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.7777862548828, + "epoch": 16.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.934006728664171e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.41666412353516, + "epoch": 16.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.933858000411894e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 16.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.933709107001331e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 16.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.933560048442588e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.94444274902344, + "epoch": 16.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.933410824745778e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 16.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.933261435921029e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.80555725097656, + "epoch": 16.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.933111881978477e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.7777862548828, + "epoch": 16.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.932962162928271e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.13888549804688, + "epoch": 16.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.93281227878057e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.7777862548828, + "epoch": 16.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.932662229545547e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.30555725097656, + "epoch": 16.791044776119403, + "grad_norm": 0.5736889428071528, + "learning_rate": 4.93251201523338e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9166717529297, + "epoch": 16.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.932361635854267e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.36111450195312, + "epoch": 16.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.932211091418411e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.36111450195312, + "epoch": 16.813432835820894, + "grad_norm": 0.2769530464827184, + "learning_rate": 4.932060381936026e-07, + "loss": 0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8333282470703, + "epoch": 16.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.931909507417341e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9166717529297, + "epoch": 16.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.931758467872594e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.5, + "epoch": 16.83582089552239, + "grad_norm": 1.0967443430250186, + "learning_rate": 4.931607263312032e-07, + "loss": 0.005, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.36111450195312, + "epoch": 16.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.931455893745918e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.6666717529297, + "epoch": 16.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.931304359184524e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.1666717529297, + "epoch": 16.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.93115265963813e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.75, + "epoch": 16.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.931000795117034e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0833282470703, + "epoch": 16.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.930848765631537e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3333282470703, + "epoch": 16.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.93069657119196e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.7777862548828, + "epoch": 16.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.930544211808628e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.05555725097656, + "epoch": 16.895522388059703, + "grad_norm": 0.9839599117173382, + "learning_rate": 4.93039168749188e-07, + "loss": -0.0001, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.38888549804688, + "epoch": 16.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.930238998252068e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.72222900390625, + "epoch": 16.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.93008614409955e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.80555725097656, + "epoch": 16.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.929933125044701e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.8333282470703, + "epoch": 16.925373134328357, + "grad_norm": 0.8806111244275229, + "learning_rate": 4.929779941097905e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 16.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.929626592269554e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.1666717529297, + "epoch": 16.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.929473078570058e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.94444274902344, + "epoch": 16.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.92931940000983e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.69444274902344, + "epoch": 16.955223880597014, + "grad_norm": 0.8772952674287533, + "learning_rate": 4.929165556599301e-07, + "loss": -0.0012, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.8333282470703, + "epoch": 16.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.929011548348912e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 16.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.928857375269109e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.19444274902344, + "epoch": 16.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.928703037370359e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.72222900390625, + "epoch": 16.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.928548534663132e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0, + "epoch": 16.992537313432837, + "grad_norm": 0.5273411563995193, + "learning_rate": 4.928393867157914e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.2777862548828, + "epoch": 17.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.9282390348652e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1666717529297, + "epoch": 17.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.928084037795497e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.8333282470703, + "epoch": 17.02238805970149, + "grad_norm": 0.581828612136818, + "learning_rate": 4.927928875959321e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.69444274902344, + "epoch": 17.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.927773549367205e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.02777862548828, + "epoch": 17.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.927618058029684e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.7777862548828, + "epoch": 17.044776119402986, + "grad_norm": 2.492595012162253, + "learning_rate": 4.927462401957314e-07, + "loss": -0.0008, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5277862548828, + "epoch": 17.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.927306581160656e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 17.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.927150595650283e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.08333587646484, + "epoch": 17.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.926994445436781e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 17.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.926838130530746e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.69444274902344, + "epoch": 17.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.926681650942785e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2777862548828, + "epoch": 17.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.926525006683516e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 17.097014925373134, + "grad_norm": 0.8438330169570732, + "learning_rate": 4.926368197763569e-07, + "loss": -0.0003, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0277862548828, + "epoch": 17.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.926211224193586e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 17.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.926054085984218e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9166717529297, + "epoch": 17.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.925896783146128e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.22222900390625, + "epoch": 17.12686567164179, + "grad_norm": 1.0659277361213988, + "learning_rate": 4.925739315689991e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.94444274902344, + "epoch": 17.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.925581683626491e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 17.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.925423886966328e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6666717529297, + "epoch": 17.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.925265925720206e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.05555725097656, + "epoch": 17.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.925107799898847e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 17.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.924949509512979e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.38888549804688, + "epoch": 17.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.924791054573344e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.88888549804688, + "epoch": 17.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.924632435090696e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0277862548828, + "epoch": 17.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.924473651075797e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0277862548828, + "epoch": 17.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.924314702539422e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.25, + "epoch": 17.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.924155589492359e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.30555725097656, + "epoch": 17.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.923996311945401e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.13888549804688, + "epoch": 17.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.923836869909362e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3333282470703, + "epoch": 17.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.923677263395057e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.22222900390625, + "epoch": 17.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.923517492413319e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.11111450195312, + "epoch": 17.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.923357556974988e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.61111450195312, + "epoch": 17.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.923197457090919e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.30555725097656, + "epoch": 17.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.923037192771975e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.88888549804688, + "epoch": 17.261194029850746, + "grad_norm": 1.3630055428226213, + "learning_rate": 4.922876764029031e-07, + "loss": 0.0006, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 17.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.922716170872974e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.9166717529297, + "epoch": 17.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.922555413314703e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.94444274902344, + "epoch": 17.28358208955224, + "grad_norm": 0.3427379998399829, + "learning_rate": 4.922394491365124e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.38888549804688, + "epoch": 17.291044776119403, + "grad_norm": 0.43585813022514247, + "learning_rate": 4.922233405035159e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.44444274902344, + "epoch": 17.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.922072154335739e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8333282470703, + "epoch": 17.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.921910739277804e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0833282470703, + "epoch": 17.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.921749159872311e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.58333587646484, + "epoch": 17.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.921587416130222e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.94444274902344, + "epoch": 17.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.921425508062514e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.7777862548828, + "epoch": 17.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.921263435680172e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.55555725097656, + "epoch": 17.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.921101198994196e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.3333282470703, + "epoch": 17.350746268656717, + "grad_norm": 0.5798043979618165, + "learning_rate": 4.920938798015595e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.69444274902344, + "epoch": 17.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.920776232755388e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.72222900390625, + "epoch": 17.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.920613503224608e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.63888549804688, + "epoch": 17.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.920450609434295e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9166717529297, + "epoch": 17.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.920287551395506e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.63888549804688, + "epoch": 17.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.920124329119305e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.36111450195312, + "epoch": 17.395522388059703, + "grad_norm": 1.217351260772324, + "learning_rate": 4.919960942616767e-07, + "loss": 0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.22222900390625, + "epoch": 17.402985074626866, + "grad_norm": 0.38969600408529587, + "learning_rate": 4.919797391898979e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 17.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.91963367697704e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.88888549804688, + "epoch": 17.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.91946979786206e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5833282470703, + "epoch": 17.425373134328357, + "grad_norm": 1.1355319418469687, + "learning_rate": 4.91930575456516e-07, + "loss": -0.0005, + "reward": 0.2777777910232544, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.2777777910232544, + "rewards/format_reward": 0.0, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.44444274902344, + "epoch": 17.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.919141547097469e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5, + "epoch": 17.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.918977175470132e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0, + "epoch": 17.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.918812639694304e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.25, + "epoch": 17.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.918647939781149e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.97222900390625, + "epoch": 17.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.918483075741843e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333282470703, + "epoch": 17.470149253731343, + "grad_norm": 0.5458227857305854, + "learning_rate": 4.918318047587572e-07, + "loss": -0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.2777862548828, + "epoch": 17.47761194029851, + "grad_norm": 0.7266631544366766, + "learning_rate": 4.918152855329538e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.4166717529297, + "epoch": 17.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.91798749897895e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.61111450195312, + "epoch": 17.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.917821978547025e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.55555725097656, + "epoch": 17.5, + "grad_norm": 0.0, + "learning_rate": 4.917656294045e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13888549804688, + "epoch": 17.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.917490445484116e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.69444274902344, + "epoch": 17.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.917324432875626e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.11111450195312, + "epoch": 17.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.917158256230798e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.5277862548828, + "epoch": 17.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.916991915560906e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.36111450195312, + "epoch": 17.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.91682541087724e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4166717529297, + "epoch": 17.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.916658742191096e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.69444274902344, + "epoch": 17.55223880597015, + "grad_norm": 3.145643960107291, + "learning_rate": 4.916491909513787e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8333282470703, + "epoch": 17.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.916324912856632e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.7777862548828, + "epoch": 17.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.916157752230963e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.91666412353516, + "epoch": 17.574626865671643, + "grad_norm": 1.9222075322451808, + "learning_rate": 4.915990427648125e-07, + "loss": 0.0004, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5833282470703, + "epoch": 17.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.915822939119471e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0833282470703, + "epoch": 17.58955223880597, + "grad_norm": 0.7822973377262868, + "learning_rate": 4.915655286656368e-07, + "loss": 0.0, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5277862548828, + "epoch": 17.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.915487470270191e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.30555725097656, + "epoch": 17.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.915319489972328e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8333282470703, + "epoch": 17.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.915151345774178e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6666717529297, + "epoch": 17.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.914983037687152e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.11111450195312, + "epoch": 17.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.91481456572267e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9166717529297, + "epoch": 17.634328358208954, + "grad_norm": 0.4466474892252231, + "learning_rate": 4.914645929892166e-07, + "loss": -0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 17.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.914477130207081e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4166717529297, + "epoch": 17.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.914308166678871e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.86111450195312, + "epoch": 17.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.914139039319001e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 17.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.91396974813895e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.61111450195312, + "epoch": 17.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.913800293150202e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5833282470703, + "epoch": 17.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.913630674364258e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.3333282470703, + "epoch": 17.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.913460891792629e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.1666717529297, + "epoch": 17.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.913290945446835e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.55555725097656, + "epoch": 17.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.913120835338408e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.61111450195312, + "epoch": 17.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.912950561478893e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 17.71641791044776, + "grad_norm": 0.4304317060516609, + "learning_rate": 4.912780123879843e-07, + "loss": -0.0004, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.19444274902344, + "epoch": 17.723880597014926, + "grad_norm": 0.8112258782809068, + "learning_rate": 4.912609522552824e-07, + "loss": 0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.7777862548828, + "epoch": 17.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.912438757509414e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.47222900390625, + "epoch": 17.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.912267828761199e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.47222900390625, + "epoch": 17.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.912096736319779e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.69444274902344, + "epoch": 17.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.911925480196765e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.38888549804688, + "epoch": 17.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.911754060403775e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.44444274902344, + "epoch": 17.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.911582476952446e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.2777862548828, + "epoch": 17.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.911410729854418e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.5833282470703, + "epoch": 17.78358208955224, + "grad_norm": 1.2362340133102945, + "learning_rate": 4.911238819121346e-07, + "loss": 0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.8333282470703, + "epoch": 17.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.911066744764897e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5, + "epoch": 17.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.910894506796746e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.94444274902344, + "epoch": 17.80597014925373, + "grad_norm": 0.2977007707009, + "learning_rate": 4.910722105228582e-07, + "loss": 0.0004, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.22222900390625, + "epoch": 17.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.910549540072103e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 17.82089552238806, + "grad_norm": 0.9163099593647013, + "learning_rate": 4.910376811339021e-07, + "loss": -0.0046, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.61111450195312, + "epoch": 17.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.910203919041054e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.63888549804688, + "epoch": 17.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.910030863189937e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.86111450195312, + "epoch": 17.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.909857643797412e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.36111450195312, + "epoch": 17.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.909684260875235e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.13888549804688, + "epoch": 17.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.909510714435168e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.30555725097656, + "epoch": 17.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.909337004488992e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.2777862548828, + "epoch": 17.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.909163131048492e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55555725097656, + "epoch": 17.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.908989094125468e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.4166717529297, + "epoch": 17.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.908814893731728e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.69444274902344, + "epoch": 17.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.908640529879095e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.13888549804688, + "epoch": 17.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.9084660025794e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.80555725097656, + "epoch": 17.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.908291311844488e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.55555725097656, + "epoch": 17.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.908116457686211e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.97222900390625, + "epoch": 17.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.907941440116436e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.36111450195312, + "epoch": 17.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.907766259147037e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.4166717529297, + "epoch": 17.940298507462686, + "grad_norm": 0.5140231968793734, + "learning_rate": 4.907590914789904e-07, + "loss": -0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.9166717529297, + "epoch": 17.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.907415407056936e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.63888549804688, + "epoch": 17.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.907239735960041e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.1666717529297, + "epoch": 17.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.907063901511141e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6666717529297, + "epoch": 17.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.906887903722166e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.9166717529297, + "epoch": 17.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.906711742605061e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.61111450195312, + "epoch": 17.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.906535418171778e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.33334350585938, + "epoch": 17.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.906358930434285e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 18.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.906182279404557e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.05555725097656, + "epoch": 18.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.906005465094581e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30555725097656, + "epoch": 18.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.905828487516355e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.86111450195312, + "epoch": 18.029850746268657, + "grad_norm": 0.6616954828295778, + "learning_rate": 4.905651346681888e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.8333282470703, + "epoch": 18.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.905474042603202e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.5833282470703, + "epoch": 18.044776119402986, + "grad_norm": 0.32269206176215093, + "learning_rate": 4.905296575292328e-07, + "loss": 0.0001, + "reward": 0.3611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.3611111044883728, + "rewards/format_reward": 0.0, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.1666717529297, + "epoch": 18.05223880597015, + "grad_norm": 1.7126051558294553, + "learning_rate": 4.905118944761308e-07, + "loss": -0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.4166717529297, + "epoch": 18.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.904941151022198e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.97222900390625, + "epoch": 18.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.90476319408706e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5277862548828, + "epoch": 18.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.904585073967972e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.7777862548828, + "epoch": 18.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.904406790677019e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.72222900390625, + "epoch": 18.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.904228344226301e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.97222900390625, + "epoch": 18.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.904049734627927e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.2777862548828, + "epoch": 18.104477611940297, + "grad_norm": 0.6609516571449969, + "learning_rate": 4.903870961894014e-07, + "loss": 0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.36111450195312, + "epoch": 18.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.903692026036699e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.44444274902344, + "epoch": 18.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.90351292706812e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.30555725097656, + "epoch": 18.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.90333366500043e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.11111450195312, + "epoch": 18.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.903154239845797e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.38888549804688, + "epoch": 18.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.902974651616394e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.88888549804688, + "epoch": 18.149253731343283, + "grad_norm": 0.36229564858162816, + "learning_rate": 4.902794900324409e-07, + "loss": -0.0001, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.72222900390625, + "epoch": 18.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.902614985982038e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.05555725097656, + "epoch": 18.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.90243490860149e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.63888549804688, + "epoch": 18.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.902254668194987e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5, + "epoch": 18.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.902074264774757e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.2777862548828, + "epoch": 18.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.901893698353045e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.30555725097656, + "epoch": 18.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.9017129689421e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.02777862548828, + "epoch": 18.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.90153207655419e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.30555725097656, + "epoch": 18.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.901351021201588e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 18.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.90116980289658e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.05555725097656, + "epoch": 18.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.900988421651464e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.75, + "epoch": 18.23134328358209, + "grad_norm": 0.9000191153259987, + "learning_rate": 4.900806877478548e-07, + "loss": 0.0008, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.1666717529297, + "epoch": 18.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.900625170390152e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5277862548828, + "epoch": 18.246268656716417, + "grad_norm": 0.35991995801294785, + "learning_rate": 4.900443300398606e-07, + "loss": -0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 18.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.90026126751625e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.61111450195312, + "epoch": 18.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.900079071755439e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.05555725097656, + "epoch": 18.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.899896713128535e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.61111450195312, + "epoch": 18.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.899714191647914e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 18.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.899531507325959e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0, + "epoch": 18.291044776119403, + "grad_norm": 0.4601230601836358, + "learning_rate": 4.899348660175068e-07, + "loss": -0.0001, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1666717529297, + "epoch": 18.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.89916565020765e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.75, + "epoch": 18.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.898982477436123e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.13888549804688, + "epoch": 18.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.898799141872916e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.80555725097656, + "epoch": 18.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.898615643530471e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 18.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.89843198242124e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3333282470703, + "epoch": 18.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.898248158557685e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.25, + "epoch": 18.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.898064171952281e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.97222900390625, + "epoch": 18.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.897880022617512e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.19444274902344, + "epoch": 18.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.897695710565875e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.38888549804688, + "epoch": 18.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.897511235809878e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.97222900390625, + "epoch": 18.37313432835821, + "grad_norm": 0.9391819371115834, + "learning_rate": 4.897326598362038e-07, + "loss": -0.0001, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.66666412353516, + "epoch": 18.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.897141798234884e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.61111450195312, + "epoch": 18.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.896956835440956e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.55555725097656, + "epoch": 18.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.896771709992807e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.72222900390625, + "epoch": 18.402985074626866, + "grad_norm": 0.7896884945050722, + "learning_rate": 4.896586421902998e-07, + "loss": 0.0, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.25, + "epoch": 18.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.896400971184102e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0, + "epoch": 18.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.896215357848705e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5833282470703, + "epoch": 18.425373134328357, + "grad_norm": 0.452290615697219, + "learning_rate": 4.896029581909403e-07, + "loss": -0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.11111450195312, + "epoch": 18.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.895843643378799e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 18.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.895657542269514e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.97222900390625, + "epoch": 18.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.895471278594175e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 18.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.895284852365421e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.97222900390625, + "epoch": 18.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.895098263595905e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.61111450195312, + "epoch": 18.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.894911512298286e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.94444274902344, + "epoch": 18.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.894724598485239e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.97222900390625, + "epoch": 18.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.894537522169446e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.94444274902344, + "epoch": 18.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.894350283363602e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.25, + "epoch": 18.5, + "grad_norm": 0.0, + "learning_rate": 4.894162882080414e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.97222900390625, + "epoch": 18.507462686567163, + "grad_norm": 0.8497285288390846, + "learning_rate": 4.893975318332597e-07, + "loss": -0.0008, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.36111450195312, + "epoch": 18.51492537313433, + "grad_norm": 1.9798262596425007, + "learning_rate": 4.893787592132879e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6666717529297, + "epoch": 18.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.893599703494001e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.05555725097656, + "epoch": 18.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.893411652428712e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.97222900390625, + "epoch": 18.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.893223438949771e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6666717529297, + "epoch": 18.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.893035063069952e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.3333282470703, + "epoch": 18.55223880597015, + "grad_norm": 1.4966659926740928, + "learning_rate": 4.892846524802036e-07, + "loss": 0.0004, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.05555725097656, + "epoch": 18.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.892657824158819e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.47222900390625, + "epoch": 18.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.892468961153105e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.44444274902344, + "epoch": 18.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.89227993579771e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.25, + "epoch": 18.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.89209074810546e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.55555725097656, + "epoch": 18.58955223880597, + "grad_norm": 0.9366431155030844, + "learning_rate": 4.891901398089196e-07, + "loss": -0.0, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.1666717529297, + "epoch": 18.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.891711885761764e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.36111450195312, + "epoch": 18.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.891522211136026e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.22222900390625, + "epoch": 18.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.891332374224851e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.44444274902344, + "epoch": 18.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.891142375041123e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5833282470703, + "epoch": 18.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.890952213597734e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.7777862548828, + "epoch": 18.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.890761889907589e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0833282470703, + "epoch": 18.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.890571403983602e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.36111450195312, + "epoch": 18.649253731343283, + "grad_norm": 1.8537719495358453, + "learning_rate": 4.8903807558387e-07, + "loss": -0.0005, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.6666717529297, + "epoch": 18.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.89018994548582e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.11111450195312, + "epoch": 18.66417910447761, + "grad_norm": 0.64365166811782, + "learning_rate": 4.889998972937909e-07, + "loss": 0.0003, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.38888549804688, + "epoch": 18.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.889807838207929e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 18.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.889616541308846e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.9166717529297, + "epoch": 18.686567164179106, + "grad_norm": 0.6412503310770927, + "learning_rate": 4.889425082253645e-07, + "loss": 0.0019, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.22222137451172, + "epoch": 18.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.889233461055316e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 18.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.889041677726863e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 18.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.888849732281299e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7777862548828, + "epoch": 18.71641791044776, + "grad_norm": 0.38210108124231285, + "learning_rate": 4.888657624731651e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 18.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.888465355090953e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.86111450195312, + "epoch": 18.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.888272923372254e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.4166717529297, + "epoch": 18.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.888080329588612e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 18.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.887887573753095e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.13888549804688, + "epoch": 18.753731343283583, + "grad_norm": 0.8025105715846457, + "learning_rate": 4.887694655878783e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7777862548828, + "epoch": 18.761194029850746, + "grad_norm": 1.828290525239786, + "learning_rate": 4.88750157597877e-07, + "loss": -0.0016, + "reward": 0.694444477558136, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5, + "epoch": 18.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.887308334066154e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.69444274902344, + "epoch": 18.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.887114930154051e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5277862548828, + "epoch": 18.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.886921364255585e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.13888549804688, + "epoch": 18.791044776119403, + "grad_norm": 0.5110610092286246, + "learning_rate": 4.886727636383889e-07, + "loss": 0.0004, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.52777862548828, + "epoch": 18.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.886533746552111e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.3888931274414, + "epoch": 18.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.886339694773408e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.2777862548828, + "epoch": 18.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.886145481060948e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.69444274902344, + "epoch": 18.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.885951105427909e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8333282470703, + "epoch": 18.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.885756567887483e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.88888549804688, + "epoch": 18.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.885561868452869e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.30555725097656, + "epoch": 18.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.88536700713728e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.69444274902344, + "epoch": 18.850746268656717, + "grad_norm": 1.7398411763927253, + "learning_rate": 4.885171983953938e-07, + "loss": -0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.5, + "epoch": 18.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.88497679891608e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.25, + "epoch": 18.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.884781452036948e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.11111450195312, + "epoch": 18.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.884585943329798e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9166717529297, + "epoch": 18.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.884390272807899e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 18.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.884194440484528e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.11111450195312, + "epoch": 18.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.883998446372973e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.58333587646484, + "epoch": 18.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.883802290486535e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.22222900390625, + "epoch": 18.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.883605972838524e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.36111450195312, + "epoch": 18.917910447761194, + "grad_norm": 0.5589923491407405, + "learning_rate": 4.883409493442263e-07, + "loss": 0.0005, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.72222900390625, + "epoch": 18.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.883212852311085e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.8888931274414, + "epoch": 18.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.883016049458331e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.9166717529297, + "epoch": 18.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.88281908489736e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0833282470703, + "epoch": 18.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.882621958641534e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 18.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.882424670704232e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.13888549804688, + "epoch": 18.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.88222722109884e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.9166717529297, + "epoch": 18.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.882029609838758e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.08333587646484, + "epoch": 18.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.881831836937395e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.75, + "epoch": 18.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.881633902408172e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.5, + "epoch": 18.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.881435806264521e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 19.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.881237548519883e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.2777862548828, + "epoch": 19.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.881039129187713e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.75, + "epoch": 19.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.880840548281474e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.19444274902344, + "epoch": 19.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.880641805814643e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.97222900390625, + "epoch": 19.03731343283582, + "grad_norm": 1.7027890376661794, + "learning_rate": 4.880442901800705e-07, + "loss": 0.0002, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 19.044776119402986, + "grad_norm": 0.38553607324651673, + "learning_rate": 4.88024383625316e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.69444274902344, + "epoch": 19.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.880044609185512e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5277862548828, + "epoch": 19.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.879845220611284e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.75, + "epoch": 19.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.879645670544004e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7777862548828, + "epoch": 19.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.879445958997215e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 19.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.879246085984467e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.30555725097656, + "epoch": 19.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.879046051519325e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5277862548828, + "epoch": 19.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.878845855615364e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.13888549804688, + "epoch": 19.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.878645498286165e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3333282470703, + "epoch": 19.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.878444979545327e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.94444274902344, + "epoch": 19.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.878244299406457e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.80555725097656, + "epoch": 19.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.878043457883172e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.97222900390625, + "epoch": 19.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.877842454989101e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.19444274902344, + "epoch": 19.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.877641290737883e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.38888549804688, + "epoch": 19.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.877439965143171e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.86111450195312, + "epoch": 19.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.877238478218625e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.0833282470703, + "epoch": 19.16417910447761, + "grad_norm": 0.8456339634647048, + "learning_rate": 4.877036829977917e-07, + "loss": -0.0008, + "reward": 0.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 19.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.876835020434732e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.86111450195312, + "epoch": 19.17910447761194, + "grad_norm": 1.1065738214742942, + "learning_rate": 4.876633049602764e-07, + "loss": 0.0008, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0, + "epoch": 19.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.876430917495718e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.22222137451172, + "epoch": 19.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.876228624127311e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 19.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.87602616951127e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.11111450195312, + "epoch": 19.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.875823553661333e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.86111450195312, + "epoch": 19.21641791044776, + "grad_norm": 0.5232929697244029, + "learning_rate": 4.87562077659125e-07, + "loss": 0.0001, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.4166717529297, + "epoch": 19.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.875417838314782e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.88888549804688, + "epoch": 19.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.875214738845699e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.61111450195312, + "epoch": 19.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.875011478197781e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.38888549804688, + "epoch": 19.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.874808056384825e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0833282470703, + "epoch": 19.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.874604473420632e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6666717529297, + "epoch": 19.261194029850746, + "grad_norm": 2.1423557287962662, + "learning_rate": 4.874400729319019e-07, + "loss": 0.0008, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5277862548828, + "epoch": 19.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.874196824093809e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.25, + "epoch": 19.276119402985074, + "grad_norm": 0.279414964040098, + "learning_rate": 4.873992757758841e-07, + "loss": -0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.2777862548828, + "epoch": 19.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.873788530327962e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3333282470703, + "epoch": 19.291044776119403, + "grad_norm": 0.49468852458390167, + "learning_rate": 4.873584141815032e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.72222900390625, + "epoch": 19.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.873379592233917e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.55555725097656, + "epoch": 19.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.8731748815985e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9166717529297, + "epoch": 19.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.872970009922673e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.9166717529297, + "epoch": 19.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.872764977220337e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.86111450195312, + "epoch": 19.328358208955223, + "grad_norm": 0.45911191301380166, + "learning_rate": 4.872559783505405e-07, + "loss": 0.0004, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.52777862548828, + "epoch": 19.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.872354428791803e-07, + "loss": 0.0, + "reward": 0.2222222238779068, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.2222222238779068, + "rewards/format_reward": 0.0, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.69444274902344, + "epoch": 19.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.872148913093464e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.80555725097656, + "epoch": 19.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.871943236424334e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 19.35820895522388, + "grad_norm": 0.541991240955792, + "learning_rate": 4.871737398798372e-07, + "loss": 0.0002, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.44444274902344, + "epoch": 19.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.871531400229544e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 19.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.871325240731829e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.25, + "epoch": 19.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.871118920319218e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.80555725097656, + "epoch": 19.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.87091243900571e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.69444274902344, + "epoch": 19.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.870705796805317e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.44444274902344, + "epoch": 19.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.870498993732063e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.30555725097656, + "epoch": 19.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.870292029799979e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.72222900390625, + "epoch": 19.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.870084905023111e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1666717529297, + "epoch": 19.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.869877619415513e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.63888549804688, + "epoch": 19.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.869670172991252e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.30555725097656, + "epoch": 19.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.869462565764404e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.69444274902344, + "epoch": 19.44776119402985, + "grad_norm": 1.2510565505418858, + "learning_rate": 4.869254797749058e-07, + "loss": -0.0031, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.9166717529297, + "epoch": 19.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.869046868959313e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.38888549804688, + "epoch": 19.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.868838779409279e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.77777862548828, + "epoch": 19.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.868630529113075e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.38888549804688, + "epoch": 19.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.868422118084834e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 19.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.868213546338698e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 19.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.868004813888819e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.05555725097656, + "epoch": 19.5, + "grad_norm": 0.37026887837516265, + "learning_rate": 4.867795920749364e-07, + "loss": 0.0008, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.75, + "epoch": 19.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.867586866934506e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.25, + "epoch": 19.51492537313433, + "grad_norm": 1.0563650874698627, + "learning_rate": 4.867377652458434e-07, + "loss": -0.0003, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5277862548828, + "epoch": 19.52238805970149, + "grad_norm": 0.5260212947820636, + "learning_rate": 4.867168277335342e-07, + "loss": 0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.55555725097656, + "epoch": 19.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.866958741579439e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5833282470703, + "epoch": 19.53731343283582, + "grad_norm": 0.8945249773365775, + "learning_rate": 4.866749045204943e-07, + "loss": 0.0009, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7777862548828, + "epoch": 19.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.866539188226085e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.44444274902344, + "epoch": 19.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.866329170657105e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.30555725097656, + "epoch": 19.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.866118992512255e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.80555725097656, + "epoch": 19.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.865908653805797e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.22222900390625, + "epoch": 19.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.865698154552004e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.75, + "epoch": 19.582089552238806, + "grad_norm": 0.45234223006967805, + "learning_rate": 4.86548749476516e-07, + "loss": -0.0005, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0, + "epoch": 19.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.865276674459562e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.38888549804688, + "epoch": 19.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.865065693649514e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5833282470703, + "epoch": 19.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.864854552349333e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0277862548828, + "epoch": 19.611940298507463, + "grad_norm": 0.26290321526934285, + "learning_rate": 4.864643250573347e-07, + "loss": 0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.44444274902344, + "epoch": 19.619402985074625, + "grad_norm": 0.3281392394775126, + "learning_rate": 4.864431788335895e-07, + "loss": 0.0004, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0833282470703, + "epoch": 19.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.864220165651326e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.52777862548828, + "epoch": 19.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.864008382534002e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36111450195312, + "epoch": 19.64179104477612, + "grad_norm": 1.5617811005783153, + "learning_rate": 4.863796438998292e-07, + "loss": 0.0005, + "reward": 0.6111111044883728, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.7777862548828, + "epoch": 19.649253731343283, + "grad_norm": 0.6936347868696717, + "learning_rate": 4.863584335058579e-07, + "loss": -0.0007, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0, + "epoch": 19.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.863372070729257e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.41666412353516, + "epoch": 19.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.86315964602473e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.72222900390625, + "epoch": 19.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.862947060959412e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.13888549804688, + "epoch": 19.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.862734315547729e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.38888549804688, + "epoch": 19.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.862521409804117e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0, + "epoch": 19.69402985074627, + "grad_norm": 0.41225325195071544, + "learning_rate": 4.862308343743023e-07, + "loss": -0.0006, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.47222900390625, + "epoch": 19.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.862095117378908e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 19.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.86188173072624e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 19.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.861668183799498e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.47222900390625, + "epoch": 19.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.861454476613174e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.47222900390625, + "epoch": 19.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.86124060918177e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.05555725097656, + "epoch": 19.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.861026581519797e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.88888549804688, + "epoch": 19.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.860812393641781e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1666717529297, + "epoch": 19.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.860598045562255e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.8333282470703, + "epoch": 19.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.860383537295767e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.55555725097656, + "epoch": 19.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.860168868856869e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0833282470703, + "epoch": 19.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.859954040260132e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.94444274902344, + "epoch": 19.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.859739051520131e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.1666717529297, + "epoch": 19.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.859523902651455e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0833282470703, + "epoch": 19.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.859308593668707e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.55555725097656, + "epoch": 19.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.859093124586495e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.3333282470703, + "epoch": 19.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.85887749541944e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.2777862548828, + "epoch": 19.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.858661706182176e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5833282470703, + "epoch": 19.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.858445756889344e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.36111450195312, + "epoch": 19.83582089552239, + "grad_norm": 0.47859332622868717, + "learning_rate": 4.8582296475556e-07, + "loss": -0.0019, + "reward": 0.6388888955116272, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.2777862548828, + "epoch": 19.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.858013378195609e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.55555725097656, + "epoch": 19.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.857796948824044e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0833282470703, + "epoch": 19.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.857580359455595e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.1666717529297, + "epoch": 19.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.857363610104957e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.44444274902344, + "epoch": 19.87313432835821, + "grad_norm": 1.2205149550300662, + "learning_rate": 4.857146700786841e-07, + "loss": 0.0006, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0833282470703, + "epoch": 19.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.856929631515964e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.25, + "epoch": 19.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.856712402307054e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4166717529297, + "epoch": 19.895522388059703, + "grad_norm": 0.4850026685135545, + "learning_rate": 4.856495013174857e-07, + "loss": -0.0006, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.80555725097656, + "epoch": 19.902985074626866, + "grad_norm": 0.6041636652591991, + "learning_rate": 4.856277464134122e-07, + "loss": 0.0015, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0833282470703, + "epoch": 19.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.85605975519961e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1666717529297, + "epoch": 19.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.855841886386099e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.30555725097656, + "epoch": 19.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.855623857708368e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.72222900390625, + "epoch": 19.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.855405669181215e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 19.940298507462686, + "grad_norm": 1.5652896190538432, + "learning_rate": 4.855187320819446e-07, + "loss": -0.0031, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5833282470703, + "epoch": 19.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.854968812637877e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.25, + "epoch": 19.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.854750144651336e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.19444274902344, + "epoch": 19.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.854531316874662e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.9166717529297, + "epoch": 19.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.854312329322702e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.55555725097656, + "epoch": 19.97761194029851, + "grad_norm": 2.881551338069291, + "learning_rate": 4.85409318201032e-07, + "loss": 0.0007, + "reward": 0.5, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0833282470703, + "epoch": 19.98507462686567, + "grad_norm": 0.7384495203756511, + "learning_rate": 4.853873874952385e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.08334350585938, + "epoch": 19.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.853654408163778e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9166717529297, + "epoch": 20.007462686567163, + "grad_norm": 4.846389663439705, + "learning_rate": 4.853434781659394e-07, + "loss": 0.0066, + "reward": 0.694444477558136, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.19444274902344, + "epoch": 20.01492537313433, + "grad_norm": 2.0542006477214336, + "learning_rate": 4.853214995454134e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.44444274902344, + "epoch": 20.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.852995049562913e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5277862548828, + "epoch": 20.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.852774944000659e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 20.03731343283582, + "grad_norm": 0.9653432368298595, + "learning_rate": 4.852554678782305e-07, + "loss": -0.0011, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 20.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.852334253922799e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 20.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.852113669437098e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 20.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.85189292534017e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.22222900390625, + "epoch": 20.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.851672021646998e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0833282470703, + "epoch": 20.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.851450958372568e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6666717529297, + "epoch": 20.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.851229735531885e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.3333282470703, + "epoch": 20.08955223880597, + "grad_norm": 0.30722730619423777, + "learning_rate": 4.851008353139956e-07, + "loss": -0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.02777862548828, + "epoch": 20.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.850786811211807e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.11111450195312, + "epoch": 20.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.850565109762472e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.88888549804688, + "epoch": 20.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.850343248806993e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 20.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.850121228360427e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.33333587646484, + "epoch": 20.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.84989904843784e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.22222900390625, + "epoch": 20.134328358208954, + "grad_norm": 0.37903828503138315, + "learning_rate": 4.849676709054307e-07, + "loss": 0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 20.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.849454210224918e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.19444274902344, + "epoch": 20.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.849231551964771e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.75, + "epoch": 20.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.849008734288974e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.80555725097656, + "epoch": 20.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.848785757212648e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.80555725097656, + "epoch": 20.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.848562620750922e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 20.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.848339324918941e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5833282470703, + "epoch": 20.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.848115869731856e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.61111450195312, + "epoch": 20.19402985074627, + "grad_norm": 1.8098933487384945, + "learning_rate": 4.84789225520483e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.30555725097656, + "epoch": 20.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.847668481353038e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.69444274902344, + "epoch": 20.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.847444548191664e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.25, + "epoch": 20.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.847220455735904e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.44444274902344, + "epoch": 20.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.846996204000966e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.36111450195312, + "epoch": 20.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.846771793002066e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.11111450195312, + "epoch": 20.238805970149254, + "grad_norm": 0.9902253281779434, + "learning_rate": 4.846547222754432e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.22222900390625, + "epoch": 20.246268656716417, + "grad_norm": 0.5292241215517733, + "learning_rate": 4.846322493273303e-07, + "loss": -0.0, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.72222137451172, + "epoch": 20.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.84609760457393e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.1666717529297, + "epoch": 20.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.845872556671574e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.38888549804688, + "epoch": 20.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.845647349581505e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.5277862548828, + "epoch": 20.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.845421983319006e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5277862548828, + "epoch": 20.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.84519645789937e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.88888549804688, + "epoch": 20.291044776119403, + "grad_norm": 0.6943016786465588, + "learning_rate": 4.8449707733379e-07, + "loss": 0.0006, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.19444274902344, + "epoch": 20.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.844744929649911e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3333282470703, + "epoch": 20.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.84451892685073e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.97222900390625, + "epoch": 20.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.844292764955691e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5277862548828, + "epoch": 20.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.844066443980143e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.30555725097656, + "epoch": 20.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.843839963939442e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.05555725097656, + "epoch": 20.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.843613324848956e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 20.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.843386526724068e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.47222900390625, + "epoch": 20.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.843159569580166e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.9166717529297, + "epoch": 20.35820895522388, + "grad_norm": 0.9056043016588083, + "learning_rate": 4.842932453432651e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55555725097656, + "epoch": 20.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.842705178296935e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.86111450195312, + "epoch": 20.37313432835821, + "grad_norm": 0.7966073692228552, + "learning_rate": 4.842477744188441e-07, + "loss": -0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.4166717529297, + "epoch": 20.380597014925375, + "grad_norm": 0.429474203392898, + "learning_rate": 4.8422501511226e-07, + "loss": 0.0002, + "reward": 0.4166666567325592, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.4166666567325592, + "rewards/format_reward": 0.0, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.72222900390625, + "epoch": 20.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.842022399114859e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.13888549804688, + "epoch": 20.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.841794488180671e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5, + "epoch": 20.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.841566418335504e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.36111450195312, + "epoch": 20.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.841338189594834e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.94444274902344, + "epoch": 20.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.841109801974145e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 20.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.84088125548894e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6666717529297, + "epoch": 20.432835820895523, + "grad_norm": 1.230148507354647, + "learning_rate": 4.840652550154724e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.13888549804688, + "epoch": 20.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.840423685987018e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.05555725097656, + "epoch": 20.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.840194663001354e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.55555725097656, + "epoch": 20.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.839965481213271e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75, + "epoch": 20.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.83973614063832e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.2777862548828, + "epoch": 20.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.839506641292067e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.19444274902344, + "epoch": 20.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.839276983190083e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6666717529297, + "epoch": 20.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.839047166347954e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 20.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.838817190781274e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 20.5, + "grad_norm": 0.0, + "learning_rate": 4.838587056505648e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.88888549804688, + "epoch": 20.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.838356763536694e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0277862548828, + "epoch": 20.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.83812631189004e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0833282470703, + "epoch": 20.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.837895701581323e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.7777862548828, + "epoch": 20.529850746268657, + "grad_norm": 0.8322272844518943, + "learning_rate": 4.837664932626191e-07, + "loss": -0.0005, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.47222900390625, + "epoch": 20.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.837434005040305e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.72222900390625, + "epoch": 20.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.837202918839335e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8333282470703, + "epoch": 20.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.836971674038962e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.61111450195312, + "epoch": 20.559701492537314, + "grad_norm": 8.188337982426557, + "learning_rate": 4.83674027065488e-07, + "loss": 0.0009, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.63888549804688, + "epoch": 20.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.836508708702788e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.86111450195312, + "epoch": 20.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.836276988198402e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.11111450195312, + "epoch": 20.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.836045109157448e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.19444274902344, + "epoch": 20.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.835813071595656e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 20.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.835580875528775e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.7777862548828, + "epoch": 20.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.835348520972561e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.25, + "epoch": 20.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.835116007942782e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.55555725097656, + "epoch": 20.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.834883336455214e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.4166717529297, + "epoch": 20.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.834650506525648e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.97222900390625, + "epoch": 20.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.834417518169882e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.69444274902344, + "epoch": 20.64179104477612, + "grad_norm": 0.8058526536648815, + "learning_rate": 4.834184371403727e-07, + "loss": 0.0003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.36111450195312, + "epoch": 20.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.833951066243004e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 20.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.833717602703544e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.11111450195312, + "epoch": 20.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.833483980801192e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.72222900390625, + "epoch": 20.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.833250200551798e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9166717529297, + "epoch": 20.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.833016261971226e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0277862548828, + "epoch": 20.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.832782165075354e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.25, + "epoch": 20.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.832547909880065e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 20.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.832313496401257e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5833282470703, + "epoch": 20.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.832078924654834e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.2777862548828, + "epoch": 20.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.831844194656717e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 20.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.831609306422832e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3333282470703, + "epoch": 20.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.831374259969119e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.2777862548828, + "epoch": 20.738805970149254, + "grad_norm": 0.8130262788058131, + "learning_rate": 4.83113905531153e-07, + "loss": 0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.1666717529297, + "epoch": 20.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.830903692466023e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.05555725097656, + "epoch": 20.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.83066817144857e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.22222900390625, + "epoch": 20.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.830432492275153e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.13888549804688, + "epoch": 20.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.830196654961766e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.19444274902344, + "epoch": 20.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.829960659524411e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.86111450195312, + "epoch": 20.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.829724505979104e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.38888549804688, + "epoch": 20.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.829488194341869e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.69444274902344, + "epoch": 20.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.829251724628741e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 20.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.829015096855769e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.30555725097656, + "epoch": 20.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.828778311039008e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.75, + "epoch": 20.82089552238806, + "grad_norm": 1.9319784520070689, + "learning_rate": 4.828541367194527e-07, + "loss": -0.0051, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 20.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.828304265338403e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.30555725097656, + "epoch": 20.83582089552239, + "grad_norm": 0.8482168389072957, + "learning_rate": 4.828067005486729e-07, + "loss": 0.0, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.47222900390625, + "epoch": 20.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.827829587655602e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.61111450195312, + "epoch": 20.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.827592011861133e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.38888549804688, + "epoch": 20.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.827354278119445e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 20.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.827116386446671e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.2777862548828, + "epoch": 20.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.826878336858951e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.47222900390625, + "epoch": 20.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.82664012937244e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.75, + "epoch": 20.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.826401764003304e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.11111450195312, + "epoch": 20.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.826163240767716e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.69444274902344, + "epoch": 20.902985074626866, + "grad_norm": 0.501887901637355, + "learning_rate": 4.825924559681864e-07, + "loss": 0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 20.91044776119403, + "grad_norm": 3.0024999034114663, + "learning_rate": 4.825685720761943e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.05555725097656, + "epoch": 20.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.82544672402416e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.61111450195312, + "epoch": 20.925373134328357, + "grad_norm": 2.330562295185508, + "learning_rate": 4.825207569484733e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.3333282470703, + "epoch": 20.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.824968257159894e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.94444274902344, + "epoch": 20.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.824728787065878e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.3333282470703, + "epoch": 20.94776119402985, + "grad_norm": 0.5824380190077586, + "learning_rate": 4.824489159218937e-07, + "loss": 0.0006, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.72222900390625, + "epoch": 20.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.824249373635332e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.36111450195312, + "epoch": 20.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.824009430331333e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.2777862548828, + "epoch": 20.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.823769329323225e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.75, + "epoch": 20.97761194029851, + "grad_norm": 1.1525910085491673, + "learning_rate": 4.823529070627299e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 20.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.823288654259859e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.08334350585938, + "epoch": 20.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.823048080237219e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.7777862548828, + "epoch": 21.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.822807348575705e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 21.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.822566459291652e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.19444274902344, + "epoch": 21.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.822325412401405e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 21.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.822084207921325e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0833282470703, + "epoch": 21.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.821842845867776e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.19444274902344, + "epoch": 21.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.821601326257139e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.47222900390625, + "epoch": 21.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.8213596491058e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 21.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.821117814430162e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0, + "epoch": 21.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.820875822246636e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.94444274902344, + "epoch": 21.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.82063367257164e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.97222900390625, + "epoch": 21.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.820391365421609e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.69444274902344, + "epoch": 21.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.820148900812983e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.11111450195312, + "epoch": 21.097014925373134, + "grad_norm": 0.8956317039659903, + "learning_rate": 4.819906278762218e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.2777862548828, + "epoch": 21.104477611940297, + "grad_norm": 1.0306361182131172, + "learning_rate": 4.819663499285777e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 21.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.819420562400133e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.9166717529297, + "epoch": 21.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.819177468121773e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5833282470703, + "epoch": 21.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.818934216467195e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.97222900390625, + "epoch": 21.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.818690807452901e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.69444274902344, + "epoch": 21.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.818447241095412e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 21.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.818203517411255e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0833282470703, + "epoch": 21.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.817959636416969e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 21.16417910447761, + "grad_norm": 0.6502572442175, + "learning_rate": 4.817715598129103e-07, + "loss": 0.0001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 21.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.817471402564216e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.30555725097656, + "epoch": 21.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.817227049738882e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5833282470703, + "epoch": 21.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.816982539669679e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.86111450195312, + "epoch": 21.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.816737872373202e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 21.20149253731343, + "grad_norm": 1.3972652794118312, + "learning_rate": 4.816493047866053e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5277862548828, + "epoch": 21.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.816248066164843e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.47222900390625, + "epoch": 21.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.816002927286199e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.22222137451172, + "epoch": 21.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.815757631246756e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.19444274902344, + "epoch": 21.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.815512178063156e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.55555725097656, + "epoch": 21.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.815266567752059e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3333282470703, + "epoch": 21.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.81502080033013e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.72222900390625, + "epoch": 21.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.814774875814045e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 21.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.814528794220495e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.6666717529297, + "epoch": 21.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.814282555566178e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5, + "epoch": 21.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.814036159867803e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0, + "epoch": 21.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.813789607142089e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.19444274902344, + "epoch": 21.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.813542897405768e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.11111450195312, + "epoch": 21.298507462686565, + "grad_norm": 0.8207714203174831, + "learning_rate": 4.813296030675582e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.30555725097656, + "epoch": 21.30597014925373, + "grad_norm": 0.5198705684400952, + "learning_rate": 4.813049006968281e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0833282470703, + "epoch": 21.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.81280182630063e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.13888549804688, + "epoch": 21.32089552238806, + "grad_norm": 0.7003191857814084, + "learning_rate": 4.8125544886894e-07, + "loss": -0.0, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.47222900390625, + "epoch": 21.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.812306994151376e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.22222900390625, + "epoch": 21.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.812059342703354e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.44444274902344, + "epoch": 21.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.811811534362138e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.88888549804688, + "epoch": 21.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.811563569144544e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5833282470703, + "epoch": 21.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.811315447067397e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.19444274902344, + "epoch": 21.365671641791046, + "grad_norm": 0.6915401308351045, + "learning_rate": 4.811067168147537e-07, + "loss": -0.0169, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 21.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.810818732401811e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.13888549804688, + "epoch": 21.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.810570139847077e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3333282470703, + "epoch": 21.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.810321390500205e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.19444274902344, + "epoch": 21.395522388059703, + "grad_norm": 0.6021657311401807, + "learning_rate": 4.810072484378074e-07, + "loss": 0.0005, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.72222900390625, + "epoch": 21.402985074626866, + "grad_norm": 0.8934431212250875, + "learning_rate": 4.809823421497574e-07, + "loss": 0.0, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.47222900390625, + "epoch": 21.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.809574201875607e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0833282470703, + "epoch": 21.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.809324825529084e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0, + "epoch": 21.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.809075292474928e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 21.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.808825602730072e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5833282470703, + "epoch": 21.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.808575756311458e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.72222900390625, + "epoch": 21.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.808325753236043e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.88888549804688, + "epoch": 21.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.80807559352079e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5277862548828, + "epoch": 21.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.807825277182675e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.94444274902344, + "epoch": 21.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.807574804238684e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.30555725097656, + "epoch": 21.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.807324174705812e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.2777862548828, + "epoch": 21.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.807073388601071e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.44444274902344, + "epoch": 21.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.806822445941474e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.86111450195312, + "epoch": 21.5, + "grad_norm": 0.0, + "learning_rate": 4.806571346744052e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.25, + "epoch": 21.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.806320091025845e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.61111450195312, + "epoch": 21.51492537313433, + "grad_norm": 0.5239803949103741, + "learning_rate": 4.806068678803902e-07, + "loss": 0.0001, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.94444274902344, + "epoch": 21.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.805817110095283e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5277862548828, + "epoch": 21.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.80556538491706e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.72222900390625, + "epoch": 21.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.805313503286313e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6666717529297, + "epoch": 21.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.805061465220136e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8333282470703, + "epoch": 21.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.804809270735632e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.25, + "epoch": 21.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.804556919849914e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 21.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.804304412580106e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.86111450195312, + "epoch": 21.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.804051748943342e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5833282470703, + "epoch": 21.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.803798928956771e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.86111450195312, + "epoch": 21.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.803545952637545e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.47222137451172, + "epoch": 21.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.803292820002832e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 21.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.80303953106981e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.22222900390625, + "epoch": 21.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.802786085855667e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 21.619402985074625, + "grad_norm": 3.0465870320887105, + "learning_rate": 4.8025324843776e-07, + "loss": 0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.75, + "epoch": 21.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.802278726652818e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.22222900390625, + "epoch": 21.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.802024812698542e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3333282470703, + "epoch": 21.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.801770742532001e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.05555725097656, + "epoch": 21.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.801516516170437e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.61111450195312, + "epoch": 21.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.801262133631101e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.38888549804688, + "epoch": 21.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.801007594931255e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.30555725097656, + "epoch": 21.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.800752900088171e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.63888549804688, + "epoch": 21.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.800498049119133e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.19444274902344, + "epoch": 21.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.800243042041435e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.63888549804688, + "epoch": 21.69402985074627, + "grad_norm": 4.224640699883317, + "learning_rate": 4.799987878872381e-07, + "loss": -0.0, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.9166717529297, + "epoch": 21.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.799732559629287e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5833282470703, + "epoch": 21.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.799477084329478e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.3333282470703, + "epoch": 21.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.799221452990288e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.19444274902344, + "epoch": 21.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.798965665629067e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.1388931274414, + "epoch": 21.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.798709722263171e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.38888549804688, + "epoch": 21.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.79845362290997e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.3333282470703, + "epoch": 21.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.798197367586838e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0277862548828, + "epoch": 21.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.797940956311167e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.69444274902344, + "epoch": 21.761194029850746, + "grad_norm": 1.051528212664497, + "learning_rate": 4.797684389100358e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.2777862548828, + "epoch": 21.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.797427665971819e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.86111450195312, + "epoch": 21.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.797170786942972e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.63888549804688, + "epoch": 21.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.796913752031248e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.3333282470703, + "epoch": 21.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.79665656125409e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.36111450195312, + "epoch": 21.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.796399214628949e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.05555725097656, + "epoch": 21.80597014925373, + "grad_norm": 1.5124881314537568, + "learning_rate": 4.796141712173289e-07, + "loss": 0.0271, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.19444274902344, + "epoch": 21.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.795884053904585e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.47222900390625, + "epoch": 21.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.795626239840319e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.94444274902344, + "epoch": 21.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.795368269997987e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.11111450195312, + "epoch": 21.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.795110144395096e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.80555725097656, + "epoch": 21.84328358208955, + "grad_norm": 0.5786867824350954, + "learning_rate": 4.794851863049158e-07, + "loss": 0.0002, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.05555725097656, + "epoch": 21.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.794593425977704e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.22222900390625, + "epoch": 21.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.794334833198269e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.25, + "epoch": 21.865671641791046, + "grad_norm": 0.3602929352082068, + "learning_rate": 4.794076084728401e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.61111450195312, + "epoch": 21.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.79381718058566e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.9166717529297, + "epoch": 21.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.793558120787612e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0, + "epoch": 21.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.793298905351837e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.13888549804688, + "epoch": 21.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.793039534295927e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.4166717529297, + "epoch": 21.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.79278000763748e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.97222137451172, + "epoch": 21.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.792520325394111e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.36111450195312, + "epoch": 21.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.792260487583437e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.05555725097656, + "epoch": 21.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.792000494223093e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5277862548828, + "epoch": 21.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.791740345330722e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.4166717529297, + "epoch": 21.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.791480040923975e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.1666717529297, + "epoch": 21.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.791219581020518e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4166717529297, + "epoch": 21.955223880597014, + "grad_norm": 1.2630625828569935, + "learning_rate": 4.790958965638025e-07, + "loss": -0.0008, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.38888549804688, + "epoch": 21.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.790698194794182e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0, + "epoch": 21.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.790437268506682e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 21.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.790176186793233e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.72222900390625, + "epoch": 21.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.789914949671552e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.08334350585938, + "epoch": 21.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.789653557159365e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.38888549804688, + "epoch": 22.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.78939200927441e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0277862548828, + "epoch": 22.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.789130306034436e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.88888549804688, + "epoch": 22.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.7888684474572e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.22222900390625, + "epoch": 22.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.788606433560473e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.44444274902344, + "epoch": 22.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.788344264362036e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5, + "epoch": 22.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.788081939879675e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0277862548828, + "epoch": 22.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.787819460131197e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.4166717529297, + "epoch": 22.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.787556825134409e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.47222900390625, + "epoch": 22.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.787294034907134e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9166717529297, + "epoch": 22.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.787031089467207e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.72222900390625, + "epoch": 22.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.786767988832468e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.55555725097656, + "epoch": 22.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.786504733020772e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.6666717529297, + "epoch": 22.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.786241322049983e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8333282470703, + "epoch": 22.104477611940297, + "grad_norm": 2.7773076364641898, + "learning_rate": 4.785977755937976e-07, + "loss": -0.0029, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 22.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.785714034702636e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3333282470703, + "epoch": 22.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.785450158361859e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 22.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.78518612693355e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3333282470703, + "epoch": 22.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.784921940435628e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5833282470703, + "epoch": 22.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.784657598886019e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0277862548828, + "epoch": 22.149253731343283, + "grad_norm": 1.2561265395551573, + "learning_rate": 4.78439310230266e-07, + "loss": 0.0, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5, + "epoch": 22.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.784128450703503e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.22222900390625, + "epoch": 22.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.783863644106502e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5277862548828, + "epoch": 22.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.78359868252963e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.55555725097656, + "epoch": 22.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.783333565990865e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.05555725097656, + "epoch": 22.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.783068294508198e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.11111450195312, + "epoch": 22.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.78280286809963e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0833282470703, + "epoch": 22.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.782537286783173e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.22222900390625, + "epoch": 22.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.782271550576849e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0277862548828, + "epoch": 22.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.78200565949869e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.9166717529297, + "epoch": 22.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.781739613566738e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.97222900390625, + "epoch": 22.23134328358209, + "grad_norm": 0.7512549176431792, + "learning_rate": 4.781473412799048e-07, + "loss": 0.0003, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.61111450195312, + "epoch": 22.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.781207057213685e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.86111450195312, + "epoch": 22.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.780940546828721e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 22.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.780673881662242e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.30555725097656, + "epoch": 22.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.780407061732343e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.86111450195312, + "epoch": 22.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.780140087057131e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4166717529297, + "epoch": 22.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.779872957654723e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.33333587646484, + "epoch": 22.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.779605673543246e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 22.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.779338234740836e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 22.298507462686565, + "grad_norm": 0.3909251928299414, + "learning_rate": 4.779070641265642e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.77777862548828, + "epoch": 22.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.778802893135822e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.44444274902344, + "epoch": 22.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.778534990369546e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.13888549804688, + "epoch": 22.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.778266932984992e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9166717529297, + "epoch": 22.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.777998721000352e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 22.33582089552239, + "grad_norm": 3.561615393850962, + "learning_rate": 4.777730354433825e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.38888549804688, + "epoch": 22.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.777461833303622e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.75, + "epoch": 22.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.777193157627965e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05555725097656, + "epoch": 22.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.776924327425087e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.69444274902344, + "epoch": 22.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.776655342713229e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.94444274902344, + "epoch": 22.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.776386203510644e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 22.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.776116909835595e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5277862548828, + "epoch": 22.388059701492537, + "grad_norm": 0.5297568147975923, + "learning_rate": 4.775847461706357e-07, + "loss": -0.0007, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.1666717529297, + "epoch": 22.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.775577859141214e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.88888549804688, + "epoch": 22.402985074626866, + "grad_norm": 1.2376777308805544, + "learning_rate": 4.775308102158461e-07, + "loss": 0.0003, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.38888549804688, + "epoch": 22.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.775038190776403e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.69444274902344, + "epoch": 22.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.774768125013355e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.2777862548828, + "epoch": 22.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.774497904887646e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.63888549804688, + "epoch": 22.432835820895523, + "grad_norm": 0.35165560578883764, + "learning_rate": 4.774227530417611e-07, + "loss": 0.0002, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.11111450195312, + "epoch": 22.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.773957001621596e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6666717529297, + "epoch": 22.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.773686318517961e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.11111450195312, + "epoch": 22.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.773415481125073e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.86111450195312, + "epoch": 22.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.77314448946131e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8888931274414, + "epoch": 22.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.772873343545063e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 22.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.77260204339473e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0833282470703, + "epoch": 22.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.772330589028722e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4166717529297, + "epoch": 22.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.772058980465459e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 22.5, + "grad_norm": 0.0, + "learning_rate": 4.771787217723373e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6666717529297, + "epoch": 22.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.771515300820903e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.61111450195312, + "epoch": 22.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.771243229776504e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05555725097656, + "epoch": 22.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.770971004608636e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 22.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.770698625335773e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.2777862548828, + "epoch": 22.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.770426091976397e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5833282470703, + "epoch": 22.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.770153404549004e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.22222900390625, + "epoch": 22.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.769880563072097e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.30555725097656, + "epoch": 22.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.769607567564189e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.72222900390625, + "epoch": 22.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.769334418043808e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 22.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.769061114529488e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0277862548828, + "epoch": 22.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.768787657039775e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.69444274902344, + "epoch": 22.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.768514045593226e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.44444274902344, + "epoch": 22.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.7682402802084077e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.4166717529297, + "epoch": 22.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.7679663609038965e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5833282470703, + "epoch": 22.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.767692287698282e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0, + "epoch": 22.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.7674180606101613e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0833282470703, + "epoch": 22.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.7671436796581423e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.55555725097656, + "epoch": 22.634328358208954, + "grad_norm": 1.3257554216758358, + "learning_rate": 4.7668691448608457e-07, + "loss": 0.001, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5277862548828, + "epoch": 22.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.766594456236901e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.9166717529297, + "epoch": 22.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.766319613804947e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.38888549804688, + "epoch": 22.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.766044617583634e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.38888549804688, + "epoch": 22.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.7657694675916247e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.88888549804688, + "epoch": 22.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.7654941638475885e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.11111450195312, + "epoch": 22.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.7652187063702086e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.47222900390625, + "epoch": 22.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.7649430951781767e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5277862548828, + "epoch": 22.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.7646673302901944e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5277862548828, + "epoch": 22.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.7643914117249763e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9166717529297, + "epoch": 22.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7641153395012445e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.1666717529297, + "epoch": 22.71641791044776, + "grad_norm": 1.1526343327774544, + "learning_rate": 4.7638391136377337e-07, + "loss": 0.001, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.61111450195312, + "epoch": 22.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.7635627341531883e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.94444274902344, + "epoch": 22.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.763286201066362e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.38888549804688, + "epoch": 22.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.7630095143960216e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 22.746268656716417, + "grad_norm": 0.6334750675347012, + "learning_rate": 4.762732674160941e-07, + "loss": 0.0, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.30555725097656, + "epoch": 22.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.7624556803799076e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.22222900390625, + "epoch": 22.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.762178533071717e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.25, + "epoch": 22.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.7619012322551755e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.6666717529297, + "epoch": 22.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.761623777949102e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.22222900390625, + "epoch": 22.78358208955224, + "grad_norm": 2.505478548912356, + "learning_rate": 4.7613461701723223e-07, + "loss": -0.0005, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.1666717529297, + "epoch": 22.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.761068408943676e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.36111450195312, + "epoch": 22.798507462686565, + "grad_norm": 1.1189602256075866, + "learning_rate": 4.760790494282011e-07, + "loss": 0.0007, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.4166717529297, + "epoch": 22.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.760512426206187e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.88888549804688, + "epoch": 22.813432835820894, + "grad_norm": 2.1368111544545463, + "learning_rate": 4.7602342047350716e-07, + "loss": -0.0485, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0277862548828, + "epoch": 22.82089552238806, + "grad_norm": 3.080486522373744, + "learning_rate": 4.759955829887545e-07, + "loss": 0.0, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.22222900390625, + "epoch": 22.828358208955223, + "grad_norm": 0.6141747071593037, + "learning_rate": 4.7596773016824993e-07, + "loss": 0.0001, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.55555725097656, + "epoch": 22.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.7593986201388323e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0277862548828, + "epoch": 22.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.7591197852754573e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.1666717529297, + "epoch": 22.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.7588407971112945e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.86111450195312, + "epoch": 22.85820895522388, + "grad_norm": 0.687056581756116, + "learning_rate": 4.758561655665275e-07, + "loss": 0.0008, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.44444274902344, + "epoch": 22.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.7582823609563415e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.3333282470703, + "epoch": 22.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.758002913003447e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.69444274902344, + "epoch": 22.880597014925375, + "grad_norm": 0.5797226012908612, + "learning_rate": 4.757723311825554e-07, + "loss": -0.0002, + "reward": 0.6388888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 0.0, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.94444274902344, + "epoch": 22.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.757443557441637e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.9166717529297, + "epoch": 22.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.7571636498706784e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.19444274902344, + "epoch": 22.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.756883589131673e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 22.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.756603375243625e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.2777862548828, + "epoch": 22.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.756323008225549e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0833282470703, + "epoch": 22.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.7560424880964714e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 22.932835820895523, + "grad_norm": 0.7399789740511088, + "learning_rate": 4.7557618148754263e-07, + "loss": 0.0005, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.75, + "epoch": 22.940298507462686, + "grad_norm": 0.6189454666246569, + "learning_rate": 4.755480988581461e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.61111450195312, + "epoch": 22.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.755200009233632e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.9166717529297, + "epoch": 22.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.754918876851006e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.27777862548828, + "epoch": 22.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.75463759145266e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.63888549804688, + "epoch": 22.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.7543561530576826e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.69444274902344, + "epoch": 22.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.75407456168517e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.38888549804688, + "epoch": 22.98507462686567, + "grad_norm": 0.6472920281544475, + "learning_rate": 4.7537928173542317e-07, + "loss": -0.0002, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.25, + "epoch": 22.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.753510920083987e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.86111450195312, + "epoch": 23.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.7532288698935654e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.97222900390625, + "epoch": 23.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.752946666802104e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.36111450195312, + "epoch": 23.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.7526643108287547e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.80555725097656, + "epoch": 23.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.7523818019926776e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.05555725097656, + "epoch": 23.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.7520991403130426e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.7777862548828, + "epoch": 23.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.7518163258090323e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25, + "epoch": 23.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.7515333584998363e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5833282470703, + "epoch": 23.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.7512502384046573e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1388931274414, + "epoch": 23.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.7509669655427077e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.9166717529297, + "epoch": 23.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.750683539933209e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.55555725097656, + "epoch": 23.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.750399961595395e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55555725097656, + "epoch": 23.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7501162305485086e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.05555725097656, + "epoch": 23.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.7498323468118036e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.22222900390625, + "epoch": 23.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.749548310404543e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.3333282470703, + "epoch": 23.111940298507463, + "grad_norm": 1.1334521178898747, + "learning_rate": 4.749264121346003e-07, + "loss": 0.0004, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.25, + "epoch": 23.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.7489797796554664e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.22222900390625, + "epoch": 23.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.7486952853522297e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.47222900390625, + "epoch": 23.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.7484106384555967e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0833282470703, + "epoch": 23.14179104477612, + "grad_norm": 0.5544287715719578, + "learning_rate": 4.7481258389848845e-07, + "loss": -0.0, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5833282470703, + "epoch": 23.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.747840886959419e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.19444274902344, + "epoch": 23.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.7475557823985363e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.97222900390625, + "epoch": 23.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.747270525321583e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 23.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.746985115747917e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.13888549804688, + "epoch": 23.17910447761194, + "grad_norm": 1.1341788411189047, + "learning_rate": 4.7466995536969054e-07, + "loss": -0.0001, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.2777862548828, + "epoch": 23.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.746413839187926e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.9166717529297, + "epoch": 23.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.746127972240367e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 23.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.7458419528736273e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.94444274902344, + "epoch": 23.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7455557811071153e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.72222900390625, + "epoch": 23.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.74526945696025e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.69444274902344, + "epoch": 23.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.744982980452462e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.75, + "epoch": 23.23134328358209, + "grad_norm": 1.46858965195477, + "learning_rate": 4.74469635160319e-07, + "loss": -0.0023, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0277862548828, + "epoch": 23.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.7444095704318855e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.83333587646484, + "epoch": 23.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.744122636958008e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.13888549804688, + "epoch": 23.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.743835551201029e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.7777862548828, + "epoch": 23.261194029850746, + "grad_norm": 1.0402408047657643, + "learning_rate": 4.743548313180429e-07, + "loss": -0.0009, + "reward": 0.472222238779068, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.472222238779068, + "rewards/format_reward": 0.0, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.7777862548828, + "epoch": 23.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.7432609229157007e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.47222900390625, + "epoch": 23.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.742973380426345e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.11111450195312, + "epoch": 23.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.742685685731874e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 23.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.7423978388518124e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.3333282470703, + "epoch": 23.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.742109839805691e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 23.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.741821688613053e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5277862548828, + "epoch": 23.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.741533385293453e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.7777862548828, + "epoch": 23.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.741244929866454e-07, + "loss": 0.0, + "reward": 0.4444444477558136, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4444444477558136, + "rewards/format_reward": 0.0, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.38888549804688, + "epoch": 23.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.740956322351631e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.2777862548828, + "epoch": 23.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.7406675627685684e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 23.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.74037865113686e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.3333282470703, + "epoch": 23.350746268656717, + "grad_norm": 1.2337635993040688, + "learning_rate": 4.7400895874761126e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.13888549804688, + "epoch": 23.35820895522388, + "grad_norm": 1.6964196065216106, + "learning_rate": 4.73980037180594e-07, + "loss": -0.0009, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.63888549804688, + "epoch": 23.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.7395110041459684e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.11111450195312, + "epoch": 23.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.739221484515835e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.80555725097656, + "epoch": 23.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.7389318129351853e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.63888549804688, + "epoch": 23.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.7386419894236763e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.55555725097656, + "epoch": 23.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.738352014000974e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.55555725097656, + "epoch": 23.402985074626866, + "grad_norm": 1.941629015700508, + "learning_rate": 4.7380618866867573e-07, + "loss": -0.0041, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8333282470703, + "epoch": 23.41044776119403, + "grad_norm": 1.3295552274530467, + "learning_rate": 4.7377716075007136e-07, + "loss": -0.0045, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.30555725097656, + "epoch": 23.417910447761194, + "grad_norm": 0.9208414499646012, + "learning_rate": 4.7374811764625397e-07, + "loss": 0.0025, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.05555725097656, + "epoch": 23.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.7371905935919444e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.97222900390625, + "epoch": 23.432835820895523, + "grad_norm": 0.6466894186069014, + "learning_rate": 4.736899858908647e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.11111450195312, + "epoch": 23.440298507462686, + "grad_norm": 3.7829822767627936, + "learning_rate": 4.736608972432375e-07, + "loss": -0.0061, + "reward": 0.5833333134651184, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 23.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.736317934182869e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 23.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.736026744179878e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.9166717529297, + "epoch": 23.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.7357354024431607e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5277862548828, + "epoch": 23.470149253731343, + "grad_norm": 2.735275889960434, + "learning_rate": 4.735443908992489e-07, + "loss": 0.0417, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.69444274902344, + "epoch": 23.47761194029851, + "grad_norm": 1.643786907611429, + "learning_rate": 4.7351522638476415e-07, + "loss": -0.0011, + "reward": 0.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.1666717529297, + "epoch": 23.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.73486046702841e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3124 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 23.492537313432837, + "grad_norm": 0.7599340224222579, + "learning_rate": 4.7345685185545946e-07, + "loss": -0.0, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 23.5, + "grad_norm": 1.3455426244513462, + "learning_rate": 4.7342764184460074e-07, + "loss": -0.0117, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3126 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.22222900390625, + "epoch": 23.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.7339841667224697e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3127 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 23.51492537313433, + "grad_norm": 0.912511260892812, + "learning_rate": 4.733691763403812e-07, + "loss": 0.0002, + "reward": 0.6111111044883728, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.6111111044883728, + "rewards/format_reward": 0.0, + "step": 3128 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.38888549804688, + "epoch": 23.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.7333992085098785e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3129 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.4166717529297, + "epoch": 23.529850746268657, + "grad_norm": 0.6687786609735712, + "learning_rate": 4.73310650206052e-07, + "loss": 0.0001, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 3130 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.19444274902344, + "epoch": 23.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.7328136440756e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3131 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.38888549804688, + "epoch": 23.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.732520634574991e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3132 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 23.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.732227473578576e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3133 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.47222900390625, + "epoch": 23.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.73193416110625e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3134 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5277862548828, + "epoch": 23.567164179104477, + "grad_norm": 1.1925391582066835, + "learning_rate": 4.731640697177914e-07, + "loss": 0.0023, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3135 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 23.574626865671643, + "grad_norm": 1.0972608830764512, + "learning_rate": 4.731347081813485e-07, + "loss": -0.0105, + "reward": 0.5833333134651184, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.5833333134651184, + "rewards/format_reward": 0.0, + "step": 3136 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9166717529297, + "epoch": 23.582089552238806, + "grad_norm": 0.6158376006185233, + "learning_rate": 4.731053315032886e-07, + "loss": 0.0006, + "reward": 0.5277777910232544, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.5277777910232544, + "rewards/format_reward": 0.0, + "step": 3137 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.4166717529297, + "epoch": 23.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7307593968560513e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3138 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.47222900390625, + "epoch": 23.597014925373134, + "grad_norm": 0.5333736017262873, + "learning_rate": 4.7304653273029263e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3139 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.80555725097656, + "epoch": 23.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.730171106393466e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3140 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.6944580078125, + "epoch": 23.611940298507463, + "grad_norm": 0.8224811416247553, + "learning_rate": 4.729876734147636e-07, + "loss": 0.0001, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3141 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.9166564941406, + "epoch": 23.619402985074625, + "grad_norm": 0.7065447932553024, + "learning_rate": 4.7295822105854117e-07, + "loss": 0.0069, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3142 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 23.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.7292875357267793e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3143 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.63888549804688, + "epoch": 23.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.728992709591735e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3144 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6388854980469, + "epoch": 23.64179104477612, + "grad_norm": 0.7648564582177575, + "learning_rate": 4.728697732200284e-07, + "loss": -0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3145 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.8888854980469, + "epoch": 23.649253731343283, + "grad_norm": 0.44531790445032066, + "learning_rate": 4.7284026035724454e-07, + "loss": 0.0003, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3146 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.02777099609375, + "epoch": 23.65671641791045, + "grad_norm": 57.304008393829456, + "learning_rate": 4.7281073237282453e-07, + "loss": 0.0031, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3147 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0277862548828, + "epoch": 23.66417910447761, + "grad_norm": 0.7475668514616278, + "learning_rate": 4.727811892687721e-07, + "loss": -0.0391, + "reward": 0.694444477558136, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.0, + "step": 3148 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.75, + "epoch": 23.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.7275163104709194e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3149 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.27777099609375, + "epoch": 23.67910447761194, + "grad_norm": 0.849479901154119, + "learning_rate": 4.727220577097899e-07, + "loss": -0.0002, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3150 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.19444274902344, + "epoch": 23.686567164179106, + "grad_norm": 1.7948298015411586, + "learning_rate": 4.7269246925887274e-07, + "loss": -0.0225, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3151 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.25, + "epoch": 23.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.7266286569634834e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3152 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.5, + "epoch": 23.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.726332470242255e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3153 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.6111145019531, + "epoch": 23.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.726036132445141e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3154 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.0, + "epoch": 23.71641791044776, + "grad_norm": 1.1723504942085654, + "learning_rate": 4.7257396435922517e-07, + "loss": 0.0035, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3155 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.44444274902344, + "epoch": 23.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.7254430037037056e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3156 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5833282470703, + "epoch": 23.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.725146212799632e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3157 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.30555725097656, + "epoch": 23.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.724849270900171e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3158 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.3055725097656, + "epoch": 23.746268656716417, + "grad_norm": 0.47578817208872365, + "learning_rate": 4.7245521780254725e-07, + "loss": 0.014, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3159 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.5277862548828, + "epoch": 23.753731343283583, + "grad_norm": 3.6376866312524117, + "learning_rate": 4.7242549341956974e-07, + "loss": 0.003, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3160 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.47222900390625, + "epoch": 23.761194029850746, + "grad_norm": 0.5920594047147931, + "learning_rate": 4.723957539431015e-07, + "loss": -0.0027, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3161 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.3333435058594, + "epoch": 23.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.7236599937516077e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3162 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.38888549804688, + "epoch": 23.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.7233622971776655e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3163 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.3888854980469, + "epoch": 23.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.7230644497293893e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3164 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.7777862548828, + "epoch": 23.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.722766451426992e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3165 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.25, + "epoch": 23.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.7224683022906947e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3166 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.02777099609375, + "epoch": 23.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.722170002340729e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3167 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.0, + "epoch": 23.813432835820894, + "grad_norm": 1.0959003832906327, + "learning_rate": 4.7218715515973374e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3168 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.72222900390625, + "epoch": 23.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.721572950080773e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3169 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.5833435058594, + "epoch": 23.828358208955223, + "grad_norm": 1.1831229959383962, + "learning_rate": 4.7212741978112975e-07, + "loss": -0.0012, + "reward": 0.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3170 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.4444580078125, + "epoch": 23.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.720975294809184e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3171 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.5, + "epoch": 23.84328358208955, + "grad_norm": 0.4780988564137973, + "learning_rate": 4.720676241094717e-07, + "loss": 0.0005, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3172 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5833435058594, + "epoch": 23.850746268656717, + "grad_norm": 2.6446733535300626, + "learning_rate": 4.7203770366881866e-07, + "loss": 0.0172, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3173 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.4166564941406, + "epoch": 23.85820895522388, + "grad_norm": 1.4439760383427398, + "learning_rate": 4.7200776816099005e-07, + "loss": 0.0032, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3174 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.75, + "epoch": 23.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.71977817588017e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3175 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.4444580078125, + "epoch": 23.87313432835821, + "grad_norm": 1.0841053071614744, + "learning_rate": 4.7194785195193197e-07, + "loss": -0.0055, + "reward": 0.8888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3176 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.8888854980469, + "epoch": 23.880597014925375, + "grad_norm": 0.6604144276368449, + "learning_rate": 4.7191787125476836e-07, + "loss": 0.0087, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3177 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5555725097656, + "epoch": 23.888059701492537, + "grad_norm": 4.284659356978204, + "learning_rate": 4.718878754985607e-07, + "loss": 0.0258, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3178 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.1388854980469, + "epoch": 23.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.718578646853444e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3179 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.1388854980469, + "epoch": 23.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.7182783881715593e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3180 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.1666564941406, + "epoch": 23.91044776119403, + "grad_norm": 0.8143486904291478, + "learning_rate": 4.717977978960329e-07, + "loss": 0.0118, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3181 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.6111145019531, + "epoch": 23.917910447761194, + "grad_norm": 0.8870662108369759, + "learning_rate": 4.717677419240137e-07, + "loss": -0.0145, + "reward": 0.9166666865348816, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3182 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.6666564941406, + "epoch": 23.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.71737670903138e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3183 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.1944580078125, + "epoch": 23.932835820895523, + "grad_norm": 0.7327212268373627, + "learning_rate": 4.717075848354464e-07, + "loss": 0.0002, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3184 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.0833435058594, + "epoch": 23.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.716774837229804e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3185 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.47222900390625, + "epoch": 23.94776119402985, + "grad_norm": 0.9765094526595239, + "learning_rate": 4.716473675677827e-07, + "loss": -0.0008, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3186 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.6666564941406, + "epoch": 23.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.7161723637189687e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3187 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.1388854980469, + "epoch": 23.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.7158709013736766e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3188 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.8888854980469, + "epoch": 23.970149253731343, + "grad_norm": 2.9951492231296504, + "learning_rate": 4.7155692886624063e-07, + "loss": -0.0022, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3189 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.0555725097656, + "epoch": 23.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.7152675256056266e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3190 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.4166564941406, + "epoch": 23.98507462686567, + "grad_norm": 1.1898917050221585, + "learning_rate": 4.714965612223813e-07, + "loss": 0.0311, + "reward": 0.8611111044883728, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3191 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.25, + "epoch": 23.992537313432837, + "grad_norm": 0.7371845376433384, + "learning_rate": 4.714663548537454e-07, + "loss": 0.0136, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3192 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.3055725097656, + "epoch": 24.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.7143613345670475e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3193 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.6666717529297, + "epoch": 24.01492537313433, + "grad_norm": 0.966297915680395, + "learning_rate": 4.7140589703330994e-07, + "loss": -0.0003, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3194 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.27777099609375, + "epoch": 24.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.7137564558561296e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3195 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.8888854980469, + "epoch": 24.029850746268657, + "grad_norm": 0.8425622751173292, + "learning_rate": 4.7134537911566665e-07, + "loss": 0.0156, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3196 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.1666564941406, + "epoch": 24.03731343283582, + "grad_norm": 0.6408778991558547, + "learning_rate": 4.713150976255247e-07, + "loss": -0.0003, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 3197 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.3055725097656, + "epoch": 24.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.71284801117242e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3198 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.3888854980469, + "epoch": 24.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.712544895928745e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3199 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.9444580078125, + "epoch": 24.059701492537314, + "grad_norm": 0.7063627044628906, + "learning_rate": 4.7122416305447917e-07, + "loss": 0.0005, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3200 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.7777862548828, + "epoch": 24.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.7119382150411374e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3201 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.1388854980469, + "epoch": 24.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.711634649438373e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3202 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.38888549804688, + "epoch": 24.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.711330933757097e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3203 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.1666717529297, + "epoch": 24.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7110270680179196e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3204 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.0277862548828, + "epoch": 24.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.710723052241461e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3205 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.36111450195312, + "epoch": 24.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.71041888644835e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3206 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.8888854980469, + "epoch": 24.111940298507463, + "grad_norm": 0.808965509776656, + "learning_rate": 4.710114570659229e-07, + "loss": -0.0125, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3207 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.38888549804688, + "epoch": 24.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.709810104894747e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3208 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.8888854980469, + "epoch": 24.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.709505489175565e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3209 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.72222900390625, + "epoch": 24.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.709200723522353e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3210 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.0, + "epoch": 24.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.708895807955793e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3211 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.0833435058594, + "epoch": 24.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.7085907424965767e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3212 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.3888854980469, + "epoch": 24.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.708285527165404e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3213 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.4166717529297, + "epoch": 24.16417910447761, + "grad_norm": 0.3374101048832469, + "learning_rate": 4.7079801619829873e-07, + "loss": 0.0191, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3214 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.3333435058594, + "epoch": 24.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.707674646970048e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3215 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0, + "epoch": 24.17910447761194, + "grad_norm": 0.6926332205863206, + "learning_rate": 4.707368982147317e-07, + "loss": -0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3216 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.5277862548828, + "epoch": 24.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.7070631675355374e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3217 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.0, + "epoch": 24.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.7067572031554626e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3218 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.1944580078125, + "epoch": 24.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.7064510890278517e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3219 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0, + "epoch": 24.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.7061448251734804e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3220 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.22222900390625, + "epoch": 24.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.705838411613129e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3221 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.61111450195312, + "epoch": 24.223880597014926, + "grad_norm": 0.5367139454044416, + "learning_rate": 4.7055318483675923e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3222 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5833435058594, + "epoch": 24.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.705225135457671e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3223 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.6388854980469, + "epoch": 24.238805970149254, + "grad_norm": 1.145945973433068, + "learning_rate": 4.70491827290418e-07, + "loss": 0.0084, + "reward": 0.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3224 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.72222900390625, + "epoch": 24.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.7046112607279417e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3225 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.0555725097656, + "epoch": 24.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.7043040989497894e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3226 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.2777862548828, + "epoch": 24.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.7039967875905685e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3227 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.69444274902344, + "epoch": 24.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.70368932667113e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3228 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.61111450195312, + "epoch": 24.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.70338171621234e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3229 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.55555725097656, + "epoch": 24.28358208955224, + "grad_norm": 0.3325365086738732, + "learning_rate": 4.703073956235071e-07, + "loss": -0.0015, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3230 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.8333435058594, + "epoch": 24.291044776119403, + "grad_norm": 1.7033580751201116, + "learning_rate": 4.702766046760208e-07, + "loss": 0.0018, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3231 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.4166717529297, + "epoch": 24.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.702457987808645e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3232 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3055725097656, + "epoch": 24.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.7021497794012867e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3233 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.36111450195312, + "epoch": 24.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.701841421559048e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3234 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.5833282470703, + "epoch": 24.32089552238806, + "grad_norm": 0.9209770930767589, + "learning_rate": 4.7015329143028526e-07, + "loss": -0.0005, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3235 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.47222900390625, + "epoch": 24.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.7012242576536366e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3236 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.0277862548828, + "epoch": 24.33582089552239, + "grad_norm": 0.8005147991212643, + "learning_rate": 4.7009154516323436e-07, + "loss": -0.0008, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3237 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.13888549804688, + "epoch": 24.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.70060649625993e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3238 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7777862548828, + "epoch": 24.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.7002973915573605e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3239 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.22222900390625, + "epoch": 24.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.699988137545611e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3240 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.1666717529297, + "epoch": 24.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.6996787342456667e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3241 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.86111450195312, + "epoch": 24.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.6993691816785236e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3242 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0833282470703, + "epoch": 24.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.699059479865187e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3243 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.86111450195312, + "epoch": 24.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.6987496288266726e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3244 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.75, + "epoch": 24.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.6984396285840077e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3245 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.72222900390625, + "epoch": 24.402985074626866, + "grad_norm": 2.4192820075132278, + "learning_rate": 4.698129479158227e-07, + "loss": 0.0003, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3246 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.61111450195312, + "epoch": 24.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.697819180570378e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3247 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.94444274902344, + "epoch": 24.417910447761194, + "grad_norm": 1.0349820937309708, + "learning_rate": 4.6975087328415173e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3248 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.88888549804688, + "epoch": 24.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.6971981359927106e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3249 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.05555725097656, + "epoch": 24.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.6968873900450346e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3250 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.11111450195312, + "epoch": 24.440298507462686, + "grad_norm": 0.5802802222200324, + "learning_rate": 4.6965764950195763e-07, + "loss": -0.0257, + "reward": 0.8333333134651184, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3251 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.5, + "epoch": 24.44776119402985, + "grad_norm": 0.4011080075950445, + "learning_rate": 4.696265450937432e-07, + "loss": 0.0007, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3252 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.55555725097656, + "epoch": 24.455223880597014, + "grad_norm": 1.7360648980993771, + "learning_rate": 4.6959542578197107e-07, + "loss": 0.0317, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3253 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.63888549804688, + "epoch": 24.46268656716418, + "grad_norm": 0.5100332714536306, + "learning_rate": 4.6956429156875274e-07, + "loss": -0.0185, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3254 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.88888549804688, + "epoch": 24.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.69533142456201e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3255 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.5833282470703, + "epoch": 24.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.695019784464297e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3256 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0277862548828, + "epoch": 24.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.6947079954155345e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3257 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.44444274902344, + "epoch": 24.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.6943960574368804e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3258 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1944580078125, + "epoch": 24.5, + "grad_norm": 0.0, + "learning_rate": 4.694083970549502e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3259 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.47222900390625, + "epoch": 24.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.6937717347745776e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3260 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.13888549804688, + "epoch": 24.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.693459350133295e-07, + "loss": 0.0, + "reward": 0.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 0.0, + "step": 3261 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5277862548828, + "epoch": 24.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.6931468166468533e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3262 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.0, + "epoch": 24.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.692834134336459e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3263 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0833282470703, + "epoch": 24.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.6925213032233314e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3264 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.63888549804688, + "epoch": 24.544776119402986, + "grad_norm": 0.8892229105844511, + "learning_rate": 4.6922083233286967e-07, + "loss": 0.0006, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3265 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.63888549804688, + "epoch": 24.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.6918951946737963e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3266 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.88888549804688, + "epoch": 24.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.6915819172798766e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3267 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.69444274902344, + "epoch": 24.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.6912684911681974e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3268 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.4166717529297, + "epoch": 24.574626865671643, + "grad_norm": 1.0036993761415747, + "learning_rate": 4.690954916360026e-07, + "loss": -0.0049, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3269 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.05555725097656, + "epoch": 24.582089552238806, + "grad_norm": 0.5451984702912915, + "learning_rate": 4.690641192876642e-07, + "loss": -0.0007, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3270 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.05555725097656, + "epoch": 24.58955223880597, + "grad_norm": 0.5665336820686634, + "learning_rate": 4.6903273207393345e-07, + "loss": -0.0002, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3271 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.30555725097656, + "epoch": 24.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.690013299969402e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3272 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 24.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.689699130588153e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3273 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.80555725097656, + "epoch": 24.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.6893848126169084e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3274 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.5833282470703, + "epoch": 24.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.689070346076995e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3275 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.55555725097656, + "epoch": 24.62686567164179, + "grad_norm": 1.6389012612970784, + "learning_rate": 4.688755730989754e-07, + "loss": 0.0001, + "reward": 0.8333333134651184, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.0, + "step": 3276 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 24.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.6884409673765346e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3277 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.25, + "epoch": 24.64179104477612, + "grad_norm": 1.0488778293894652, + "learning_rate": 4.6881260552586945e-07, + "loss": 0.0002, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3278 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.0277862548828, + "epoch": 24.649253731343283, + "grad_norm": 1.1083370868625093, + "learning_rate": 4.687810994657605e-07, + "loss": -0.0312, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3279 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.5, + "epoch": 24.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.687495785594645e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3280 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.4444580078125, + "epoch": 24.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.687180428091204e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3281 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.52777099609375, + "epoch": 24.671641791044777, + "grad_norm": 0.43032284629478373, + "learning_rate": 4.6868649221686826e-07, + "loss": 0.0036, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3282 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.4444580078125, + "epoch": 24.67910447761194, + "grad_norm": 0.7252329775443472, + "learning_rate": 4.686549267848489e-07, + "loss": 0.0034, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3283 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1944580078125, + "epoch": 24.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.6862334651520455e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3284 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.44444274902344, + "epoch": 24.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.685917514100779e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3285 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.77777099609375, + "epoch": 24.701492537313435, + "grad_norm": 0.6129934895787099, + "learning_rate": 4.6856014147161317e-07, + "loss": 0.0012, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3286 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.38888549804688, + "epoch": 24.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.6852851670195525e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3287 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0833282470703, + "epoch": 24.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.6849687710325024e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3288 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.0833435058594, + "epoch": 24.723880597014926, + "grad_norm": 0.7345610752566104, + "learning_rate": 4.684652226776452e-07, + "loss": -0.0007, + "reward": 0.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.0, + "step": 3289 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.86111450195312, + "epoch": 24.73134328358209, + "grad_norm": 0.9942988523188596, + "learning_rate": 4.6843355342728804e-07, + "loss": 0.0, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3290 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.52777099609375, + "epoch": 24.738805970149254, + "grad_norm": 1.5705491975364432, + "learning_rate": 4.684018693543278e-07, + "loss": 0.0007, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3291 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.0277862548828, + "epoch": 24.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.6837017046091455e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3292 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0833282470703, + "epoch": 24.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.6833845674919935e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3293 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.4166717529297, + "epoch": 24.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.6830672822133435e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3294 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.8333282470703, + "epoch": 24.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.682749848794724e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3295 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.55555725097656, + "epoch": 24.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.6824322672576755e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3296 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.5277862548828, + "epoch": 24.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.682114537623751e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3297 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.9166564941406, + "epoch": 24.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.6817966599145096e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3298 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 24.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.6814786341515223e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3299 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.6666717529297, + "epoch": 24.80597014925373, + "grad_norm": 1.0015038595672192, + "learning_rate": 4.6811604603563694e-07, + "loss": -0.004, + "reward": 0.8888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3300 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.25, + "epoch": 24.813432835820894, + "grad_norm": 0.38164284080083427, + "learning_rate": 4.680842138550642e-07, + "loss": 0.0, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3301 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.69444274902344, + "epoch": 24.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.680523668755943e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3302 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1388854980469, + "epoch": 24.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.68020505099388e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3303 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.80555725097656, + "epoch": 24.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.6798862852860764e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3304 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.1111145019531, + "epoch": 24.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.6795673716541615e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3305 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.5555725097656, + "epoch": 24.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.6792483101197776e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3306 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.5833282470703, + "epoch": 24.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.6789291007045753e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3307 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0833282470703, + "epoch": 24.865671641791046, + "grad_norm": 0.3424273588333425, + "learning_rate": 4.6786097434302166e-07, + "loss": 0.0002, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3308 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.8333282470703, + "epoch": 24.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.6782902383183705e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3309 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.8333435058594, + "epoch": 24.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.6779705853907195e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3310 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.38888549804688, + "epoch": 24.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.677650784668956e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3311 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.5, + "epoch": 24.895522388059703, + "grad_norm": 0.6784299445566776, + "learning_rate": 4.677330836174779e-07, + "loss": -0.0002, + "reward": 0.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.0, + "step": 3312 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.69444274902344, + "epoch": 24.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.6770107399299013e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3313 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.72222900390625, + "epoch": 24.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.676690495956044e-07, + "loss": 0.0, + "reward": 0.6666666865348816, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 3314 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.1666717529297, + "epoch": 24.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.676370104274937e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3315 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.1111145019531, + "epoch": 24.925373134328357, + "grad_norm": 0.3649926277246423, + "learning_rate": 4.676049564908324e-07, + "loss": 0.0001, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3316 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.3611145019531, + "epoch": 24.932835820895523, + "grad_norm": 0.5442305620359662, + "learning_rate": 4.675728877877955e-07, + "loss": -0.0266, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3317 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.72222900390625, + "epoch": 24.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.675408043205591e-07, + "loss": 0.0, + "reward": 0.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.0, + "step": 3318 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5277862548828, + "epoch": 24.94776119402985, + "grad_norm": 0.6583702261025948, + "learning_rate": 4.6750870609130046e-07, + "loss": -0.0091, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3319 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.25, + "epoch": 24.955223880597014, + "grad_norm": 0.6879589727027147, + "learning_rate": 4.6747659310219757e-07, + "loss": -0.0007, + "reward": 0.944444477558136, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3320 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.6944580078125, + "epoch": 24.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.674444653554297e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3321 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.86111450195312, + "epoch": 24.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.6741232285317693e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3322 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.5, + "epoch": 24.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.673801655976205e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3323 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0555725097656, + "epoch": 24.98507462686567, + "grad_norm": 2.4127432888145757, + "learning_rate": 4.673479935909424e-07, + "loss": -0.0092, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3324 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.1666717529297, + "epoch": 24.992537313432837, + "grad_norm": 0.564306371786623, + "learning_rate": 4.673158068353259e-07, + "loss": 0.0, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3325 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.44444274902344, + "epoch": 25.007462686567163, + "grad_norm": 0.7621898471315522, + "learning_rate": 4.6728360533295514e-07, + "loss": 0.0011, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3326 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.8611145019531, + "epoch": 25.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.6725138908601526e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3327 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.6111145019531, + "epoch": 25.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.6721915809669235e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3328 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.5277862548828, + "epoch": 25.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.6718691236717355e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3329 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1944580078125, + "epoch": 25.03731343283582, + "grad_norm": 1.9124289687103764, + "learning_rate": 4.6715465189964723e-07, + "loss": 0.0006, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3330 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.4166564941406, + "epoch": 25.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.671223766963023e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3331 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.27777099609375, + "epoch": 25.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.670900867593289e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3332 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.3611145019531, + "epoch": 25.059701492537314, + "grad_norm": 1.1272453091746863, + "learning_rate": 4.670577820909184e-07, + "loss": 0.0, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3333 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.5, + "epoch": 25.067164179104477, + "grad_norm": 0.3429767476929757, + "learning_rate": 4.6702546269326277e-07, + "loss": -0.0001, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.0, + "step": 3334 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.22222900390625, + "epoch": 25.074626865671643, + "grad_norm": 0.5569452291780078, + "learning_rate": 4.6699312856855524e-07, + "loss": 0.0001, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3335 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.27777099609375, + "epoch": 25.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.6696077971898985e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3336 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.5833435058594, + "epoch": 25.08955223880597, + "grad_norm": 0.8072459215691965, + "learning_rate": 4.6692841614676194e-07, + "loss": 0.0099, + "reward": 0.9166666865348816, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3337 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.1388854980469, + "epoch": 25.097014925373134, + "grad_norm": 0.6218259720695846, + "learning_rate": 4.668960378540675e-07, + "loss": 0.0221, + "reward": 0.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.02777777798473835, + "step": 3338 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5555725097656, + "epoch": 25.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.668636448431037e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3339 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.1944580078125, + "epoch": 25.111940298507463, + "grad_norm": 0.6852158010836517, + "learning_rate": 4.6683123711606876e-07, + "loss": 0.0, + "reward": 0.944444477558136, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 0.0, + "step": 3340 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.27777099609375, + "epoch": 25.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.6679881467516177e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3341 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.3611145019531, + "epoch": 25.12686567164179, + "grad_norm": 0.6311121335504503, + "learning_rate": 4.6676637752258283e-07, + "loss": 0.0328, + "reward": 1.0277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.02777777798473835, + "step": 3342 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.77777099609375, + "epoch": 25.134328358208954, + "grad_norm": 0.4209105677967857, + "learning_rate": 4.667339256605332e-07, + "loss": 0.0002, + "reward": 0.9166666865348816, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.0, + "step": 3343 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.97222900390625, + "epoch": 25.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.667014590912149e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3344 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.5, + "epoch": 25.149253731343283, + "grad_norm": 0.3219574387552652, + "learning_rate": 4.666689778168311e-07, + "loss": -0.0004, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3345 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.94444274902344, + "epoch": 25.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.6663648183958604e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3346 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.27777099609375, + "epoch": 25.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.666039711616847e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3347 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3055725097656, + "epoch": 25.171641791044777, + "grad_norm": 0.9922789652388317, + "learning_rate": 4.6657144578533324e-07, + "loss": -0.0247, + "reward": 0.8611111044883728, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.0, + "step": 3348 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.25, + "epoch": 25.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.6653890571273883e-07, + "loss": 0.0, + "reward": 0.8888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.0, + "step": 3349 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.1388854980469, + "epoch": 25.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.6650635094610966e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3350 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.3333282470703, + "epoch": 25.19402985074627, + "grad_norm": 1.2018689290399192, + "learning_rate": 4.664737814876547e-07, + "loss": 0.0204, + "reward": 1.0555555820465088, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0555555559694767, + "step": 3351 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.9166564941406, + "epoch": 25.20149253731343, + "grad_norm": 1.441125435727988, + "learning_rate": 4.664411973395842e-07, + "loss": 0.0002, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.1111111119389534, + "step": 3352 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.55555725097656, + "epoch": 25.208955223880597, + "grad_norm": 1.0006447951696533, + "learning_rate": 4.6640859850410917e-07, + "loss": 0.023, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.0, + "step": 3353 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.11111450195312, + "epoch": 25.21641791044776, + "grad_norm": 5.995163508936301, + "learning_rate": 4.6637598498344185e-07, + "loss": 0.0242, + "reward": 1.138888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.1388888955116272, + "step": 3354 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.72222900390625, + "epoch": 25.223880597014926, + "grad_norm": 1.4795892200335383, + "learning_rate": 4.663433567797952e-07, + "loss": 0.0005, + "reward": 0.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.2222222238779068, + "step": 3355 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.9166564941406, + "epoch": 25.23134328358209, + "grad_norm": 0.6341762357915749, + "learning_rate": 4.6631071389538343e-07, + "loss": 0.0354, + "reward": 0.9722222089767456, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.1388888955116272, + "step": 3356 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5, + "epoch": 25.238805970149254, + "grad_norm": 1.443449721426318, + "learning_rate": 4.662780563324217e-07, + "loss": 0.0428, + "reward": 1.638888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.6388888955116272, + "step": 3357 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.47222900390625, + "epoch": 25.246268656716417, + "grad_norm": 0.6528418931027126, + "learning_rate": 4.6624538409312584e-07, + "loss": 0.0013, + "reward": 1.5277777910232544, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.6388888955116272, + "step": 3358 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6666717529297, + "epoch": 25.253731343283583, + "grad_norm": 3.1311442452753075, + "learning_rate": 4.662126971797132e-07, + "loss": -0.0116, + "reward": 1.888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3359 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6666717529297, + "epoch": 25.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.6617999559440187e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3360 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0277862548828, + "epoch": 25.26865671641791, + "grad_norm": 2.7037261225737628, + "learning_rate": 4.6614727933941077e-07, + "loss": 0.001, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3361 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.11111450195312, + "epoch": 25.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.6611454841696014e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3362 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.9166717529297, + "epoch": 25.28358208955224, + "grad_norm": 1.0522612768607555, + "learning_rate": 4.660818028292709e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9722222089767456, + "step": 3363 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.05555725097656, + "epoch": 25.291044776119403, + "grad_norm": 1.5706210678182255, + "learning_rate": 4.6604904257856526e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3364 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 25.298507462686565, + "grad_norm": 1.9290055794516499, + "learning_rate": 4.660162676670662e-07, + "loss": 0.0011, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3365 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.7777862548828, + "epoch": 25.30597014925373, + "grad_norm": 0.6729551947130085, + "learning_rate": 4.6598347809699783e-07, + "loss": -0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3366 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.63888549804688, + "epoch": 25.313432835820894, + "grad_norm": 1.2522650898638672, + "learning_rate": 4.659506738705852e-07, + "loss": 0.0353, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3367 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.80555725097656, + "epoch": 25.32089552238806, + "grad_norm": 0.8561030951115619, + "learning_rate": 4.6591785499005433e-07, + "loss": 0.0, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3368 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.2777862548828, + "epoch": 25.328358208955223, + "grad_norm": 0.9340499745851076, + "learning_rate": 4.658850214576323e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3369 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.86111450195312, + "epoch": 25.33582089552239, + "grad_norm": 2.2969288420002023, + "learning_rate": 4.6585217327554707e-07, + "loss": 0.0669, + "reward": 1.8333333730697632, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3370 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.63888549804688, + "epoch": 25.34328358208955, + "grad_norm": 1.2619110649730003, + "learning_rate": 4.658193104460278e-07, + "loss": -0.022, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3371 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3333282470703, + "epoch": 25.350746268656717, + "grad_norm": 0.6377266276664889, + "learning_rate": 4.657864329713044e-07, + "loss": 0.003, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3372 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.63888549804688, + "epoch": 25.35820895522388, + "grad_norm": 2.3545694770906387, + "learning_rate": 4.6575354085360796e-07, + "loss": 0.0305, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3373 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.88888549804688, + "epoch": 25.365671641791046, + "grad_norm": 1.3817332100770454, + "learning_rate": 4.657206340951705e-07, + "loss": -0.0387, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3374 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.80555725097656, + "epoch": 25.37313432835821, + "grad_norm": 3.8700711016710976, + "learning_rate": 4.656877126982249e-07, + "loss": -0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1666717529297, + "epoch": 25.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.656547766650054e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3376 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.25, + "epoch": 25.388059701492537, + "grad_norm": 0.7512853034368224, + "learning_rate": 4.656218259977468e-07, + "loss": 0.0008, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3377 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.69444274902344, + "epoch": 25.395522388059703, + "grad_norm": 1.126371614117674, + "learning_rate": 4.655888606986852e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3378 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.80555725097656, + "epoch": 25.402985074626866, + "grad_norm": 1.9417869006064288, + "learning_rate": 4.655558807700575e-07, + "loss": -0.0014, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3379 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0, + "epoch": 25.41044776119403, + "grad_norm": 2.2260751952454543, + "learning_rate": 4.6552288621410165e-07, + "loss": -0.0347, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3380 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.44444274902344, + "epoch": 25.417910447761194, + "grad_norm": 2.2386571780716777, + "learning_rate": 4.654898770330566e-07, + "loss": 0.0177, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3381 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.94444274902344, + "epoch": 25.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.654568532291625e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3382 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.11111450195312, + "epoch": 25.432835820895523, + "grad_norm": 2.3983831338344315, + "learning_rate": 4.654238148046601e-07, + "loss": 0.0063, + "reward": 1.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 3383 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5833282470703, + "epoch": 25.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.653907617617915e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3384 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.0277862548828, + "epoch": 25.44776119402985, + "grad_norm": 5.622597124697866, + "learning_rate": 4.6535769410279944e-07, + "loss": -0.0227, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3385 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.61111450195312, + "epoch": 25.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.65324611829928e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3386 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.91666412353516, + "epoch": 25.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.6529151494542206e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3387 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.63888549804688, + "epoch": 25.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.6525840345152754e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3388 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5, + "epoch": 25.47761194029851, + "grad_norm": 0.8213726680491674, + "learning_rate": 4.6522527735049134e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3389 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.13888549804688, + "epoch": 25.48507462686567, + "grad_norm": 3.147100226891364, + "learning_rate": 4.6519213664456123e-07, + "loss": -0.0063, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3390 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.19444274902344, + "epoch": 25.492537313432837, + "grad_norm": 1.0192735860336954, + "learning_rate": 4.651589813359863e-07, + "loss": 0.0, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3391 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.27777862548828, + "epoch": 25.5, + "grad_norm": 0.0, + "learning_rate": 4.651258114270163e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3392 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.97222900390625, + "epoch": 25.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.650926269199021e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3393 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7777862548828, + "epoch": 25.51492537313433, + "grad_norm": 1.647799377783055, + "learning_rate": 4.6505942781689556e-07, + "loss": 0.0081, + "reward": 1.8611111640930176, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3394 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0833282470703, + "epoch": 25.52238805970149, + "grad_norm": 2.367657386478141, + "learning_rate": 4.6502621412024957e-07, + "loss": -0.009, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3395 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.75, + "epoch": 25.529850746268657, + "grad_norm": 1.8944251154617728, + "learning_rate": 4.64992985832218e-07, + "loss": -0.0088, + "reward": 1.8055555820465088, + "reward_std": 0.21749190986156464, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.9722222089767456, + "step": 3396 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3333282470703, + "epoch": 25.53731343283582, + "grad_norm": 1.1386381963719117, + "learning_rate": 4.6495974295505545e-07, + "loss": -0.0012, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3397 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.44444274902344, + "epoch": 25.544776119402986, + "grad_norm": 1.2736811420183187, + "learning_rate": 4.64926485491018e-07, + "loss": -0.0278, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3398 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.22222900390625, + "epoch": 25.55223880597015, + "grad_norm": 0.7599719121980154, + "learning_rate": 4.648932134423623e-07, + "loss": -0.0011, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3399 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.9166717529297, + "epoch": 25.559701492537314, + "grad_norm": 131.70212273055708, + "learning_rate": 4.6485992681134633e-07, + "loss": 0.0188, + "reward": 1.8333333730697632, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 0.9722222089767456, + "step": 3400 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5833282470703, + "epoch": 25.567164179104477, + "grad_norm": 0.9495264156145543, + "learning_rate": 4.648266256002286e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3401 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.38888549804688, + "epoch": 25.574626865671643, + "grad_norm": 59.36823045314648, + "learning_rate": 4.6479330981126905e-07, + "loss": 0.0095, + "reward": 1.7777777910232544, + "reward_std": 0.20183296501636505, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 0.9722222089767456, + "step": 3402 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1666717529297, + "epoch": 25.582089552238806, + "grad_norm": 3.426181458514578, + "learning_rate": 4.6475997944672853e-07, + "loss": 0.0002, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3403 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.13888549804688, + "epoch": 25.58955223880597, + "grad_norm": 12.002704400718818, + "learning_rate": 4.647266345088686e-07, + "loss": -0.0087, + "reward": 1.6944444179534912, + "reward_std": 0.10638079792261124, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.9722222089767456, + "step": 3404 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.36111450195312, + "epoch": 25.597014925373134, + "grad_norm": 2.212337209834646, + "learning_rate": 4.6469327499995207e-07, + "loss": 0.1436, + "reward": 1.638888955116272, + "reward_std": 0.2816419303417206, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.944444477558136, + "step": 3405 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.47222900390625, + "epoch": 25.604477611940297, + "grad_norm": 110.33753593355233, + "learning_rate": 4.6465990092224274e-07, + "loss": 0.0919, + "reward": 1.7777777910232544, + "reward_std": 0.2222222238779068, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 0.944444477558136, + "step": 3406 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5833282470703, + "epoch": 25.611940298507463, + "grad_norm": 21.79727135367254, + "learning_rate": 4.6462651227800526e-07, + "loss": -0.0398, + "reward": 1.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9722222089767456, + "step": 3407 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.22222137451172, + "epoch": 25.619402985074625, + "grad_norm": 2.922532206856727, + "learning_rate": 4.645931090695053e-07, + "loss": -0.0424, + "reward": 1.888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3408 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25, + "epoch": 25.62686567164179, + "grad_norm": 3.193159618507569, + "learning_rate": 4.6455969129900974e-07, + "loss": -0.0779, + "reward": 1.75, + "reward_std": 0.21749190986156464, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.9722222089767456, + "step": 3409 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.88888549804688, + "epoch": 25.634328358208954, + "grad_norm": 90.68208600925949, + "learning_rate": 4.6452625896878607e-07, + "loss": 0.0956, + "reward": 1.638888955116272, + "reward_std": 0.21749190986156464, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 0.9166666865348816, + "step": 3410 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.55555725097656, + "epoch": 25.64179104477612, + "grad_norm": 1.440777155633454, + "learning_rate": 4.64492812081103e-07, + "loss": 0.0368, + "reward": 1.7222222089767456, + "reward_std": 0.19245009124279022, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.944444477558136, + "step": 3411 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.72222900390625, + "epoch": 25.649253731343283, + "grad_norm": 3.584133274512251, + "learning_rate": 4.6445935063823026e-07, + "loss": -0.1179, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3412 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.72222900390625, + "epoch": 25.65671641791045, + "grad_norm": 0.8657380676372246, + "learning_rate": 4.6442587464243844e-07, + "loss": 0.0005, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3413 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.72222900390625, + "epoch": 25.66417910447761, + "grad_norm": 1.419604550438029, + "learning_rate": 4.643923840959992e-07, + "loss": -0.0022, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3414 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.05555725097656, + "epoch": 25.671641791044777, + "grad_norm": 1.3067682517777774, + "learning_rate": 4.643588790011851e-07, + "loss": -0.0075, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 0.9722222089767456, + "step": 3415 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.22222900390625, + "epoch": 25.67910447761194, + "grad_norm": 10.000660646921112, + "learning_rate": 4.643253593602699e-07, + "loss": 0.0007, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3416 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.19444274902344, + "epoch": 25.686567164179106, + "grad_norm": 1.4210410685056156, + "learning_rate": 4.642918251755281e-07, + "loss": 0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3417 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.47222900390625, + "epoch": 25.69402985074627, + "grad_norm": 124.43830420825147, + "learning_rate": 4.642582764492352e-07, + "loss": -0.0157, + "reward": 1.75, + "reward_std": 0.23081667721271515, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.9722222089767456, + "step": 3418 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.47222900390625, + "epoch": 25.701492537313435, + "grad_norm": 1.517010103249219, + "learning_rate": 4.642247131836679e-07, + "loss": 0.0217, + "reward": 1.6666666269302368, + "reward_std": 0.16193635761737823, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 0.9722222089767456, + "step": 3419 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 25.708955223880597, + "grad_norm": 0.6459973568252176, + "learning_rate": 4.641911353811037e-07, + "loss": 0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3420 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.86111450195312, + "epoch": 25.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.641575430438212e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3421 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.5833282470703, + "epoch": 25.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.6412393617409983e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3422 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.11111450195312, + "epoch": 25.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.6409031477422024e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3423 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.1666717529297, + "epoch": 25.738805970149254, + "grad_norm": 0.5901756980909958, + "learning_rate": 4.640566788464638e-07, + "loss": 0.0022, + "reward": 1.638888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 1.0, + "step": 3424 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.36111450195312, + "epoch": 25.746268656716417, + "grad_norm": 1.1674058679167691, + "learning_rate": 4.6402302839311315e-07, + "loss": -0.0188, + "reward": 1.6944444179534912, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 1.0, + "step": 3425 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.47222900390625, + "epoch": 25.753731343283583, + "grad_norm": 0.49767282143532177, + "learning_rate": 4.6398936341645156e-07, + "loss": 0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3426 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 25.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.639556839187636e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3427 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.36111450195312, + "epoch": 25.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.639219899023348e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3428 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.7777862548828, + "epoch": 25.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.6388828136945144e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3429 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.19444274902344, + "epoch": 25.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.6385455832240105e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3430 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.97222900390625, + "epoch": 25.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.638208207634719e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3431 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.61111450195312, + "epoch": 25.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.6378706869495353e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3432 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1666717529297, + "epoch": 25.80597014925373, + "grad_norm": 3.9237255025663376, + "learning_rate": 4.637533021191362e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3433 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.05555725097656, + "epoch": 25.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.637195210383113e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3434 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.30555725097656, + "epoch": 25.82089552238806, + "grad_norm": 0.7255281068403271, + "learning_rate": 4.6368572545477114e-07, + "loss": 0.0001, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3435 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.63888549804688, + "epoch": 25.828358208955223, + "grad_norm": 1.414202026549768, + "learning_rate": 4.636519153708091e-07, + "loss": -0.0002, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3436 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.25, + "epoch": 25.83582089552239, + "grad_norm": 0.9035988293723602, + "learning_rate": 4.636180907887195e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3437 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.05555725097656, + "epoch": 25.84328358208955, + "grad_norm": 1.3378962797388758, + "learning_rate": 4.6358425171079753e-07, + "loss": -0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3438 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 25.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.635503981393395e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 3439 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.38888549804688, + "epoch": 25.85820895522388, + "grad_norm": 0.8888475979811612, + "learning_rate": 4.635165300766427e-07, + "loss": 0.002, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3440 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.11111450195312, + "epoch": 25.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.634826475250053e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3441 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0833282470703, + "epoch": 25.87313432835821, + "grad_norm": 0.9081535368822242, + "learning_rate": 4.634487504867267e-07, + "loss": 0.0047, + "reward": 1.888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3442 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5833282470703, + "epoch": 25.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.634148389641069e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3443 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5277862548828, + "epoch": 25.888059701492537, + "grad_norm": 1.7734930000416889, + "learning_rate": 4.633809129594472e-07, + "loss": 0.0062, + "reward": 1.8333333730697632, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3444 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.72222900390625, + "epoch": 25.895522388059703, + "grad_norm": 24.56654236389503, + "learning_rate": 4.6334697247504975e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3445 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.5833282470703, + "epoch": 25.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.633130175132177e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3446 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.38888549804688, + "epoch": 25.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.632790480762552e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3447 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.86111450195312, + "epoch": 25.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.632450641664673e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3448 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.38888549804688, + "epoch": 25.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.632110657861602e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3449 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.13888549804688, + "epoch": 25.932835820895523, + "grad_norm": 1.1257326003973402, + "learning_rate": 4.6317705293764096e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3450 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.19444274902344, + "epoch": 25.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.631430256232177e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3451 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.08333587646484, + "epoch": 25.94776119402985, + "grad_norm": 0.616411455156134, + "learning_rate": 4.6310898384519927e-07, + "loss": -0.0116, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3452 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.80555725097656, + "epoch": 25.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.6307492760589586e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3453 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 25.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.630408569076185e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3454 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.63888549804688, + "epoch": 25.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.630067717526791e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3455 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.22222900390625, + "epoch": 25.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.6297267214339063e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3456 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.63888549804688, + "epoch": 25.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.629385580820671e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3457 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.08334350585938, + "epoch": 25.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.629044295710234e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3458 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.38888549804688, + "epoch": 26.007462686567163, + "grad_norm": 3.1572302527340703, + "learning_rate": 4.628702866125755e-07, + "loss": -0.0005, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3459 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 26.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.6283612920904023e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 3460 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 26.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.6280195736273545e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3461 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.2777862548828, + "epoch": 26.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.627677710759801e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3462 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.11111450195312, + "epoch": 26.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.62733570351094e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3463 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.75, + "epoch": 26.044776119402986, + "grad_norm": 1.2655584482301927, + "learning_rate": 4.626993551903979e-07, + "loss": 0.0189, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3464 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.13888549804688, + "epoch": 26.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.626651255962136e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3465 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.55555725097656, + "epoch": 26.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.626308815708639e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3466 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.86111450195312, + "epoch": 26.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.6259662311667265e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3467 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.9166717529297, + "epoch": 26.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.625623502359644e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3468 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.72222900390625, + "epoch": 26.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.625280629310649e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3469 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.55555725097656, + "epoch": 26.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.6249376120430106e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3470 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.80555725097656, + "epoch": 26.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.624594450580003e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3471 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.19444274902344, + "epoch": 26.104477611940297, + "grad_norm": 1.1630956984968754, + "learning_rate": 4.6242511449449127e-07, + "loss": -0.0094, + "reward": 1.8611111640930176, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3472 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 26.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.6239076951610367e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3473 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.75, + "epoch": 26.119402985074625, + "grad_norm": 1.7928037671277417, + "learning_rate": 4.623564101251682e-07, + "loss": 0.0007, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3474 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.13888549804688, + "epoch": 26.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.6232203632401625e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3475 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.11111450195312, + "epoch": 26.134328358208954, + "grad_norm": 0.9537755668510345, + "learning_rate": 4.622876481149806e-07, + "loss": -0.0009, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3476 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.86111450195312, + "epoch": 26.14179104477612, + "grad_norm": 1.7057036494538538, + "learning_rate": 4.6225324550039447e-07, + "loss": 0.0191, + "reward": 1.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3477 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.80555725097656, + "epoch": 26.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.622188284825927e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3478 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5833282470703, + "epoch": 26.15671641791045, + "grad_norm": 1.0356668040967127, + "learning_rate": 4.6218439706391053e-07, + "loss": 0.0058, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3479 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.0277862548828, + "epoch": 26.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.6214995124668466e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3480 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0277862548828, + "epoch": 26.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.621154910332524e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3481 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5277862548828, + "epoch": 26.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.620810164259522e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3482 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.22222900390625, + "epoch": 26.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.620465274271234e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3483 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.94444274902344, + "epoch": 26.19402985074627, + "grad_norm": 0.8904707908726869, + "learning_rate": 4.6201202403910643e-07, + "loss": -0.0448, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3484 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.94444274902344, + "epoch": 26.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.619775062642427e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3485 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.27777862548828, + "epoch": 26.208955223880597, + "grad_norm": 5.111516688768857, + "learning_rate": 4.619429741048745e-07, + "loss": -0.0007, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3486 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.61111450195312, + "epoch": 26.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.6190842756334504e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3487 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.2777862548828, + "epoch": 26.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.618738666419987e-07, + "loss": 0.0, + "reward": 1.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 1.0, + "step": 3488 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.22222900390625, + "epoch": 26.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.618392913431808e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3489 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.63888549804688, + "epoch": 26.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.6180470166923737e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3490 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.19444274902344, + "epoch": 26.246268656716417, + "grad_norm": 1.9293127821663, + "learning_rate": 4.6177009762251577e-07, + "loss": 0.0004, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3491 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.61111450195312, + "epoch": 26.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.6173547920536416e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3492 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.86111450195312, + "epoch": 26.261194029850746, + "grad_norm": 1.2629008363762013, + "learning_rate": 4.6170084642013174e-07, + "loss": -0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3493 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.1666717529297, + "epoch": 26.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.616661992691685e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3494 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.75, + "epoch": 26.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.616315377548257e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3495 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.72222900390625, + "epoch": 26.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.6159686187945536e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3496 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6666717529297, + "epoch": 26.291044776119403, + "grad_norm": 1.6401995676472925, + "learning_rate": 4.615621716454105e-07, + "loss": -0.036, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3497 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 26.298507462686565, + "grad_norm": 2.2626140003699717, + "learning_rate": 4.615274670550452e-07, + "loss": 0.0652, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3498 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.80555725097656, + "epoch": 26.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.6149274811071446e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3499 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.30555725097656, + "epoch": 26.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.6145801481477433e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3500 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.97222900390625, + "epoch": 26.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.614232671695816e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3501 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.19444274902344, + "epoch": 26.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.613885051774944e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3502 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.38888549804688, + "epoch": 26.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.6135372884087143e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3503 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5277862548828, + "epoch": 26.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.613189381620727e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3504 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.69444274902344, + "epoch": 26.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.61284133143459e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3505 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8333282470703, + "epoch": 26.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.6124931378739217e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3506 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5277862548828, + "epoch": 26.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.61214480096235e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3507 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.69444274902344, + "epoch": 26.37313432835821, + "grad_norm": 1.5721483685783628, + "learning_rate": 4.611796320723512e-07, + "loss": 0.0076, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3508 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5833282470703, + "epoch": 26.380597014925375, + "grad_norm": 2.2731630285475837, + "learning_rate": 4.611447697181057e-07, + "loss": -0.0008, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3509 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.8333282470703, + "epoch": 26.388059701492537, + "grad_norm": 1.0711566622963318, + "learning_rate": 4.611098930358639e-07, + "loss": 0.0164, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3510 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.72222900390625, + "epoch": 26.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.6107500202799277e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3511 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.55555725097656, + "epoch": 26.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.610400966968599e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3512 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.80555725097656, + "epoch": 26.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.6100517704483377e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3513 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.63888549804688, + "epoch": 26.417910447761194, + "grad_norm": 0.9845004467637224, + "learning_rate": 4.6097024307428417e-07, + "loss": -0.0461, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3514 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.25, + "epoch": 26.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.6093529478758153e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3515 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5833282470703, + "epoch": 26.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.6090033218709755e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3516 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3333282470703, + "epoch": 26.440298507462686, + "grad_norm": 0.8407454440593893, + "learning_rate": 4.6086535527520456e-07, + "loss": -0.0011, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3517 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8333282470703, + "epoch": 26.44776119402985, + "grad_norm": 1.0557372334362554, + "learning_rate": 4.6083036405427615e-07, + "loss": 0.0524, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.9722222089767456, + "step": 3518 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.3333282470703, + "epoch": 26.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.607953585266868e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3519 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 26.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.6076033869481177e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3520 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4166717529297, + "epoch": 26.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.607253045610277e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3521 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.88888549804688, + "epoch": 26.47761194029851, + "grad_norm": 0.5495758941650122, + "learning_rate": 4.606902561277118e-07, + "loss": 0.0009, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3522 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 26.48507462686567, + "grad_norm": 0.6279287787736831, + "learning_rate": 4.6065519339724244e-07, + "loss": -0.0015, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3523 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.61111450195312, + "epoch": 26.492537313432837, + "grad_norm": 0.7951186818147807, + "learning_rate": 4.6062011637199903e-07, + "loss": 0.0008, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3524 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.55555725097656, + "epoch": 26.5, + "grad_norm": 5.075854346711657, + "learning_rate": 4.605850250543617e-07, + "loss": -0.0406, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3525 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.88888549804688, + "epoch": 26.507462686567163, + "grad_norm": 4.046037791571268, + "learning_rate": 4.6054991944671173e-07, + "loss": -0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3526 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.63888549804688, + "epoch": 26.51492537313433, + "grad_norm": 2.239164519824037, + "learning_rate": 4.6051479955143134e-07, + "loss": -0.0475, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9722222089767456, + "step": 3527 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.38888549804688, + "epoch": 26.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.604796653709038e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3528 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 26.529850746268657, + "grad_norm": 1.2123585966997557, + "learning_rate": 4.6044451690751325e-07, + "loss": -0.0067, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3529 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.80555725097656, + "epoch": 26.53731343283582, + "grad_norm": 0.9749109050950239, + "learning_rate": 4.6040935416364466e-07, + "loss": -0.0001, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3530 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5277862548828, + "epoch": 26.544776119402986, + "grad_norm": 0.9746919138672564, + "learning_rate": 4.6037417714168436e-07, + "loss": -0.0449, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3531 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36111450195312, + "epoch": 26.55223880597015, + "grad_norm": 1.2964403637321642, + "learning_rate": 4.6033898584401923e-07, + "loss": -0.0007, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3532 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.86111450195312, + "epoch": 26.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.6030378027303743e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3533 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.75, + "epoch": 26.567164179104477, + "grad_norm": 0.7301335405175868, + "learning_rate": 4.6026856043112796e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3534 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.1666717529297, + "epoch": 26.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.6023332632068057e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3535 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.11111450195312, + "epoch": 26.582089552238806, + "grad_norm": 1.840372249022666, + "learning_rate": 4.601980779440865e-07, + "loss": -0.0446, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3536 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.19444274902344, + "epoch": 26.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.601628153037374e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3537 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.25, + "epoch": 26.597014925373134, + "grad_norm": 0.45820455718406683, + "learning_rate": 4.6012753840202634e-07, + "loss": 0.0003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3538 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0277862548828, + "epoch": 26.604477611940297, + "grad_norm": 0.4736559165957897, + "learning_rate": 4.600922472413471e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3539 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.22222900390625, + "epoch": 26.611940298507463, + "grad_norm": 0.7272819949870096, + "learning_rate": 4.600569418240945e-07, + "loss": -0.0036, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3540 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.88888549804688, + "epoch": 26.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.6002162215266426e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3541 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0833282470703, + "epoch": 26.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.599862882294531e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3542 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.55555725097656, + "epoch": 26.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.5995094005685884e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3543 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.19444274902344, + "epoch": 26.64179104477612, + "grad_norm": 0.47815040186285546, + "learning_rate": 4.5991557763728017e-07, + "loss": 0.0032, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3544 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.47222900390625, + "epoch": 26.649253731343283, + "grad_norm": 17.218543245067174, + "learning_rate": 4.5988020097311666e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3545 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.3333282470703, + "epoch": 26.65671641791045, + "grad_norm": 0.9027742718650275, + "learning_rate": 4.598448100667689e-07, + "loss": 0.0035, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3546 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.8333282470703, + "epoch": 26.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.5980940492063846e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3547 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.6666717529297, + "epoch": 26.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.59773985537128e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3548 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.25, + "epoch": 26.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.5973855191864095e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3549 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5277862548828, + "epoch": 26.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.5970310406758185e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3550 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.80555725097656, + "epoch": 26.69402985074627, + "grad_norm": 0.40518392500635886, + "learning_rate": 4.59667641986356e-07, + "loss": -0.0006, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3551 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.22222900390625, + "epoch": 26.701492537313435, + "grad_norm": 1.0694680864188764, + "learning_rate": 4.5963216567736993e-07, + "loss": -0.0005, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3552 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.61111450195312, + "epoch": 26.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.59596675143031e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3553 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.69444274902344, + "epoch": 26.71641791044776, + "grad_norm": 23.54258112790217, + "learning_rate": 4.5956117038574757e-07, + "loss": 0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3554 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.2777862548828, + "epoch": 26.723880597014926, + "grad_norm": 0.7307622771067007, + "learning_rate": 4.595256514079289e-07, + "loss": 0.0019, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3555 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.72222900390625, + "epoch": 26.73134328358209, + "grad_norm": 0.7968985076829048, + "learning_rate": 4.594901182119852e-07, + "loss": -0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3556 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1388931274414, + "epoch": 26.738805970149254, + "grad_norm": 0.6694773653542169, + "learning_rate": 4.5945457080032777e-07, + "loss": 0.0003, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3557 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.30555725097656, + "epoch": 26.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.5941900917536893e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3558 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.72222900390625, + "epoch": 26.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.593834333395216e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3559 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4166717529297, + "epoch": 26.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.5934784329520015e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3560 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.86111450195312, + "epoch": 26.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.5931223904481945e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3561 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5277862548828, + "epoch": 26.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.5927662059079573e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3562 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 26.78358208955224, + "grad_norm": 0.7640241046973852, + "learning_rate": 4.592409879355459e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3563 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.61111450195312, + "epoch": 26.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.59205341081488e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3564 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5, + "epoch": 26.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.5916968003104096e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3565 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.44444274902344, + "epoch": 26.80597014925373, + "grad_norm": 0.7391096578138782, + "learning_rate": 4.5913400478662465e-07, + "loss": 0.0019, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3566 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.97222900390625, + "epoch": 26.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.590983153506601e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3567 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.3333282470703, + "epoch": 26.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.5906261172556893e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3568 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.97222900390625, + "epoch": 26.828358208955223, + "grad_norm": 0.8845784913382204, + "learning_rate": 4.5902689391377407e-07, + "loss": 0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3569 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.94444274902344, + "epoch": 26.83582089552239, + "grad_norm": 0.3493957530156368, + "learning_rate": 4.5899116191769925e-07, + "loss": 0.0037, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3570 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3333282470703, + "epoch": 26.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.589554157397692e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3571 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 26.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.589196553824096e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3572 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 26.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.588838808480471e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3573 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.69444274902344, + "epoch": 26.865671641791046, + "grad_norm": 0.9339890405309914, + "learning_rate": 4.5884809213910936e-07, + "loss": 0.0054, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3574 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.69444274902344, + "epoch": 26.87313432835821, + "grad_norm": 1.158589906541659, + "learning_rate": 4.588122892580248e-07, + "loss": 0.0188, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3575 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.61111450195312, + "epoch": 26.880597014925375, + "grad_norm": 1.5107857798910644, + "learning_rate": 4.5877647220722317e-07, + "loss": 0.0181, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3576 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25, + "epoch": 26.888059701492537, + "grad_norm": 1.3971398848743648, + "learning_rate": 4.587406409891348e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3577 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.69444274902344, + "epoch": 26.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.5870479560619123e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3578 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6666717529297, + "epoch": 26.902985074626866, + "grad_norm": 3.658868074060769, + "learning_rate": 4.586689360608249e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3579 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.47222900390625, + "epoch": 26.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.5863306235546904e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3580 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5, + "epoch": 26.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.5859717449255814e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3581 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.38888549804688, + "epoch": 26.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.5856127247452745e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3582 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.63888549804688, + "epoch": 26.932835820895523, + "grad_norm": 2.9968937578654566, + "learning_rate": 4.585253563038133e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3583 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.80555725097656, + "epoch": 26.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.584894259828528e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3584 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.25, + "epoch": 26.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.5845348151408415e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3585 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.2777862548828, + "epoch": 26.955223880597014, + "grad_norm": 2.2594571091959192, + "learning_rate": 4.584175228999465e-07, + "loss": 0.002, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3586 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.13888549804688, + "epoch": 26.96268656716418, + "grad_norm": 0.961703507718267, + "learning_rate": 4.5838155014288e-07, + "loss": -0.0007, + "reward": 1.8333333730697632, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3587 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6666717529297, + "epoch": 26.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.583455632453258e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3588 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8333282470703, + "epoch": 26.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.583095622097257e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3589 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.63888549804688, + "epoch": 26.98507462686567, + "grad_norm": 0.35233272991500986, + "learning_rate": 4.5827354703852284e-07, + "loss": -0.0003, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3590 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0, + "epoch": 26.992537313432837, + "grad_norm": 1.3664017211899353, + "learning_rate": 4.5823751773416115e-07, + "loss": -0.0009, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3591 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.3333282470703, + "epoch": 27.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.582014742990854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3592 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8333282470703, + "epoch": 27.01492537313433, + "grad_norm": 0.9165485687581617, + "learning_rate": 4.581654167357416e-07, + "loss": 0.0006, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3593 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.47222137451172, + "epoch": 27.02238805970149, + "grad_norm": 1.3457562338619198, + "learning_rate": 4.5812934504657654e-07, + "loss": 0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3594 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.63888549804688, + "epoch": 27.029850746268657, + "grad_norm": 0.6321410508897418, + "learning_rate": 4.580932592340379e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3595 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5277862548828, + "epoch": 27.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.5805715930057454e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3596 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5277862548828, + "epoch": 27.044776119402986, + "grad_norm": 1.2061033467484241, + "learning_rate": 4.5802104524863604e-07, + "loss": 0.0012, + "reward": 1.6944444179534912, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 1.0, + "step": 3597 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.61111450195312, + "epoch": 27.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.579849170806731e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3598 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5833282470703, + "epoch": 27.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.579487747991373e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3599 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 27.067164179104477, + "grad_norm": 2.0242901288605992, + "learning_rate": 4.5791261840648134e-07, + "loss": -0.0002, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3600 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.0277862548828, + "epoch": 27.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.5787644790515857e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3601 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.2777862548828, + "epoch": 27.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.578402632976235e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3602 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.86111450195312, + "epoch": 27.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.5780406458633167e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3603 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 27.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.5776785177373944e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3604 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.69444274902344, + "epoch": 27.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.57731624862304e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3605 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.97222900390625, + "epoch": 27.111940298507463, + "grad_norm": 0.7162385721878715, + "learning_rate": 4.5769538385448394e-07, + "loss": 0.0046, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3606 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 27.119402985074625, + "grad_norm": 0.7286646969637753, + "learning_rate": 4.576591287527383e-07, + "loss": -0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3607 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5833282470703, + "epoch": 27.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.5762285955952745e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3608 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5833282470703, + "epoch": 27.134328358208954, + "grad_norm": 3.0202799655961585, + "learning_rate": 4.575865762773125e-07, + "loss": 0.0012, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3609 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.1666717529297, + "epoch": 27.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.5755027890855546e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3610 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.61111450195312, + "epoch": 27.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.575139674557196e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3611 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.47222900390625, + "epoch": 27.15671641791045, + "grad_norm": 1.720316919176683, + "learning_rate": 4.5747764192126894e-07, + "loss": -0.0001, + "reward": 1.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3612 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 27.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.574413023076685e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3613 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5, + "epoch": 27.171641791044777, + "grad_norm": 1.5059256429802779, + "learning_rate": 4.574049486173841e-07, + "loss": -0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3614 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 27.17910447761194, + "grad_norm": 0.6018841263002344, + "learning_rate": 4.573685808528828e-07, + "loss": 0.0031, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3615 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.88888549804688, + "epoch": 27.186567164179106, + "grad_norm": 1.0083174472295786, + "learning_rate": 4.5733219901663235e-07, + "loss": 0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3616 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.72222900390625, + "epoch": 27.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.5729580311110163e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3617 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.38888549804688, + "epoch": 27.20149253731343, + "grad_norm": 1.3333123390116608, + "learning_rate": 4.572593931387604e-07, + "loss": -0.0011, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3618 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.44444274902344, + "epoch": 27.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.5722296910207947e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3619 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.88888549804688, + "epoch": 27.21641791044776, + "grad_norm": 2.1208195386765976, + "learning_rate": 4.571865310035304e-07, + "loss": -0.0004, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3620 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.36111450195312, + "epoch": 27.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.5715007884558585e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3621 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9166717529297, + "epoch": 27.23134328358209, + "grad_norm": 1.22258242809931, + "learning_rate": 4.5711361263071955e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3622 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.38888549804688, + "epoch": 27.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.570771323614059e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3623 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.61111450195312, + "epoch": 27.246268656716417, + "grad_norm": 1.6820105967366044, + "learning_rate": 4.570406380401205e-07, + "loss": -0.0235, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3624 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.94444274902344, + "epoch": 27.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.570041296693397e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3625 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 27.261194029850746, + "grad_norm": 0.6115879639906568, + "learning_rate": 4.5696760725154104e-07, + "loss": -0.0001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3626 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.97222900390625, + "epoch": 27.26865671641791, + "grad_norm": 1.0576172941439974, + "learning_rate": 4.5693107078920275e-07, + "loss": 0.0008, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3627 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.63888549804688, + "epoch": 27.276119402985074, + "grad_norm": 1.6248824322476503, + "learning_rate": 4.5689452028480424e-07, + "loss": 0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3628 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.9166717529297, + "epoch": 27.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.568579557408256e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3629 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.4166717529297, + "epoch": 27.291044776119403, + "grad_norm": 4.04850425309478, + "learning_rate": 4.5682137715974835e-07, + "loss": 0.0029, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3630 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.9166717529297, + "epoch": 27.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.5678478454405443e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3631 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 27.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.567481778962271e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3632 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.30555725097656, + "epoch": 27.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.5671155721875034e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3633 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0833282470703, + "epoch": 27.32089552238806, + "grad_norm": 1.0610299613328358, + "learning_rate": 4.566749225141092e-07, + "loss": -0.0008, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3634 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.7777862548828, + "epoch": 27.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.566382737847897e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3635 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.72222900390625, + "epoch": 27.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.5660161103327867e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3636 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.69444274902344, + "epoch": 27.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.565649342620641e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3637 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5833282470703, + "epoch": 27.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.5652824347363484e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3638 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.61111450195312, + "epoch": 27.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.5649153867048063e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3639 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.25, + "epoch": 27.365671641791046, + "grad_norm": 0.7415033862212432, + "learning_rate": 4.5645481985509215e-07, + "loss": 0.0005, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3640 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.30555725097656, + "epoch": 27.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.564180870299612e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3641 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.36111450195312, + "epoch": 27.380597014925375, + "grad_norm": 1.593973408731406, + "learning_rate": 4.563813401975804e-07, + "loss": 0.0156, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3642 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.05555725097656, + "epoch": 27.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.5634457936044323e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3643 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.19444274902344, + "epoch": 27.395522388059703, + "grad_norm": 2.2946415400937163, + "learning_rate": 4.563078045210444e-07, + "loss": 0.0526, + "reward": 1.7222222089767456, + "reward_std": 0.22608637809753418, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9722222089767456, + "step": 3644 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.75, + "epoch": 27.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.562710156818793e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3645 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4166717529297, + "epoch": 27.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.5623421284544427e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3646 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0, + "epoch": 27.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.561973960142369e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3647 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.86111450195312, + "epoch": 27.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.5616056519075543e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3648 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.44444274902344, + "epoch": 27.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.561237203774991e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3649 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55555725097656, + "epoch": 27.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.5608686157696835e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3650 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.63888549804688, + "epoch": 27.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.560499887916641e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3651 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.72222900390625, + "epoch": 27.455223880597014, + "grad_norm": 1.0355452882241365, + "learning_rate": 4.560131020240887e-07, + "loss": -0.0073, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3652 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.72222900390625, + "epoch": 27.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.5597620127674506e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3653 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.0833282470703, + "epoch": 27.470149253731343, + "grad_norm": 1.6402424541910785, + "learning_rate": 4.559392865521374e-07, + "loss": -0.0001, + "reward": 1.5555555820465088, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 1.0, + "step": 3654 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.36111450195312, + "epoch": 27.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.559023578527705e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3655 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.9166717529297, + "epoch": 27.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.558654151811505e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3656 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2777862548828, + "epoch": 27.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.558284585397841e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3657 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.47222900390625, + "epoch": 27.5, + "grad_norm": 0.0, + "learning_rate": 4.557914879311793e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3658 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.38888549804688, + "epoch": 27.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.557545033578447e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3659 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0833282470703, + "epoch": 27.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.557175048222901e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3660 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.88888549804688, + "epoch": 27.52238805970149, + "grad_norm": 0.300373539270168, + "learning_rate": 4.556804923270262e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3661 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.94444274902344, + "epoch": 27.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.556434658745646e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3662 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.94444274902344, + "epoch": 27.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.556064254674179e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3663 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.86111450195312, + "epoch": 27.544776119402986, + "grad_norm": 0.3768991174703974, + "learning_rate": 4.5556937110809945e-07, + "loss": -0.0004, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3664 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 27.55223880597015, + "grad_norm": 0.6793672429603429, + "learning_rate": 4.555323027991239e-07, + "loss": -0.0019, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3665 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.97222900390625, + "epoch": 27.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.554952205430066e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3666 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.4166717529297, + "epoch": 27.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.5545812434226383e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3667 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.80555725097656, + "epoch": 27.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.554210141994129e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3668 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.72222900390625, + "epoch": 27.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.553838901169722e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3669 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.94444274902344, + "epoch": 27.58955223880597, + "grad_norm": 2.013803195742981, + "learning_rate": 4.553467520974607e-07, + "loss": -0.0013, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3670 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.47222900390625, + "epoch": 27.597014925373134, + "grad_norm": 1.1408175681274078, + "learning_rate": 4.553096001433987e-07, + "loss": -0.0015, + "reward": 1.888888955116272, + "reward_std": 0.12830005586147308, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3671 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.13888549804688, + "epoch": 27.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.5527243425730724e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3672 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.05555725097656, + "epoch": 27.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.5523525444170834e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3673 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.7777862548828, + "epoch": 27.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.5519806069912496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3674 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9166717529297, + "epoch": 27.62686567164179, + "grad_norm": 0.8609923192146719, + "learning_rate": 4.5516085303208103e-07, + "loss": 0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3675 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.11111450195312, + "epoch": 27.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.551236314431014e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3676 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.69444274902344, + "epoch": 27.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.550863959347118e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3677 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.80555725097656, + "epoch": 27.649253731343283, + "grad_norm": 1.503085164304876, + "learning_rate": 4.5504914650943914e-07, + "loss": -0.0048, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3678 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.22222900390625, + "epoch": 27.65671641791045, + "grad_norm": 2.782446123787061, + "learning_rate": 4.550118831698111e-07, + "loss": -0.0009, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3679 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.30555725097656, + "epoch": 27.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.549746059183561e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3680 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 27.671641791044777, + "grad_norm": 0.8534223816156798, + "learning_rate": 4.5493731475760397e-07, + "loss": -0.0002, + "reward": 1.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 3681 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7777862548828, + "epoch": 27.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.549000096900851e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3682 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.3333282470703, + "epoch": 27.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.548626907183311e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3683 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0, + "epoch": 27.69402985074627, + "grad_norm": 0.9330649262950291, + "learning_rate": 4.5482535784487423e-07, + "loss": 0.0007, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3684 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.80555725097656, + "epoch": 27.701492537313435, + "grad_norm": 0.8355527165650988, + "learning_rate": 4.5478801107224794e-07, + "loss": -0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3685 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0277862548828, + "epoch": 27.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.5475065040298645e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3686 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.36111450195312, + "epoch": 27.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.547132758396251e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3687 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0833282470703, + "epoch": 27.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.546758873847e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3688 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.75, + "epoch": 27.73134328358209, + "grad_norm": 1.360440522529014, + "learning_rate": 4.5463848504074833e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3689 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.44444274902344, + "epoch": 27.738805970149254, + "grad_norm": 1.09616207778175, + "learning_rate": 4.546010688103081e-07, + "loss": 0.0062, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3690 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.94444274902344, + "epoch": 27.746268656716417, + "grad_norm": 1.633413181276668, + "learning_rate": 4.5456363869591854e-07, + "loss": 0.0017, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3691 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.63888549804688, + "epoch": 27.753731343283583, + "grad_norm": 2.309439340990318, + "learning_rate": 4.545261947001192e-07, + "loss": -0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3692 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5, + "epoch": 27.761194029850746, + "grad_norm": 1.2574217074715899, + "learning_rate": 4.5448873682545135e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3693 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0833282470703, + "epoch": 27.76865671641791, + "grad_norm": 1.201546114494263, + "learning_rate": 4.544512650744566e-07, + "loss": 0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3694 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.94444274902344, + "epoch": 27.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.5441377944967793e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3695 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.05555725097656, + "epoch": 27.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.543762799536589e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3696 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.25, + "epoch": 27.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.5433876658894434e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3697 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.38888549804688, + "epoch": 27.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.543012393580796e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3698 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.44444274902344, + "epoch": 27.80597014925373, + "grad_norm": 0.8037806441688625, + "learning_rate": 4.5426369826361144e-07, + "loss": 0.0001, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3699 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.11111450195312, + "epoch": 27.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.5422614330808727e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3700 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.11111450195312, + "epoch": 27.82089552238806, + "grad_norm": 1.4513970095704474, + "learning_rate": 4.5418857449405556e-07, + "loss": 0.0013, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3701 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.22222900390625, + "epoch": 27.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.5415099182406556e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3702 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.69444274902344, + "epoch": 27.83582089552239, + "grad_norm": 1.2131595018563548, + "learning_rate": 4.5411339530066775e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3703 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 27.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.540757849264133e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3704 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.47222900390625, + "epoch": 27.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.5403816070385437e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3705 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.22222900390625, + "epoch": 27.85820895522388, + "grad_norm": 1.7424619250250435, + "learning_rate": 4.540005226355441e-07, + "loss": 0.0118, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3706 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 27.865671641791046, + "grad_norm": 0.7531408312474905, + "learning_rate": 4.539628707240366e-07, + "loss": 0.0022, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3707 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.38888549804688, + "epoch": 27.87313432835821, + "grad_norm": 0.5943474923496505, + "learning_rate": 4.5392520497188694e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3708 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.55555725097656, + "epoch": 27.880597014925375, + "grad_norm": 0.8486938647344353, + "learning_rate": 4.5388752538165083e-07, + "loss": 0.0011, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3709 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 27.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.5384983195588534e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3710 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.44444274902344, + "epoch": 27.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.5381212469714823e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3711 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5277862548828, + "epoch": 27.902985074626866, + "grad_norm": 1.539493235250217, + "learning_rate": 4.537744036079984e-07, + "loss": 0.0044, + "reward": 1.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 3712 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5277862548828, + "epoch": 27.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.5373666869099537e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3713 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5833282470703, + "epoch": 27.917910447761194, + "grad_norm": 1.3473629465963521, + "learning_rate": 4.536989199486998e-07, + "loss": -0.0011, + "reward": 1.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 3714 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.7777862548828, + "epoch": 27.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.536611573836734e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3715 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.88888549804688, + "epoch": 27.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.5362338099847856e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3716 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0277862548828, + "epoch": 27.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.5358559079567884e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3717 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.75, + "epoch": 27.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.535477867778385e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3718 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.61111450195312, + "epoch": 27.955223880597014, + "grad_norm": 0.0, + "learning_rate": 4.53509968947523e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3719 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.1666717529297, + "epoch": 27.96268656716418, + "grad_norm": 0.852384460579703, + "learning_rate": 4.534721373072985e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3720 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 27.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.5343429185973225e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 3721 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.2777862548828, + "epoch": 27.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.5339643260739245e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3722 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.36111450195312, + "epoch": 27.98507462686567, + "grad_norm": 0.724912258737734, + "learning_rate": 4.5335855955284807e-07, + "loss": -0.0021, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3723 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 27.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.5332067269866913e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3724 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3333282470703, + "epoch": 28.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.532827720474267e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3725 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.5, + "epoch": 28.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.5324485760169254e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3726 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.5, + "epoch": 28.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.532069293640396e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3727 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0277862548828, + "epoch": 28.029850746268657, + "grad_norm": 0.6510705666695125, + "learning_rate": 4.5316898733704146e-07, + "loss": -0.003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3728 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.69444274902344, + "epoch": 28.03731343283582, + "grad_norm": 1.0570311188217976, + "learning_rate": 4.5313103152327294e-07, + "loss": 0.0021, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3729 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 28.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.5309306192530967e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3730 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 28.05223880597015, + "grad_norm": 0.8654151599294282, + "learning_rate": 4.530550785457282e-07, + "loss": -0.0044, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3731 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.3333282470703, + "epoch": 28.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.53017081387106e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3732 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.36111450195312, + "epoch": 28.067164179104477, + "grad_norm": 0.9369697867818513, + "learning_rate": 4.529790704520215e-07, + "loss": -0.0004, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3733 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.69444274902344, + "epoch": 28.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.5294104574305416e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3734 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.86111450195312, + "epoch": 28.082089552238806, + "grad_norm": 1.0798398148897015, + "learning_rate": 4.5290300726278407e-07, + "loss": -0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3735 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 28.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.5286495501379273e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3736 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.86111450195312, + "epoch": 28.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.528268889986622e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3737 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9166717529297, + "epoch": 28.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.5278880921997566e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3738 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.30555725097656, + "epoch": 28.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.52750715680317e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3739 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.94444274902344, + "epoch": 28.119402985074625, + "grad_norm": 0.6175650640426698, + "learning_rate": 4.5271260838227123e-07, + "loss": 0.0004, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3740 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0277862548828, + "epoch": 28.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.526744873284244e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3741 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 28.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.5263635252136324e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3742 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 28.14179104477612, + "grad_norm": 0.5996882493995221, + "learning_rate": 4.525982039636755e-07, + "loss": 0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3743 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.47222900390625, + "epoch": 28.149253731343283, + "grad_norm": 0.9414442883317773, + "learning_rate": 4.525600416579499e-07, + "loss": 0.0012, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3744 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5277862548828, + "epoch": 28.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.525218656067762e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3745 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.0, + "epoch": 28.16417910447761, + "grad_norm": 35.68838523910523, + "learning_rate": 4.524836758127449e-07, + "loss": 0.0036, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3746 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.2777862548828, + "epoch": 28.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.5244547227844746e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3747 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.7777862548828, + "epoch": 28.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.5240725500647637e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3748 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6666717529297, + "epoch": 28.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.52369023999425e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3749 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.22222900390625, + "epoch": 28.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.523307792598876e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3750 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.5, + "epoch": 28.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.5229252079045953e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3751 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.72222900390625, + "epoch": 28.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.5225424859373684e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3752 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.44444274902344, + "epoch": 28.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.5221596267231665e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3753 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8333282470703, + "epoch": 28.223880597014926, + "grad_norm": 0.2920558046100685, + "learning_rate": 4.52177663028797e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.9722222089767456, + "step": 3754 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 28.23134328358209, + "grad_norm": 3.072182341058146, + "learning_rate": 4.5213934966577694e-07, + "loss": -0.0105, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3755 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5277862548828, + "epoch": 28.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.521010225858562e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3756 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5833282470703, + "epoch": 28.246268656716417, + "grad_norm": 1.691092302516338, + "learning_rate": 4.520626817916357e-07, + "loss": 0.0002, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3757 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.6666717529297, + "epoch": 28.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.520243272857172e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3758 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.94444274902344, + "epoch": 28.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.5198595907070334e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3759 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8333282470703, + "epoch": 28.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.519475771491977e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3760 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.97222900390625, + "epoch": 28.276119402985074, + "grad_norm": 0.531179952103952, + "learning_rate": 4.5190918152380504e-07, + "loss": -0.0013, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3761 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.4166717529297, + "epoch": 28.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.5187077219713054e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3762 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4166717529297, + "epoch": 28.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.518323491717808e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3763 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0, + "epoch": 28.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.5179391245036313e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3764 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.30555725097656, + "epoch": 28.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.517554620354857e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3765 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9166717529297, + "epoch": 28.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.517169979297578e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3766 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.72222900390625, + "epoch": 28.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.516785201357895e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3767 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.2777862548828, + "epoch": 28.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.516400286561919e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3768 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4166717529297, + "epoch": 28.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.516015234935769e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3769 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.22222900390625, + "epoch": 28.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.515630046505574e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3770 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8333282470703, + "epoch": 28.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.5152447212974743e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3771 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30555725097656, + "epoch": 28.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.514859259337615e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3772 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.61111450195312, + "epoch": 28.365671641791046, + "grad_norm": 1.4222067459834131, + "learning_rate": 4.5144736606521546e-07, + "loss": -0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3773 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.63888549804688, + "epoch": 28.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.5140879252672585e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3774 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5833282470703, + "epoch": 28.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.513702053209103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3775 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 28.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.513316044503872e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3776 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1666717529297, + "epoch": 28.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.51292989917776e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3777 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5277862548828, + "epoch": 28.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.512543617256971e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3778 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0833282470703, + "epoch": 28.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.5121571987677155e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3779 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0, + "epoch": 28.417910447761194, + "grad_norm": 1.2979905107191454, + "learning_rate": 4.5117706437362166e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3780 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.38888549804688, + "epoch": 28.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.5113839521887067e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3781 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.61111450195312, + "epoch": 28.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.510997124151424e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3782 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.80555725097656, + "epoch": 28.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.510610159650619e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3783 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.94444274902344, + "epoch": 28.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.5102230587125504e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3784 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3333282470703, + "epoch": 28.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.509835821363487e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3785 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.80555725097656, + "epoch": 28.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.509448447629706e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3786 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.38888549804688, + "epoch": 28.470149253731343, + "grad_norm": 0.6471997405078757, + "learning_rate": 4.5090609375374923e-07, + "loss": 0.0008, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3787 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.3333282470703, + "epoch": 28.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.5086732911131444e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3788 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.94444274902344, + "epoch": 28.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.5082855083829665e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3789 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.25, + "epoch": 28.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.507897589373272e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3790 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.61111450195312, + "epoch": 28.5, + "grad_norm": 0.6506336847397336, + "learning_rate": 4.5075095341103857e-07, + "loss": 0.0006, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3791 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5277862548828, + "epoch": 28.507462686567163, + "grad_norm": 0.6979172771574907, + "learning_rate": 4.50712134262064e-07, + "loss": 0.0, + "reward": 1.8055555820465088, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3792 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0833282470703, + "epoch": 28.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.5067330149303777e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3793 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 28.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.50634455106595e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3794 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.13888549804688, + "epoch": 28.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.5059559510537167e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3795 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 28.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.5055672149200485e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3796 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.72222137451172, + "epoch": 28.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.5051783426913235e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3797 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.69444274902344, + "epoch": 28.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.5047893343939314e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3798 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.63888549804688, + "epoch": 28.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.504400190054269e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3799 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8888931274414, + "epoch": 28.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.504010909698743e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3800 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 28.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.5036214933537696e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3801 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 28.582089552238806, + "grad_norm": 0.636554941940687, + "learning_rate": 4.503231941045775e-07, + "loss": -0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3802 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.36111450195312, + "epoch": 28.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.502842252801192e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3803 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.9166717529297, + "epoch": 28.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.5024524286464657e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3804 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.75, + "epoch": 28.604477611940297, + "grad_norm": 0.4779317664780177, + "learning_rate": 4.502062468608048e-07, + "loss": 0.0, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 3805 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.44444274902344, + "epoch": 28.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.5016723727124017e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3806 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.8333282470703, + "epoch": 28.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.501282140985998e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3807 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.69444274902344, + "epoch": 28.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.5008917734553175e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3808 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.2777862548828, + "epoch": 28.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.500501270146849e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3809 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.61111450195312, + "epoch": 28.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.500110631087094e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3810 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.61111450195312, + "epoch": 28.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.4997198563025583e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3811 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 28.65671641791045, + "grad_norm": 0.7390070928789343, + "learning_rate": 4.4993289458197614e-07, + "loss": 0.0012, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3812 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0, + "epoch": 28.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.498937899665228e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3813 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.05555725097656, + "epoch": 28.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.4985467178654947e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3814 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.36111450195312, + "epoch": 28.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.498155400447107e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3815 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.19444274902344, + "epoch": 28.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.4977639474366194e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3816 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5833282470703, + "epoch": 28.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.497372358860594e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3817 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.13888549804688, + "epoch": 28.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.496980634745604e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3818 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.47222900390625, + "epoch": 28.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.4965887751182317e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3819 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.72222900390625, + "epoch": 28.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.496196780005068e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3820 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.4166717529297, + "epoch": 28.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.495804649432714e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3821 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 28.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.495412383427778e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3822 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.6666717529297, + "epoch": 28.738805970149254, + "grad_norm": 1.6613812971579205, + "learning_rate": 4.4950199820168785e-07, + "loss": 0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3823 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.6666717529297, + "epoch": 28.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.4946274452266444e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3824 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5, + "epoch": 28.753731343283583, + "grad_norm": 1.4030324938155794, + "learning_rate": 4.4942347730837105e-07, + "loss": 0.0003, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3825 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.11111450195312, + "epoch": 28.761194029850746, + "grad_norm": 1.204544923515471, + "learning_rate": 4.4938419656147256e-07, + "loss": -0.0, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3826 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 28.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.493449022846344e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3827 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 28.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.4930559448052306e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3828 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.2777862548828, + "epoch": 28.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.4926627315180586e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3829 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.0, + "epoch": 28.791044776119403, + "grad_norm": 1.824591681142385, + "learning_rate": 4.4922693830115115e-07, + "loss": -0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3830 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9166717529297, + "epoch": 28.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.491875899312281e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3831 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.8333282470703, + "epoch": 28.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.4914822804470677e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3832 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.8333282470703, + "epoch": 28.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.4910885264425834e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3833 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 28.82089552238806, + "grad_norm": 1.6899693233026536, + "learning_rate": 4.4906946373255464e-07, + "loss": 0.0169, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3834 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.86111450195312, + "epoch": 28.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.490300613122687e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3835 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.44444274902344, + "epoch": 28.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.4899064538607424e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3836 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.44444274902344, + "epoch": 28.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.4895121595664586e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3837 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.36111450195312, + "epoch": 28.850746268656717, + "grad_norm": 0.4121046247511898, + "learning_rate": 4.489117730266594e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3838 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.80555725097656, + "epoch": 28.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.488723165987912e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3839 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6666717529297, + "epoch": 28.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.4883284667571887e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3840 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.22222900390625, + "epoch": 28.87313432835821, + "grad_norm": 0.8371698138968526, + "learning_rate": 4.4879336326012065e-07, + "loss": 0.0069, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3841 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.38888549804688, + "epoch": 28.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.487538663546759e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3842 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0, + "epoch": 28.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.4871435596206497e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3843 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.30555725097656, + "epoch": 28.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.486748320849687e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3844 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.88888549804688, + "epoch": 28.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.486352947260694e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 3845 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.22222900390625, + "epoch": 28.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.4859574388804977e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3846 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.19444274902344, + "epoch": 28.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.485561795735939e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3847 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 28.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.485166017853864e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3848 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.13888549804688, + "epoch": 28.932835820895523, + "grad_norm": 0.5815708431374121, + "learning_rate": 4.4847701052611303e-07, + "loss": 0.0, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3849 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.55555725097656, + "epoch": 28.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.4843740579846045e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3850 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.38888549804688, + "epoch": 28.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.483977876051161e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3851 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.44444274902344, + "epoch": 28.955223880597014, + "grad_norm": 1.9828130143751337, + "learning_rate": 4.483581559487685e-07, + "loss": -0.0009, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.9722222089767456, + "step": 3852 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.19444274902344, + "epoch": 28.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.483185108321069e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3853 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 28.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.482788522578216e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3854 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.47222900390625, + "epoch": 28.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.482391802286038e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3855 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.61111450195312, + "epoch": 28.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.481994947471456e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3856 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 28.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.481597958161399e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3857 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.47222900390625, + "epoch": 29.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.4812008343828084e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3858 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.25, + "epoch": 29.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.4808035761626305e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3859 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.52777862548828, + "epoch": 29.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.480406183527823e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3860 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75, + "epoch": 29.029850746268657, + "grad_norm": 2.086916288588168, + "learning_rate": 4.480008656505353e-07, + "loss": 0.0056, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3861 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.88888549804688, + "epoch": 29.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.4796109951221963e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3862 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5277862548828, + "epoch": 29.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.479213199405336e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3863 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.61111450195312, + "epoch": 29.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.478815269381768e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3864 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.94444274902344, + "epoch": 29.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.478417205078494e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3865 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 29.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.4780190065225273e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3866 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 29.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.477620673740888e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3867 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0277862548828, + "epoch": 29.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.477222206760606e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3868 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.6666717529297, + "epoch": 29.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.4768236056087227e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3869 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.80555725097656, + "epoch": 29.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.476424870312285e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3870 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.05555725097656, + "epoch": 29.104477611940297, + "grad_norm": 0.26080818317649235, + "learning_rate": 4.4760260008983506e-07, + "loss": -0.0001, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3871 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.63888549804688, + "epoch": 29.111940298507463, + "grad_norm": 0.7969966897984829, + "learning_rate": 4.475626997393987e-07, + "loss": 0.0008, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3872 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.75, + "epoch": 29.119402985074625, + "grad_norm": 1.1450075476911017, + "learning_rate": 4.4752278598262695e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3873 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.25, + "epoch": 29.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.4748285882222835e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3874 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.05555725097656, + "epoch": 29.134328358208954, + "grad_norm": 1.3784758394305692, + "learning_rate": 4.4744291826091223e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.63888549804688, + "epoch": 29.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.474029643013889e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3876 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.44444274902344, + "epoch": 29.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.473629969463697e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3877 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.9166717529297, + "epoch": 29.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.473230161985667e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3878 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 29.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.472830220606929e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3879 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.0277862548828, + "epoch": 29.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.472430145354622e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3880 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.19444274902344, + "epoch": 29.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.472029936255896e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3881 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.44444274902344, + "epoch": 29.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.471629593337908e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3882 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.30555725097656, + "epoch": 29.19402985074627, + "grad_norm": 1.7051868312327092, + "learning_rate": 4.4712291166278244e-07, + "loss": 0.0004, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3883 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0, + "epoch": 29.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.470828506152821e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3884 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.05555725097656, + "epoch": 29.208955223880597, + "grad_norm": 0.5305060903888371, + "learning_rate": 4.470427761940083e-07, + "loss": 0.0008, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3885 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.8333282470703, + "epoch": 29.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.470026884016804e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3886 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.6666717529297, + "epoch": 29.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.4696258724101887e-07, + "loss": 0.0, + "reward": 1.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 1.0, + "step": 3887 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.38888549804688, + "epoch": 29.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.4692247271474467e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3888 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.44444274902344, + "epoch": 29.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.4688234482558006e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3889 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.69444274902344, + "epoch": 29.246268656716417, + "grad_norm": 1.0585616393624597, + "learning_rate": 4.4684220357624794e-07, + "loss": 0.0018, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3890 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 29.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.4680204896947243e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3891 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.25, + "epoch": 29.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.4676188100797826e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3892 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 29.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.4672169969449114e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3893 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.2777862548828, + "epoch": 29.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.466815050317378e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3894 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0277862548828, + "epoch": 29.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.4664129702244566e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3895 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.69444274902344, + "epoch": 29.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.4660107566934346e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3896 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55555725097656, + "epoch": 29.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.465608409751602e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 3897 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.05555725097656, + "epoch": 29.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.4652059294262647e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3898 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5833282470703, + "epoch": 29.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.4648033157447325e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3899 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 29.32089552238806, + "grad_norm": 0.7873151840239329, + "learning_rate": 4.4644005687343265e-07, + "loss": 0.0016, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3900 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.05555725097656, + "epoch": 29.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.463997688422377e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3901 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.5, + "epoch": 29.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.463594674836223e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3902 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.61111450195312, + "epoch": 29.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.463191528003213e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3903 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.97222900390625, + "epoch": 29.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.462788247950702e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3904 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.9166717529297, + "epoch": 29.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.462384834706058e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3905 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 29.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.461981288296655e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3906 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55555725097656, + "epoch": 29.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.4615776087498773e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3907 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 29.380597014925375, + "grad_norm": 0.0, + "learning_rate": 4.461173796093118e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3908 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 29.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.4607698503537803e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3909 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 29.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.4603657715592745e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3910 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.4166717529297, + "epoch": 29.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.459961559737021e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3911 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0833282470703, + "epoch": 29.41044776119403, + "grad_norm": 1.3669591178381335, + "learning_rate": 4.4595572149144486e-07, + "loss": -0.0002, + "reward": 1.638888955116272, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 1.0, + "step": 3912 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.36111450195312, + "epoch": 29.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.4591527371189955e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3913 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.25, + "epoch": 29.425373134328357, + "grad_norm": 0.48763322110871743, + "learning_rate": 4.4587481263781104e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3914 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.88888549804688, + "epoch": 29.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.458343382719248e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3915 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3333282470703, + "epoch": 29.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.457938506169875e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3916 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.86111450195312, + "epoch": 29.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.457533496757465e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3917 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.2777862548828, + "epoch": 29.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.4571283545095007e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3918 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5833282470703, + "epoch": 29.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.456723079453476e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3919 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.69444274902344, + "epoch": 29.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.4563176716168917e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3920 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.88888549804688, + "epoch": 29.47761194029851, + "grad_norm": 0.6823798944512571, + "learning_rate": 4.455912131027258e-07, + "loss": 0.0074, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3921 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.36111450195312, + "epoch": 29.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.455506457712095e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3922 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 29.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.45510065169893e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3923 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.2777862548828, + "epoch": 29.5, + "grad_norm": 0.0, + "learning_rate": 4.4546947130153014e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3924 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.7777862548828, + "epoch": 29.507462686567163, + "grad_norm": 0.0, + "learning_rate": 4.454288641688755e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3925 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.47222900390625, + "epoch": 29.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.4538824377468466e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3926 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.22222900390625, + "epoch": 29.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.4534761012171404e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3927 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.8333282470703, + "epoch": 29.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.4530696321272105e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3928 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.1666717529297, + "epoch": 29.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.4526630305046384e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3929 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.22222900390625, + "epoch": 29.544776119402986, + "grad_norm": 0.0, + "learning_rate": 4.4522562963770167e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3930 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.36111450195312, + "epoch": 29.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.4518494297719444e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3931 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.97222900390625, + "epoch": 29.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.451442430717032e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3932 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1666717529297, + "epoch": 29.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.4510352992398983e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3933 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.61111450195312, + "epoch": 29.574626865671643, + "grad_norm": 0.5783019920072611, + "learning_rate": 4.450628035368169e-07, + "loss": -0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3934 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.55555725097656, + "epoch": 29.582089552238806, + "grad_norm": 1.1266649368367692, + "learning_rate": 4.4502206391294817e-07, + "loss": 0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3935 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.44444274902344, + "epoch": 29.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.4498131105514814e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3936 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.1388931274414, + "epoch": 29.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.449405449661823e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3937 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.69444274902344, + "epoch": 29.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.448997656488168e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3938 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.1666717529297, + "epoch": 29.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.4485897310581915e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3939 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.44444274902344, + "epoch": 29.619402985074625, + "grad_norm": 0.8541045652556736, + "learning_rate": 4.4481816733995726e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3940 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.22222900390625, + "epoch": 29.62686567164179, + "grad_norm": 0.5288563438694481, + "learning_rate": 4.447773483540002e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3941 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 29.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.4473651615071793e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3942 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.4166717529297, + "epoch": 29.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.446956707328813e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3943 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.80555725097656, + "epoch": 29.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.4465481210326193e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3944 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 29.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.4461394026463253e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3945 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.7777862548828, + "epoch": 29.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.445730552197664e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3946 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0277862548828, + "epoch": 29.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.445321569714382e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3947 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.9166717529297, + "epoch": 29.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.4449124552242313e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3948 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.63888549804688, + "epoch": 29.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.4445032087549737e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3949 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 29.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.44409383033438e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3950 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.11111450195312, + "epoch": 29.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.4436843199902303e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3951 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.3333282470703, + "epoch": 29.708955223880597, + "grad_norm": 1.0156601710987838, + "learning_rate": 4.4432746777503137e-07, + "loss": -0.0001, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 3952 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.88888549804688, + "epoch": 29.71641791044776, + "grad_norm": 1.1784707993373906, + "learning_rate": 4.442864903642427e-07, + "loss": 0.0006, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3953 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 29.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.442454997694378e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3954 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 29.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.4420449599339814e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3955 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.36111450195312, + "epoch": 29.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.441634790389063e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3956 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.47222900390625, + "epoch": 29.746268656716417, + "grad_norm": 0.9201953681128237, + "learning_rate": 4.4412244890874557e-07, + "loss": -0.0073, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 3957 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1666717529297, + "epoch": 29.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.440814056057002e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3958 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.05555725097656, + "epoch": 29.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.4404034913255525e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3959 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9166717529297, + "epoch": 29.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.439992794920968e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3960 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.22222900390625, + "epoch": 29.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.4395819668711194e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3961 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.8333282470703, + "epoch": 29.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.439171007203883e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 3962 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.13888549804688, + "epoch": 29.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.438759915947147e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3963 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5277862548828, + "epoch": 29.798507462686565, + "grad_norm": 3.386940300996623, + "learning_rate": 4.438348693128807e-07, + "loss": -0.0008, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3964 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5833282470703, + "epoch": 29.80597014925373, + "grad_norm": 0.34221316181027334, + "learning_rate": 4.4379373387767677e-07, + "loss": 0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3965 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.61111450195312, + "epoch": 29.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.4375258529189443e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3966 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.19444274902344, + "epoch": 29.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.437114235583258e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3967 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0833282470703, + "epoch": 29.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.4367024867976416e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3968 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.5, + "epoch": 29.83582089552239, + "grad_norm": 1.1493319116521645, + "learning_rate": 4.436290606590036e-07, + "loss": -0.0091, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3969 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.38888549804688, + "epoch": 29.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.4358785949883896e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3970 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.11111450195312, + "epoch": 29.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.435466452020663e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3971 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.97222900390625, + "epoch": 29.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.435054177714822e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3972 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0833282470703, + "epoch": 29.865671641791046, + "grad_norm": 1.0216579698034896, + "learning_rate": 4.434641772098843e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3973 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.55555725097656, + "epoch": 29.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.434229235200713e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3974 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.86111450195312, + "epoch": 29.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.4338165670484234e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3975 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.0833282470703, + "epoch": 29.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.43340376766998e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3976 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.86111450195312, + "epoch": 29.895522388059703, + "grad_norm": 2.251356389851654, + "learning_rate": 4.4329908370933934e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3977 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.55555725097656, + "epoch": 29.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.432577775346684e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3978 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.61111450195312, + "epoch": 29.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.4321645824578835e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3979 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.75, + "epoch": 29.917910447761194, + "grad_norm": 2.0214438391191965, + "learning_rate": 4.4317512584550286e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 3980 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.55555725097656, + "epoch": 29.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.4313378033661674e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3981 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.2777862548828, + "epoch": 29.932835820895523, + "grad_norm": 0.49524882835482287, + "learning_rate": 4.4309242172193574e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3982 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0, + "epoch": 29.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.4305105000426633e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3983 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5277862548828, + "epoch": 29.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.430096651864159e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3984 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0277862548828, + "epoch": 29.955223880597014, + "grad_norm": 1.0836487553924943, + "learning_rate": 4.429682672711929e-07, + "loss": -0.0022, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.944444477558136, + "step": 3985 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.05555725097656, + "epoch": 29.96268656716418, + "grad_norm": 0.7357679128077154, + "learning_rate": 4.429268562614064e-07, + "loss": -0.0003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3986 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.30555725097656, + "epoch": 29.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.4288543215986653e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3987 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.63888549804688, + "epoch": 29.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.428439949693843e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3988 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.94444274902344, + "epoch": 29.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.4280254469277156e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3989 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1666717529297, + "epoch": 29.992537313432837, + "grad_norm": 0.651865478663432, + "learning_rate": 4.4276108133284105e-07, + "loss": -0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 3990 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.94444274902344, + "epoch": 30.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.4271960489240647e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3991 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.94444274902344, + "epoch": 30.01492537313433, + "grad_norm": 0.7641174766671486, + "learning_rate": 4.4267811537428235e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 3992 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 30.02238805970149, + "grad_norm": 0.9211689224905112, + "learning_rate": 4.4263661278128407e-07, + "loss": 0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 3993 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1666717529297, + "epoch": 30.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.425950971162279e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3994 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.30555725097656, + "epoch": 30.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.4255356838193115e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3995 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.44444274902344, + "epoch": 30.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.425120265812118e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 3996 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6666717529297, + "epoch": 30.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.424704717168889e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3997 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.1666717529297, + "epoch": 30.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.4242890379178233e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3998 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0277862548828, + "epoch": 30.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.423873228087127e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 3999 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.11111450195312, + "epoch": 30.074626865671643, + "grad_norm": 1.2804969570916511, + "learning_rate": 4.4234572877050167e-07, + "loss": 0.001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4000 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1666717529297, + "epoch": 30.082089552238806, + "grad_norm": 0.46837634255949473, + "learning_rate": 4.423041216799719e-07, + "loss": -0.0004, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4001 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.05555725097656, + "epoch": 30.08955223880597, + "grad_norm": 0.5739607008718397, + "learning_rate": 4.422625015399467e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4002 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5833282470703, + "epoch": 30.097014925373134, + "grad_norm": 3.102978867026784, + "learning_rate": 4.422208683532503e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4003 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.80555725097656, + "epoch": 30.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.4217922212270785e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4004 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.36111450195312, + "epoch": 30.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.4213756285114557e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4005 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9166717529297, + "epoch": 30.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.420958905413903e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4006 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5, + "epoch": 30.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.4205420519626984e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4007 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.97222900390625, + "epoch": 30.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.420125068186129e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4008 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.9166717529297, + "epoch": 30.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.4197079541124917e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4009 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5277862548828, + "epoch": 30.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.4192907097700904e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4010 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.47222900390625, + "epoch": 30.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.4188733351872394e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4011 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.80555725097656, + "epoch": 30.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.41845583039226e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4012 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 30.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.418038195413485e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4013 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 30.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.4176204302792536e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4014 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0, + "epoch": 30.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.417202535017915e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4015 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.9166717529297, + "epoch": 30.19402985074627, + "grad_norm": 0.8252497863725698, + "learning_rate": 4.4167845096578264e-07, + "loss": 0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4016 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.88888549804688, + "epoch": 30.20149253731343, + "grad_norm": 0.0, + "learning_rate": 4.4163663542273556e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4017 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.30555725097656, + "epoch": 30.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.415948068754878e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4018 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1666717529297, + "epoch": 30.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.4155296532687763e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4019 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.80555725097656, + "epoch": 30.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.415111107797445e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4020 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.63888549804688, + "epoch": 30.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.4146924323692856e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4021 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.47222900390625, + "epoch": 30.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.414273627012709e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4022 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6666717529297, + "epoch": 30.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.413854691756135e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4023 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5277862548828, + "epoch": 30.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.4134356266279916e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4024 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.80555725097656, + "epoch": 30.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.4130164316567154e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4025 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7777862548828, + "epoch": 30.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.4125971068707536e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4026 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.55555725097656, + "epoch": 30.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.41217765229856e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4027 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.47222900390625, + "epoch": 30.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.4117580679685997e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4028 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.3333282470703, + "epoch": 30.291044776119403, + "grad_norm": 1.0319931569571772, + "learning_rate": 4.4113383539093437e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4029 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6666717529297, + "epoch": 30.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.410918510149273e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4030 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.86111450195312, + "epoch": 30.30597014925373, + "grad_norm": 0.8140956990485748, + "learning_rate": 4.4104985367168785e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4031 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.47222900390625, + "epoch": 30.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.4100784336406594e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4032 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.88888549804688, + "epoch": 30.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.409658200949122e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4033 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0277862548828, + "epoch": 30.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.4092378386707837e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4034 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.0833282470703, + "epoch": 30.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.408817346834169e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4035 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.05555725097656, + "epoch": 30.34328358208955, + "grad_norm": 0.0, + "learning_rate": 4.408396725467812e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4036 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.7777862548828, + "epoch": 30.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.407975974600257e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4037 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.25, + "epoch": 30.35820895522388, + "grad_norm": 2.0843114294931073, + "learning_rate": 4.4075550942600537e-07, + "loss": -0.0044, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4038 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.72222900390625, + "epoch": 30.365671641791046, + "grad_norm": 0.42937947523011927, + "learning_rate": 4.407134084475763e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4039 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6666717529297, + "epoch": 30.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.4067129452759546e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4040 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.36111450195312, + "epoch": 30.380597014925375, + "grad_norm": 0.7697373621713238, + "learning_rate": 4.406291676689206e-07, + "loss": 0.0013, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4041 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1666717529297, + "epoch": 30.388059701492537, + "grad_norm": 0.0, + "learning_rate": 4.405870278744104e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4042 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.22222900390625, + "epoch": 30.395522388059703, + "grad_norm": 0.7509666668924302, + "learning_rate": 4.405448751469244e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4043 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.7777862548828, + "epoch": 30.402985074626866, + "grad_norm": 0.0, + "learning_rate": 4.40502709489323e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4044 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 30.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.4046053090446753e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4045 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.1666717529297, + "epoch": 30.417910447761194, + "grad_norm": 0.6447704154635626, + "learning_rate": 4.404183393952202e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4046 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.7777862548828, + "epoch": 30.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.403761349644441e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4047 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.30555725097656, + "epoch": 30.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.4033391761500304e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4048 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.63888549804688, + "epoch": 30.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.4029168734976194e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4049 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.11111450195312, + "epoch": 30.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.4024944417158636e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4050 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.38888549804688, + "epoch": 30.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.4020718808334313e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4051 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5833282470703, + "epoch": 30.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.401649190878993e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4052 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.88888549804688, + "epoch": 30.470149253731343, + "grad_norm": 1.163257259605616, + "learning_rate": 4.401226371881235e-07, + "loss": 0.0031, + "reward": 1.75, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4053 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.80555725097656, + "epoch": 30.47761194029851, + "grad_norm": 2.1111618145807047, + "learning_rate": 4.4008034238688487e-07, + "loss": -0.0008, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4054 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0277862548828, + "epoch": 30.48507462686567, + "grad_norm": 0.9563322706622507, + "learning_rate": 4.4003803468705334e-07, + "loss": 0.0045, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4055 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.05555725097656, + "epoch": 30.492537313432837, + "grad_norm": 1.285748958867723, + "learning_rate": 4.3999571409150003e-07, + "loss": -0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4056 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 30.5, + "grad_norm": 0.0, + "learning_rate": 4.3995338060309667e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4057 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.30555725097656, + "epoch": 30.507462686567163, + "grad_norm": 0.5492154045342339, + "learning_rate": 4.3991103422471585e-07, + "loss": -0.0034, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4058 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 30.51492537313433, + "grad_norm": 1.174703809421238, + "learning_rate": 4.3986867495923134e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4059 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9166717529297, + "epoch": 30.52238805970149, + "grad_norm": 0.7891828342148364, + "learning_rate": 4.398263028095175e-07, + "loss": -0.0008, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4060 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.88888549804688, + "epoch": 30.529850746268657, + "grad_norm": 0.46869265363581525, + "learning_rate": 4.397839177784496e-07, + "loss": 0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4061 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.47222900390625, + "epoch": 30.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.397415198689038e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4062 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.94444274902344, + "epoch": 30.544776119402986, + "grad_norm": 0.4319680056491725, + "learning_rate": 4.396991090837573e-07, + "loss": -0.0006, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4063 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.97222900390625, + "epoch": 30.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.3965668542588785e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4064 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.36111450195312, + "epoch": 30.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.396142488981745e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4065 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.8333282470703, + "epoch": 30.567164179104477, + "grad_norm": 0.2825147135898701, + "learning_rate": 4.395717995034968e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4066 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 30.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.3952933724473524e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4067 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.86111450195312, + "epoch": 30.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.3948686212477137e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4068 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.86111450195312, + "epoch": 30.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.394443741464874e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4069 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.30555725097656, + "epoch": 30.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.3940187331276665e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4070 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.22222900390625, + "epoch": 30.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.3935935962649303e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4071 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.97222900390625, + "epoch": 30.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.393168330905515e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4072 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7777862548828, + "epoch": 30.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.3927429370782784e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4073 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.94444274902344, + "epoch": 30.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.392317414812088e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4074 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 30.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.3918917641358176e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4075 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9166717529297, + "epoch": 30.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.3914659850783533e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4076 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 30.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.391040077668586e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4077 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.63888549804688, + "epoch": 30.65671641791045, + "grad_norm": 1.488711772723827, + "learning_rate": 4.390614041935418e-07, + "loss": -0.0235, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4078 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4166717529297, + "epoch": 30.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.3901878779077595e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4079 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.61111450195312, + "epoch": 30.671641791044777, + "grad_norm": 0.0, + "learning_rate": 4.38976158561453e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4080 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5833282470703, + "epoch": 30.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3893351650846554e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4081 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.63888549804688, + "epoch": 30.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.388908616347074e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4082 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.8333282470703, + "epoch": 30.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.3884819394307293e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4083 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.55555725097656, + "epoch": 30.701492537313435, + "grad_norm": 0.7044010477250383, + "learning_rate": 4.3880551343645763e-07, + "loss": -0.0011, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4084 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 30.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.3876282011775765e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4085 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.47222900390625, + "epoch": 30.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.3872011398987006e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4086 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.75, + "epoch": 30.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.3867739505569303e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4087 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5277862548828, + "epoch": 30.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.3863466331812517e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4088 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5833282470703, + "epoch": 30.738805970149254, + "grad_norm": 0.0, + "learning_rate": 4.3859191878006634e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4089 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 30.746268656716417, + "grad_norm": 0.0, + "learning_rate": 4.385491614444171e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4090 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.11111450195312, + "epoch": 30.753731343283583, + "grad_norm": 0.701555161179819, + "learning_rate": 4.38506391314079e-07, + "loss": -0.0034, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4091 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.47222900390625, + "epoch": 30.761194029850746, + "grad_norm": 3.107756197779335, + "learning_rate": 4.3846360839195405e-07, + "loss": -0.0026, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4092 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.75, + "epoch": 30.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.384208126809458e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4093 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.47222900390625, + "epoch": 30.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.3837800418395814e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4094 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.7777862548828, + "epoch": 30.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.3833518290389605e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4095 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.7777862548828, + "epoch": 30.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.3829234884366523e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4096 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.86111450195312, + "epoch": 30.798507462686565, + "grad_norm": 2.0772295819797417, + "learning_rate": 4.382495020061724e-07, + "loss": -0.0025, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4097 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.80555725097656, + "epoch": 30.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.382066423943251e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4098 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.38888549804688, + "epoch": 30.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.381637700110318e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4099 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.94444274902344, + "epoch": 30.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.381208848592016e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4100 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.86111450195312, + "epoch": 30.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.3807798694174475e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4101 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 30.83582089552239, + "grad_norm": 0.8980041406825675, + "learning_rate": 4.380350762615722e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4102 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.3888931274414, + "epoch": 30.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.3799215282159574e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4103 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.36111450195312, + "epoch": 30.850746268656717, + "grad_norm": 0.7369677519228731, + "learning_rate": 4.3794921662472824e-07, + "loss": 0.0013, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4104 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.30555725097656, + "epoch": 30.85820895522388, + "grad_norm": 0.813370741748878, + "learning_rate": 4.3790626767388315e-07, + "loss": 0.0007, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4105 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.55555725097656, + "epoch": 30.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.378633059719751e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4106 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.61111450195312, + "epoch": 30.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.378203315219192e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4107 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5277862548828, + "epoch": 30.880597014925375, + "grad_norm": 0.5139284397591656, + "learning_rate": 4.3777734432663183e-07, + "loss": -0.0067, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4108 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5833282470703, + "epoch": 30.888059701492537, + "grad_norm": 0.6939118912543591, + "learning_rate": 4.377343443890299e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4109 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5833282470703, + "epoch": 30.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.3769133171203136e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4110 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.30555725097656, + "epoch": 30.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.376483062985551e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4111 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.72222900390625, + "epoch": 30.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.3760526815152064e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4112 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.88888549804688, + "epoch": 30.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3756221727384856e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4113 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5, + "epoch": 30.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.3751915366846015e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4114 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.63888549804688, + "epoch": 30.932835820895523, + "grad_norm": 0.9479311161582852, + "learning_rate": 4.3747607733827773e-07, + "loss": -0.0006, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4115 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.0277862548828, + "epoch": 30.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.374329882862244e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4116 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 30.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.3738988651522403e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4117 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.36111450195312, + "epoch": 30.955223880597014, + "grad_norm": 1.3151205703386966, + "learning_rate": 4.3734677202820156e-07, + "loss": -0.0335, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4118 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.61111450195312, + "epoch": 30.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.3730364482808266e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4119 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.86111450195312, + "epoch": 30.970149253731343, + "grad_norm": 0.7818766270143247, + "learning_rate": 4.372605049177939e-07, + "loss": 0.0023, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4120 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5, + "epoch": 30.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.3721735230026256e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4121 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.88888549804688, + "epoch": 30.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.37174186978417e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4122 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.83334350585938, + "epoch": 30.992537313432837, + "grad_norm": 2.368405487684066, + "learning_rate": 4.371310089551865e-07, + "loss": -0.0307, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4123 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8333282470703, + "epoch": 31.007462686567163, + "grad_norm": 0.0, + "learning_rate": 4.370878182335008e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4124 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.80555725097656, + "epoch": 31.01492537313433, + "grad_norm": 0.0, + "learning_rate": 4.3704461481629095e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4125 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.19444274902344, + "epoch": 31.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.370013987064887e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4126 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.69444274902344, + "epoch": 31.029850746268657, + "grad_norm": 0.0, + "learning_rate": 4.3695816990702643e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4127 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.25, + "epoch": 31.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.369149284208378e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4128 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 31.044776119402986, + "grad_norm": 0.0, + "learning_rate": 4.36871674250857e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4129 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0, + "epoch": 31.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.3682840740001927e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4130 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.80555725097656, + "epoch": 31.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.3678512787126057e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4131 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.97222900390625, + "epoch": 31.067164179104477, + "grad_norm": 0.0, + "learning_rate": 4.3674183566751783e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4132 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8333282470703, + "epoch": 31.074626865671643, + "grad_norm": 0.0, + "learning_rate": 4.366985307917288e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4133 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1666717529297, + "epoch": 31.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.3665521324683206e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4134 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.11111450195312, + "epoch": 31.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.366118830357671e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4135 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0833282470703, + "epoch": 31.097014925373134, + "grad_norm": 0.0, + "learning_rate": 4.3656854016147433e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4136 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.30555725097656, + "epoch": 31.104477611940297, + "grad_norm": 0.0, + "learning_rate": 4.365251846268948e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4137 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.47222900390625, + "epoch": 31.111940298507463, + "grad_norm": 0.0, + "learning_rate": 4.364818164349706e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4138 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.7777862548828, + "epoch": 31.119402985074625, + "grad_norm": 0.0, + "learning_rate": 4.364384355886447e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4139 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.19444274902344, + "epoch": 31.12686567164179, + "grad_norm": 0.5556425783471036, + "learning_rate": 4.363950420908608e-07, + "loss": 0.0032, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4140 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.72222900390625, + "epoch": 31.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.3635163594456346e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4141 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.7777862548828, + "epoch": 31.14179104477612, + "grad_norm": 0.4083510152900414, + "learning_rate": 4.363082171526984e-07, + "loss": 0.0034, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4142 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.38888549804688, + "epoch": 31.149253731343283, + "grad_norm": 0.0, + "learning_rate": 4.362647857182117e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4143 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1666717529297, + "epoch": 31.15671641791045, + "grad_norm": 0.0, + "learning_rate": 4.362213416440507e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4144 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0, + "epoch": 31.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.361778849331634e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4145 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8333282470703, + "epoch": 31.171641791044777, + "grad_norm": 0.0, + "learning_rate": 4.3613441558849875e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4146 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.88888549804688, + "epoch": 31.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3609093361300646e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4147 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5277862548828, + "epoch": 31.186567164179106, + "grad_norm": 0.0, + "learning_rate": 4.3604743900963724e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4148 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.97222900390625, + "epoch": 31.19402985074627, + "grad_norm": 0.7739707411673217, + "learning_rate": 4.3600393178134253e-07, + "loss": -0.0063, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4149 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.1666717529297, + "epoch": 31.20149253731343, + "grad_norm": 0.6695217252105204, + "learning_rate": 4.359604119310747e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4150 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.69444274902344, + "epoch": 31.208955223880597, + "grad_norm": 0.0, + "learning_rate": 4.3591687946178686e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4151 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.72222900390625, + "epoch": 31.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.3587333437643316e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4152 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.55555725097656, + "epoch": 31.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.3582977667796844e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4153 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.7777862548828, + "epoch": 31.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.357862063693485e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4154 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4166717529297, + "epoch": 31.238805970149254, + "grad_norm": 0.0, + "learning_rate": 4.357426234535301e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4155 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 31.246268656716417, + "grad_norm": 0.0, + "learning_rate": 4.356990279334704e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4156 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5277862548828, + "epoch": 31.253731343283583, + "grad_norm": 0.0, + "learning_rate": 4.3565541981212807e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4157 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.88888549804688, + "epoch": 31.261194029850746, + "grad_norm": 0.0, + "learning_rate": 4.3561179909246205e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4158 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0833282470703, + "epoch": 31.26865671641791, + "grad_norm": 0.5606198048760258, + "learning_rate": 4.3556816577743245e-07, + "loss": 0.0007, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4159 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30555725097656, + "epoch": 31.276119402985074, + "grad_norm": 0.908174307196965, + "learning_rate": 4.3552451987000025e-07, + "loss": 0.001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4160 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.1666717529297, + "epoch": 31.28358208955224, + "grad_norm": 0.38055974237934304, + "learning_rate": 4.354808613731271e-07, + "loss": -0.0006, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4161 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.7777862548828, + "epoch": 31.291044776119403, + "grad_norm": 0.0, + "learning_rate": 4.3543719028977565e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4162 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 31.298507462686565, + "grad_norm": 7.543973178088745, + "learning_rate": 4.353935066229094e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4163 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 31.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.353498103754926e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4164 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.2777862548828, + "epoch": 31.313432835820894, + "grad_norm": 0.0, + "learning_rate": 4.3530610155049046e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4165 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333282470703, + "epoch": 31.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.3526238015086895e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4166 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.7777862548828, + "epoch": 31.328358208955223, + "grad_norm": 0.0, + "learning_rate": 4.3521864617959503e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4167 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.22222900390625, + "epoch": 31.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.3517489963963633e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4168 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.80555725097656, + "epoch": 31.34328358208955, + "grad_norm": 1.5384442055228662, + "learning_rate": 4.351311405339615e-07, + "loss": -0.0381, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4169 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.30555725097656, + "epoch": 31.350746268656717, + "grad_norm": 0.0, + "learning_rate": 4.350873688655399e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4170 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.63888549804688, + "epoch": 31.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.3504358463734193e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4171 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.0833282470703, + "epoch": 31.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.3499978785233864e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4172 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.63888549804688, + "epoch": 31.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.34955978513502e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4173 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0833282470703, + "epoch": 31.380597014925375, + "grad_norm": 0.9744279291001218, + "learning_rate": 4.34912156623805e-07, + "loss": 0.0157, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4174 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.44444274902344, + "epoch": 31.388059701492537, + "grad_norm": 2.953195022612593, + "learning_rate": 4.348683221862212e-07, + "loss": 0.0005, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4175 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.88888549804688, + "epoch": 31.395522388059703, + "grad_norm": 0.0, + "learning_rate": 4.34824475203725e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4176 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.11111450195312, + "epoch": 31.402985074626866, + "grad_norm": 2.832561652823272, + "learning_rate": 4.3478061567929214e-07, + "loss": -0.0293, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4177 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.80555725097656, + "epoch": 31.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.3473674361589865e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4178 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0277862548828, + "epoch": 31.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3469285901652173e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4179 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.36111450195312, + "epoch": 31.425373134328357, + "grad_norm": 0.0, + "learning_rate": 4.346489618841392e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4180 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1666717529297, + "epoch": 31.432835820895523, + "grad_norm": 0.0, + "learning_rate": 4.3460505222172993e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4181 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0, + "epoch": 31.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.3456113003227367e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4182 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.80555725097656, + "epoch": 31.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.3451719531875077e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4183 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.25, + "epoch": 31.455223880597014, + "grad_norm": 0.0, + "learning_rate": 4.344732480841426e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4184 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.13888549804688, + "epoch": 31.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.344292883314314e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4185 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5833282470703, + "epoch": 31.470149253731343, + "grad_norm": 0.0, + "learning_rate": 4.3438531606360017e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4186 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.38888549804688, + "epoch": 31.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.343413312836329e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4187 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.30555725097656, + "epoch": 31.48507462686567, + "grad_norm": 0.0, + "learning_rate": 4.3429733399451433e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4188 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.55555725097656, + "epoch": 31.492537313432837, + "grad_norm": 0.0, + "learning_rate": 4.3425332419923e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4189 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.9166717529297, + "epoch": 31.5, + "grad_norm": 0.0, + "learning_rate": 4.342093019007663e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4190 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5833282470703, + "epoch": 31.507462686567163, + "grad_norm": 1.0130188343950663, + "learning_rate": 4.341652671021107e-07, + "loss": -0.0024, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4191 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.1666717529297, + "epoch": 31.51492537313433, + "grad_norm": 0.0, + "learning_rate": 4.3412121980625117e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4192 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9166717529297, + "epoch": 31.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.340771600161768e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4193 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.11111450195312, + "epoch": 31.529850746268657, + "grad_norm": 0.0, + "learning_rate": 4.3403308773487727e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4194 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.47222900390625, + "epoch": 31.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.3398900296534346e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4195 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 31.544776119402986, + "grad_norm": 0.7381612083214457, + "learning_rate": 4.3394490571056687e-07, + "loss": 0.0012, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4196 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.69444274902344, + "epoch": 31.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.3390079597353977e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4197 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.13888549804688, + "epoch": 31.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.338566737572555e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4198 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.13888549804688, + "epoch": 31.567164179104477, + "grad_norm": 0.0, + "learning_rate": 4.3381253906470807e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4199 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.75, + "epoch": 31.574626865671643, + "grad_norm": 0.0, + "learning_rate": 4.3376839189889237e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4200 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.41666412353516, + "epoch": 31.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.337242322628043e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4201 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.7777862548828, + "epoch": 31.58955223880597, + "grad_norm": 1.3711194033266811, + "learning_rate": 4.3368006015944036e-07, + "loss": -0.0021, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4202 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.7777862548828, + "epoch": 31.597014925373134, + "grad_norm": 0.0, + "learning_rate": 4.3363587559179797e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4203 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0277862548828, + "epoch": 31.604477611940297, + "grad_norm": 0.0, + "learning_rate": 4.3359167856287553e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4204 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.44444274902344, + "epoch": 31.611940298507463, + "grad_norm": 0.0, + "learning_rate": 4.3354746907567217e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4205 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.63888549804688, + "epoch": 31.619402985074625, + "grad_norm": 0.0, + "learning_rate": 4.3350324713318787e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4206 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.3333282470703, + "epoch": 31.62686567164179, + "grad_norm": 0.7430217420403289, + "learning_rate": 4.334590127384235e-07, + "loss": 0.0029, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4207 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.63888549804688, + "epoch": 31.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.334147658943807e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4208 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.25, + "epoch": 31.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.33370506604062e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4209 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.97222900390625, + "epoch": 31.649253731343283, + "grad_norm": 0.0, + "learning_rate": 4.333262348704708e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4210 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.36111450195312, + "epoch": 31.65671641791045, + "grad_norm": 0.0, + "learning_rate": 4.3328195069661134e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4211 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.47222900390625, + "epoch": 31.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.3323765408548865e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4212 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.63888549804688, + "epoch": 31.671641791044777, + "grad_norm": 0.8632045738400486, + "learning_rate": 4.331933450401087e-07, + "loss": 0.0065, + "reward": 1.7222222089767456, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 4213 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.88888549804688, + "epoch": 31.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3314902356347807e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4214 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.72222900390625, + "epoch": 31.686567164179106, + "grad_norm": 0.0, + "learning_rate": 4.331046896586046e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4215 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.2777862548828, + "epoch": 31.69402985074627, + "grad_norm": 0.7039816974708721, + "learning_rate": 4.330603433284965e-07, + "loss": 0.0026, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4216 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8333282470703, + "epoch": 31.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.330159845761632e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4217 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.25, + "epoch": 31.708955223880597, + "grad_norm": 0.0, + "learning_rate": 4.3297161340461474e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4218 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8333282470703, + "epoch": 31.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.329272298168621e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4219 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9166717529297, + "epoch": 31.723880597014926, + "grad_norm": 0.941468079176926, + "learning_rate": 4.3288283381591725e-07, + "loss": -0.0007, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4220 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0833282470703, + "epoch": 31.73134328358209, + "grad_norm": 1.1955299213434003, + "learning_rate": 4.328384254047926e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4221 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.36111450195312, + "epoch": 31.738805970149254, + "grad_norm": 1.6956327468033283, + "learning_rate": 4.327940045865018e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4222 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0277862548828, + "epoch": 31.746268656716417, + "grad_norm": 1.8871891573494004, + "learning_rate": 4.327495713640591e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4223 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.30555725097656, + "epoch": 31.753731343283583, + "grad_norm": 0.0, + "learning_rate": 4.3270512574047966e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4224 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.61111450195312, + "epoch": 31.761194029850746, + "grad_norm": 0.0, + "learning_rate": 4.3266066771877966e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4225 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 31.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.3261619730197586e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4226 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.47222900390625, + "epoch": 31.776119402985074, + "grad_norm": 1.312748898912208, + "learning_rate": 4.3257171449308587e-07, + "loss": 0.001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4227 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.80555725097656, + "epoch": 31.78358208955224, + "grad_norm": 0.8130356592697384, + "learning_rate": 4.3252721929512835e-07, + "loss": -0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4228 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.05555725097656, + "epoch": 31.791044776119403, + "grad_norm": 0.0, + "learning_rate": 4.324827117111227e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4229 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5833282470703, + "epoch": 31.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.3243819174408904e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4230 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9166717529297, + "epoch": 31.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.323936593970485e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4231 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.94444274902344, + "epoch": 31.813432835820894, + "grad_norm": 0.0, + "learning_rate": 4.32349114673023e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4232 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0, + "epoch": 31.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.3230455757503525e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4233 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.22222900390625, + "epoch": 31.828358208955223, + "grad_norm": 0.0, + "learning_rate": 4.322599881061088e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4234 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.3333282470703, + "epoch": 31.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.322154062692682e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4235 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.05555725097656, + "epoch": 31.84328358208955, + "grad_norm": 0.0, + "learning_rate": 4.321708120675385e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4236 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0833282470703, + "epoch": 31.850746268656717, + "grad_norm": 0.0, + "learning_rate": 4.3212620550394605e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4237 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.61111450195312, + "epoch": 31.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.3208158658151763e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4238 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.72222900390625, + "epoch": 31.865671641791046, + "grad_norm": 3.308342417518474, + "learning_rate": 4.320369553032811e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4239 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05555725097656, + "epoch": 31.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.31992311672265e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4240 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0, + "epoch": 31.880597014925375, + "grad_norm": 0.0, + "learning_rate": 4.319476556914988e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4241 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9166717529297, + "epoch": 31.888059701492537, + "grad_norm": 0.0, + "learning_rate": 4.3190298736401286e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4242 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 31.895522388059703, + "grad_norm": 0.0, + "learning_rate": 4.318583066928383e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4243 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.1666717529297, + "epoch": 31.902985074626866, + "grad_norm": 0.0, + "learning_rate": 4.31813613681007e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4244 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 31.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.3176890833155185e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4245 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5833282470703, + "epoch": 31.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.317241906475064e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4246 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 31.925373134328357, + "grad_norm": 0.0, + "learning_rate": 4.316794606319053e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4247 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.72222900390625, + "epoch": 31.932835820895523, + "grad_norm": 0.0, + "learning_rate": 4.3163471828778375e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4248 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.0, + "epoch": 31.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.315899636181779e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4249 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.97222900390625, + "epoch": 31.94776119402985, + "grad_norm": 0.8979512554802311, + "learning_rate": 4.3154519662612476e-07, + "loss": -0.0305, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4250 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.6666717529297, + "epoch": 31.955223880597014, + "grad_norm": 1.7661136378801976, + "learning_rate": 4.315004173146622e-07, + "loss": -0.0139, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4251 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.77777862548828, + "epoch": 31.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.3145562568682883e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4252 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.36111450195312, + "epoch": 31.970149253731343, + "grad_norm": 0.0, + "learning_rate": 4.314108217456641e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4253 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.11111450195312, + "epoch": 31.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.313660054942085e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4254 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.05555725097656, + "epoch": 31.98507462686567, + "grad_norm": 0.0, + "learning_rate": 4.3132117693550306e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4255 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.4166717529297, + "epoch": 31.992537313432837, + "grad_norm": 0.0, + "learning_rate": 4.3127633607258985e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4256 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75, + "epoch": 32.007462686567166, + "grad_norm": 0.9971856703332548, + "learning_rate": 4.3123148290851166e-07, + "loss": 0.0011, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4257 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.22222900390625, + "epoch": 32.014925373134325, + "grad_norm": 0.0, + "learning_rate": 4.3118661744631224e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4258 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.30555725097656, + "epoch": 32.02238805970149, + "grad_norm": 0.7235207464426999, + "learning_rate": 4.31141739689036e-07, + "loss": -0.0039, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4259 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0277862548828, + "epoch": 32.02985074626866, + "grad_norm": 0.6257166774916624, + "learning_rate": 4.3109684963972835e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4260 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5277862548828, + "epoch": 32.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.310519473014356e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4261 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.8333282470703, + "epoch": 32.04477611940298, + "grad_norm": 0.0, + "learning_rate": 4.3100703267720446e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4262 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 32.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.3096210577008297e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4263 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.61111450195312, + "epoch": 32.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.3091716658311976e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4264 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.25, + "epoch": 32.06716417910448, + "grad_norm": 0.0, + "learning_rate": 4.3087221511936434e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4265 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.86111450195312, + "epoch": 32.07462686567164, + "grad_norm": 0.0, + "learning_rate": 4.3082725138186716e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4266 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.7777862548828, + "epoch": 32.082089552238806, + "grad_norm": 2.616788515393949, + "learning_rate": 4.3078227537367916e-07, + "loss": -0.0013, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4267 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.7777862548828, + "epoch": 32.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.3073728709785264e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4268 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.4166717529297, + "epoch": 32.09701492537314, + "grad_norm": 0.0, + "learning_rate": 4.3069228655744025e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4269 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.88888549804688, + "epoch": 32.1044776119403, + "grad_norm": 0.0, + "learning_rate": 4.306472737554957e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4270 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5277862548828, + "epoch": 32.11194029850746, + "grad_norm": 0.0, + "learning_rate": 4.3060224869507355e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4271 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 32.11940298507463, + "grad_norm": 0.7760857783889168, + "learning_rate": 4.3055721137922904e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4272 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9166717529297, + "epoch": 32.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.3051216181101844e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4273 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.88888549804688, + "epoch": 32.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.3046709999349874e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4274 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25, + "epoch": 32.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.304220259297277e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4275 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.05555725097656, + "epoch": 32.149253731343286, + "grad_norm": 0.0, + "learning_rate": 4.3037693962276407e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4276 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.86111450195312, + "epoch": 32.156716417910445, + "grad_norm": 0.0, + "learning_rate": 4.303318410756673e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4277 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.5833282470703, + "epoch": 32.16417910447761, + "grad_norm": 0.5436561925863027, + "learning_rate": 4.3028673029149773e-07, + "loss": 0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4278 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2777862548828, + "epoch": 32.17164179104478, + "grad_norm": 2.2194098800756503, + "learning_rate": 4.302416072733165e-07, + "loss": 0.0019, + "reward": 1.75, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4279 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 32.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.3019647202418564e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4280 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5277862548828, + "epoch": 32.1865671641791, + "grad_norm": 1.4082887036928968, + "learning_rate": 4.3015132454716783e-07, + "loss": -0.0078, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4281 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.88888549804688, + "epoch": 32.19402985074627, + "grad_norm": 1.2245140912864567, + "learning_rate": 4.301061648453269e-07, + "loss": -0.0071, + "reward": 1.8333333730697632, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4282 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75, + "epoch": 32.201492537313435, + "grad_norm": 0.0, + "learning_rate": 4.3006099292172725e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4283 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.3333282470703, + "epoch": 32.208955223880594, + "grad_norm": 0.0, + "learning_rate": 4.300158087794342e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4284 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.6666717529297, + "epoch": 32.21641791044776, + "grad_norm": 0.506228364231538, + "learning_rate": 4.299706124215138e-07, + "loss": 0.0009, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4285 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.47222900390625, + "epoch": 32.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.2992540385103315e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4286 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.63888549804688, + "epoch": 32.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.298801830710599e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4287 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.38888549804688, + "epoch": 32.23880597014925, + "grad_norm": 0.0, + "learning_rate": 4.2983495008466273e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4288 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.11111450195312, + "epoch": 32.24626865671642, + "grad_norm": 1.571088822126878, + "learning_rate": 4.297897048949112e-07, + "loss": 0.001, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4289 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.38888549804688, + "epoch": 32.25373134328358, + "grad_norm": 0.0, + "learning_rate": 4.2974444750487544e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4290 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.72222900390625, + "epoch": 32.26119402985075, + "grad_norm": 0.0, + "learning_rate": 4.2969917791762655e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4291 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.4166717529297, + "epoch": 32.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.296538961362366e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4292 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.80555725097656, + "epoch": 32.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.296086021637782e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4293 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.25, + "epoch": 32.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.2956329600332496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4294 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.8333282470703, + "epoch": 32.291044776119406, + "grad_norm": 0.8298888866882652, + "learning_rate": 4.2951797765795144e-07, + "loss": 0.0003, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4295 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25, + "epoch": 32.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.294726471307327e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4296 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333282470703, + "epoch": 32.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.2942730442474495e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4297 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0, + "epoch": 32.3134328358209, + "grad_norm": 0.0, + "learning_rate": 4.2938194954306493e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4298 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.47222900390625, + "epoch": 32.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.2933658248877056e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4299 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.80555725097656, + "epoch": 32.32835820895522, + "grad_norm": 0.0, + "learning_rate": 4.2929120326494023e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4300 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.80555725097656, + "epoch": 32.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.292458118746534e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4301 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.75, + "epoch": 32.343283582089555, + "grad_norm": 0.0, + "learning_rate": 4.292004083209901e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4302 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.9166717529297, + "epoch": 32.350746268656714, + "grad_norm": 0.825305355432171, + "learning_rate": 4.2915499260703157e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4303 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.8333282470703, + "epoch": 32.35820895522388, + "grad_norm": 1.6392030260486792, + "learning_rate": 4.291095647358595e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4304 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.55555725097656, + "epoch": 32.365671641791046, + "grad_norm": 0.5860054021099618, + "learning_rate": 4.290641247105567e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4305 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.61111450195312, + "epoch": 32.37313432835821, + "grad_norm": 1.0091778023627367, + "learning_rate": 4.2901867253420654e-07, + "loss": -0.0012, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4306 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.0833282470703, + "epoch": 32.38059701492537, + "grad_norm": 0.0, + "learning_rate": 4.289732082098935e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4307 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.86111450195312, + "epoch": 32.38805970149254, + "grad_norm": 0.0, + "learning_rate": 4.289277317407025e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4308 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.7777862548828, + "epoch": 32.3955223880597, + "grad_norm": 0.0, + "learning_rate": 4.288822431297197e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4309 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.25, + "epoch": 32.40298507462686, + "grad_norm": 0.645189733781749, + "learning_rate": 4.288367423800319e-07, + "loss": -0.0215, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4310 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 32.41044776119403, + "grad_norm": 0.6142423599323745, + "learning_rate": 4.2879122949472656e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4311 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.22222900390625, + "epoch": 32.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.2874570447689226e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4312 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0277862548828, + "epoch": 32.42537313432836, + "grad_norm": 0.0, + "learning_rate": 4.2870016732961815e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4313 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.97222900390625, + "epoch": 32.43283582089552, + "grad_norm": 0.0, + "learning_rate": 4.286546180559945e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4314 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6666717529297, + "epoch": 32.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.2860905665911206e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4315 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.94444274902344, + "epoch": 32.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.2856348314206257e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4316 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.7777862548828, + "epoch": 32.45522388059702, + "grad_norm": 0.0, + "learning_rate": 4.285178975079387e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4317 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0277862548828, + "epoch": 32.46268656716418, + "grad_norm": 1.110354423388445, + "learning_rate": 4.284722997598337e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4318 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 32.47014925373134, + "grad_norm": 0.0, + "learning_rate": 4.2842668990084187e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4319 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0833282470703, + "epoch": 32.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.283810679340582e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4320 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.7777862548828, + "epoch": 32.485074626865675, + "grad_norm": 0.7932969668746247, + "learning_rate": 4.2833543386257853e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4321 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0833282470703, + "epoch": 32.492537313432834, + "grad_norm": 0.0, + "learning_rate": 4.2828978768949954e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4322 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.61111450195312, + "epoch": 32.5, + "grad_norm": 0.0, + "learning_rate": 4.2824412941791867e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4323 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5, + "epoch": 32.507462686567166, + "grad_norm": 0.0, + "learning_rate": 4.281984590509343e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4324 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.36111450195312, + "epoch": 32.514925373134325, + "grad_norm": 0.34109810339012525, + "learning_rate": 4.2815277659164546e-07, + "loss": 0.0001, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4325 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.4166717529297, + "epoch": 32.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.2810708204315214e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4326 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 32.52985074626866, + "grad_norm": 0.0, + "learning_rate": 4.2806137540855527e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4327 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.11111450195312, + "epoch": 32.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.280156566909562e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4328 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5, + "epoch": 32.54477611940298, + "grad_norm": 0.0, + "learning_rate": 4.279699258934574e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4329 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.97222900390625, + "epoch": 32.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.2792418301916223e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4330 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5833282470703, + "epoch": 32.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.278784280711746e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4331 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.97222900390625, + "epoch": 32.56716417910448, + "grad_norm": 0.0, + "learning_rate": 4.2783266105259943e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4332 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.8333282470703, + "epoch": 32.57462686567164, + "grad_norm": 0.0, + "learning_rate": 4.277868819665423e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4333 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.6666717529297, + "epoch": 32.582089552238806, + "grad_norm": 0.4112845819879238, + "learning_rate": 4.2774109081610997e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4334 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5833282470703, + "epoch": 32.58955223880597, + "grad_norm": 0.8616386949462089, + "learning_rate": 4.276952876044095e-07, + "loss": -0.0015, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4335 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.19444274902344, + "epoch": 32.59701492537313, + "grad_norm": 0.0, + "learning_rate": 4.276494723345493e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4336 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.47222900390625, + "epoch": 32.6044776119403, + "grad_norm": 0.643344413635867, + "learning_rate": 4.276036450096381e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4337 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9166717529297, + "epoch": 32.61194029850746, + "grad_norm": 0.0, + "learning_rate": 4.275578056327857e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4338 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.1666717529297, + "epoch": 32.61940298507463, + "grad_norm": 0.0, + "learning_rate": 4.2751195420710283e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4339 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5833282470703, + "epoch": 32.62686567164179, + "grad_norm": 0.5601875980616329, + "learning_rate": 4.2746609073570085e-07, + "loss": 0.002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4340 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0277862548828, + "epoch": 32.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.274202152216919e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4341 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.13888549804688, + "epoch": 32.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.273743276681891e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4342 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.25, + "epoch": 32.649253731343286, + "grad_norm": 0.0, + "learning_rate": 4.2732842807830636e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4343 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.3333282470703, + "epoch": 32.656716417910445, + "grad_norm": 0.9429320953917231, + "learning_rate": 4.2728251645515836e-07, + "loss": 0.0017, + "reward": 1.6944444179534912, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.694444477558136, + "rewards/format_reward": 1.0, + "step": 4344 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6666717529297, + "epoch": 32.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.2723659280186055e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4345 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.47222900390625, + "epoch": 32.67164179104478, + "grad_norm": 0.0, + "learning_rate": 4.2719065712152917e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4346 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5277862548828, + "epoch": 32.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.2714470941728155e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4347 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.69444274902344, + "epoch": 32.6865671641791, + "grad_norm": 0.0, + "learning_rate": 4.270987496922355e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4348 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.05555725097656, + "epoch": 32.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.2705277794950976e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4349 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.94444274902344, + "epoch": 32.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.270067941922241e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4350 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.8333282470703, + "epoch": 32.708955223880594, + "grad_norm": 1.9008636524071114, + "learning_rate": 4.269607984234986e-07, + "loss": 0.0009, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4351 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.38888549804688, + "epoch": 32.71641791044776, + "grad_norm": 0.5191883066665324, + "learning_rate": 4.2691479064645477e-07, + "loss": 0.0001, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4352 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 32.723880597014926, + "grad_norm": 2.8004200623173725, + "learning_rate": 4.268687708642145e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.9722222089767456, + "step": 4353 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.30555725097656, + "epoch": 32.73134328358209, + "grad_norm": 0.6302212861430119, + "learning_rate": 4.268227390799006e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4354 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.61111450195312, + "epoch": 32.73880597014925, + "grad_norm": 0.0, + "learning_rate": 4.2677669529663686e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4355 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.1666717529297, + "epoch": 32.74626865671642, + "grad_norm": 0.0, + "learning_rate": 4.267306395175476e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4356 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.30555725097656, + "epoch": 32.75373134328358, + "grad_norm": 0.9311911760157396, + "learning_rate": 4.266845717457582e-07, + "loss": -0.0005, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4357 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.9166717529297, + "epoch": 32.76119402985075, + "grad_norm": 1.6086950464901204, + "learning_rate": 4.266384919843947e-07, + "loss": 0.0234, + "reward": 1.9166666269302368, + "reward_std": 0.1666666716337204, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4358 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 32.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.26592400236584e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4359 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.8333282470703, + "epoch": 32.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.265462965054538e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4360 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.61111450195312, + "epoch": 32.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.2650018079413286e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4361 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8333282470703, + "epoch": 32.791044776119406, + "grad_norm": 0.0, + "learning_rate": 4.264540531057502e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4362 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.88888549804688, + "epoch": 32.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.2640791344343617e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4363 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4166717529297, + "epoch": 32.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.263617618103218e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4364 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 32.8134328358209, + "grad_norm": 0.0, + "learning_rate": 4.263155982095387e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4365 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.69444274902344, + "epoch": 32.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.262694226442196e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4366 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.88888549804688, + "epoch": 32.82835820895522, + "grad_norm": 0.0, + "learning_rate": 4.262232351174978e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4367 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.6666717529297, + "epoch": 32.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.261770356325077e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4368 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.11111450195312, + "epoch": 32.843283582089555, + "grad_norm": 1.161887151127392, + "learning_rate": 4.261308241923841e-07, + "loss": -0.0022, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4369 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.36111450195312, + "epoch": 32.850746268656714, + "grad_norm": 0.0, + "learning_rate": 4.2608460080026304e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4370 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.55555725097656, + "epoch": 32.85820895522388, + "grad_norm": 1.604762804509365, + "learning_rate": 4.2603836545928114e-07, + "loss": -0.0008, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4371 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.22222900390625, + "epoch": 32.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.2599211817257574e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4372 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0833282470703, + "epoch": 32.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.2594585894328525e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4373 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0833282470703, + "epoch": 32.88059701492537, + "grad_norm": 0.0, + "learning_rate": 4.258995877745488e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4374 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2777862548828, + "epoch": 32.88805970149254, + "grad_norm": 0.0, + "learning_rate": 4.2585330466950616e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 32.8955223880597, + "grad_norm": 0.0, + "learning_rate": 4.258070096312981e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4376 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.13888549804688, + "epoch": 32.90298507462687, + "grad_norm": 0.0, + "learning_rate": 4.2576070266306605e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4377 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.22222900390625, + "epoch": 32.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.2571438376795256e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4378 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 32.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.2566805294910047e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4379 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.94444274902344, + "epoch": 32.92537313432836, + "grad_norm": 0.0, + "learning_rate": 4.256217102096539e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4380 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.94444274902344, + "epoch": 32.93283582089552, + "grad_norm": 0.5562176448623602, + "learning_rate": 4.255753555527576e-07, + "loss": 0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4381 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.0277862548828, + "epoch": 32.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.255289889815572e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4382 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5277862548828, + "epoch": 32.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.254826104991989e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4383 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 32.95522388059702, + "grad_norm": 0.0, + "learning_rate": 4.2543622010883e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4384 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.1666717529297, + "epoch": 32.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.2538981781359844e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4385 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.8333282470703, + "epoch": 32.97014925373134, + "grad_norm": 0.0, + "learning_rate": 4.25343403616653e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4386 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.38888549804688, + "epoch": 32.97761194029851, + "grad_norm": 1.3973626591092223, + "learning_rate": 4.2529697752114334e-07, + "loss": -0.0221, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4387 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6666717529297, + "epoch": 32.985074626865675, + "grad_norm": 0.0, + "learning_rate": 4.2525053953021994e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4388 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.6666717529297, + "epoch": 32.992537313432834, + "grad_norm": 0.0, + "learning_rate": 4.252040896470338e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4389 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.75, + "epoch": 33.007462686567166, + "grad_norm": 0.0, + "learning_rate": 4.251576278747371e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4390 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.25, + "epoch": 33.014925373134325, + "grad_norm": 0.0, + "learning_rate": 4.251111542164827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4391 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5277862548828, + "epoch": 33.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.250646686754242e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4392 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.94444274902344, + "epoch": 33.02985074626866, + "grad_norm": 0.0, + "learning_rate": 4.2501817125471595e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4393 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.72222900390625, + "epoch": 33.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.249716619575133e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4394 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8333282470703, + "epoch": 33.04477611940298, + "grad_norm": 0.8276019651310628, + "learning_rate": 4.249251407869724e-07, + "loss": 0.0001, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4395 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.02777862548828, + "epoch": 33.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.2487860774624985e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4396 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36111450195312, + "epoch": 33.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.248320628385035e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4397 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.69444274902344, + "epoch": 33.06716417910448, + "grad_norm": 0.0, + "learning_rate": 4.2478550606689187e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4398 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.75, + "epoch": 33.07462686567164, + "grad_norm": 0.0, + "learning_rate": 4.247389374345741e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4399 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.47222900390625, + "epoch": 33.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.246923569447104e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4400 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.97222900390625, + "epoch": 33.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.246457646004615e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4401 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.44444274902344, + "epoch": 33.09701492537314, + "grad_norm": 0.0, + "learning_rate": 4.245991604049893e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4402 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5, + "epoch": 33.1044776119403, + "grad_norm": 0.0, + "learning_rate": 4.2455254436145616e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4403 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.94444274902344, + "epoch": 33.11194029850746, + "grad_norm": 0.0, + "learning_rate": 4.245059164730254e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4404 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.38888549804688, + "epoch": 33.11940298507463, + "grad_norm": 1.2490311759839778, + "learning_rate": 4.244592767428611e-07, + "loss": -0.0002, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4405 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 33.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.2441262517412824e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4406 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.97222137451172, + "epoch": 33.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.243659617699924e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4407 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.69444274902344, + "epoch": 33.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.243192865336203e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4408 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5277862548828, + "epoch": 33.149253731343286, + "grad_norm": 0.7394680344014802, + "learning_rate": 4.242725994681791e-07, + "loss": -0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4409 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.63888549804688, + "epoch": 33.156716417910445, + "grad_norm": 0.0, + "learning_rate": 4.2422590057683694e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4410 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.30555725097656, + "epoch": 33.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.2417918986276277e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4411 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.69444274902344, + "epoch": 33.17164179104478, + "grad_norm": 0.6123170691974259, + "learning_rate": 4.2413246732912643e-07, + "loss": 0.0008, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4412 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 33.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.240857329790982e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4413 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.44444274902344, + "epoch": 33.1865671641791, + "grad_norm": 0.0, + "learning_rate": 4.240389868158496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4414 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5, + "epoch": 33.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.239922288425527e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4415 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.44444274902344, + "epoch": 33.201492537313435, + "grad_norm": 1.198249786501864, + "learning_rate": 4.2394545906238046e-07, + "loss": 0.0002, + "reward": 1.7777777910232544, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4416 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.44444274902344, + "epoch": 33.208955223880594, + "grad_norm": 0.0, + "learning_rate": 4.2389867747850654e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4417 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.6666717529297, + "epoch": 33.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.2385188409410555e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4418 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5277862548828, + "epoch": 33.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.238050789123528e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4419 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5833282470703, + "epoch": 33.23134328358209, + "grad_norm": 0.7011326071383946, + "learning_rate": 4.237582619364244e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4420 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0277862548828, + "epoch": 33.23880597014925, + "grad_norm": 0.0, + "learning_rate": 4.237114331694973e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4421 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.38888549804688, + "epoch": 33.24626865671642, + "grad_norm": 0.8584391945068184, + "learning_rate": 4.236645926147493e-07, + "loss": -0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4422 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.19444274902344, + "epoch": 33.25373134328358, + "grad_norm": 1.6944389629341332, + "learning_rate": 4.236177402753589e-07, + "loss": 0.0, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4423 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.44444274902344, + "epoch": 33.26119402985075, + "grad_norm": 0.0, + "learning_rate": 4.2357087615450535e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4424 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.94444274902344, + "epoch": 33.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.2352400025536887e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4425 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.11111450195312, + "epoch": 33.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.234771125811304e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4426 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 33.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.2343021313497164e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4427 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.3888931274414, + "epoch": 33.291044776119406, + "grad_norm": 0.0, + "learning_rate": 4.233833019200751e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4428 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.88888549804688, + "epoch": 33.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.233363789396242e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4429 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.69444274902344, + "epoch": 33.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.2328944419680294e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4430 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.72222900390625, + "epoch": 33.3134328358209, + "grad_norm": 0.0, + "learning_rate": 4.232424976947964e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4431 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9166717529297, + "epoch": 33.32089552238806, + "grad_norm": 0.8951065445932648, + "learning_rate": 4.231955394367902e-07, + "loss": 0.0021, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4432 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.69444274902344, + "epoch": 33.32835820895522, + "grad_norm": 0.0, + "learning_rate": 4.231485694259709e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4433 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.72222900390625, + "epoch": 33.33582089552239, + "grad_norm": 1.3089479274124685, + "learning_rate": 4.231015876655258e-07, + "loss": 0.0017, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4434 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.88888549804688, + "epoch": 33.343283582089555, + "grad_norm": 0.0, + "learning_rate": 4.2305459415864303e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4435 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.72222900390625, + "epoch": 33.350746268656714, + "grad_norm": 2.3704130218576824, + "learning_rate": 4.230075889085115e-07, + "loss": -0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4436 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0277862548828, + "epoch": 33.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.2296057191832096e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4437 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 33.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.229135431912619e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4438 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8333282470703, + "epoch": 33.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.228665027305256e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4439 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 33.38059701492537, + "grad_norm": 0.0, + "learning_rate": 4.228194505393041e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4440 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.30555725097656, + "epoch": 33.38805970149254, + "grad_norm": 0.0, + "learning_rate": 4.2277238662079044e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4441 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.94444274902344, + "epoch": 33.3955223880597, + "grad_norm": 0.0, + "learning_rate": 4.2272531097817827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4442 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.38888549804688, + "epoch": 33.40298507462686, + "grad_norm": 0.0, + "learning_rate": 4.22678223614662e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4443 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6666717529297, + "epoch": 33.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.2263112453343697e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4444 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.38888549804688, + "epoch": 33.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.2258401373769927e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4445 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.69444274902344, + "epoch": 33.42537313432836, + "grad_norm": 0.0, + "learning_rate": 4.225368912306457e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4446 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.72222900390625, + "epoch": 33.43283582089552, + "grad_norm": 0.0, + "learning_rate": 4.2248975701547406e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4447 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.9166717529297, + "epoch": 33.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.2244261109538275e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4448 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.38888549804688, + "epoch": 33.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.223954534735711e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4449 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.13888549804688, + "epoch": 33.45522388059702, + "grad_norm": 0.0, + "learning_rate": 4.22348284153239e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4450 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.80555725097656, + "epoch": 33.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.2230110313758726e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4451 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.19444274902344, + "epoch": 33.47014925373134, + "grad_norm": 1.2858761973162847, + "learning_rate": 4.2225391042981777e-07, + "loss": -0.0023, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4452 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.22222900390625, + "epoch": 33.47761194029851, + "grad_norm": 0.542692522376199, + "learning_rate": 4.2220670603313284e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4453 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.97222900390625, + "epoch": 33.485074626865675, + "grad_norm": 0.0, + "learning_rate": 4.221594899507357e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4454 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.47222900390625, + "epoch": 33.492537313432834, + "grad_norm": 1.784296578599657, + "learning_rate": 4.221122621858303e-07, + "loss": 0.0003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4455 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.86111450195312, + "epoch": 33.5, + "grad_norm": 0.0, + "learning_rate": 4.220650227416215e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4456 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.22222900390625, + "epoch": 33.507462686567166, + "grad_norm": 0.0, + "learning_rate": 4.2201777162131495e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4457 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9166717529297, + "epoch": 33.514925373134325, + "grad_norm": 0.0, + "learning_rate": 4.21970508828117e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4458 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.4166717529297, + "epoch": 33.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.219232343652348e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4459 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.97222900390625, + "epoch": 33.52985074626866, + "grad_norm": 0.0, + "learning_rate": 4.218759482358765e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4460 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.30555725097656, + "epoch": 33.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.218286504432507e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4461 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.30555725097656, + "epoch": 33.54477611940298, + "grad_norm": 0.0, + "learning_rate": 4.2178134099056697e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4462 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.13888549804688, + "epoch": 33.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.217340198810357e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4463 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.05555725097656, + "epoch": 33.559701492537314, + "grad_norm": 0.47604492112178576, + "learning_rate": 4.2168668711786803e-07, + "loss": 0.0009, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4464 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.2777862548828, + "epoch": 33.56716417910448, + "grad_norm": 0.35705846463758445, + "learning_rate": 4.21639342704276e-07, + "loss": 0.0021, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4465 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.30555725097656, + "epoch": 33.57462686567164, + "grad_norm": 0.0, + "learning_rate": 4.2159198664347225e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4466 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5277862548828, + "epoch": 33.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.2154461893867024e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4467 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 33.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.214972395930843e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4468 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.72222900390625, + "epoch": 33.59701492537313, + "grad_norm": 0.7115725152001906, + "learning_rate": 4.214498486099296e-07, + "loss": -0.0085, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4469 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.22222900390625, + "epoch": 33.6044776119403, + "grad_norm": 0.0, + "learning_rate": 4.2140244599242205e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4470 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.22222900390625, + "epoch": 33.61194029850746, + "grad_norm": 0.0, + "learning_rate": 4.213550317437782e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4471 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5, + "epoch": 33.61940298507463, + "grad_norm": 0.0, + "learning_rate": 4.2130760586721557e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4472 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.8888931274414, + "epoch": 33.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.212601683659525e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4473 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.61111450195312, + "epoch": 33.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.2121271924320795e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4474 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.97222900390625, + "epoch": 33.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.211652585022017e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4475 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0277862548828, + "epoch": 33.649253731343286, + "grad_norm": 0.0, + "learning_rate": 4.2111778614615444e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4476 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0, + "epoch": 33.656716417910445, + "grad_norm": 0.0, + "learning_rate": 4.2107030217828766e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4477 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.38888549804688, + "epoch": 33.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.2102280660182343e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4478 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.0277862548828, + "epoch": 33.67164179104478, + "grad_norm": 0.0, + "learning_rate": 4.2097529941998477e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4479 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.63888549804688, + "epoch": 33.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.209277806359955e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4480 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.6666717529297, + "epoch": 33.6865671641791, + "grad_norm": 0.8497121734361469, + "learning_rate": 4.208802502530801e-07, + "loss": -0.0002, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4481 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55555725097656, + "epoch": 33.69402985074627, + "grad_norm": 0.9565452393125524, + "learning_rate": 4.2083270827446395e-07, + "loss": 0.0003, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4482 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0, + "epoch": 33.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.207851547033733e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4483 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.36111450195312, + "epoch": 33.708955223880594, + "grad_norm": 0.9566903064956553, + "learning_rate": 4.207375895430348e-07, + "loss": 0.0007, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4484 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.7777862548828, + "epoch": 33.71641791044776, + "grad_norm": 0.6633287899578526, + "learning_rate": 4.206900127966764e-07, + "loss": -0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4485 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0833282470703, + "epoch": 33.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.206424244675265e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4486 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.7777862548828, + "epoch": 33.73134328358209, + "grad_norm": 0.7061547630682532, + "learning_rate": 4.205948245588144e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4487 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.47222900390625, + "epoch": 33.73880597014925, + "grad_norm": 0.0, + "learning_rate": 4.2054721307377016e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4488 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.11111450195312, + "epoch": 33.74626865671642, + "grad_norm": 0.0, + "learning_rate": 4.204995900156246e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4489 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.86111450195312, + "epoch": 33.75373134328358, + "grad_norm": 0.0, + "learning_rate": 4.2045195538760946e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4490 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0277862548828, + "epoch": 33.76119402985075, + "grad_norm": 0.0, + "learning_rate": 4.20404309192957e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4491 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.61111450195312, + "epoch": 33.76865671641791, + "grad_norm": 0.47714986541487325, + "learning_rate": 4.203566514349005e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4492 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.47222900390625, + "epoch": 33.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.2030898211667397e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4493 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.86111450195312, + "epoch": 33.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.2026130124151227e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4494 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.11111450195312, + "epoch": 33.791044776119406, + "grad_norm": 0.0, + "learning_rate": 4.2021360881265076e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4495 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.0277862548828, + "epoch": 33.798507462686565, + "grad_norm": 1.0486770324956582, + "learning_rate": 4.2016590483332594e-07, + "loss": 0.0013, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4496 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.19444274902344, + "epoch": 33.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.2011818930677487e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4497 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.80555725097656, + "epoch": 33.8134328358209, + "grad_norm": 0.0, + "learning_rate": 4.2007046223623543e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4498 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5277862548828, + "epoch": 33.82089552238806, + "grad_norm": 1.1020790870871109, + "learning_rate": 4.200227236249464e-07, + "loss": -0.0011, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4499 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 33.82835820895522, + "grad_norm": 0.0, + "learning_rate": 4.199749734761473e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4500 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0, + "epoch": 33.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.1992721179307814e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4501 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.63888549804688, + "epoch": 33.843283582089555, + "grad_norm": 0.0, + "learning_rate": 4.198794385789802e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4502 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.13888549804688, + "epoch": 33.850746268656714, + "grad_norm": 0.0, + "learning_rate": 4.198316538370953e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4503 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.47222900390625, + "epoch": 33.85820895522388, + "grad_norm": 1.0541576810884656, + "learning_rate": 4.1978385757066593e-07, + "loss": 0.0, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4504 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 33.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.1973604978293546e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4505 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.44444274902344, + "epoch": 33.87313432835821, + "grad_norm": 0.5878809851286388, + "learning_rate": 4.1968823047714817e-07, + "loss": -0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4506 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.11111450195312, + "epoch": 33.88059701492537, + "grad_norm": 0.38128149506940423, + "learning_rate": 4.196403996565491e-07, + "loss": 0.0, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 0.9722222089767456, + "step": 4507 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.30555725097656, + "epoch": 33.88805970149254, + "grad_norm": 0.0, + "learning_rate": 4.195925573243837e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4508 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 33.8955223880597, + "grad_norm": 0.9376827604836628, + "learning_rate": 4.195447034838987e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4509 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.30555725097656, + "epoch": 33.90298507462687, + "grad_norm": 0.0, + "learning_rate": 4.194968381383414e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4510 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.9166717529297, + "epoch": 33.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.1944896129095974e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4511 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5833282470703, + "epoch": 33.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.1940107294500266e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4512 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.38888549804688, + "epoch": 33.92537313432836, + "grad_norm": 0.0, + "learning_rate": 4.1935317310371974e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4513 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6666717529297, + "epoch": 33.93283582089552, + "grad_norm": 0.0, + "learning_rate": 4.1930526177036157e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4514 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 33.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.1925733894817915e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4515 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.22222900390625, + "epoch": 33.94776119402985, + "grad_norm": 1.0031109039692216, + "learning_rate": 4.192094046404246e-07, + "loss": 0.0142, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4516 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0, + "epoch": 33.95522388059702, + "grad_norm": 1.139036011135441, + "learning_rate": 4.191614588503506e-07, + "loss": -0.0004, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4517 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 33.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.1911350158121066e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4518 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.3333282470703, + "epoch": 33.97014925373134, + "grad_norm": 0.0, + "learning_rate": 4.190655328362591e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4519 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.30555725097656, + "epoch": 33.97761194029851, + "grad_norm": 1.0851044872181843, + "learning_rate": 4.190175526187511e-07, + "loss": 0.0024, + "reward": 1.9166666269302368, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4520 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.94444274902344, + "epoch": 33.985074626865675, + "grad_norm": 0.0, + "learning_rate": 4.1896956093194254e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4521 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4166717529297, + "epoch": 33.992537313432834, + "grad_norm": 0.0, + "learning_rate": 4.1892155777908987e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4522 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.05555725097656, + "epoch": 34.007462686567166, + "grad_norm": 0.0, + "learning_rate": 4.188735431634508e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4523 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.72222900390625, + "epoch": 34.014925373134325, + "grad_norm": 0.0, + "learning_rate": 4.1882551708828327e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4524 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.69444274902344, + "epoch": 34.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.187774795568465e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4525 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.30555725097656, + "epoch": 34.02985074626866, + "grad_norm": 0.0, + "learning_rate": 4.187294305724001e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4526 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.94444274902344, + "epoch": 34.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.186813701382046e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4527 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.38888549804688, + "epoch": 34.04477611940298, + "grad_norm": 0.0, + "learning_rate": 4.186332982575214e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4528 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.3333282470703, + "epoch": 34.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.185852149336125e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4529 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5277862548828, + "epoch": 34.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.18537120169741e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4530 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0833282470703, + "epoch": 34.06716417910448, + "grad_norm": 0.0, + "learning_rate": 4.184890139691702e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4531 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5, + "epoch": 34.07462686567164, + "grad_norm": 0.0, + "learning_rate": 4.184408963351648e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4532 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.36111450195312, + "epoch": 34.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.183927672709898e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4533 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 34.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.183446267799112e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4534 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.80555725097656, + "epoch": 34.09701492537314, + "grad_norm": 0.0, + "learning_rate": 4.1829647486519593e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4535 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5833282470703, + "epoch": 34.1044776119403, + "grad_norm": 0.0, + "learning_rate": 4.182483115301114e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4536 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.0833282470703, + "epoch": 34.11194029850746, + "grad_norm": 0.0, + "learning_rate": 4.1820013677792585e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4537 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6666717529297, + "epoch": 34.11940298507463, + "grad_norm": 0.0, + "learning_rate": 4.1815195061190846e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4538 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.61111450195312, + "epoch": 34.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.18103753035329e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4539 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.36111450195312, + "epoch": 34.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.1805554405145805e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4540 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.19444274902344, + "epoch": 34.14179104477612, + "grad_norm": 0.7885290504127156, + "learning_rate": 4.1800732366356704e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4541 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.2777862548828, + "epoch": 34.149253731343286, + "grad_norm": 0.0, + "learning_rate": 4.179590918749283e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4542 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5277862548828, + "epoch": 34.156716417910445, + "grad_norm": 0.0, + "learning_rate": 4.179108486888145e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4543 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.25, + "epoch": 34.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.178625941084996e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4544 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.94444274902344, + "epoch": 34.17164179104478, + "grad_norm": 0.0, + "learning_rate": 4.17814328137258e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4545 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.72222900390625, + "epoch": 34.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.177660507783649e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4546 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.61111450195312, + "epoch": 34.1865671641791, + "grad_norm": 0.5276080394046138, + "learning_rate": 4.1771776203509643e-07, + "loss": -0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4547 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5277862548828, + "epoch": 34.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.1766946191072936e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4548 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 34.201492537313435, + "grad_norm": 4.935277986388956, + "learning_rate": 4.176211504085412e-07, + "loss": -0.0005, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4549 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.77777862548828, + "epoch": 34.208955223880594, + "grad_norm": 0.0, + "learning_rate": 4.1757282753181047e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4550 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.75, + "epoch": 34.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.1752449328381606e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4551 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.69444274902344, + "epoch": 34.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.174761476678381e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4552 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9166717529297, + "epoch": 34.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.174277906871571e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4553 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.88888549804688, + "epoch": 34.23880597014925, + "grad_norm": 1.0090318655338504, + "learning_rate": 4.1737942234505456e-07, + "loss": -0.0007, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4554 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36111450195312, + "epoch": 34.24626865671642, + "grad_norm": 0.0, + "learning_rate": 4.173310426448128e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4555 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0833282470703, + "epoch": 34.25373134328358, + "grad_norm": 0.4292329513179847, + "learning_rate": 4.172826515897145e-07, + "loss": 0.0021, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4556 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.2777862548828, + "epoch": 34.26119402985075, + "grad_norm": 0.0, + "learning_rate": 4.172342491830437e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4557 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.13888549804688, + "epoch": 34.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.171858354280848e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4558 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.38888549804688, + "epoch": 34.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.1713741032812316e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4559 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.13888549804688, + "epoch": 34.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.1708897388644477e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4560 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.1666717529297, + "epoch": 34.291044776119406, + "grad_norm": 0.0, + "learning_rate": 4.1704052610633645e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4561 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.19444274902344, + "epoch": 34.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.1699206699108584e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4562 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.97222900390625, + "epoch": 34.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.1694359654398134e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4563 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.86111450195312, + "epoch": 34.3134328358209, + "grad_norm": 0.0, + "learning_rate": 4.1689511476831197e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4564 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.44444274902344, + "epoch": 34.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.1684662166736784e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4565 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 34.32835820895522, + "grad_norm": 0.0, + "learning_rate": 4.167981172444394e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4566 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.9166717529297, + "epoch": 34.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.167496015028182e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4567 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.11111450195312, + "epoch": 34.343283582089555, + "grad_norm": 0.0, + "learning_rate": 4.167010744457965e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4568 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.5833282470703, + "epoch": 34.350746268656714, + "grad_norm": 0.0, + "learning_rate": 4.166525360766672e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4569 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.69444274902344, + "epoch": 34.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.1660398639872405e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4570 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.72222900390625, + "epoch": 34.365671641791046, + "grad_norm": 1.0068248405761966, + "learning_rate": 4.165554254152616e-07, + "loss": 0.0004, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4571 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.30555725097656, + "epoch": 34.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.1650685312957516e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4572 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.11111450195312, + "epoch": 34.38059701492537, + "grad_norm": 0.0, + "learning_rate": 4.1645826954496075e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4573 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.11111450195312, + "epoch": 34.38805970149254, + "grad_norm": 0.0, + "learning_rate": 4.1640967466471513e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4574 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.94444274902344, + "epoch": 34.3955223880597, + "grad_norm": 0.0, + "learning_rate": 4.16361068492136e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4575 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1666717529297, + "epoch": 34.40298507462686, + "grad_norm": 0.0, + "learning_rate": 4.163124510305216e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4576 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5277862548828, + "epoch": 34.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.1626382228317105e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4577 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.55555725097656, + "epoch": 34.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.1621518225338424e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4578 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.55555725097656, + "epoch": 34.42537313432836, + "grad_norm": 0.0, + "learning_rate": 4.1616653094446197e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4579 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.86111450195312, + "epoch": 34.43283582089552, + "grad_norm": 0.6134026777032776, + "learning_rate": 4.161178683597054e-07, + "loss": -0.0033, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4580 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.55555725097656, + "epoch": 34.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.1606919450241683e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4581 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.22222900390625, + "epoch": 34.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.1602050937589933e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4582 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.36111450195312, + "epoch": 34.45522388059702, + "grad_norm": 0.0, + "learning_rate": 4.159718129834564e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4583 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0833282470703, + "epoch": 34.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.159231053283926e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4584 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.1666717529297, + "epoch": 34.47014925373134, + "grad_norm": 0.0, + "learning_rate": 4.158743864140131e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4585 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.55555725097656, + "epoch": 34.47761194029851, + "grad_norm": 1.1988452798216929, + "learning_rate": 4.15825656243624e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4586 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.13888549804688, + "epoch": 34.485074626865675, + "grad_norm": 0.0, + "learning_rate": 4.1577691482053194e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4587 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.2777862548828, + "epoch": 34.492537313432834, + "grad_norm": 0.0, + "learning_rate": 4.157281621480446e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4588 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.4166717529297, + "epoch": 34.5, + "grad_norm": 0.0, + "learning_rate": 4.156793982294702e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4589 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.9166717529297, + "epoch": 34.507462686567166, + "grad_norm": 0.0, + "learning_rate": 4.1563062306811776e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4590 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.55555725097656, + "epoch": 34.514925373134325, + "grad_norm": 0.0, + "learning_rate": 4.1558183666729706e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4591 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5833282470703, + "epoch": 34.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.155330390303188e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4592 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.80555725097656, + "epoch": 34.52985074626866, + "grad_norm": 0.9227306678475476, + "learning_rate": 4.1548423016049427e-07, + "loss": 0.0048, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4593 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5833282470703, + "epoch": 34.53731343283582, + "grad_norm": 0.9508995556945018, + "learning_rate": 4.1543541006113555e-07, + "loss": 0.0028, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4594 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.75, + "epoch": 34.54477611940298, + "grad_norm": 1.1376094815946076, + "learning_rate": 4.153865787355555e-07, + "loss": -0.0023, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4595 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.44444274902344, + "epoch": 34.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.153377361870678e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4596 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.86111450195312, + "epoch": 34.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.1528888241898673e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4597 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.13888549804688, + "epoch": 34.56716417910448, + "grad_norm": 0.0, + "learning_rate": 4.1524001743462755e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4598 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5, + "epoch": 34.57462686567164, + "grad_norm": 1.283378301554102, + "learning_rate": 4.151911412373062e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4599 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.94444274902344, + "epoch": 34.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.151422538303392e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4600 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.9166717529297, + "epoch": 34.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.1509335521704413e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4601 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.47222900390625, + "epoch": 34.59701492537313, + "grad_norm": 0.0, + "learning_rate": 4.1504444540073914e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4602 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5833282470703, + "epoch": 34.6044776119403, + "grad_norm": 0.0, + "learning_rate": 4.149955243847431e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4603 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.44444274902344, + "epoch": 34.61194029850746, + "grad_norm": 0.0, + "learning_rate": 4.1494659217237583e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4604 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.11111450195312, + "epoch": 34.61940298507463, + "grad_norm": 0.0, + "learning_rate": 4.148976487669577e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4605 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.30555725097656, + "epoch": 34.62686567164179, + "grad_norm": 0.6422769426391868, + "learning_rate": 4.148486941718101e-07, + "loss": 0.0, + "reward": 1.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 4606 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.1666717529297, + "epoch": 34.634328358208954, + "grad_norm": 1.0327872852539624, + "learning_rate": 4.147997283902549e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4607 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.38888549804688, + "epoch": 34.64179104477612, + "grad_norm": 1.767100088441337, + "learning_rate": 4.1475075142561484e-07, + "loss": -0.001, + "reward": 1.888888955116272, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4608 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.5833282470703, + "epoch": 34.649253731343286, + "grad_norm": 0.46196634783513846, + "learning_rate": 4.147017632812134e-07, + "loss": 0.0016, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4609 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.55555725097656, + "epoch": 34.656716417910445, + "grad_norm": 0.0, + "learning_rate": 4.1465276396037505e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4610 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.30555725097656, + "epoch": 34.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.1460375346642476e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4611 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 34.67164179104478, + "grad_norm": 0.0, + "learning_rate": 4.145547318026881e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4612 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.05555725097656, + "epoch": 34.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.1450569897249174e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4613 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.22222900390625, + "epoch": 34.6865671641791, + "grad_norm": 0.0, + "learning_rate": 4.1445665497916305e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4614 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3333282470703, + "epoch": 34.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.1440759982603e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4615 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.94444274902344, + "epoch": 34.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.1435853351642147e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4616 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.19444274902344, + "epoch": 34.708955223880594, + "grad_norm": 0.0, + "learning_rate": 4.1430945605366694e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4617 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.1666717529297, + "epoch": 34.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.142603674410969e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4618 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5277862548828, + "epoch": 34.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.142112676820423e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4619 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.38888549804688, + "epoch": 34.73134328358209, + "grad_norm": 0.5377018952195836, + "learning_rate": 4.14162156779835e-07, + "loss": 0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4620 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0277862548828, + "epoch": 34.73880597014925, + "grad_norm": 0.0, + "learning_rate": 4.141130347378076e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4621 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5833282470703, + "epoch": 34.74626865671642, + "grad_norm": 0.0, + "learning_rate": 4.140639015592935e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4622 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.55555725097656, + "epoch": 34.75373134328358, + "grad_norm": 0.6116861915012651, + "learning_rate": 4.140147572476268e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4623 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.4166717529297, + "epoch": 34.76119402985075, + "grad_norm": 0.3889896710222061, + "learning_rate": 4.1396560180614235e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4624 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.94444274902344, + "epoch": 34.76865671641791, + "grad_norm": 1.6742461711736694, + "learning_rate": 4.1391643523817574e-07, + "loss": 0.0028, + "reward": 1.888888955116272, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4625 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5277862548828, + "epoch": 34.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.138672575470634e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4626 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9166717529297, + "epoch": 34.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.1381806873614243e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4627 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.25, + "epoch": 34.791044776119406, + "grad_norm": 0.0, + "learning_rate": 4.137688688087507e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4628 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.9166717529297, + "epoch": 34.798507462686565, + "grad_norm": 0.756976663270941, + "learning_rate": 4.137196577682269e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4629 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.30555725097656, + "epoch": 34.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.136704356179105e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4630 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.2777862548828, + "epoch": 34.8134328358209, + "grad_norm": 0.0, + "learning_rate": 4.136212023611414e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4631 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.97222900390625, + "epoch": 34.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.1357195800126073e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4632 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5833282470703, + "epoch": 34.82835820895522, + "grad_norm": 0.821837233997268, + "learning_rate": 4.1352270254161004e-07, + "loss": 0.0014, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4633 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.4166717529297, + "epoch": 34.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.1347343598553176e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4634 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.97222900390625, + "epoch": 34.843283582089555, + "grad_norm": 0.0, + "learning_rate": 4.13424158336369e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4635 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 34.850746268656714, + "grad_norm": 0.0, + "learning_rate": 4.133748695974658e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4636 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.88888549804688, + "epoch": 34.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.133255697721667e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4637 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.05555725097656, + "epoch": 34.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.132762588638172e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4638 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.80555725097656, + "epoch": 34.87313432835821, + "grad_norm": 1.4570671527102215, + "learning_rate": 4.1322693687576346e-07, + "loss": -0.0013, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4639 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4166717529297, + "epoch": 34.88059701492537, + "grad_norm": 0.0, + "learning_rate": 4.1317760381135236e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4640 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.8333282470703, + "epoch": 34.88805970149254, + "grad_norm": 0.5474186094856565, + "learning_rate": 4.1312825967393163e-07, + "loss": -0.0392, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4641 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.7777862548828, + "epoch": 34.8955223880597, + "grad_norm": 0.0, + "learning_rate": 4.1307890446684966e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4642 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8333282470703, + "epoch": 34.90298507462687, + "grad_norm": 0.0, + "learning_rate": 4.1302953819345566e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4643 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3333282470703, + "epoch": 34.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.129801608570995e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4644 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.97222900390625, + "epoch": 34.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.1293077246113186e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4645 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.38888549804688, + "epoch": 34.92537313432836, + "grad_norm": 0.0, + "learning_rate": 4.1288137300890425e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4646 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.72222900390625, + "epoch": 34.93283582089552, + "grad_norm": 0.0, + "learning_rate": 4.128319625037687e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4647 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.47222900390625, + "epoch": 34.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.1278254094907836e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4648 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.19444274902344, + "epoch": 34.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.127331083481868e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4649 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.8333282470703, + "epoch": 34.95522388059702, + "grad_norm": 0.0, + "learning_rate": 4.126836647044483e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4650 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 34.96268656716418, + "grad_norm": 0.0, + "learning_rate": 4.1263421002121837e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4651 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.86111450195312, + "epoch": 34.97014925373134, + "grad_norm": 2.9249309812926336, + "learning_rate": 4.1258474430185254e-07, + "loss": -0.0001, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4652 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.3333282470703, + "epoch": 34.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.125352675497078e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4653 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8333282470703, + "epoch": 34.985074626865675, + "grad_norm": 1.5243774783604385, + "learning_rate": 4.1248577976814146e-07, + "loss": 0.001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4654 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 34.992537313432834, + "grad_norm": 0.0, + "learning_rate": 4.1243628096051166e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4655 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.11111450195312, + "epoch": 35.007462686567166, + "grad_norm": 0.0, + "learning_rate": 4.123867711301774e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4656 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.11111450195312, + "epoch": 35.014925373134325, + "grad_norm": 0.0, + "learning_rate": 4.1233725028049827e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4657 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1666717529297, + "epoch": 35.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.1228771841483473e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4658 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 35.02985074626866, + "grad_norm": 0.0, + "learning_rate": 4.122381755365479e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4659 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.44444274902344, + "epoch": 35.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.1218862164899983e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4660 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.11111450195312, + "epoch": 35.04477611940298, + "grad_norm": 0.7426414619322931, + "learning_rate": 4.1213905675555307e-07, + "loss": -0.0025, + "reward": 1.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 0.944444477558136, + "step": 4661 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.19444274902344, + "epoch": 35.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.1208948085957094e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4662 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.55555725097656, + "epoch": 35.059701492537314, + "grad_norm": 0.7603424687103473, + "learning_rate": 4.1203989396441775e-07, + "loss": 0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4663 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.13888549804688, + "epoch": 35.06716417910448, + "grad_norm": 0.0, + "learning_rate": 4.119902960734584e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4664 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.2777862548828, + "epoch": 35.07462686567164, + "grad_norm": 0.8720415333329341, + "learning_rate": 4.119406871900584e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4665 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 35.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.1189106731758423e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4666 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.61111450195312, + "epoch": 35.08955223880597, + "grad_norm": 1.0763789494089309, + "learning_rate": 4.11841436459403e-07, + "loss": 0.0097, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4667 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.36111450195312, + "epoch": 35.09701492537314, + "grad_norm": 0.3701827659721961, + "learning_rate": 4.1179179461888267e-07, + "loss": 0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4668 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.47222900390625, + "epoch": 35.1044776119403, + "grad_norm": 0.0, + "learning_rate": 4.1174214179939173e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4669 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.36111450195312, + "epoch": 35.11194029850746, + "grad_norm": 0.0, + "learning_rate": 4.1169247800429964e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4670 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.44444274902344, + "epoch": 35.11940298507463, + "grad_norm": 13.936360714017127, + "learning_rate": 4.1164280323697653e-07, + "loss": -0.0007, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4671 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.80555725097656, + "epoch": 35.12686567164179, + "grad_norm": 0.0, + "learning_rate": 4.115931175007932e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4672 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.61111450195312, + "epoch": 35.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.115434207991213e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4673 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.05555725097656, + "epoch": 35.14179104477612, + "grad_norm": 0.6159574612732687, + "learning_rate": 4.114937131353332e-07, + "loss": -0.001, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4674 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.69444274902344, + "epoch": 35.149253731343286, + "grad_norm": 0.0, + "learning_rate": 4.1144399451280197e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4675 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.88888549804688, + "epoch": 35.156716417910445, + "grad_norm": 0.8491168692752942, + "learning_rate": 4.1139426493490143e-07, + "loss": -0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4676 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05555725097656, + "epoch": 35.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.1134452440500613e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4677 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.61111450195312, + "epoch": 35.17164179104478, + "grad_norm": 0.0, + "learning_rate": 4.112947729264915e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4678 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5833282470703, + "epoch": 35.17910447761194, + "grad_norm": 0.0, + "learning_rate": 4.112450105027335e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4679 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.5, + "epoch": 35.1865671641791, + "grad_norm": 0.0, + "learning_rate": 4.11195237137109e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4680 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.19444274902344, + "epoch": 35.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.1114545283299564e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4681 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.55555725097656, + "epoch": 35.201492537313435, + "grad_norm": 0.0, + "learning_rate": 4.110956575937715e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4682 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5277862548828, + "epoch": 35.208955223880594, + "grad_norm": 0.0, + "learning_rate": 4.1104585142281577e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4683 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.9166717529297, + "epoch": 35.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.1099603432350817e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4684 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9166717529297, + "epoch": 35.223880597014926, + "grad_norm": 1.0942753817572874, + "learning_rate": 4.109462062992293e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4685 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8333282470703, + "epoch": 35.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.1089636735336024e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4686 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.69444274902344, + "epoch": 35.23880597014925, + "grad_norm": 0.0, + "learning_rate": 4.108465174892832e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4687 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.80555725097656, + "epoch": 35.24626865671642, + "grad_norm": 0.3668543998660397, + "learning_rate": 4.107966567103809e-07, + "loss": -0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4688 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.6666717529297, + "epoch": 35.25373134328358, + "grad_norm": 0.8046286272353775, + "learning_rate": 4.107467850200367e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4689 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.7777862548828, + "epoch": 35.26119402985075, + "grad_norm": 0.0, + "learning_rate": 4.106969024216348e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4690 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05555725097656, + "epoch": 35.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.1064700891856037e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4691 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.86111450195312, + "epoch": 35.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.1059710451419893e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4692 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.7777862548828, + "epoch": 35.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.10547189211937e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4693 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.13888549804688, + "epoch": 35.291044776119406, + "grad_norm": 0.0, + "learning_rate": 4.1049726301516184e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4694 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1666717529297, + "epoch": 35.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.1044732592726127e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4695 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.30555725097656, + "epoch": 35.30597014925373, + "grad_norm": 0.6053892425321296, + "learning_rate": 4.103973779516239e-07, + "loss": 0.0022, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 4696 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0833282470703, + "epoch": 35.3134328358209, + "grad_norm": 0.7927915935940446, + "learning_rate": 4.103474190916393e-07, + "loss": -0.0008, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4697 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.4166717529297, + "epoch": 35.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.1029744935069745e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4698 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.9166717529297, + "epoch": 35.32835820895522, + "grad_norm": 0.0, + "learning_rate": 4.102474687321893e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4699 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.13888549804688, + "epoch": 35.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.1019747723950647e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4700 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3333282470703, + "epoch": 35.343283582089555, + "grad_norm": 0.0, + "learning_rate": 4.1014747487604134e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4701 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.61111450195312, + "epoch": 35.350746268656714, + "grad_norm": 0.5130809566927167, + "learning_rate": 4.1009746164518696e-07, + "loss": 0.0002, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4702 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0, + "epoch": 35.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.100474375503371e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4703 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.4166717529297, + "epoch": 35.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.099974025948865e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4704 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.63888549804688, + "epoch": 35.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.099473567822303e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4705 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.38888549804688, + "epoch": 35.38059701492537, + "grad_norm": 0.0, + "learning_rate": 4.098973001157646e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4706 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.25, + "epoch": 35.38805970149254, + "grad_norm": 0.5815102896058398, + "learning_rate": 4.098472325988862e-07, + "loss": 0.0009, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4707 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.94444274902344, + "epoch": 35.3955223880597, + "grad_norm": 1.4199915709084308, + "learning_rate": 4.097971542349925e-07, + "loss": 0.0001, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4708 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.13888549804688, + "epoch": 35.40298507462686, + "grad_norm": 0.0, + "learning_rate": 4.097470650274819e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4709 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.72222900390625, + "epoch": 35.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.0969696497975334e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4710 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.75, + "epoch": 35.417910447761194, + "grad_norm": 0.0, + "learning_rate": 4.0964685409520654e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4711 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0833282470703, + "epoch": 35.42537313432836, + "grad_norm": 0.0, + "learning_rate": 4.0959673237724197e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4712 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1666717529297, + "epoch": 35.43283582089552, + "grad_norm": 0.0, + "learning_rate": 4.095465998292607e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4713 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.69444274902344, + "epoch": 35.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.094964564546648e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4714 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.19444274902344, + "epoch": 35.44776119402985, + "grad_norm": 0.4620040488485422, + "learning_rate": 4.094463022568569e-07, + "loss": 0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4715 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.9166717529297, + "epoch": 35.45522388059702, + "grad_norm": 0.0, + "learning_rate": 4.0939613723924034e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4716 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.9166717529297, + "epoch": 35.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.0934596140521925e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4717 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.72222900390625, + "epoch": 35.47014925373134, + "grad_norm": 0.0, + "learning_rate": 4.0929577475819853e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4718 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13888549804688, + "epoch": 35.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.092455773015839e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4719 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.63888549804688, + "epoch": 35.485074626865675, + "grad_norm": 0.0, + "learning_rate": 4.091953690387815e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4720 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.2777862548828, + "epoch": 35.492537313432834, + "grad_norm": 0.0, + "learning_rate": 4.0914514997319836e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4721 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.4166717529297, + "epoch": 35.5, + "grad_norm": 0.0, + "learning_rate": 4.090949201082425e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4722 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.97222900390625, + "epoch": 35.507462686567166, + "grad_norm": 0.0, + "learning_rate": 4.0904467944732233e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4723 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.7777862548828, + "epoch": 35.514925373134325, + "grad_norm": 0.0, + "learning_rate": 4.089944279938471e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4724 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.7777862548828, + "epoch": 35.52238805970149, + "grad_norm": 0.6452709002232948, + "learning_rate": 4.0894416575122677e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4725 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.63888549804688, + "epoch": 35.52985074626866, + "grad_norm": 0.0, + "learning_rate": 4.088938927228721e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4726 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.47222900390625, + "epoch": 35.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.088436089121947e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4727 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0833282470703, + "epoch": 35.54477611940298, + "grad_norm": 0.0, + "learning_rate": 4.087933143226064e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4728 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5, + "epoch": 35.55223880597015, + "grad_norm": 0.30670393520571765, + "learning_rate": 4.0874300895752056e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4729 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0277862548828, + "epoch": 35.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.086926928203505e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4730 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.94444274902344, + "epoch": 35.56716417910448, + "grad_norm": 0.3674856462684637, + "learning_rate": 4.086423659145108e-07, + "loss": -0.0, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4731 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.80555725097656, + "epoch": 35.57462686567164, + "grad_norm": 0.0, + "learning_rate": 4.0859202824341644e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4732 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.30555725097656, + "epoch": 35.582089552238806, + "grad_norm": 0.0, + "learning_rate": 4.0854167981048335e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4733 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.33333587646484, + "epoch": 35.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.0849132061912804e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4734 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.44444274902344, + "epoch": 35.59701492537313, + "grad_norm": 0.0, + "learning_rate": 4.0844095067276796e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4735 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.05555725097656, + "epoch": 35.6044776119403, + "grad_norm": 0.0, + "learning_rate": 4.0839056997482096e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4736 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.61111450195312, + "epoch": 35.61194029850746, + "grad_norm": 0.0, + "learning_rate": 4.0834017852870587e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4737 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.11111450195312, + "epoch": 35.61940298507463, + "grad_norm": 0.0, + "learning_rate": 4.082897763378422e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4738 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.30555725097656, + "epoch": 35.62686567164179, + "grad_norm": 1.2544230714093392, + "learning_rate": 4.0823936340565035e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4739 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 35.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.081889397355509e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4740 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.0277862548828, + "epoch": 35.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.081385053309658e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4741 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0277862548828, + "epoch": 35.649253731343286, + "grad_norm": 0.0, + "learning_rate": 4.080880601953174e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4742 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.30555725097656, + "epoch": 35.656716417910445, + "grad_norm": 0.0, + "learning_rate": 4.0803760433202885e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4743 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.97222900390625, + "epoch": 35.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.07987137744524e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4744 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 35.67164179104478, + "grad_norm": 0.0, + "learning_rate": 4.079366604362273e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4745 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0277862548828, + "epoch": 35.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.078861724105644e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4746 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.69444274902344, + "epoch": 35.6865671641791, + "grad_norm": 0.0, + "learning_rate": 4.0783567367096106e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4747 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5833282470703, + "epoch": 35.69402985074627, + "grad_norm": 0.6418944831257472, + "learning_rate": 4.077851642208442e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4748 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 35.701492537313435, + "grad_norm": 0.9038193271398609, + "learning_rate": 4.077346440636412e-07, + "loss": 0.0009, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4749 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5, + "epoch": 35.708955223880594, + "grad_norm": 0.0, + "learning_rate": 4.076841132027805e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4750 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.8333282470703, + "epoch": 35.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.0763357164169076e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4751 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6666717529297, + "epoch": 35.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.075830193838019e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4752 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.44444274902344, + "epoch": 35.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.075324564325443e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4753 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.80555725097656, + "epoch": 35.73880597014925, + "grad_norm": 0.0, + "learning_rate": 4.07481882791349e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4754 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0277862548828, + "epoch": 35.74626865671642, + "grad_norm": 0.6588332758919475, + "learning_rate": 4.074312984636479e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4755 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.25, + "epoch": 35.75373134328358, + "grad_norm": 0.32151011267834156, + "learning_rate": 4.0738070345287357e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4756 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 35.76119402985075, + "grad_norm": 0.0, + "learning_rate": 4.0733009776245937e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4757 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.61111450195312, + "epoch": 35.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.0727948139583925e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4758 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 35.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.0722885435644803e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4759 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5, + "epoch": 35.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.0717821664772124e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4760 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 35.791044776119406, + "grad_norm": 0.9818920969108135, + "learning_rate": 4.071275682730949e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4761 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.22222900390625, + "epoch": 35.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.070769092360061e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4762 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.72222900390625, + "epoch": 35.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.070262395398926e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4763 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.44444274902344, + "epoch": 35.8134328358209, + "grad_norm": 0.0, + "learning_rate": 4.069755591881925e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4764 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5, + "epoch": 35.82089552238806, + "grad_norm": 0.507592272714584, + "learning_rate": 4.06924868184345e-07, + "loss": -0.0046, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4765 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.11111450195312, + "epoch": 35.82835820895522, + "grad_norm": 0.0, + "learning_rate": 4.0687416653179006e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4766 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.30555725097656, + "epoch": 35.83582089552239, + "grad_norm": 0.0, + "learning_rate": 4.0682345423396804e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4767 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.19444274902344, + "epoch": 35.843283582089555, + "grad_norm": 0.0, + "learning_rate": 4.0677273129432033e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4768 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.11111450195312, + "epoch": 35.850746268656714, + "grad_norm": 0.0, + "learning_rate": 4.067219977162888e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4769 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.22222900390625, + "epoch": 35.85820895522388, + "grad_norm": 0.0, + "learning_rate": 4.066712535033163e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4770 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 35.865671641791046, + "grad_norm": 0.0, + "learning_rate": 4.066204986588462e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4771 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1666717529297, + "epoch": 35.87313432835821, + "grad_norm": 0.0, + "learning_rate": 4.065697331863226e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4772 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.94444274902344, + "epoch": 35.88059701492537, + "grad_norm": 0.0, + "learning_rate": 4.0651895708919055e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4773 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.72222900390625, + "epoch": 35.88805970149254, + "grad_norm": 0.0, + "learning_rate": 4.0646817037089544e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4774 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.47222900390625, + "epoch": 35.8955223880597, + "grad_norm": 0.0, + "learning_rate": 4.064173730348837e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4775 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0833282470703, + "epoch": 35.90298507462687, + "grad_norm": 0.8473229271583764, + "learning_rate": 4.0636656508460227e-07, + "loss": -0.0013, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4776 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.61111450195312, + "epoch": 35.91044776119403, + "grad_norm": 0.0, + "learning_rate": 4.063157465234991e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4777 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.86111450195312, + "epoch": 35.917910447761194, + "grad_norm": 0.0, + "learning_rate": 4.0626491735502246e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4778 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.5833282470703, + "epoch": 35.92537313432836, + "grad_norm": 0.0, + "learning_rate": 4.062140775826216e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4779 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.88888549804688, + "epoch": 35.93283582089552, + "grad_norm": 0.0, + "learning_rate": 4.061632272097466e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4780 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.69444274902344, + "epoch": 35.940298507462686, + "grad_norm": 0.0, + "learning_rate": 4.061123662398479e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4781 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.47222900390625, + "epoch": 35.94776119402985, + "grad_norm": 0.0, + "learning_rate": 4.060614946763769e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4782 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.55555725097656, + "epoch": 35.95522388059702, + "grad_norm": 0.0, + "learning_rate": 4.0601061252278573e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4783 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.30555725097656, + "epoch": 35.96268656716418, + "grad_norm": 1.2663125369459927, + "learning_rate": 4.059597197825272e-07, + "loss": -0.0026, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4784 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.69444274902344, + "epoch": 35.97014925373134, + "grad_norm": 0.6908728239809849, + "learning_rate": 4.059088164590547e-07, + "loss": 0.0059, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4785 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.05555725097656, + "epoch": 35.97761194029851, + "grad_norm": 0.0, + "learning_rate": 4.058579025558226e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4786 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 35.985074626865675, + "grad_norm": 0.0, + "learning_rate": 4.058069780762858e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4787 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.75, + "epoch": 35.992537313432834, + "grad_norm": 0.0, + "learning_rate": 4.057560430238999e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4788 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5, + "epoch": 36.007462686567166, + "grad_norm": 0.0, + "learning_rate": 4.057050974021213e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4789 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.94444274902344, + "epoch": 36.014925373134325, + "grad_norm": 0.0, + "learning_rate": 4.056541412144072e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4790 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0, + "epoch": 36.02238805970149, + "grad_norm": 0.0, + "learning_rate": 4.056031744642153e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4791 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.9166717529297, + "epoch": 36.02985074626866, + "grad_norm": 0.6868965823560984, + "learning_rate": 4.055521971550043e-07, + "loss": 0.0016, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4792 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.13888549804688, + "epoch": 36.03731343283582, + "grad_norm": 0.0, + "learning_rate": 4.0550120929023326e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4793 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.38888549804688, + "epoch": 36.04477611940298, + "grad_norm": 0.0, + "learning_rate": 4.054502108733622e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4794 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.47222900390625, + "epoch": 36.05223880597015, + "grad_norm": 0.0, + "learning_rate": 4.0539920190785195e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4795 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5, + "epoch": 36.059701492537314, + "grad_norm": 0.0, + "learning_rate": 4.053481823971636e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4796 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5277862548828, + "epoch": 36.06716417910448, + "grad_norm": 1.4303374663293433, + "learning_rate": 4.0529715234475956e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4797 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.5, + "epoch": 36.07462686567164, + "grad_norm": 0.0, + "learning_rate": 4.052461117541025e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4798 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.38888549804688, + "epoch": 36.082089552238806, + "grad_norm": 0.0, + "learning_rate": 4.05195060628656e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4799 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.05555725097656, + "epoch": 36.08955223880597, + "grad_norm": 0.0, + "learning_rate": 4.051439989718844e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4800 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.86111450195312, + "epoch": 36.09701492537314, + "grad_norm": 0.0, + "learning_rate": 4.050929267872527e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4801 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5277862548828, + "epoch": 36.1044776119403, + "grad_norm": 0.0, + "learning_rate": 4.050418440782264e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4802 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.36111450195312, + "epoch": 36.11194029850746, + "grad_norm": 0.6311058161704722, + "learning_rate": 4.0499075084827194e-07, + "loss": -0.0008, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4803 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.11111450195312, + "epoch": 36.11940298507463, + "grad_norm": 0.0, + "learning_rate": 4.049396471008566e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4804 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9166717529297, + "epoch": 36.12686567164179, + "grad_norm": 0.7909208220389734, + "learning_rate": 4.04888532839448e-07, + "loss": 0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4805 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.86111450195312, + "epoch": 36.134328358208954, + "grad_norm": 0.0, + "learning_rate": 4.0483740806751486e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4806 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.61111450195312, + "epoch": 36.14179104477612, + "grad_norm": 0.0, + "learning_rate": 4.047862727885264e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4807 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.25, + "epoch": 36.149253731343286, + "grad_norm": 0.0, + "learning_rate": 4.047351270059525e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4808 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.63888549804688, + "epoch": 36.156716417910445, + "grad_norm": 0.0, + "learning_rate": 4.0468397072326396e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4809 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.0833282470703, + "epoch": 36.16417910447761, + "grad_norm": 0.0, + "learning_rate": 4.046328039439321e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4810 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.19444274902344, + "epoch": 36.17164179104478, + "grad_norm": 0.0, + "learning_rate": 4.0458162667142905e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4811 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.94444274902344, + "epoch": 36.17910447761194, + "grad_norm": 3.5171400781950553, + "learning_rate": 4.0453043890922754e-07, + "loss": 0.0086, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4812 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.19444274902344, + "epoch": 36.1865671641791, + "grad_norm": 0.0, + "learning_rate": 4.044792406608013e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4813 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.9166717529297, + "epoch": 36.19402985074627, + "grad_norm": 0.0, + "learning_rate": 4.0442803192962436e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4814 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.9166717529297, + "epoch": 36.201492537313435, + "grad_norm": 0.0, + "learning_rate": 4.043768127191719e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4815 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.55555725097656, + "epoch": 36.208955223880594, + "grad_norm": 0.0, + "learning_rate": 4.0432558303291934e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4816 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.75, + "epoch": 36.21641791044776, + "grad_norm": 0.0, + "learning_rate": 4.0427434287434323e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4817 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.36111450195312, + "epoch": 36.223880597014926, + "grad_norm": 0.0, + "learning_rate": 4.042230922469207e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4818 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8333282470703, + "epoch": 36.23134328358209, + "grad_norm": 0.0, + "learning_rate": 4.041718311541293e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4819 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.6666717529297, + "epoch": 36.23880597014925, + "grad_norm": 7.54334889432081, + "learning_rate": 4.041205595994478e-07, + "loss": -0.0031, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 4820 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.69444274902344, + "epoch": 36.24626865671642, + "grad_norm": 0.0, + "learning_rate": 4.040692775863553e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4821 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 36.25373134328358, + "grad_norm": 0.0, + "learning_rate": 4.040179851183317e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4822 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5277862548828, + "epoch": 36.26119402985075, + "grad_norm": 0.0, + "learning_rate": 4.039666821988577e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4823 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.9166717529297, + "epoch": 36.26865671641791, + "grad_norm": 0.0, + "learning_rate": 4.039153688314145e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4824 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0277862548828, + "epoch": 36.276119402985074, + "grad_norm": 0.0, + "learning_rate": 4.0386404501948443e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4825 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 36.28358208955224, + "grad_norm": 0.0, + "learning_rate": 4.0381271076655004e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4826 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.94444274902344, + "epoch": 36.291044776119406, + "grad_norm": 0.0, + "learning_rate": 4.037613660760949e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4827 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 36.298507462686565, + "grad_norm": 0.0, + "learning_rate": 4.037100109516031e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4828 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.5, + "epoch": 36.30597014925373, + "grad_norm": 0.0, + "learning_rate": 4.036586453965596e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4829 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.1388931274414, + "epoch": 36.3134328358209, + "grad_norm": 0.0, + "learning_rate": 4.0360726941445e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4830 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.38888549804688, + "epoch": 36.32089552238806, + "grad_norm": 0.0, + "learning_rate": 4.0355588300876053e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4831 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.88888549804688, + "epoch": 36.32835820895522, + "grad_norm": 0.0, + "learning_rate": 4.035044861829783e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4832 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5277862548828, + "epoch": 36.33582089552239, + "grad_norm": 0.0, + "learning_rate": 4.03453078940591e-07, + "loss": 0.0, + "reward": 1.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 1.0, + "step": 4833 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.80555725097656, + "epoch": 36.343283582089555, + "grad_norm": 0.0, + "learning_rate": 4.0340166128508695e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4834 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.30555725097656, + "epoch": 36.350746268656714, + "grad_norm": 0.0, + "learning_rate": 4.033502332199554e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4835 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.72222900390625, + "epoch": 36.35820895522388, + "grad_norm": 0.0, + "learning_rate": 4.032987947486862e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4836 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.55555725097656, + "epoch": 36.365671641791046, + "grad_norm": 0.0, + "learning_rate": 4.0324734587476985e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4837 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.38888549804688, + "epoch": 36.37313432835821, + "grad_norm": 0.0, + "learning_rate": 4.0319588660169755e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4838 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.5, + "epoch": 36.38059701492537, + "grad_norm": 0.0, + "learning_rate": 4.0314441693296134e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4839 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.36111450195312, + "epoch": 36.38805970149254, + "grad_norm": 0.0, + "learning_rate": 4.030929368720539e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4840 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.63888549804688, + "epoch": 36.3955223880597, + "grad_norm": 0.0, + "learning_rate": 4.030414464224685e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4841 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 36.40298507462686, + "grad_norm": 0.0, + "learning_rate": 4.029899455876992e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4842 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.55555725097656, + "epoch": 36.41044776119403, + "grad_norm": 0.0, + "learning_rate": 4.029384343712409e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4843 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4166717529297, + "epoch": 36.417910447761194, + "grad_norm": 0.4226501046490397, + "learning_rate": 4.0288691277658903e-07, + "loss": -0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4844 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.05555725097656, + "epoch": 36.42537313432836, + "grad_norm": 0.33969310854765256, + "learning_rate": 4.0283538080723976e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4845 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.61111450195312, + "epoch": 36.43283582089552, + "grad_norm": 0.6101657830403441, + "learning_rate": 4.0278383846668996e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4846 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.0277862548828, + "epoch": 36.440298507462686, + "grad_norm": 0.0, + "learning_rate": 4.027322857584372e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4847 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.75, + "epoch": 36.44776119402985, + "grad_norm": 0.0, + "learning_rate": 4.026807226859799e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4848 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.19444274902344, + "epoch": 36.45522388059702, + "grad_norm": 0.0, + "learning_rate": 4.0262914925281687e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4849 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0, + "epoch": 36.46268656716418, + "grad_norm": 0.0, + "learning_rate": 4.02577565462448e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4850 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.80555725097656, + "epoch": 36.47014925373134, + "grad_norm": 0.0, + "learning_rate": 4.0252597131837363e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4851 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5277862548828, + "epoch": 36.47761194029851, + "grad_norm": 0.0, + "learning_rate": 4.0247436682409486e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4852 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.63888549804688, + "epoch": 36.485074626865675, + "grad_norm": 0.0, + "learning_rate": 4.0242275198311345e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4853 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.05555725097656, + "epoch": 36.492537313432834, + "grad_norm": 0.0, + "learning_rate": 4.0237112679893194e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4854 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.2777862548828, + "epoch": 36.5, + "grad_norm": 1.5919424282295607, + "learning_rate": 4.023194912750536e-07, + "loss": 0.0012, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4855 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.36111450195312, + "epoch": 36.507462686567166, + "grad_norm": 0.0, + "learning_rate": 4.0226784541498233e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4856 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.47222900390625, + "epoch": 36.514925373134325, + "grad_norm": 0.0, + "learning_rate": 4.0221618922222264e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4857 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.86111450195312, + "epoch": 36.52238805970149, + "grad_norm": 0.0, + "learning_rate": 4.021645227002801e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4858 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.75, + "epoch": 36.52985074626866, + "grad_norm": 0.0, + "learning_rate": 4.0211284585266046e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 4859 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.72222900390625, + "epoch": 36.53731343283582, + "grad_norm": 0.0, + "learning_rate": 4.020611586828705e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4860 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.86111450195312, + "epoch": 36.54477611940298, + "grad_norm": 0.0, + "learning_rate": 4.0200946119441777e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4861 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8333282470703, + "epoch": 36.55223880597015, + "grad_norm": 0.0, + "learning_rate": 4.019577533908103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4862 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.30555725097656, + "epoch": 36.559701492537314, + "grad_norm": 0.0, + "learning_rate": 4.019060352755569e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4863 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.63888549804688, + "epoch": 36.56716417910448, + "grad_norm": 0.0, + "learning_rate": 4.018543068521671e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4864 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.19444274902344, + "epoch": 36.57462686567164, + "grad_norm": 0.0, + "learning_rate": 4.0180256812415116e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4865 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.55555725097656, + "epoch": 36.582089552238806, + "grad_norm": 0.8018422608279523, + "learning_rate": 4.0175081909501996e-07, + "loss": -0.0028, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 4866 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.97222900390625, + "epoch": 36.58955223880597, + "grad_norm": 0.0, + "learning_rate": 4.016990597682851e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4867 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8333282470703, + "epoch": 36.59701492537313, + "grad_norm": 0.0, + "learning_rate": 4.016472901474589e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4868 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.47222900390625, + "epoch": 36.6044776119403, + "grad_norm": 0.0, + "learning_rate": 4.015955102360545e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4869 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.47222900390625, + "epoch": 36.61194029850746, + "grad_norm": 0.0, + "learning_rate": 4.015437200375854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4870 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.19444274902344, + "epoch": 36.61940298507463, + "grad_norm": 0.32622571062965167, + "learning_rate": 4.014919195555662e-07, + "loss": -0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4871 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5, + "epoch": 36.62686567164179, + "grad_norm": 0.0, + "learning_rate": 4.01440108793512e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4872 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.55555725097656, + "epoch": 36.634328358208954, + "grad_norm": 0.0, + "learning_rate": 4.013882877549385e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4873 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.80555725097656, + "epoch": 36.64179104477612, + "grad_norm": 0.0, + "learning_rate": 4.0133645644336217e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4874 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.1666717529297, + "epoch": 36.649253731343286, + "grad_norm": 0.0, + "learning_rate": 4.0128461486230036e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4875 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.55555725097656, + "epoch": 36.656716417910445, + "grad_norm": 0.7484878825398523, + "learning_rate": 4.0123276301527086e-07, + "loss": -0.0064, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4876 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.25, + "epoch": 36.66417910447761, + "grad_norm": 0.0, + "learning_rate": 4.011809009057924e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4877 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.25, + "epoch": 36.67164179104478, + "grad_norm": 0.0, + "learning_rate": 4.011290285373841e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4878 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 36.67910447761194, + "grad_norm": 0.0, + "learning_rate": 4.01077145913566e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4879 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.80555725097656, + "epoch": 36.6865671641791, + "grad_norm": 0.0, + "learning_rate": 4.010252530378589e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4880 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.6666717529297, + "epoch": 36.69402985074627, + "grad_norm": 0.0, + "learning_rate": 4.00973349913784e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4881 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2777862548828, + "epoch": 36.701492537313435, + "grad_norm": 0.0, + "learning_rate": 4.0092143654486357e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4882 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 36.708955223880594, + "grad_norm": 0.0, + "learning_rate": 4.008695129346202e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4883 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.36111450195312, + "epoch": 36.71641791044776, + "grad_norm": 0.0, + "learning_rate": 4.008175790865774e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4884 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.0277862548828, + "epoch": 36.723880597014926, + "grad_norm": 0.0, + "learning_rate": 4.0076563500425947e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4885 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.25, + "epoch": 36.73134328358209, + "grad_norm": 0.0, + "learning_rate": 4.0071368069119117e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4886 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.55555725097656, + "epoch": 36.73880597014925, + "grad_norm": 0.0, + "learning_rate": 4.0066171615089793e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4887 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.5833282470703, + "epoch": 36.74626865671642, + "grad_norm": 0.0, + "learning_rate": 4.006097413869061e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4888 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.72222137451172, + "epoch": 36.75373134328358, + "grad_norm": 0.0, + "learning_rate": 4.005577564027427e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4889 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5277862548828, + "epoch": 36.76119402985075, + "grad_norm": 0.0, + "learning_rate": 4.0050576120193525e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4890 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 36.76865671641791, + "grad_norm": 0.0, + "learning_rate": 4.004537557880121e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4891 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.05555725097656, + "epoch": 36.776119402985074, + "grad_norm": 0.0, + "learning_rate": 4.004017401645022e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4892 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7777862548828, + "epoch": 36.78358208955224, + "grad_norm": 0.0, + "learning_rate": 4.003497143349354e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4893 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1666717529297, + "epoch": 36.791044776119406, + "grad_norm": 0.0, + "learning_rate": 4.00297678302842e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4894 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.72222900390625, + "epoch": 36.798507462686565, + "grad_norm": 0.0, + "learning_rate": 4.002456320717531e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4895 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0277862548828, + "epoch": 36.80597014925373, + "grad_norm": 0.0, + "learning_rate": 4.001935756452005e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4896 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.55555725097656, + "epoch": 36.8134328358209, + "grad_norm": 0.0, + "learning_rate": 4.0014150902671675e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4897 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.44444274902344, + "epoch": 36.82089552238806, + "grad_norm": 0.0, + "learning_rate": 4.0008943221983494e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4898 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.19444274902344, + "epoch": 36.82835820895522, + "grad_norm": 0.0, + "learning_rate": 4.000373452280889e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4899 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.61111450195312, + "epoch": 36.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.9998524805501325e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4900 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0, + "epoch": 36.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.9993314070414316e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4901 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.0, + "epoch": 36.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.998810231790147e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4902 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0277862548828, + "epoch": 36.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.998288954831644e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4903 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0833282470703, + "epoch": 36.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.9977675762012954e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4904 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.61111450195312, + "epoch": 36.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.9972460959344825e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4905 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 36.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.9967245140665906e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4906 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 36.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.996202830633014e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4907 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.97222900390625, + "epoch": 36.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.995681045669155e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4908 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5833282470703, + "epoch": 36.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.995159159210419e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4909 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.63888549804688, + "epoch": 36.91044776119403, + "grad_norm": 0.940506898476477, + "learning_rate": 3.994637171292222e-07, + "loss": -0.007, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4910 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.8333282470703, + "epoch": 36.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.994115081949985e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4911 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 36.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.993592891219135e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4912 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.22222900390625, + "epoch": 36.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.9930705991351097e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4913 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.38888549804688, + "epoch": 36.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.9925482057333494e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4914 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.72222900390625, + "epoch": 36.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.9920257110493027e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4915 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.7777862548828, + "epoch": 36.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.991503115118426e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4916 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5833282470703, + "epoch": 36.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.990980417976184e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4917 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.97222900390625, + "epoch": 36.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.990457619658042e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4918 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.0, + "epoch": 36.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.9899347201994794e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4919 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.8333282470703, + "epoch": 36.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.989411719635978e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4920 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.0, + "epoch": 36.992537313432834, + "grad_norm": 1.4078760718402945, + "learning_rate": 3.988888618003029e-07, + "loss": 0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4921 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.75, + "epoch": 37.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.988365415336129e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4922 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.63888549804688, + "epoch": 37.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.9878421116707815e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4923 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.0277862548828, + "epoch": 37.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.9873187070424975e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4924 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.75, + "epoch": 37.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.9867952014867945e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4925 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1666717529297, + "epoch": 37.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.986271595039196e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4926 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.47222900390625, + "epoch": 37.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.9857478877352346e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4927 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.6666717529297, + "epoch": 37.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.9852240796104474e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4928 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.9166717529297, + "epoch": 37.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.9847001707003803e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4929 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.30555725097656, + "epoch": 37.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.9841761610405845e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4930 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5833282470703, + "epoch": 37.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.983652050666617e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4931 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.25, + "epoch": 37.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.983127839614046e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4932 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.69444274902344, + "epoch": 37.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.982603527918442e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4933 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 37.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.9820791156153854e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4934 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.30555725097656, + "epoch": 37.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.9815546027404603e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4935 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.61111450195312, + "epoch": 37.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.9810299893292597e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4936 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.61111450195312, + "epoch": 37.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.980505275417385e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4937 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.0, + "epoch": 37.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.9799804610404407e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4938 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.86111450195312, + "epoch": 37.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.9794555462340413e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4939 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.7777862548828, + "epoch": 37.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.978930531033806e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4940 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.38888549804688, + "epoch": 37.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.978405415475361e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4941 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.69444274902344, + "epoch": 37.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.9778801995943414e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4942 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.5277862548828, + "epoch": 37.16417910447761, + "grad_norm": 0.3264013046749512, + "learning_rate": 3.977354883426387e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4943 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.5277862548828, + "epoch": 37.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.976829467007145e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4944 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.25, + "epoch": 37.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.9763039503722694e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4945 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5833282470703, + "epoch": 37.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.97577833355742e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4946 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.13888549804688, + "epoch": 37.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.975252616598267e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4947 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.11111450195312, + "epoch": 37.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.9747267995304823e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4948 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 37.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.9742008823897486e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4949 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 37.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.9736748652117535e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4950 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0, + "epoch": 37.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.973148748032192e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4951 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.55555725097656, + "epoch": 37.23134328358209, + "grad_norm": 0.37727688977680857, + "learning_rate": 3.972622530886765e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 4952 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5833282470703, + "epoch": 37.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.9720962138111813e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4953 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.69444274902344, + "epoch": 37.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.971569796841157e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4954 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2777862548828, + "epoch": 37.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.971043280012413e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4955 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.94444274902344, + "epoch": 37.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.9705166633606766e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4956 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6666717529297, + "epoch": 37.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.9699899469216867e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4957 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.88888549804688, + "epoch": 37.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.9694631307311825e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4958 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.88888549804688, + "epoch": 37.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.968936214824915e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4959 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.75, + "epoch": 37.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.9684091992386386e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4960 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.97222900390625, + "epoch": 37.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.967882084008117e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4961 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2777862548828, + "epoch": 37.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.967354869169119e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4962 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.69444274902344, + "epoch": 37.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.966827554757421e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4963 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.97222900390625, + "epoch": 37.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.966300140808805e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4964 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1666717529297, + "epoch": 37.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.9657726273590613e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4965 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.9166717529297, + "epoch": 37.33582089552239, + "grad_norm": 0.9342877092684388, + "learning_rate": 3.965245014443986e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4966 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.88888549804688, + "epoch": 37.343283582089555, + "grad_norm": 0.5102135191326692, + "learning_rate": 3.9647173020993826e-07, + "loss": -0.0004, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4967 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.61111450195312, + "epoch": 37.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.96418949036106e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4968 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0833282470703, + "epoch": 37.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.963661579264836e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4969 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.47222900390625, + "epoch": 37.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.9631335688465326e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4970 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.22222900390625, + "epoch": 37.37313432835821, + "grad_norm": 0.5608936439738702, + "learning_rate": 3.9626054591419815e-07, + "loss": -0.0002, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 4971 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.3333282470703, + "epoch": 37.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.9620772501870183e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4972 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.63888549804688, + "epoch": 37.38805970149254, + "grad_norm": 0.0, + "learning_rate": 3.961548942017486e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4973 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.2777862548828, + "epoch": 37.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.961020534669237e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4974 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.44444274902344, + "epoch": 37.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.960492028178126e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4975 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.4166717529297, + "epoch": 37.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.959963422580018e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4976 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5277862548828, + "epoch": 37.417910447761194, + "grad_norm": 0.6702344243304811, + "learning_rate": 3.9594347179107835e-07, + "loss": -0.0015, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4977 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.55555725097656, + "epoch": 37.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.9589059142062995e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4978 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.69444274902344, + "epoch": 37.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.9583770115024496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4979 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.44444274902344, + "epoch": 37.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.957848009835124e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 4980 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0277862548828, + "epoch": 37.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.9573189092402214e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4981 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.25, + "epoch": 37.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.9567897097536445e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4982 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5277862548828, + "epoch": 37.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.9562604114113045e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4983 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.97222900390625, + "epoch": 37.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.95573101424912e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4984 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 37.47761194029851, + "grad_norm": 0.9227835827486263, + "learning_rate": 3.955201518303013e-07, + "loss": -0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4985 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.9166717529297, + "epoch": 37.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.9546719236089153e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4986 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.94444274902344, + "epoch": 37.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.9541422302027663e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4987 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8333282470703, + "epoch": 37.5, + "grad_norm": 0.0, + "learning_rate": 3.953612438120507e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4988 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.55555725097656, + "epoch": 37.507462686567166, + "grad_norm": 0.666005609258619, + "learning_rate": 3.953082547398091e-07, + "loss": 0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 4989 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.19444274902344, + "epoch": 37.514925373134325, + "grad_norm": 0.8367631863781039, + "learning_rate": 3.952552558071475e-07, + "loss": 0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 4990 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5, + "epoch": 37.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.9520224701766227e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4991 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.47222900390625, + "epoch": 37.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.9514922837495067e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 4992 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.75, + "epoch": 37.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.950961998826103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4993 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0, + "epoch": 37.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.950431615442397e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4994 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.63888549804688, + "epoch": 37.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.94990113363438e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4995 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.69444274902344, + "epoch": 37.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.9493705534380493e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4996 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.25, + "epoch": 37.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.9488398748894094e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4997 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5833282470703, + "epoch": 37.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.948309098024471e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4998 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.7777862548828, + "epoch": 37.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.9477782228792527e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 4999 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.05555725097656, + "epoch": 37.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.947247249489779e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5000 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.69444274902344, + "epoch": 37.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.94671617789208e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5001 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.30555725097656, + "epoch": 37.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.946185008122194e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5002 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.75, + "epoch": 37.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.945653740216166e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5003 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.75, + "epoch": 37.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.945122374210047e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5004 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.36111450195312, + "epoch": 37.62686567164179, + "grad_norm": 3.585473889827824, + "learning_rate": 3.944590910139894e-07, + "loss": -0.0137, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5005 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.38888549804688, + "epoch": 37.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.944059348041773e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5006 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.38888549804688, + "epoch": 37.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.943527687951753e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5007 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.47222900390625, + "epoch": 37.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.9429959299059125e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5008 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.7777862548828, + "epoch": 37.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.9424640739403365e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5009 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5833282470703, + "epoch": 37.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.9419321200911155e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5010 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.75, + "epoch": 37.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.941400068394347e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5011 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.19444274902344, + "epoch": 37.67910447761194, + "grad_norm": 0.7856657222074477, + "learning_rate": 3.9408679188861365e-07, + "loss": 0.0, + "reward": 1.7222222089767456, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.7222222089767456, + "rewards/format_reward": 1.0, + "step": 5012 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1666717529297, + "epoch": 37.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.940335671602594e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5013 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.4166717529297, + "epoch": 37.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.939803326579836e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5014 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5277862548828, + "epoch": 37.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.939270883853989e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5015 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.33333587646484, + "epoch": 37.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.938738343461182e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5016 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.9166717529297, + "epoch": 37.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.938205705437553e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5017 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.86111450195312, + "epoch": 37.723880597014926, + "grad_norm": 7.574282000440789, + "learning_rate": 3.937672969819246e-07, + "loss": 0.0016, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5018 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.36111450195312, + "epoch": 37.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.937140136642413e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5019 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.4166717529297, + "epoch": 37.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.936607205943209e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5020 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.22222900390625, + "epoch": 37.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.9360741777578e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5021 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.61111450195312, + "epoch": 37.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.9355410521223553e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5022 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.6666717529297, + "epoch": 37.76119402985075, + "grad_norm": 0.8164786133397702, + "learning_rate": 3.9350078290730527e-07, + "loss": 0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5023 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 37.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.9344745086460764e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5024 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.88888549804688, + "epoch": 37.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.933941090877615e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5025 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.13888549804688, + "epoch": 37.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.9334075758038676e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5026 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.11111450195312, + "epoch": 37.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.932873963461036e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5027 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.63888549804688, + "epoch": 37.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.932340253885333e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5028 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.13888549804688, + "epoch": 37.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.931806447112973e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5029 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.80555725097656, + "epoch": 37.8134328358209, + "grad_norm": 1.7912971032579397, + "learning_rate": 3.9312725431801796e-07, + "loss": -0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5030 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.19444274902344, + "epoch": 37.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.9307385421231835e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5031 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.30555725097656, + "epoch": 37.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.9302044439782213e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5032 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.44444274902344, + "epoch": 37.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.929670248781537e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5033 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.72222900390625, + "epoch": 37.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.9291359565693785e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5034 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.80555725097656, + "epoch": 37.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.928601567378003e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5035 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 37.85820895522388, + "grad_norm": 0.355140069369442, + "learning_rate": 3.928067081243672e-07, + "loss": -0.005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5036 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.4166717529297, + "epoch": 37.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.927532498202658e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5037 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.86111450195312, + "epoch": 37.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.9269978182912356e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5038 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.47222900390625, + "epoch": 37.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.926463041545687e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5039 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.05555725097656, + "epoch": 37.88805970149254, + "grad_norm": 1.2426387908802632, + "learning_rate": 3.9259281680023014e-07, + "loss": 0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5040 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.88888549804688, + "epoch": 37.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.9253931976973755e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5041 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.86111450195312, + "epoch": 37.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.92485813066721e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5042 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 37.91044776119403, + "grad_norm": 1.6131553451947847, + "learning_rate": 3.924322966948117e-07, + "loss": 0.0, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5043 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5277862548828, + "epoch": 37.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.923787706576408e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5044 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.9166717529297, + "epoch": 37.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.923252349588408e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5045 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.6666717529297, + "epoch": 37.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.922716896020444e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5046 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.05555725097656, + "epoch": 37.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.9221813459088516e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5047 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.94444274902344, + "epoch": 37.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.9216456992899724e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5048 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.36111450195312, + "epoch": 37.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.921109956200156e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5049 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.61111450195312, + "epoch": 37.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.920574116675756e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5050 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.72222900390625, + "epoch": 37.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.9200381807531324e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5051 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 37.97761194029851, + "grad_norm": 0.4781918891666664, + "learning_rate": 3.9195021484686555e-07, + "loss": -0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5052 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1666717529297, + "epoch": 37.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.918966019858698e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5053 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.08334350585938, + "epoch": 37.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.9184297949596415e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5054 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5277862548828, + "epoch": 38.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.9178934738078737e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5055 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.2777862548828, + "epoch": 38.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.9173570564397895e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5056 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0277862548828, + "epoch": 38.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.9168205428917865e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5057 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.72222900390625, + "epoch": 38.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.916283933200275e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5058 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.0277862548828, + "epoch": 38.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.915747227401667e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5059 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.7777862548828, + "epoch": 38.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.9152104255323827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5060 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0, + "epoch": 38.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.914673527628849e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5061 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0833282470703, + "epoch": 38.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.914136533727499e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5062 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.36111450195312, + "epoch": 38.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.9135994438647725e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5063 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.2777862548828, + "epoch": 38.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.9130622580771155e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5064 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.3333282470703, + "epoch": 38.082089552238806, + "grad_norm": 0.5894682298306742, + "learning_rate": 3.9125249764009805e-07, + "loss": -0.0011, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5065 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.13888549804688, + "epoch": 38.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.911987598872827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5066 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.8333282470703, + "epoch": 38.09701492537314, + "grad_norm": 0.5753436339485685, + "learning_rate": 3.911450125529121e-07, + "loss": 0.0045, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5067 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.80555725097656, + "epoch": 38.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.9109125564063345e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5068 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.55555725097656, + "epoch": 38.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.9103748915409455e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5069 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.44444274902344, + "epoch": 38.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.90983713096944e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5070 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.63888549804688, + "epoch": 38.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.909299274728309e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5071 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.6666717529297, + "epoch": 38.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.908761322854052e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5072 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.69444274902344, + "epoch": 38.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.908223275383173e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5073 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.75, + "epoch": 38.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.9076851323521825e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5074 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5277862548828, + "epoch": 38.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.907146893797598e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5075 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.38888549804688, + "epoch": 38.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.9066085597559456e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5076 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.36111450195312, + "epoch": 38.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.9060701302637543e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5077 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.47222900390625, + "epoch": 38.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.9055316053575616e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5078 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.6666717529297, + "epoch": 38.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.90499298507391e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5079 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.30555725097656, + "epoch": 38.19402985074627, + "grad_norm": 1.8925927196585868, + "learning_rate": 3.904454269449351e-07, + "loss": 0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5080 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.11111450195312, + "epoch": 38.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.903915458520441e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5081 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3333282470703, + "epoch": 38.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.9033765523237423e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5082 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.94444274902344, + "epoch": 38.21641791044776, + "grad_norm": 1.0806783946647702, + "learning_rate": 3.9028375508958246e-07, + "loss": -0.0006, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5083 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.75, + "epoch": 38.223880597014926, + "grad_norm": 2.1396169074512734, + "learning_rate": 3.902298454273264e-07, + "loss": -0.0001, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 5084 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.55555725097656, + "epoch": 38.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.901759262492643e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5085 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.0, + "epoch": 38.23880597014925, + "grad_norm": 0.6661871013113627, + "learning_rate": 3.9012199755905497e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5086 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.25, + "epoch": 38.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.90068059360358e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5087 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.22222900390625, + "epoch": 38.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.900141116568335e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5088 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0833282470703, + "epoch": 38.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.8996015445214233e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5089 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5, + "epoch": 38.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.89906187749946e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5090 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0, + "epoch": 38.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.8985221155390654e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5091 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.36111450195312, + "epoch": 38.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.8979822586768666e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5092 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5833282470703, + "epoch": 38.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.8974423069494986e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5093 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.55555725097656, + "epoch": 38.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.896902260393603e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5094 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.6666717529297, + "epoch": 38.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.8963621190458233e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5095 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.97222900390625, + "epoch": 38.3134328358209, + "grad_norm": 3.1421133284327167, + "learning_rate": 3.895821882942815e-07, + "loss": 0.0001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5096 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.63888549804688, + "epoch": 38.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.895281552121238e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5097 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.72222900390625, + "epoch": 38.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.8947411266177566e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5098 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.8333282470703, + "epoch": 38.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.894200606469045e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5099 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.47222900390625, + "epoch": 38.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.8936599917117815e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5100 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.7777862548828, + "epoch": 38.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.893119282382652e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5101 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4166717529297, + "epoch": 38.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.892578478518347e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5102 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.4166717529297, + "epoch": 38.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.892037580155566e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5103 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.47222900390625, + "epoch": 38.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.8914965873310134e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5104 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.63888549804688, + "epoch": 38.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.8909555000814e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5105 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.7777862548828, + "epoch": 38.38805970149254, + "grad_norm": 0.0, + "learning_rate": 3.890414318443443e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5106 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 38.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.8898730424538667e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5107 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.44444274902344, + "epoch": 38.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.889331672149402e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5108 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 38.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.888790207566783e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5109 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5, + "epoch": 38.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.8882486487427556e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5110 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.86111450195312, + "epoch": 38.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.8877069957140684e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5111 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.5833282470703, + "epoch": 38.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.887165248517477e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5112 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.22222900390625, + "epoch": 38.440298507462686, + "grad_norm": 1.041559445770278, + "learning_rate": 3.886623407189743e-07, + "loss": -0.0032, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5113 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.52777862548828, + "epoch": 38.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.886081471767636e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5114 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9166717529297, + "epoch": 38.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.8855394422879306e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5115 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.05555725097656, + "epoch": 38.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.884997318787407e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5116 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 38.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.884455101302856e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5117 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.19444274902344, + "epoch": 38.47761194029851, + "grad_norm": 0.0, + "learning_rate": 3.8839127898710687e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5118 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.11111450195312, + "epoch": 38.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.883370384528848e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5119 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.6666717529297, + "epoch": 38.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.882827885312998e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5120 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0833282470703, + "epoch": 38.5, + "grad_norm": 0.0, + "learning_rate": 3.882285292260335e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5121 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.55555725097656, + "epoch": 38.507462686567166, + "grad_norm": 0.0, + "learning_rate": 3.8817426054076765e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5122 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.88888549804688, + "epoch": 38.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.881199824791849e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5123 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6666717529297, + "epoch": 38.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.8806569504496856e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5124 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 38.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.880113982418024e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5125 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1666717529297, + "epoch": 38.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.87957092073371e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5126 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.36111450195312, + "epoch": 38.54477611940298, + "grad_norm": 0.833862595053386, + "learning_rate": 3.879027765433594e-07, + "loss": -0.0012, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5127 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.72222900390625, + "epoch": 38.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.8784845165545354e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5128 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9166717529297, + "epoch": 38.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.8779411741333965e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5129 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0833282470703, + "epoch": 38.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.87739773820705e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5130 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.61111450195312, + "epoch": 38.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.87685420881237e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5131 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.38888549804688, + "epoch": 38.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.8763105859862424e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5132 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5277862548828, + "epoch": 38.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.8757668697655543e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5133 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.8333282470703, + "epoch": 38.59701492537313, + "grad_norm": 2.765890755958293, + "learning_rate": 3.8752230601872035e-07, + "loss": -0.001, + "reward": 1.8611111640930176, + "reward_std": 0.11970558762550354, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5134 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0277862548828, + "epoch": 38.6044776119403, + "grad_norm": 1.2733878159881042, + "learning_rate": 3.874679157288091e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5135 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.8333282470703, + "epoch": 38.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.8741351611051254e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5136 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5833282470703, + "epoch": 38.61940298507463, + "grad_norm": 1.1332962390104424, + "learning_rate": 3.873591071675222e-07, + "loss": 0.0028, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5137 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7777862548828, + "epoch": 38.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.873046889035302e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5138 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.44444274902344, + "epoch": 38.634328358208954, + "grad_norm": 0.8888648480412237, + "learning_rate": 3.8725026132222914e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5139 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.41666412353516, + "epoch": 38.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.871958244273127e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5140 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.1666717529297, + "epoch": 38.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.871413782224746e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5141 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.69444274902344, + "epoch": 38.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.8708692271140964e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5142 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6666717529297, + "epoch": 38.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.87032457897813e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5143 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.22222900390625, + "epoch": 38.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.8697798378538073e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5144 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.13888549804688, + "epoch": 38.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.869235003778093e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5145 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.13888549804688, + "epoch": 38.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.8686900767879576e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5146 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6666717529297, + "epoch": 38.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.8681450569203803e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5147 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.94444274902344, + "epoch": 38.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.8675999442123465e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5148 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.5277862548828, + "epoch": 38.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.8670547387008446e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5149 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.8333282470703, + "epoch": 38.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.8665094404228724e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5150 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.22222900390625, + "epoch": 38.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.865964049415433e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5151 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.44444274902344, + "epoch": 38.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.8654185657155367e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5152 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.3333282470703, + "epoch": 38.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.8648729893601985e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5153 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.13888549804688, + "epoch": 38.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.864327320386441e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5154 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.36111450195312, + "epoch": 38.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.863781558831292e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5155 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.86111450195312, + "epoch": 38.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.8632357047317856e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5156 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.22222900390625, + "epoch": 38.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.862689758124964e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5157 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.9166717529297, + "epoch": 38.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.8621437190478735e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5158 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.47222900390625, + "epoch": 38.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.8615975875375676e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5159 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.13888549804688, + "epoch": 38.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.861051363631107e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5160 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.36111450195312, + "epoch": 38.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.860505047365556e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5161 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.47222900390625, + "epoch": 38.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.8599586387779895e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5162 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.72222900390625, + "epoch": 38.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.8594121379054827e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5163 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.30555725097656, + "epoch": 38.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.858865544785123e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5164 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.05555725097656, + "epoch": 38.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.8583188594540006e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5165 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.88888549804688, + "epoch": 38.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.8577720819492133e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5166 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.80555725097656, + "epoch": 38.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.857225212307863e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5167 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.1666717529297, + "epoch": 38.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.8566782505670614e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5168 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.69444274902344, + "epoch": 38.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.8561311967639245e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5169 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.22222900390625, + "epoch": 38.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.855584050935573e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5170 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.44444274902344, + "epoch": 38.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.855036813119137e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5171 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.11111450195312, + "epoch": 38.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.854489483351751e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5172 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.22222900390625, + "epoch": 38.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.8539420616705554e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5173 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5, + "epoch": 38.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.8533945481126985e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5174 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.19444274902344, + "epoch": 38.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.8528469427153325e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5175 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.38888549804688, + "epoch": 38.91044776119403, + "grad_norm": 0.0, + "learning_rate": 3.8522992455156187e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5176 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.3333282470703, + "epoch": 38.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.851751456550721e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5177 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5277862548828, + "epoch": 38.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.8512035758578143e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5178 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.22222900390625, + "epoch": 38.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.850655603474076e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5179 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.6666717529297, + "epoch": 38.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.850107539436689e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5180 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.0, + "epoch": 38.94776119402985, + "grad_norm": 1.49305780247714, + "learning_rate": 3.8495593837828466e-07, + "loss": -0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5181 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.22222900390625, + "epoch": 38.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.8490111365497445e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5182 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.61111450195312, + "epoch": 38.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.848462797774586e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5183 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.11111450195312, + "epoch": 38.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.8479143674945826e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5184 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7777862548828, + "epoch": 38.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.8473658457469464e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5185 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.4166717529297, + "epoch": 38.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.846817232568903e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5186 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.1666717529297, + "epoch": 38.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.846268527997678e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5187 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.38888549804688, + "epoch": 39.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.845719732070507e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5188 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.5277862548828, + "epoch": 39.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.8451708448246305e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 5189 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.97222900390625, + "epoch": 39.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.844621866297295e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5190 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.61111450195312, + "epoch": 39.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.8440727965257527e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5191 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.11111450195312, + "epoch": 39.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.8435236355472634e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5192 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.05555725097656, + "epoch": 39.04477611940298, + "grad_norm": 0.8031340790789427, + "learning_rate": 3.8429743833990934e-07, + "loss": -0.0066, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5193 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.80555725097656, + "epoch": 39.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.8424250401185124e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5194 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.13888549804688, + "epoch": 39.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.841875605742799e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5195 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.3888931274414, + "epoch": 39.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.8413260803092377e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5196 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.75, + "epoch": 39.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.8407764638551177e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5197 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.05555725097656, + "epoch": 39.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.840226756417735e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5198 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0277862548828, + "epoch": 39.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.839676958034392e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5199 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.2777862548828, + "epoch": 39.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.8391270687423983e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5200 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5, + "epoch": 39.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.8385770885790673e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5201 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.25, + "epoch": 39.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.838027017581721e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5202 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 39.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.8374768557876866e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5203 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 39.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.836926603234296e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5204 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6666717529297, + "epoch": 39.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.8363762599588896e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5205 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.97222900390625, + "epoch": 39.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.8358258259988127e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5206 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 39.149253731343286, + "grad_norm": 2.8307732116847917, + "learning_rate": 3.835275301391416e-07, + "loss": -0.0204, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5207 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.19444274902344, + "epoch": 39.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.83472468617406e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5208 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.13888549804688, + "epoch": 39.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.8341739803841063e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5209 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.05555725097656, + "epoch": 39.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.833623184058926e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5210 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.97222900390625, + "epoch": 39.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.8330722972358945e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5211 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.0277862548828, + "epoch": 39.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.832521319952395e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5212 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.22222900390625, + "epoch": 39.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.8319702522458173e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5213 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.27777862548828, + "epoch": 39.201492537313435, + "grad_norm": 129.96172707209425, + "learning_rate": 3.8314190941535533e-07, + "loss": 0.0029, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 0.9722222089767456, + "step": 5214 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.19444274902344, + "epoch": 39.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.830867845713006e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5215 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5, + "epoch": 39.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.830316506961582e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5216 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.19444274902344, + "epoch": 39.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.829765077936693e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5217 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.38888549804688, + "epoch": 39.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.82921355867576e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5218 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.94444274902344, + "epoch": 39.23880597014925, + "grad_norm": 0.7252725963801312, + "learning_rate": 3.8286619492162076e-07, + "loss": -0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5219 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.97222900390625, + "epoch": 39.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.8281102495954675e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5220 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 39.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.827558459850977e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5221 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.61111450195312, + "epoch": 39.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.827006580020179e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5222 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.80555725097656, + "epoch": 39.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.8264546101405254e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5223 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.75, + "epoch": 39.276119402985074, + "grad_norm": 0.3660123310509677, + "learning_rate": 3.8259025502494704e-07, + "loss": -0.0003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 5224 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.6666717529297, + "epoch": 39.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.825350400384477e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5225 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.19444274902344, + "epoch": 39.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.824798160583012e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5226 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0833282470703, + "epoch": 39.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.8242458308825507e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5227 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.6666717529297, + "epoch": 39.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.823693411320574e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5228 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 39.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.823140901934567e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5229 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6666717529297, + "epoch": 39.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.8225883027620244e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5230 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5, + "epoch": 39.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.8220356138404407e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5231 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.5833282470703, + "epoch": 39.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.821482835207325e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5232 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 39.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.820929966900185e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5233 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.19444274902344, + "epoch": 39.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.8203770089565403e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5234 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.61111450195312, + "epoch": 39.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.819823961413912e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5235 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.30555725097656, + "epoch": 39.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.8192708243098295e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5236 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7777862548828, + "epoch": 39.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.8187175976818275e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5237 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.69444274902344, + "epoch": 39.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.8181642815674485e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5238 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.7777862548828, + "epoch": 39.38805970149254, + "grad_norm": 0.0, + "learning_rate": 3.8176108760042383e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5239 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.61111450195312, + "epoch": 39.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.817057381029751e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5240 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.3333282470703, + "epoch": 39.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.816503796681547e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5241 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 39.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.81595012299719e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5242 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2777862548828, + "epoch": 39.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.8153963600142526e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5243 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.9166717529297, + "epoch": 39.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.814842507770312e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5244 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.19444274902344, + "epoch": 39.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.814288566302952e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5245 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.72222137451172, + "epoch": 39.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.813734535649761e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5246 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3333282470703, + "epoch": 39.44776119402985, + "grad_norm": 1.7651921282722411, + "learning_rate": 3.813180415848338e-07, + "loss": -0.0022, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5247 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4166717529297, + "epoch": 39.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.812626206936282e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5248 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 39.46268656716418, + "grad_norm": 0.5329308559189958, + "learning_rate": 3.8120719089512015e-07, + "loss": 0.0001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5249 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.63888549804688, + "epoch": 39.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.8115175219307105e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5250 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.38888549804688, + "epoch": 39.47761194029851, + "grad_norm": 0.516335122786937, + "learning_rate": 3.81096304591243e-07, + "loss": -0.0039, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5251 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05555725097656, + "epoch": 39.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.8104084809339844e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5252 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.38888549804688, + "epoch": 39.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.809853827033006e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5253 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.13888549804688, + "epoch": 39.5, + "grad_norm": 0.0, + "learning_rate": 3.809299084247134e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5254 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.69444274902344, + "epoch": 39.507462686567166, + "grad_norm": 0.0, + "learning_rate": 3.808744252614011e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5255 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.75, + "epoch": 39.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.808189332171288e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5256 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2777862548828, + "epoch": 39.52238805970149, + "grad_norm": 1.8712629062240078, + "learning_rate": 3.807634322956621e-07, + "loss": -0.0004, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5257 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.11111450195312, + "epoch": 39.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.8070792250076726e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5258 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.38888549804688, + "epoch": 39.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.8065240383621103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5259 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.05555725097656, + "epoch": 39.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.805968763057609e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5260 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.44444274902344, + "epoch": 39.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.8054133991318477e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5261 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.88888549804688, + "epoch": 39.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.804857946622514e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5262 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.13888549804688, + "epoch": 39.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.804302405567299e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5263 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.7777862548828, + "epoch": 39.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.803746776003902e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5264 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.13888549804688, + "epoch": 39.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.8031910579700267e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5265 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.11111450195312, + "epoch": 39.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.802635251503382e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5266 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.80555725097656, + "epoch": 39.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.8020793566416877e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5267 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5, + "epoch": 39.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.801523373422663e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5268 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.36111450195312, + "epoch": 39.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.8009673018840376e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5269 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.97222900390625, + "epoch": 39.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.800411142063545e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5270 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.4166717529297, + "epoch": 39.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.799854893998926e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5271 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.1666717529297, + "epoch": 39.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.799298557727926e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5272 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.63888549804688, + "epoch": 39.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.7987421332882985e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5273 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.61111450195312, + "epoch": 39.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.7981856207178005e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5274 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5, + "epoch": 39.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.7976290200541974e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5275 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.94444274902344, + "epoch": 39.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.797072331335258e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5276 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.80555725097656, + "epoch": 39.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.796515554598759e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5277 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.69444274902344, + "epoch": 39.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.795958689882483e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5278 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.8333282470703, + "epoch": 39.6865671641791, + "grad_norm": 0.5550722507308083, + "learning_rate": 3.7954017372242176e-07, + "loss": -0.0037, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5279 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.80555725097656, + "epoch": 39.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.7948446966617566e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5280 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.1666717529297, + "epoch": 39.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.7942875682329013e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5281 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.11111450195312, + "epoch": 39.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.793730351975456e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5282 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.1666717529297, + "epoch": 39.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.7931730479272326e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5283 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.6666717529297, + "epoch": 39.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.7926156561260507e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5284 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.38888549804688, + "epoch": 39.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.7920581766097336e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5285 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.22222900390625, + "epoch": 39.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.79150060941611e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5286 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.80555725097656, + "epoch": 39.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.7909429545830163e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5287 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 39.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.790385212148295e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5288 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8333282470703, + "epoch": 39.76119402985075, + "grad_norm": 1.8634668036443471, + "learning_rate": 3.7898273821497924e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5289 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.61111450195312, + "epoch": 39.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.789269464625362e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5290 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.36111450195312, + "epoch": 39.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.788711459612865e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5291 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.38888549804688, + "epoch": 39.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.7881533671501653e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5292 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7777862548828, + "epoch": 39.791044776119406, + "grad_norm": 1.4818151237598516, + "learning_rate": 3.7875951872751356e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5293 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.11111450195312, + "epoch": 39.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.7870369200256516e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5294 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.8333282470703, + "epoch": 39.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.786478565439598e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5295 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.2777862548828, + "epoch": 39.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.7859201235548633e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5296 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.86111450195312, + "epoch": 39.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.785361594409342e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5297 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.97222900390625, + "epoch": 39.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.784802978040937e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5298 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.25, + "epoch": 39.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.7842442744875534e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5299 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.8333282470703, + "epoch": 39.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.7836854837871044e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5300 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.47222900390625, + "epoch": 39.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.78312660597751e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5301 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.19444274902344, + "epoch": 39.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.7825676410966934e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5302 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.1666717529297, + "epoch": 39.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.782008589182586e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5303 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5277862548828, + "epoch": 39.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.7814494502731243e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5304 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.88888549804688, + "epoch": 39.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.78089022440625e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5305 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.75, + "epoch": 39.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.7803309116199123e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5306 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8333282470703, + "epoch": 39.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.7797715119520646e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5307 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0, + "epoch": 39.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.779212025440668e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5308 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.38888549804688, + "epoch": 39.91044776119403, + "grad_norm": 0.0, + "learning_rate": 3.7786524521236873e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5309 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.5, + "epoch": 39.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.778092792039096e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5310 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.97222900390625, + "epoch": 39.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.7775330452248695e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5311 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0833282470703, + "epoch": 39.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.776973211718993e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5312 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.86111450195312, + "epoch": 39.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.776413291559457e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5313 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0833282470703, + "epoch": 39.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.7758532847842547e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5314 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.05555725097656, + "epoch": 39.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.7752931914313886e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5315 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.69444274902344, + "epoch": 39.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.774733011538866e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5316 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.72222900390625, + "epoch": 39.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.7741727451446993e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5317 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0833282470703, + "epoch": 39.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.773612392286908e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5318 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3333282470703, + "epoch": 39.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.7730519530035166e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5319 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.58334350585938, + "epoch": 39.992537313432834, + "grad_norm": 12.004686875192856, + "learning_rate": 3.7724914273325565e-07, + "loss": -0.0008, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5320 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.94444274902344, + "epoch": 40.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.771930815312062e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5321 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.3333282470703, + "epoch": 40.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.7713701169800783e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5322 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.6666717529297, + "epoch": 40.02238805970149, + "grad_norm": 1.1301224773647012, + "learning_rate": 3.7708093323746525e-07, + "loss": -0.002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5323 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.61111450195312, + "epoch": 40.02985074626866, + "grad_norm": 1.2098074028922594, + "learning_rate": 3.770248461533837e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5324 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8333282470703, + "epoch": 40.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.7696875044956945e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5325 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.11111450195312, + "epoch": 40.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.76912646129829e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5326 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5, + "epoch": 40.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.768565331979693e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5327 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.94444274902344, + "epoch": 40.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.768004116577983e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5328 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9166717529297, + "epoch": 40.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.767442815131244e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5329 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.13888549804688, + "epoch": 40.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.766881427677563e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5330 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 40.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.766319954255036e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5331 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4166717529297, + "epoch": 40.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.765758394901765e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5332 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.11111450195312, + "epoch": 40.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.7651967496558545e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5333 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.05555725097656, + "epoch": 40.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.764635018555418e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5334 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.22222900390625, + "epoch": 40.11194029850746, + "grad_norm": 0.6108728492300011, + "learning_rate": 3.764073201638574e-07, + "loss": -0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5335 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6666717529297, + "epoch": 40.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.763511298943447e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5336 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2777862548828, + "epoch": 40.12686567164179, + "grad_norm": 1.316815655668104, + "learning_rate": 3.7629493105081656e-07, + "loss": -0.001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5337 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5833282470703, + "epoch": 40.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.7623872363708663e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5338 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3333282470703, + "epoch": 40.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.7618250765696914e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5339 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.88888549804688, + "epoch": 40.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.7612628311427876e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5340 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 40.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.760700500128308e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5341 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.47222900390625, + "epoch": 40.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.760138083564411e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5342 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0277862548828, + "epoch": 40.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.7595755814892625e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5343 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.47222900390625, + "epoch": 40.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.7590129939410333e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5344 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6666717529297, + "epoch": 40.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.758450320957899e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5345 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3333282470703, + "epoch": 40.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.7578875625780426e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5346 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.94444274902344, + "epoch": 40.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.757324718839652e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5347 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 40.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.756761789780919e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5348 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.30555725097656, + "epoch": 40.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.756198775440046e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5349 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.36111450195312, + "epoch": 40.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.755635675855238e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5350 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.97222137451172, + "epoch": 40.23134328358209, + "grad_norm": 0.6325610898527327, + "learning_rate": 3.755072491064704e-07, + "loss": -0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5351 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.22222900390625, + "epoch": 40.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.754509221106663e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5352 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.2777862548828, + "epoch": 40.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.753945866019337e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5353 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.3333282470703, + "epoch": 40.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.7533824258409544e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5354 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.19444274902344, + "epoch": 40.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.7528189006097497e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5355 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.86111450195312, + "epoch": 40.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.7522552903639625e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5356 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.30555725097656, + "epoch": 40.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.751691595141841e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5357 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.8333282470703, + "epoch": 40.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.751127814981633e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5358 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4166717529297, + "epoch": 40.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.750563949921599e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5359 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.8333282470703, + "epoch": 40.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5360 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7777862548828, + "epoch": 40.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.7494359652551053e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5361 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.4166717529297, + "epoch": 40.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.748871845725191e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5362 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.7777862548828, + "epoch": 40.32089552238806, + "grad_norm": 0.6638393330696827, + "learning_rate": 3.748307641448536e-07, + "loss": 0.0138, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5363 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.94444274902344, + "epoch": 40.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.747743352463427e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5364 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.0833282470703, + "epoch": 40.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.7471789788081554e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5365 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.61111450195312, + "epoch": 40.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.746614520521019e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5366 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.55555725097656, + "epoch": 40.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.746049977640322e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5367 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.05555725097656, + "epoch": 40.35820895522388, + "grad_norm": 1.3434995526403857, + "learning_rate": 3.745485350204372e-07, + "loss": -0.0015, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5368 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2777862548828, + "epoch": 40.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.744920638251485e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5369 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.72222900390625, + "epoch": 40.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.7443558418199824e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5370 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.75, + "epoch": 40.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.7437909609481877e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5371 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.30555725097656, + "epoch": 40.38805970149254, + "grad_norm": 0.0, + "learning_rate": 3.7432259956744356e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5372 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.4166717529297, + "epoch": 40.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.742660946037062e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5373 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.44444274902344, + "epoch": 40.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.7420958120744126e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5374 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5833282470703, + "epoch": 40.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.741530593824834e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.77777862548828, + "epoch": 40.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.740965291326684e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5376 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.88888549804688, + "epoch": 40.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.7403999046183206e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5377 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.30555725097656, + "epoch": 40.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.739834433738111e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5378 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.25, + "epoch": 40.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.739268878724428e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5379 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5277862548828, + "epoch": 40.44776119402985, + "grad_norm": 2.181384830560122, + "learning_rate": 3.7387032396156494e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5380 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.94444274902344, + "epoch": 40.45522388059702, + "grad_norm": 0.9647515172942618, + "learning_rate": 3.7381375164501584e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5381 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.80555725097656, + "epoch": 40.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.737571709266343e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5382 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.63888549804688, + "epoch": 40.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.7370058181025997e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5383 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9166717529297, + "epoch": 40.47761194029851, + "grad_norm": 0.0, + "learning_rate": 3.736439842997329e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5384 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.72222900390625, + "epoch": 40.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.7358737839889356e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5385 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.1666717529297, + "epoch": 40.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.7353076411158336e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5386 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 40.5, + "grad_norm": 0.0, + "learning_rate": 3.7347414144164394e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5387 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.22222900390625, + "epoch": 40.507462686567166, + "grad_norm": 2.385122868685392, + "learning_rate": 3.734175103929177e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5388 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 40.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.733608709692475e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5389 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.38888549804688, + "epoch": 40.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.733042231744768e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5390 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.61111450195312, + "epoch": 40.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.7324756701244974e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5391 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.55555725097656, + "epoch": 40.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.731909024870108e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5392 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.55555725097656, + "epoch": 40.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.7313422960200526e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5393 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.7777862548828, + "epoch": 40.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.730775483612789e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5394 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.97222900390625, + "epoch": 40.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.7302085876867777e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5395 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.36111450195312, + "epoch": 40.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.7296416082804906e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5396 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.0, + "epoch": 40.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.729074545432401e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5397 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0833282470703, + "epoch": 40.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.7285073991809884e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5398 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.7777862548828, + "epoch": 40.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.727940169564739e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5399 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.7777862548828, + "epoch": 40.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.727372856622144e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5400 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.05555725097656, + "epoch": 40.6044776119403, + "grad_norm": 1.0101852006256908, + "learning_rate": 3.726805460391702e-07, + "loss": 0.0004, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5401 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.7777862548828, + "epoch": 40.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.726237980911914e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5402 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.22222900390625, + "epoch": 40.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.725670418221289e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5403 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.9166717529297, + "epoch": 40.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.725102772358341e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5404 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.72222900390625, + "epoch": 40.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.724535043361589e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5405 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.61111450195312, + "epoch": 40.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.723967231269559e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5406 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.69444274902344, + "epoch": 40.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.723399336120782e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5407 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.63888549804688, + "epoch": 40.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.722831357953795e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5408 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.05555725097656, + "epoch": 40.66417910447761, + "grad_norm": 0.6904395750035431, + "learning_rate": 3.722263296807139e-07, + "loss": 0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5409 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.44444274902344, + "epoch": 40.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.7216951527193635e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5410 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.8333282470703, + "epoch": 40.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.7211269257290205e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5411 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4166717529297, + "epoch": 40.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.7205586158746693e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5412 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.44444274902344, + "epoch": 40.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.7199902231948754e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5413 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5277862548828, + "epoch": 40.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.719421747728209e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5414 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4166717529297, + "epoch": 40.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.7188531895132455e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5415 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 40.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.7182845485885665e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5416 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.30555725097656, + "epoch": 40.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.717715824992759e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5417 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.19444274902344, + "epoch": 40.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.717147018764418e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5418 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.63888549804688, + "epoch": 40.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.716578129942139e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5419 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.0, + "epoch": 40.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.7160091585645275e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5420 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8333282470703, + "epoch": 40.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.7154401046701934e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5421 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.44444274902344, + "epoch": 40.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.71487096829775e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5422 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.25, + "epoch": 40.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.71430174948582e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5423 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.8333282470703, + "epoch": 40.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.71373244827303e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5424 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.1666717529297, + "epoch": 40.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.713163064698011e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5425 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0, + "epoch": 40.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.712593598799401e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5426 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5, + "epoch": 40.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.712024050615843e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5427 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.38888549804688, + "epoch": 40.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.7114544201859854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5428 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.6666717529297, + "epoch": 40.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.7108847075484827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5429 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.36111450195312, + "epoch": 40.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.710314912741996e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5430 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.44444274902344, + "epoch": 40.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.70974503580519e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5431 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.2777862548828, + "epoch": 40.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.7091750767767347e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5432 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.38888549804688, + "epoch": 40.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.7086050356953087e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5433 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.94444274902344, + "epoch": 40.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.7080349125995925e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5434 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.94444274902344, + "epoch": 40.85820895522388, + "grad_norm": 1.0516238158145619, + "learning_rate": 3.707464707528275e-07, + "loss": -0.0004, + "reward": 1.75, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 5435 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0833282470703, + "epoch": 40.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.7068944205200494e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5436 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.05555725097656, + "epoch": 40.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.7063240516136133e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5437 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0833282470703, + "epoch": 40.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.7057536008476733e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5438 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.0833282470703, + "epoch": 40.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.7051830682609377e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5439 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.80555725097656, + "epoch": 40.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.704612453892123e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5440 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.1666717529297, + "epoch": 40.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.7040417577799493e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5441 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.05555725097656, + "epoch": 40.91044776119403, + "grad_norm": 0.3362759089996338, + "learning_rate": 3.7034709799631455e-07, + "loss": 0.0003, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5442 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.30555725097656, + "epoch": 40.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.702900120480441e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5443 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.9166717529297, + "epoch": 40.92537313432836, + "grad_norm": 0.7385425531839496, + "learning_rate": 3.7023291793705746e-07, + "loss": 0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5444 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.19444274902344, + "epoch": 40.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.7017581566722904e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5445 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.69444274902344, + "epoch": 40.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.7011870524243364e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5446 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9166717529297, + "epoch": 40.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.7006158666654675e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5447 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0277862548828, + "epoch": 40.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.700044599434443e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5448 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.72222900390625, + "epoch": 40.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.6994732507700285e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5449 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.86111450195312, + "epoch": 40.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.698901820710995e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5450 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.0, + "epoch": 40.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.6983303092961183e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5451 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.72222900390625, + "epoch": 40.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.697758716564182e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5452 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0, + "epoch": 40.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.6971870425539723e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5453 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.4166717529297, + "epoch": 41.007462686567166, + "grad_norm": 1.3107989304274796, + "learning_rate": 3.6966152873042825e-07, + "loss": -0.0005, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5454 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5277862548828, + "epoch": 41.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.696043450853912e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5455 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.38888549804688, + "epoch": 41.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.695471533241663e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5456 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.86111450195312, + "epoch": 41.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.6948995345063465e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5457 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.22222900390625, + "epoch": 41.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.694327454686777e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5458 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.63888549804688, + "epoch": 41.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.693755293821776e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5459 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.88888549804688, + "epoch": 41.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.6931830519501676e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5460 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.88888549804688, + "epoch": 41.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.692610729110784e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5461 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.44444274902344, + "epoch": 41.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.692038325342465e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5462 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.22222900390625, + "epoch": 41.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.6914658406840494e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5463 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.4166717529297, + "epoch": 41.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.690893275174387e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5464 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.5, + "epoch": 41.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.690320628852331e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5465 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.13888549804688, + "epoch": 41.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.6897479017567406e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5466 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.86111450195312, + "epoch": 41.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.6891750939264796e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5467 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0833282470703, + "epoch": 41.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.688602205400419e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5468 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 41.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.6880292362174345e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5469 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.94444274902344, + "epoch": 41.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.687456186416405e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5470 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.47222900390625, + "epoch": 41.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.686883056036219e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5471 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 41.14179104477612, + "grad_norm": 0.79326148070755, + "learning_rate": 3.686309845115767e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5472 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.13888549804688, + "epoch": 41.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.6857365536939456e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5473 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.0, + "epoch": 41.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.68516318180966e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5474 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.25, + "epoch": 41.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.684589729501817e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5475 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.5, + "epoch": 41.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.6840161968093306e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5476 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8333282470703, + "epoch": 41.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.683442583771119e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5477 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.47222900390625, + "epoch": 41.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.682868890426108e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5478 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.9166717529297, + "epoch": 41.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.6822951168132284e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5479 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.72222900390625, + "epoch": 41.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.681721262971413e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5480 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.88888549804688, + "epoch": 41.208955223880594, + "grad_norm": 0.9312585954759461, + "learning_rate": 3.681147328939605e-07, + "loss": 0.0006, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5481 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.13888549804688, + "epoch": 41.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.6805733147567494e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5482 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5277862548828, + "epoch": 41.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.6799992204617997e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5483 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.30555725097656, + "epoch": 41.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.6794250460937115e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5484 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.05555725097656, + "epoch": 41.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.678850791691448e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5485 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3333282470703, + "epoch": 41.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.678276457293978e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5486 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0277862548828, + "epoch": 41.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.6777020429402744e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5487 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.13888549804688, + "epoch": 41.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.6771275486693164e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5488 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.86111450195312, + "epoch": 41.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.6765529745200874e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5489 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.88888549804688, + "epoch": 41.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.675978320531579e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5490 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.6666717529297, + "epoch": 41.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.675403586742785e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5491 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.5, + "epoch": 41.291044776119406, + "grad_norm": 1.1171248119166157, + "learning_rate": 3.6748287731927066e-07, + "loss": 0.003, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 5492 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.80555725097656, + "epoch": 41.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.6742538799203503e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5493 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.2777862548828, + "epoch": 41.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.673678906964727e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5494 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1666717529297, + "epoch": 41.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.673103854364853e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5495 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.6666717529297, + "epoch": 41.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.6725287221597524e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5496 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.88888549804688, + "epoch": 41.32835820895522, + "grad_norm": 0.648546105393355, + "learning_rate": 3.6719535103884507e-07, + "loss": 0.0005, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5497 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 41.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.671378219089983e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5498 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.36111450195312, + "epoch": 41.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.670802848303386e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5499 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.47222900390625, + "epoch": 41.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.6702273980677045e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5500 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.72222900390625, + "epoch": 41.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.6696518684219884e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5501 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.0277862548828, + "epoch": 41.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.669076259405291e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5502 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.38888549804688, + "epoch": 41.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.6685005710566733e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5503 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1666717529297, + "epoch": 41.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.6679248034152005e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5504 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.44444274902344, + "epoch": 41.38805970149254, + "grad_norm": 1.6128355572723938, + "learning_rate": 3.667348956519942e-07, + "loss": 0.0074, + "reward": 1.8333333730697632, + "reward_std": 0.17526113986968994, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 5505 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.05555725097656, + "epoch": 41.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.666773030409977e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5506 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.1666717529297, + "epoch": 41.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.6661970251243854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5507 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.75, + "epoch": 41.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.665620940702253e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5508 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.63888549804688, + "epoch": 41.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.665044777182673e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5509 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.3333282470703, + "epoch": 41.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.6644685346047445e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5510 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.86111450195312, + "epoch": 41.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.663892213007569e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5511 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.44444274902344, + "epoch": 41.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.6633158124302544e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5512 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1666717529297, + "epoch": 41.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.662739332911916e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5513 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.55555725097656, + "epoch": 41.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.6621627744916717e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5514 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.22222900390625, + "epoch": 41.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.6615861372086473e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5515 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.19444274902344, + "epoch": 41.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.6610094211019703e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5516 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.94444274902344, + "epoch": 41.47761194029851, + "grad_norm": 0.0, + "learning_rate": 3.660432626210779e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5517 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.94444274902344, + "epoch": 41.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.6598557525742115e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5518 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.47222900390625, + "epoch": 41.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.6592788002314134e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5519 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.8333282470703, + "epoch": 41.5, + "grad_norm": 0.0, + "learning_rate": 3.6587017692215385e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5520 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.11111450195312, + "epoch": 41.507462686567166, + "grad_norm": 0.0, + "learning_rate": 3.6581246595837406e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5521 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.0, + "epoch": 41.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.657547471357183e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5522 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.36111450195312, + "epoch": 41.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.656970204581033e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5523 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.80555725097656, + "epoch": 41.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.6563928592944627e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5524 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.94444274902344, + "epoch": 41.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.65581543553665e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5525 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8333282470703, + "epoch": 41.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.6552379333467775e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5526 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 41.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.6546603527640344e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5527 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.72222900390625, + "epoch": 41.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.6540826938276154e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5528 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.36111450195312, + "epoch": 41.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.653504956576717e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5529 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.13888549804688, + "epoch": 41.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.652927141050548e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5530 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.94444274902344, + "epoch": 41.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.652349247288313e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5531 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.7777862548828, + "epoch": 41.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.6517712753292306e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5532 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.7777862548828, + "epoch": 41.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.65119322521252e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5533 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.72222900390625, + "epoch": 41.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.650615096977407e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5534 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.13888549804688, + "epoch": 41.61194029850746, + "grad_norm": 0.7780748345451299, + "learning_rate": 3.650036890663124e-07, + "loss": 0.0002, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5535 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.22222900390625, + "epoch": 41.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.6494586063089043e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5536 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.63888549804688, + "epoch": 41.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.648880243953992e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5537 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.3333282470703, + "epoch": 41.634328358208954, + "grad_norm": 9.304323708446239, + "learning_rate": 3.6483018036376326e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5538 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.72222900390625, + "epoch": 41.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.6477232853990794e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5539 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.94444274902344, + "epoch": 41.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.6471446892775894e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5540 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.86111450195312, + "epoch": 41.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.6465660153124243e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5541 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.0277862548828, + "epoch": 41.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.645987263542854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5542 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.30555725097656, + "epoch": 41.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.6454084340081503e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5543 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.47222900390625, + "epoch": 41.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.6448295267475926e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5544 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.47222900390625, + "epoch": 41.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.6442505418004643e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5545 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.38888549804688, + "epoch": 41.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.643671479206055e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5546 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.63888549804688, + "epoch": 41.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.6430923390036583e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5547 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.19444274902344, + "epoch": 41.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.6425131212325747e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5548 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.86111450195312, + "epoch": 41.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.6419338259321086e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5549 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.13888549804688, + "epoch": 41.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.641354453141571e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5550 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5, + "epoch": 41.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.6407750029002763e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5551 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.13888549804688, + "epoch": 41.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.640195475247546e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5552 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.2777862548828, + "epoch": 41.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.639615870222705e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5553 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.11111450195312, + "epoch": 41.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.6390361878650854e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5554 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.36111450195312, + "epoch": 41.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.638456428214024e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5555 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.11111450195312, + "epoch": 41.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.6378765913088616e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5556 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.88888549804688, + "epoch": 41.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.637296677188946e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5557 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.63888549804688, + "epoch": 41.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.636716685893628e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5558 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5277862548828, + "epoch": 41.791044776119406, + "grad_norm": 0.7001440079515605, + "learning_rate": 3.6361366174622666e-07, + "loss": -0.0, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5559 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.72222900390625, + "epoch": 41.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.6355564719342235e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5560 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.88888549804688, + "epoch": 41.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.634976249348867e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5561 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.9166717529297, + "epoch": 41.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.6343959497455703e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5562 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.75, + "epoch": 41.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.6338155731637104e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5563 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2777862548828, + "epoch": 41.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.633235119642673e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5564 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.38888549804688, + "epoch": 41.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.6326545892218455e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5565 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.75, + "epoch": 41.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.632073981940622e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5566 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.8333282470703, + "epoch": 41.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.631493297838403e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5567 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.97222900390625, + "epoch": 41.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.6309125369545904e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5568 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.63888549804688, + "epoch": 41.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.6303316993285967e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5569 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.13888549804688, + "epoch": 41.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.6297507849998346e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5570 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.3333282470703, + "epoch": 41.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.629169794007725e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5571 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.94444274902344, + "epoch": 41.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.6285887263916936e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5572 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.5833282470703, + "epoch": 41.8955223880597, + "grad_norm": 1.738805411793712, + "learning_rate": 3.6280075821911693e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 5573 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.97222900390625, + "epoch": 41.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.6274263614455894e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5574 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.0277862548828, + "epoch": 41.91044776119403, + "grad_norm": 0.0, + "learning_rate": 3.626845064194395e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5575 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5, + "epoch": 41.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.6262636904770307e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5576 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.9166717529297, + "epoch": 41.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.625682240332948e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5577 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.5833282470703, + "epoch": 41.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.625100713801604e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5578 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.30555725097656, + "epoch": 41.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.6245191109224593e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5579 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.3333282470703, + "epoch": 41.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.6239374317349816e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5580 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.8333282470703, + "epoch": 41.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.6233556762786424e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5581 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.2777862548828, + "epoch": 41.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.62277384459292e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5582 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5277862548828, + "epoch": 41.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.622191936717295e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5583 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.86111450195312, + "epoch": 41.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.621609952691255e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5584 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 41.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.621027892554294e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5585 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.25, + "epoch": 41.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.620445756345909e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5586 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.7777862548828, + "epoch": 42.007462686567166, + "grad_norm": 1.7534476658728397, + "learning_rate": 3.6198635441056024e-07, + "loss": -0.0003, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5587 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.13888549804688, + "epoch": 42.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.619281255872884e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5588 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6666717529297, + "epoch": 42.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.618698891687265e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5589 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.11111450195312, + "epoch": 42.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.618116451588266e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5590 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.94444274902344, + "epoch": 42.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.6175339356154086e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5591 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.5833282470703, + "epoch": 42.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.616951343808222e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5592 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.1666717529297, + "epoch": 42.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.6163686762062413e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5593 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2777862548828, + "epoch": 42.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.6157859328490034e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5594 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5277862548828, + "epoch": 42.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.6152031137760553e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5595 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 42.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.6146202190269443e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5596 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.72222900390625, + "epoch": 42.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.6140372486412253e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5597 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.11111450195312, + "epoch": 42.08955223880597, + "grad_norm": 0.6386734519953275, + "learning_rate": 3.6134542026584574e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5598 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.86111450195312, + "epoch": 42.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.612871081118206e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5599 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.47222900390625, + "epoch": 42.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.6122878840600413e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5600 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.05555725097656, + "epoch": 42.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.6117046115235373e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5601 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75, + "epoch": 42.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.6111212635482747e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5602 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.11111450195312, + "epoch": 42.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.610537840173838e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5603 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8333282470703, + "epoch": 42.134328358208954, + "grad_norm": 0.9651461824278035, + "learning_rate": 3.609954341439818e-07, + "loss": -0.0258, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5604 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 42.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.6093707673858105e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5605 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.5, + "epoch": 42.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.608787118051415e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5606 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.9166717529297, + "epoch": 42.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.608203393476238e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5607 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.19444274902344, + "epoch": 42.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.607619593699891e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5608 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.86111450195312, + "epoch": 42.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.607035718761988e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5609 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3333282470703, + "epoch": 42.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.606451768702151e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5610 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.8333282470703, + "epoch": 42.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.605867743560006e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5611 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.47222900390625, + "epoch": 42.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.605283643375184e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5612 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.86111450195312, + "epoch": 42.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.6046994681873216e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5613 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.72222900390625, + "epoch": 42.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.604115218036059e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5614 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.55555725097656, + "epoch": 42.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.603530892961044e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5615 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.9166717529297, + "epoch": 42.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.602946493001928e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5616 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.2777862548828, + "epoch": 42.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.602362018198366e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5617 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 42.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.6017774685900223e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5618 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6666717529297, + "epoch": 42.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.6011928442165615e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5619 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5833282470703, + "epoch": 42.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.6006081451176555e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5620 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 42.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.600023371332982e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5621 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1666717529297, + "epoch": 42.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.5994385229022226e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5622 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.94444274902344, + "epoch": 42.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.5988535998650653e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5623 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.61111450195312, + "epoch": 42.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.5982686022612e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5624 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.2777862548828, + "epoch": 42.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.5976835301303267e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.19444274902344, + "epoch": 42.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.597098383512145e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5626 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0833282470703, + "epoch": 42.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.596513162446363e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5627 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.13888549804688, + "epoch": 42.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.595927866972693e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5628 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.55555725097656, + "epoch": 42.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.595342497130853e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5629 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.8333282470703, + "epoch": 42.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.5947570529605657e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5630 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.4166717529297, + "epoch": 42.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.594171534501557e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5631 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0277862548828, + "epoch": 42.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.59358594179356e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5632 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.86111450195312, + "epoch": 42.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.593000274876313e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5633 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.36111450195312, + "epoch": 42.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.5924145337895567e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5634 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.8333282470703, + "epoch": 42.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.591828718573041e-07, + "loss": 0.0, + "reward": 1.6666666269302368, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 1.0, + "step": 5635 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.97222900390625, + "epoch": 42.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.5912428292665174e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5636 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.7777862548828, + "epoch": 42.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.590656865909744e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5637 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.0833282470703, + "epoch": 42.38805970149254, + "grad_norm": 0.0, + "learning_rate": 3.5900708285424817e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5638 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5833282470703, + "epoch": 42.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.5894847172045e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5639 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.25, + "epoch": 42.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.5888985319355725e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5640 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.9166717529297, + "epoch": 42.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.588312272775474e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5641 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.5833282470703, + "epoch": 42.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.58772593976399e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5642 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.61111450195312, + "epoch": 42.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.5871395329409064e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5643 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 42.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.5865530523460163e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5644 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.63888549804688, + "epoch": 42.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.585966498019119e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5645 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.9166717529297, + "epoch": 42.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.5853798700000157e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5646 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.94444274902344, + "epoch": 42.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.5847931683285147e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5647 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4166717529297, + "epoch": 42.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.5842063930444285e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5648 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.25, + "epoch": 42.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.583619544187575e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5649 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.36111450195312, + "epoch": 42.47761194029851, + "grad_norm": 0.0, + "learning_rate": 3.5830326217977777e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5650 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.11111450195312, + "epoch": 42.485074626865675, + "grad_norm": 0.0, + "learning_rate": 3.5824456259148627e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5651 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.63888549804688, + "epoch": 42.492537313432834, + "grad_norm": 0.7807948733545151, + "learning_rate": 3.581858556578664e-07, + "loss": 0.0004, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5652 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.11111450195312, + "epoch": 42.5, + "grad_norm": 0.566985555996816, + "learning_rate": 3.5812714138290203e-07, + "loss": -0.0079, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 5653 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.8333282470703, + "epoch": 42.507462686567166, + "grad_norm": 0.0, + "learning_rate": 3.580684197705771e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5654 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5833282470703, + "epoch": 42.514925373134325, + "grad_norm": 0.8389580894979289, + "learning_rate": 3.5800969082487674e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5655 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.75, + "epoch": 42.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.5795095454978597e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5656 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0, + "epoch": 42.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.5789221094929074e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5657 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.80555725097656, + "epoch": 42.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.5783346002737714e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5658 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.80555725097656, + "epoch": 42.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.57774701788032e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5659 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.8333282470703, + "epoch": 42.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.577159362352426e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5660 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5833282470703, + "epoch": 42.559701492537314, + "grad_norm": 1.1289069432335312, + "learning_rate": 3.5765716337299664e-07, + "loss": 0.0013, + "reward": 1.8333333730697632, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 5661 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 42.56716417910448, + "grad_norm": 0.873918131412606, + "learning_rate": 3.575983832052824e-07, + "loss": 0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5662 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5, + "epoch": 42.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.5753959573608865e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5663 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.7777862548828, + "epoch": 42.582089552238806, + "grad_norm": 0.45899535141914605, + "learning_rate": 3.574808009694046e-07, + "loss": -0.0004, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5664 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.5833282470703, + "epoch": 42.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.5742199890921986e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5665 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.19444274902344, + "epoch": 42.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.573631895595249e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.05555725097656, + "epoch": 42.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.573043729243103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5667 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.97222900390625, + "epoch": 42.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.572455490075672e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5668 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0, + "epoch": 42.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.5718671781328746e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5669 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.8333282470703, + "epoch": 42.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.571278793454633e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5670 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5277862548828, + "epoch": 42.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.570690336080872e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5671 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3333282470703, + "epoch": 42.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.570101806051526e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5672 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 42.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.56951320340653e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5673 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.80555725097656, + "epoch": 42.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.5689245281858277e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5674 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2777862548828, + "epoch": 42.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.568335780429364e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5675 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.36111450195312, + "epoch": 42.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.5677469601770916e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5676 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.05555725097656, + "epoch": 42.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.567158067468966e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5677 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.55555725097656, + "epoch": 42.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.5665691023449496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5678 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0, + "epoch": 42.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.5659800648450086e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5679 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0277862548828, + "epoch": 42.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.5653909550091134e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5680 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.47222900390625, + "epoch": 42.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.564801772877242e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5681 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.47222900390625, + "epoch": 42.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.5642125184893734e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5682 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.05555725097656, + "epoch": 42.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.563623191885495e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5683 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7777862548828, + "epoch": 42.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.563033793105599e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5684 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5277862548828, + "epoch": 42.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.562444322189677e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5685 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.61111450195312, + "epoch": 42.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.5618547791777335e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5686 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 42.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.561265164109772e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5687 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 42.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.5606754770258046e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5688 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.38888549804688, + "epoch": 42.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.560085717965846e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5689 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.86111450195312, + "epoch": 42.776119402985074, + "grad_norm": 1.960337810131778, + "learning_rate": 3.5594958869699154e-07, + "loss": 0.0006, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5690 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.47222900390625, + "epoch": 42.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.55890598407804e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5691 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.1666717529297, + "epoch": 42.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.558316009330248e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5692 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.97222900390625, + "epoch": 42.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.5577259627665743e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5693 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.86111450195312, + "epoch": 42.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.55713584442706e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5694 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.2777862548828, + "epoch": 42.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.5565456543517485e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5695 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.25, + "epoch": 42.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.55595539258069e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5696 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0833282470703, + "epoch": 42.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.5553650591539385e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5697 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.88888549804688, + "epoch": 42.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.554774654111554e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5698 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.2777862548828, + "epoch": 42.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.554184177493599e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5699 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.72222900390625, + "epoch": 42.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.553593629340144e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5700 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.38888549804688, + "epoch": 42.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.553003009691262e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5701 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.11111450195312, + "epoch": 42.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.552412318587031e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5702 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.38888549804688, + "epoch": 42.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.551821556067537e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5703 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.11111450195312, + "epoch": 42.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.5512307221728655e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5704 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.63888549804688, + "epoch": 42.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.5506398169431105e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5705 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.7777862548828, + "epoch": 42.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.5500488404183715e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5706 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.30555725097656, + "epoch": 42.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.5494577926387493e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5707 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.88888549804688, + "epoch": 42.91044776119403, + "grad_norm": 16.54448641388775, + "learning_rate": 3.5488666736443535e-07, + "loss": -0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5708 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.38888549804688, + "epoch": 42.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.5482754834752947e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5709 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.97222900390625, + "epoch": 42.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.547684222171691e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5710 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.97222900390625, + "epoch": 42.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.547092889773666e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5711 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.97222900390625, + "epoch": 42.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.5465014863213436e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5712 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.30555725097656, + "epoch": 42.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.545910011854859e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5713 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.47222900390625, + "epoch": 42.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.545318466414347e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5714 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.75, + "epoch": 42.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.5447268500399495e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5715 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0, + "epoch": 42.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.544135162771813e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5716 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.94444274902344, + "epoch": 42.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.543543404650087e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5717 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.88888549804688, + "epoch": 42.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.542951575714931e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5718 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9166717529297, + "epoch": 42.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.542359676006502e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5719 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75, + "epoch": 43.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.541767705564966e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5720 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.36111450195312, + "epoch": 43.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.541175664430496e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5721 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.61111450195312, + "epoch": 43.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.540583552643264e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5722 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 43.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.539991370243452e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5723 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.55555725097656, + "epoch": 43.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.539399117271243e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5724 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.5, + "epoch": 43.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.5388067937668293e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5725 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.22222900390625, + "epoch": 43.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.538214399770402e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5726 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0833282470703, + "epoch": 43.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.5376219353221613e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5727 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5277862548828, + "epoch": 43.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.5370294004623113e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5728 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.61111450195312, + "epoch": 43.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.5364367952310604e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5729 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.88888549804688, + "epoch": 43.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.5358441196686215e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5730 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.69444274902344, + "epoch": 43.08955223880597, + "grad_norm": 0.949728582685212, + "learning_rate": 3.535251373815214e-07, + "loss": 0.0005, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5731 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.97222900390625, + "epoch": 43.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.5346585577110597e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5732 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7777862548828, + "epoch": 43.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.534065671396388e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5733 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.69444274902344, + "epoch": 43.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.5334727149114287e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5734 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.38888549804688, + "epoch": 43.11940298507463, + "grad_norm": 1.3412095432745677, + "learning_rate": 3.532879688296421e-07, + "loss": -0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5735 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.86111450195312, + "epoch": 43.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.5322865915916056e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5736 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.44444274902344, + "epoch": 43.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.53169342483723e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5737 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.2777862548828, + "epoch": 43.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.5311001880735456e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5738 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 43.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.530506881340809e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5739 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.13888549804688, + "epoch": 43.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.529913504679281e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5740 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.13888549804688, + "epoch": 43.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.5293200581292266e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5741 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.25, + "epoch": 43.17164179104478, + "grad_norm": 0.7934333278989775, + "learning_rate": 3.5287265417309177e-07, + "loss": 0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5742 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.19444274902344, + "epoch": 43.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.5281329555246277e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5743 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.47222900390625, + "epoch": 43.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.527539299550637e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5744 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.47222900390625, + "epoch": 43.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.526945573849232e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5745 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1666717529297, + "epoch": 43.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.5263517784607e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5746 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.94444274902344, + "epoch": 43.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.525757913425337e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5747 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.05555725097656, + "epoch": 43.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.52516397878344e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5748 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.22222900390625, + "epoch": 43.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.5245699745753134e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5749 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.7777862548828, + "epoch": 43.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.523975900841266e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5750 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.61111450195312, + "epoch": 43.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.52338175762161e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5751 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.13888549804688, + "epoch": 43.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.522787544956665e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5752 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.2777862548828, + "epoch": 43.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.52219326288675e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5753 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.8333282470703, + "epoch": 43.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.5215989114521954e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5754 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.2777862548828, + "epoch": 43.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.521004490693331e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5755 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.1666717529297, + "epoch": 43.276119402985074, + "grad_norm": 0.71102772385262, + "learning_rate": 3.5204100006504953e-07, + "loss": -0.0003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5756 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.63888549804688, + "epoch": 43.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.519815441364028e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5757 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.13888549804688, + "epoch": 43.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.519220812874276e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5758 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8333282470703, + "epoch": 43.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.518626115221589e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5759 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.88888549804688, + "epoch": 43.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.5180313484463235e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5760 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.72222900390625, + "epoch": 43.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.517436512588839e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5761 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.7777862548828, + "epoch": 43.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.516841607689501e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5762 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7777862548828, + "epoch": 43.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.516246633788677e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5763 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.8333282470703, + "epoch": 43.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.515651590926743e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5764 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.5277862548828, + "epoch": 43.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.5150564791440763e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5765 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.97222900390625, + "epoch": 43.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.514461298481062e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5766 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6666717529297, + "epoch": 43.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.513866048978088e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5767 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.44444274902344, + "epoch": 43.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.5132707306755456e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5768 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.1666717529297, + "epoch": 43.37313432835821, + "grad_norm": 0.0, + "learning_rate": 3.512675343613834e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5769 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8888931274414, + "epoch": 43.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.512079887833354e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5770 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.1666717529297, + "epoch": 43.38805970149254, + "grad_norm": 2.8125284632923875, + "learning_rate": 3.511484363374513e-07, + "loss": 0.0035, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5771 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.36111450195312, + "epoch": 43.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.510888770277723e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5772 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.8333282470703, + "epoch": 43.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.5102931085833996e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5773 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.55555725097656, + "epoch": 43.41044776119403, + "grad_norm": 0.0, + "learning_rate": 3.5096973783319637e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5774 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.30555725097656, + "epoch": 43.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.50910157956384e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5775 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.11111450195312, + "epoch": 43.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.5085057123194605e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5776 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.5, + "epoch": 43.43283582089552, + "grad_norm": 0.0, + "learning_rate": 3.507909776639258e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5777 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.69444274902344, + "epoch": 43.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.5073137725636726e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5778 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 43.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.506717700133149e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5779 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.7777862548828, + "epoch": 43.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.5061215593881344e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5780 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.05555725097656, + "epoch": 43.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.505525350369083e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5781 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5833282470703, + "epoch": 43.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.504929073116452e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5782 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.11111450195312, + "epoch": 43.47761194029851, + "grad_norm": 1.2248295560565483, + "learning_rate": 3.5043327276707055e-07, + "loss": 0.0002, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5783 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.9166717529297, + "epoch": 43.485074626865675, + "grad_norm": 0.479830687085821, + "learning_rate": 3.5037363140723096e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5784 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0, + "epoch": 43.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.503139832361736e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5785 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.19444274902344, + "epoch": 43.5, + "grad_norm": 0.0, + "learning_rate": 3.502543282579463e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5786 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.72222900390625, + "epoch": 43.507462686567166, + "grad_norm": 0.1917180532426601, + "learning_rate": 3.5019466647659677e-07, + "loss": 0.0018, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5787 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.47222900390625, + "epoch": 43.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.501349978961739e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5788 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.11111450195312, + "epoch": 43.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.500753225207267e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5789 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.19444274902344, + "epoch": 43.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.5001564035430453e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5790 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.19444274902344, + "epoch": 43.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.4995595140095747e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5791 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 43.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.498962556647358e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5792 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7777862548828, + "epoch": 43.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.4983655314969047e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5793 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9166717529297, + "epoch": 43.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.497768438598728e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5794 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.05555725097656, + "epoch": 43.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.4971712779933455e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5795 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.61111450195312, + "epoch": 43.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.4965740497212804e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5796 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.1666717529297, + "epoch": 43.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.495976753823059e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5797 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0, + "epoch": 43.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.4953793903392137e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5798 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.11111450195312, + "epoch": 43.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.49478195931028e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5799 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.97222900390625, + "epoch": 43.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.4941844607768e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5800 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9166717529297, + "epoch": 43.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.493586894779318e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5801 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.13888549804688, + "epoch": 43.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.492989261358383e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5802 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.22222900390625, + "epoch": 43.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.4923915605545533e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5803 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.22222900390625, + "epoch": 43.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.491793792408384e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5804 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.22222900390625, + "epoch": 43.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.491195956960441e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5805 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1666717529297, + "epoch": 43.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.490598054251292e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5806 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0, + "epoch": 43.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.4900000843215107e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5807 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6666717529297, + "epoch": 43.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.4894020472116724e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5808 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.2777862548828, + "epoch": 43.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.488803942962361e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5809 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.38888549804688, + "epoch": 43.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.488205771614163e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5810 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.61111450195312, + "epoch": 43.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.487607533207669e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5811 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.38888549804688, + "epoch": 43.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.4870092277834747e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5812 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8333282470703, + "epoch": 43.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.4864108553821797e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5813 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.7777862548828, + "epoch": 43.708955223880594, + "grad_norm": 0.5415874177877509, + "learning_rate": 3.48581241604439e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5814 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.80555725097656, + "epoch": 43.71641791044776, + "grad_norm": 0.0, + "learning_rate": 3.4852139098107137e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5815 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.13888549804688, + "epoch": 43.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.4846153367217654e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5816 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.11111450195312, + "epoch": 43.73134328358209, + "grad_norm": 0.0, + "learning_rate": 3.484016696818163e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5817 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 43.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.4834179901405293e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5818 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.55555725097656, + "epoch": 43.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.482819216729492e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5819 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.0833282470703, + "epoch": 43.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.482220376625683e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5820 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.83333587646484, + "epoch": 43.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.4816214698697387e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5821 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5, + "epoch": 43.76865671641791, + "grad_norm": 1.2952206564320443, + "learning_rate": 3.4810224965023e-07, + "loss": 0.0006, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5822 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.1666717529297, + "epoch": 43.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.4804234565640127e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5823 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.9166717529297, + "epoch": 43.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.479824350095527e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5824 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13888549804688, + "epoch": 43.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.479225177137497e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5825 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.47222900390625, + "epoch": 43.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.4786259377305815e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5826 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.55555725097656, + "epoch": 43.80597014925373, + "grad_norm": 0.9389041902887768, + "learning_rate": 3.4780266319154454e-07, + "loss": -0.0022, + "reward": 1.9444444179534912, + "reward_std": 0.1111111119389534, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5827 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.19444274902344, + "epoch": 43.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.4774272597327553e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5828 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.86111450195312, + "epoch": 43.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.4768278212231837e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5829 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.72222900390625, + "epoch": 43.82835820895522, + "grad_norm": 0.0, + "learning_rate": 3.4762283164274097e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5830 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0277862548828, + "epoch": 43.83582089552239, + "grad_norm": 0.0, + "learning_rate": 3.475628745386113e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5831 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 43.843283582089555, + "grad_norm": 0.0, + "learning_rate": 3.4750291081399795e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5832 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.47222900390625, + "epoch": 43.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.474429404729701e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5833 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.5, + "epoch": 43.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.4738296351959715e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5834 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.11111450195312, + "epoch": 43.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.473229799579492e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5835 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.75, + "epoch": 43.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.4726298979209645e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5836 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.05555725097656, + "epoch": 43.88059701492537, + "grad_norm": 0.8516581687136117, + "learning_rate": 3.472029930261099e-07, + "loss": 0.003, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5837 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.2777862548828, + "epoch": 43.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.471429896640609e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5838 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.5833282470703, + "epoch": 43.8955223880597, + "grad_norm": 2.6292908194510485, + "learning_rate": 3.470829797100209e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5839 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5833282470703, + "epoch": 43.90298507462687, + "grad_norm": 0.0, + "learning_rate": 3.470229631680624e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5840 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3333282470703, + "epoch": 43.91044776119403, + "grad_norm": 0.0, + "learning_rate": 3.4696294004225795e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5841 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7777862548828, + "epoch": 43.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.469029103366806e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5842 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.7777862548828, + "epoch": 43.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.468428740554039e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5843 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5277862548828, + "epoch": 43.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.467828312025019e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5844 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.97222900390625, + "epoch": 43.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.4672278178204895e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5845 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.80555725097656, + "epoch": 43.94776119402985, + "grad_norm": 0.0, + "learning_rate": 3.4666272579811986e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5846 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.4166717529297, + "epoch": 43.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.4660266325479e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5847 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1666717529297, + "epoch": 43.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.465425941561353e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5848 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6666717529297, + "epoch": 43.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.464825185062317e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5849 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.4166717529297, + "epoch": 43.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.46422436309156e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5850 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.25, + "epoch": 43.985074626865675, + "grad_norm": 0.6764975718877118, + "learning_rate": 3.4636234756898536e-07, + "loss": 0.0, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5851 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.33334350585938, + "epoch": 43.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.4630225228979717e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5852 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.63888549804688, + "epoch": 44.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.4624215047566943e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5853 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0, + "epoch": 44.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.4618204213068066e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5854 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2777862548828, + "epoch": 44.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.4612192725890963e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5855 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.2777862548828, + "epoch": 44.02985074626866, + "grad_norm": 0.0, + "learning_rate": 3.460618058644357e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5856 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0, + "epoch": 44.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.460016779513387e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5857 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.1666717529297, + "epoch": 44.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.4594154352369874e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5858 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.36111450195312, + "epoch": 44.05223880597015, + "grad_norm": 0.0, + "learning_rate": 3.4588140258559647e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5859 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8333282470703, + "epoch": 44.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.4582125514111294e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5860 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 44.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.4576110119432976e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5861 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.8333282470703, + "epoch": 44.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.457009407493288e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5862 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5277862548828, + "epoch": 44.082089552238806, + "grad_norm": 0.0, + "learning_rate": 3.4564077381019253e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5863 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.4166717529297, + "epoch": 44.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.4558060038100384e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5864 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.61111450195312, + "epoch": 44.09701492537314, + "grad_norm": 0.0, + "learning_rate": 3.455204204658459e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5865 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.47222900390625, + "epoch": 44.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.454602340688025e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5866 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.44444274902344, + "epoch": 44.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.4540004119395784e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5867 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.88888549804688, + "epoch": 44.11940298507463, + "grad_norm": 0.0, + "learning_rate": 3.453398418453964e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5868 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.55555725097656, + "epoch": 44.12686567164179, + "grad_norm": 0.0, + "learning_rate": 3.4527963602720345e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5869 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.7777862548828, + "epoch": 44.134328358208954, + "grad_norm": 0.0, + "learning_rate": 3.452194237434642e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5870 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.94444274902344, + "epoch": 44.14179104477612, + "grad_norm": 0.0, + "learning_rate": 3.451592049982648e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5871 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0833282470703, + "epoch": 44.149253731343286, + "grad_norm": 0.0, + "learning_rate": 3.4509897979569146e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5872 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.9166717529297, + "epoch": 44.156716417910445, + "grad_norm": 0.0, + "learning_rate": 3.450387481398311e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5873 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5277862548828, + "epoch": 44.16417910447761, + "grad_norm": 0.0, + "learning_rate": 3.4497851003477087e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5874 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.30555725097656, + "epoch": 44.17164179104478, + "grad_norm": 0.0, + "learning_rate": 3.449182654845985e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5875 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.6666717529297, + "epoch": 44.17910447761194, + "grad_norm": 0.0, + "learning_rate": 3.4485801449340207e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5876 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0277862548828, + "epoch": 44.1865671641791, + "grad_norm": 0.0, + "learning_rate": 3.4479775706527017e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5877 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 44.19402985074627, + "grad_norm": 0.0, + "learning_rate": 3.447374932042917e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5878 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.30555725097656, + "epoch": 44.201492537313435, + "grad_norm": 0.0, + "learning_rate": 3.4467722291455616e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5879 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.4166717529297, + "epoch": 44.208955223880594, + "grad_norm": 0.0, + "learning_rate": 3.4461694620015337e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5880 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.19444274902344, + "epoch": 44.21641791044776, + "grad_norm": 0.0, + "learning_rate": 3.4455666306517365e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5881 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.0277862548828, + "epoch": 44.223880597014926, + "grad_norm": 0.0, + "learning_rate": 3.444963735137078e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5882 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.55555725097656, + "epoch": 44.23134328358209, + "grad_norm": 0.0, + "learning_rate": 3.444360775498468e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5883 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.13888549804688, + "epoch": 44.23880597014925, + "grad_norm": 0.0, + "learning_rate": 3.4437577517768246e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5884 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.2777862548828, + "epoch": 44.24626865671642, + "grad_norm": 0.0, + "learning_rate": 3.443154664013067e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5885 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6666717529297, + "epoch": 44.25373134328358, + "grad_norm": 0.0, + "learning_rate": 3.4425515122481205e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5886 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.69444274902344, + "epoch": 44.26119402985075, + "grad_norm": 0.0, + "learning_rate": 3.441948296522913e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5887 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.75, + "epoch": 44.26865671641791, + "grad_norm": 0.0, + "learning_rate": 3.4413450168783794e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5888 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.94444274902344, + "epoch": 44.276119402985074, + "grad_norm": 0.0, + "learning_rate": 3.440741673355456e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5889 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.0, + "epoch": 44.28358208955224, + "grad_norm": 0.0, + "learning_rate": 3.440138265995086e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5890 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.83333587646484, + "epoch": 44.291044776119406, + "grad_norm": 0.0, + "learning_rate": 3.4395347948382154e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5891 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.36111450195312, + "epoch": 44.298507462686565, + "grad_norm": 0.0, + "learning_rate": 3.4389312599257946e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5892 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.1666717529297, + "epoch": 44.30597014925373, + "grad_norm": 0.0, + "learning_rate": 3.4383276612987795e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5893 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.13888549804688, + "epoch": 44.3134328358209, + "grad_norm": 0.0, + "learning_rate": 3.437723998998129e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5894 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.2777862548828, + "epoch": 44.32089552238806, + "grad_norm": 0.0, + "learning_rate": 3.4371202730648065e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5895 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.0833282470703, + "epoch": 44.32835820895522, + "grad_norm": 0.0, + "learning_rate": 3.43651648353978e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5896 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0833282470703, + "epoch": 44.33582089552239, + "grad_norm": 0.0, + "learning_rate": 3.435912630464022e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5897 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11111450195312, + "epoch": 44.343283582089555, + "grad_norm": 0.0, + "learning_rate": 3.4353087138785103e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5898 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.22222900390625, + "epoch": 44.350746268656714, + "grad_norm": 0.0, + "learning_rate": 3.434704733824224e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5899 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.55555725097656, + "epoch": 44.35820895522388, + "grad_norm": 0.0, + "learning_rate": 3.4341006903421485e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5900 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.30555725097656, + "epoch": 44.365671641791046, + "grad_norm": 0.0, + "learning_rate": 3.4334965834732755e-07, + "loss": 0.0, + "reward": 1.5555555820465088, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5555555820465088, + "rewards/format_reward": 1.0, + "step": 5901 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.88888549804688, + "epoch": 44.37313432835821, + "grad_norm": 1.4365693847633094, + "learning_rate": 3.4328924132585966e-07, + "loss": -0.0003, + "reward": 1.9166666269302368, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/format_reward": 1.0, + "step": 5902 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.94444274902344, + "epoch": 44.38059701492537, + "grad_norm": 0.0, + "learning_rate": 3.4322881797391116e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5903 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0833282470703, + "epoch": 44.38805970149254, + "grad_norm": 1.1737023007763823, + "learning_rate": 3.4316838829558215e-07, + "loss": -0.0002, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5904 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.97222900390625, + "epoch": 44.3955223880597, + "grad_norm": 0.0, + "learning_rate": 3.4310795229497334e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5905 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.88888549804688, + "epoch": 44.40298507462686, + "grad_norm": 0.0, + "learning_rate": 3.4304750997618595e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5906 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 44.41044776119403, + "grad_norm": 0.46172175113590486, + "learning_rate": 3.4298706134332136e-07, + "loss": 0.0014, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5907 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.13888549804688, + "epoch": 44.417910447761194, + "grad_norm": 0.0, + "learning_rate": 3.429266064004816e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5908 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.36111450195312, + "epoch": 44.42537313432836, + "grad_norm": 0.0, + "learning_rate": 3.428661451517691e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5909 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.2777862548828, + "epoch": 44.43283582089552, + "grad_norm": 1.0286922777684517, + "learning_rate": 3.4280567760128653e-07, + "loss": 0.0, + "reward": 1.8611111640930176, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8611111044883728, + "rewards/format_reward": 1.0, + "step": 5910 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.94444274902344, + "epoch": 44.440298507462686, + "grad_norm": 0.0, + "learning_rate": 3.4274520375313737e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5911 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.11111450195312, + "epoch": 44.44776119402985, + "grad_norm": 0.0, + "learning_rate": 3.4268472361142515e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5912 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5, + "epoch": 44.45522388059702, + "grad_norm": 0.0, + "learning_rate": 3.4262423718025396e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5913 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.69444274902344, + "epoch": 44.46268656716418, + "grad_norm": 0.0, + "learning_rate": 3.425637444637283e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5914 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.80555725097656, + "epoch": 44.47014925373134, + "grad_norm": 0.0, + "learning_rate": 3.4250324546595327e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5915 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.22222900390625, + "epoch": 44.47761194029851, + "grad_norm": 0.0, + "learning_rate": 3.424427401910341e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5916 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.88888549804688, + "epoch": 44.485074626865675, + "grad_norm": 0.7037819902712319, + "learning_rate": 3.4238222864307663e-07, + "loss": 0.0007, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5917 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8333282470703, + "epoch": 44.492537313432834, + "grad_norm": 0.0, + "learning_rate": 3.4232171082618717e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5918 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 44.5, + "grad_norm": 0.0, + "learning_rate": 3.422611867444723e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5919 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0833282470703, + "epoch": 44.507462686567166, + "grad_norm": 0.0, + "learning_rate": 3.422006564020391e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5920 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.63888549804688, + "epoch": 44.514925373134325, + "grad_norm": 0.0, + "learning_rate": 3.421401198029951e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5921 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.72222900390625, + "epoch": 44.52238805970149, + "grad_norm": 0.0, + "learning_rate": 3.420795769514482e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5922 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.55555725097656, + "epoch": 44.52985074626866, + "grad_norm": 0.0, + "learning_rate": 3.420190278515069e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5923 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.13888549804688, + "epoch": 44.53731343283582, + "grad_norm": 0.0, + "learning_rate": 3.419584725072797e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5924 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.1666717529297, + "epoch": 44.54477611940298, + "grad_norm": 0.0, + "learning_rate": 3.418979109228761e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5925 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5277862548828, + "epoch": 44.55223880597015, + "grad_norm": 0.0, + "learning_rate": 3.4183734310240553e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5926 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.61111450195312, + "epoch": 44.559701492537314, + "grad_norm": 0.0, + "learning_rate": 3.417767690499781e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5927 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.80555725097656, + "epoch": 44.56716417910448, + "grad_norm": 0.0, + "learning_rate": 3.417161887697043e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5928 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.05555725097656, + "epoch": 44.57462686567164, + "grad_norm": 0.0, + "learning_rate": 3.4165560226569495e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5929 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75, + "epoch": 44.582089552238806, + "grad_norm": 0.0, + "learning_rate": 3.4159500954206153e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5930 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.9166717529297, + "epoch": 44.58955223880597, + "grad_norm": 0.0, + "learning_rate": 3.415344106029156e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5931 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.4166717529297, + "epoch": 44.59701492537313, + "grad_norm": 0.0, + "learning_rate": 3.4147380545236947e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5932 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1666717529297, + "epoch": 44.6044776119403, + "grad_norm": 0.0, + "learning_rate": 3.4141319409453563e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5933 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2777862548828, + "epoch": 44.61194029850746, + "grad_norm": 0.0, + "learning_rate": 3.41352576533527e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5934 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.1666717529297, + "epoch": 44.61940298507463, + "grad_norm": 0.0, + "learning_rate": 3.412919527734572e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5935 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.05555725097656, + "epoch": 44.62686567164179, + "grad_norm": 0.0, + "learning_rate": 3.4123132281844005e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5936 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9166717529297, + "epoch": 44.634328358208954, + "grad_norm": 0.0, + "learning_rate": 3.411706866725896e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5937 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1666717529297, + "epoch": 44.64179104477612, + "grad_norm": 0.0, + "learning_rate": 3.411100443400208e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5938 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.05555725097656, + "epoch": 44.649253731343286, + "grad_norm": 0.0, + "learning_rate": 3.4104939582484855e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5939 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.72222900390625, + "epoch": 44.656716417910445, + "grad_norm": 0.0, + "learning_rate": 3.409887411311886e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5940 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.63888549804688, + "epoch": 44.66417910447761, + "grad_norm": 0.0, + "learning_rate": 3.4092808026315667e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5941 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5833282470703, + "epoch": 44.67164179104478, + "grad_norm": 0.0, + "learning_rate": 3.4086741322486924e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5942 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.72222900390625, + "epoch": 44.67910447761194, + "grad_norm": 0.0, + "learning_rate": 3.4080674002044306e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5943 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.05555725097656, + "epoch": 44.6865671641791, + "grad_norm": 0.0, + "learning_rate": 3.4074606065399533e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5944 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.9166717529297, + "epoch": 44.69402985074627, + "grad_norm": 0.0, + "learning_rate": 3.4068537512964376e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5945 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.30555725097656, + "epoch": 44.701492537313435, + "grad_norm": 0.0, + "learning_rate": 3.406246834515062e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5946 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.44444274902344, + "epoch": 44.708955223880594, + "grad_norm": 0.0, + "learning_rate": 3.405639856237013e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5947 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.80555725097656, + "epoch": 44.71641791044776, + "grad_norm": 0.6188170793161842, + "learning_rate": 3.4050328165034777e-07, + "loss": 0.0056, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5948 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.1666717529297, + "epoch": 44.723880597014926, + "grad_norm": 0.0, + "learning_rate": 3.40442571535565e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5949 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3333282470703, + "epoch": 44.73134328358209, + "grad_norm": 1.2717931336775135, + "learning_rate": 3.403818552834727e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5950 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.38888549804688, + "epoch": 44.73880597014925, + "grad_norm": 0.0, + "learning_rate": 3.403211328981909e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5951 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.9166717529297, + "epoch": 44.74626865671642, + "grad_norm": 0.0, + "learning_rate": 3.402604043838402e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5952 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.30555725097656, + "epoch": 44.75373134328358, + "grad_norm": 0.0, + "learning_rate": 3.401996697445416e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5953 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0, + "epoch": 44.76119402985075, + "grad_norm": 0.0, + "learning_rate": 3.4013892898441645e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5954 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0833282470703, + "epoch": 44.76865671641791, + "grad_norm": 0.0, + "learning_rate": 3.4007818210758645e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5955 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.5833282470703, + "epoch": 44.776119402985074, + "grad_norm": 0.0, + "learning_rate": 3.4001742911817384e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5956 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.1666717529297, + "epoch": 44.78358208955224, + "grad_norm": 0.0, + "learning_rate": 3.3995667002030134e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5957 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.72222900390625, + "epoch": 44.791044776119406, + "grad_norm": 0.0, + "learning_rate": 3.398959048180918e-07, + "loss": 0.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.7777777910232544, + "rewards/format_reward": 1.0, + "step": 5958 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.47222900390625, + "epoch": 44.798507462686565, + "grad_norm": 0.0, + "learning_rate": 3.3983513351566877e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5959 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.5277862548828, + "epoch": 44.80597014925373, + "grad_norm": 0.0, + "learning_rate": 3.3977435611715615e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5960 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0833282470703, + "epoch": 44.8134328358209, + "grad_norm": 0.0, + "learning_rate": 3.397135726266781e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5961 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.80555725097656, + "epoch": 44.82089552238806, + "grad_norm": 0.0, + "learning_rate": 3.3965278304835937e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5962 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.8333282470703, + "epoch": 44.82835820895522, + "grad_norm": 0.9642527558801013, + "learning_rate": 3.39591987386325e-07, + "loss": 0.0, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5963 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.61111450195312, + "epoch": 44.83582089552239, + "grad_norm": 0.7710885837331052, + "learning_rate": 3.3953118564470074e-07, + "loss": 0.0161, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5964 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.61111450195312, + "epoch": 44.843283582089555, + "grad_norm": 0.5945321679277024, + "learning_rate": 3.394703778276121e-07, + "loss": -0.0002, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 5965 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.86111450195312, + "epoch": 44.850746268656714, + "grad_norm": 0.0, + "learning_rate": 3.3940956393918573e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5966 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.38888549804688, + "epoch": 44.85820895522388, + "grad_norm": 0.0, + "learning_rate": 3.3934874398354827e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5967 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.0277862548828, + "epoch": 44.865671641791046, + "grad_norm": 0.0, + "learning_rate": 3.3928791796482694e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5968 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.5277862548828, + "epoch": 44.87313432835821, + "grad_norm": 0.0, + "learning_rate": 3.3922708588714924e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5969 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.69444274902344, + "epoch": 44.88059701492537, + "grad_norm": 0.0, + "learning_rate": 3.391662477546431e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5970 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5833282470703, + "epoch": 44.88805970149254, + "grad_norm": 0.0, + "learning_rate": 3.391054035714371e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5971 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.63888549804688, + "epoch": 44.8955223880597, + "grad_norm": 0.0, + "learning_rate": 3.390445533416599e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5972 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.55555725097656, + "epoch": 44.90298507462687, + "grad_norm": 0.6933582640949676, + "learning_rate": 3.389836970694406e-07, + "loss": -0.0004, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5973 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4166717529297, + "epoch": 44.91044776119403, + "grad_norm": 0.0, + "learning_rate": 3.389228347589092e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5974 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.2777862548828, + "epoch": 44.917910447761194, + "grad_norm": 0.0, + "learning_rate": 3.388619664141954e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5975 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.0277862548828, + "epoch": 44.92537313432836, + "grad_norm": 0.0, + "learning_rate": 3.3880109203942973e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5976 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6388931274414, + "epoch": 44.93283582089552, + "grad_norm": 0.0, + "learning_rate": 3.387402116387431e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5977 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.13888549804688, + "epoch": 44.940298507462686, + "grad_norm": 0.0, + "learning_rate": 3.386793252162667e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5978 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.36111450195312, + "epoch": 44.94776119402985, + "grad_norm": 0.7834520292490254, + "learning_rate": 3.3861843277613224e-07, + "loss": -0.0001, + "reward": 1.9722222089767456, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.9722222089767456, + "rewards/format_reward": 1.0, + "step": 5979 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.69444274902344, + "epoch": 44.95522388059702, + "grad_norm": 0.0, + "learning_rate": 3.3855753432247173e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5980 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.30555725097656, + "epoch": 44.96268656716418, + "grad_norm": 0.0, + "learning_rate": 3.3849662985941786e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5981 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.05555725097656, + "epoch": 44.97014925373134, + "grad_norm": 0.0, + "learning_rate": 3.384357193911032e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5982 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.55555725097656, + "epoch": 44.97761194029851, + "grad_norm": 0.0, + "learning_rate": 3.383748029216613e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5983 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.61111450195312, + "epoch": 44.985074626865675, + "grad_norm": 0.0, + "learning_rate": 3.3831388045522585e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5984 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.6666717529297, + "epoch": 44.992537313432834, + "grad_norm": 0.0, + "learning_rate": 3.3825295199593087e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5985 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.05555725097656, + "epoch": 45.007462686567166, + "grad_norm": 0.0, + "learning_rate": 3.381920175479109e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5986 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.19444274902344, + "epoch": 45.014925373134325, + "grad_norm": 0.0, + "learning_rate": 3.3813107711530094e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5987 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.05555725097656, + "epoch": 45.02238805970149, + "grad_norm": 0.0, + "learning_rate": 3.380701307022362e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5988 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.63888549804688, + "epoch": 45.02985074626866, + "grad_norm": 0.3911137836839867, + "learning_rate": 3.380091783128526e-07, + "loss": -0.0001, + "reward": 1.8055555820465088, + "reward_std": 0.0555555559694767, + "rewards/accuracy_reward": 0.8055555820465088, + "rewards/format_reward": 1.0, + "step": 5989 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05555725097656, + "epoch": 45.03731343283582, + "grad_norm": 0.0, + "learning_rate": 3.37948219951286e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5990 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.44444274902344, + "epoch": 45.04477611940298, + "grad_norm": 0.0, + "learning_rate": 3.378872556216734e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5991 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.88888549804688, + "epoch": 45.05223880597015, + "grad_norm": 0.4950277325010451, + "learning_rate": 3.378262853281513e-07, + "loss": -0.0, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5992 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.2777862548828, + "epoch": 45.059701492537314, + "grad_norm": 0.0, + "learning_rate": 3.3776530907485735e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5993 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.97222900390625, + "epoch": 45.06716417910448, + "grad_norm": 0.0, + "learning_rate": 3.377043268659291e-07, + "loss": 0.0, + "reward": 1.888888955116272, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8888888955116272, + "rewards/format_reward": 1.0, + "step": 5994 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0277862548828, + "epoch": 45.07462686567164, + "grad_norm": 0.0, + "learning_rate": 3.3764333870550495e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5995 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.38888549804688, + "epoch": 45.082089552238806, + "grad_norm": 0.8137909254608976, + "learning_rate": 3.3758234459772327e-07, + "loss": -0.0001, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5996 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.0277862548828, + "epoch": 45.08955223880597, + "grad_norm": 0.0, + "learning_rate": 3.375213445467232e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5997 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.63888549804688, + "epoch": 45.09701492537314, + "grad_norm": 1.279822630620786, + "learning_rate": 3.37460338556644e-07, + "loss": -0.0008, + "reward": 1.9444444179534912, + "reward_std": 0.06415002793073654, + "rewards/accuracy_reward": 0.944444477558136, + "rewards/format_reward": 1.0, + "step": 5998 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5833282470703, + "epoch": 45.1044776119403, + "grad_norm": 0.0, + "learning_rate": 3.3739932663162557e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 5999 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.72222900390625, + "epoch": 45.11194029850746, + "grad_norm": 0.0, + "learning_rate": 3.3733830877580794e-07, + "loss": 0.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 6000 + } + ], + "logging_steps": 1, + "max_steps": 13400, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}