diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,4 +1,5 @@ { + "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05596753882748006, @@ -10,7017 +11,11517 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 624.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.75, + "completions/max_terminated_length": 706.75, + "completions/mean_length": 561.5, + "completions/mean_terminated_length": 561.5, + "completions/min_length": 304.25, + "completions/min_terminated_length": 304.25, "epoch": 0.00011193507765496012, - "grad_norm": 0.0, + "grad_norm": 0.6556430486006678, "kl": 0.0, - "learning_rate": 3.3333333333333335e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.0, + "learning_rate": 0.0, + "loss": -0.0315, + "num_tokens": 39768.0, + "reward": 0.02219326765043661, + "reward_std": 0.04227059497497976, + "rewards/code_reward/mean": 0.02219326765043661, + "rewards/code_reward/std": 0.0422705952078104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 458.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.25, + "completions/max_terminated_length": 609.25, + "completions/mean_length": 333.3125, + "completions/mean_terminated_length": 333.3125, + "completions/min_length": 168.75, + "completions/min_terminated_length": 168.75, "epoch": 0.00022387015530992023, - "grad_norm": 0.7040169045969362, + "grad_norm": 0.9104023477920656, "kl": 0.0, - "learning_rate": 6.666666666666667e-07, - "loss": 0.0601, - "reward": 0.007812500116415322, - "reward_std": 0.01743034040555358, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.078125, + "learning_rate": 3.3333333333333335e-07, + "loss": -0.0684, + "num_tokens": 62690.0, + "reward": 0.09372148709371686, + "reward_std": 0.05263180285692215, + "rewards/code_reward/mean": 0.09372148709371686, + "rewards/code_reward/std": 0.052631803788244724, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 444.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.25, + "completions/max_terminated_length": 613.25, + "completions/mean_length": 431.03125, + "completions/mean_terminated_length": 431.03125, + "completions/min_length": 231.5, + "completions/min_terminated_length": 231.5, "epoch": 0.00033580523296488035, - "grad_norm": 0.8629215889276903, - "kl": 0.00027060508728027344, - "learning_rate": 1.0000000000000002e-06, - "loss": 0.0038, - "reward": 0.004687500069849193, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.046875, + "grad_norm": 0.7467116220042507, + "kl": 6.216764450073242e-05, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0176, + "num_tokens": 93451.0, + "reward": 0.02661502081900835, + "reward_std": 0.03690493572503328, + "rewards/code_reward/mean": 0.02661502081900835, + "rewards/code_reward/std": 0.03690493851900101, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 443.078125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 379.625, + "completions/mean_terminated_length": 379.625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, "epoch": 0.00044774031061984047, - "grad_norm": 0.7016431850799195, - "kl": 0.00026345252990722656, - "learning_rate": 1.3333333333333334e-06, - "loss": 0.0435, - "reward": 0.006250000209547579, - "reward_std": 0.016327823046594858, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.0625, + "grad_norm": 0.9821405105801766, + "kl": 7.545948028564453e-05, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.0724, + "num_tokens": 120351.0, + "reward": 0.07474911026656628, + "reward_std": 0.10846673045307398, + "rewards/code_reward/mean": 0.07474911026656628, + "rewards/code_reward/std": 0.1084667295217514, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 466.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.25, + "completions/max_terminated_length": 699.25, + "completions/mean_length": 458.15625, + "completions/mean_terminated_length": 458.15625, + "completions/min_length": 172.75, + "completions/min_terminated_length": 172.75, "epoch": 0.0005596753882748006, - "grad_norm": 0.4574220548610273, - "kl": 0.0002818107604980469, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.007, - "reward": 0.0031250000465661287, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.03125, + "grad_norm": 0.7621492143305323, + "kl": 4.2319297790527344e-05, + "learning_rate": 1.3333333333333334e-06, + "loss": -0.022, + "num_tokens": 155684.0, + "reward": 0.021726191509515047, + "reward_std": 0.02874244563281536, + "rewards/code_reward/mean": 0.021726191509515047, + "rewards/code_reward/std": 0.02874244749546051, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 495.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.5, + "completions/max_terminated_length": 764.5, + "completions/mean_length": 439.15625, + "completions/mean_terminated_length": 439.15625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.0006716104659297607, - "grad_norm": 0.5434000185158782, - "kl": 0.0002865791320800781, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.0427, - "reward": 0.0031250000465661287, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.03125, + "grad_norm": 1.0850029834641424, + "kl": 4.9442052841186523e-05, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0318, + "num_tokens": 189689.0, + "reward": 0.36408869456499815, + "reward_std": 0.2700020968914032, + "rewards/code_reward/mean": 0.36408869456499815, + "rewards/code_reward/std": 0.2700021122582257, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 509.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.75, + "completions/max_terminated_length": 783.75, + "completions/mean_length": 459.84375, + "completions/mean_terminated_length": 459.84375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, "epoch": 0.0007835455435847208, - "grad_norm": 7.737527191229261, - "kl": 0.0010457038879394531, - "learning_rate": 2.3333333333333336e-06, - "loss": -0.0273, - "reward": 0.0031250000465661287, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.03125, + "grad_norm": 1.1358452129000465, + "kl": 9.28044319152832e-05, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.0229, + "num_tokens": 223804.0, + "reward": 0.07010709377937019, + "reward_std": 0.09961615316569805, + "rewards/code_reward/mean": 0.07010709377937019, + "rewards/code_reward/std": 0.09961615689098835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 401.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.25, + "completions/max_terminated_length": 560.25, + "completions/mean_length": 390.28125, + "completions/mean_terminated_length": 390.28125, + "completions/min_length": 230.25, + "completions/min_terminated_length": 230.25, "epoch": 0.0008954806212396809, - "grad_norm": 0.9652984425394716, - "kl": 0.000743865966796875, - "learning_rate": 2.666666666666667e-06, - "loss": 0.0031, - "reward": 0.0062500000931322575, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.0625, + "grad_norm": 1.0894200859707648, + "kl": 6.967782974243164e-05, + "learning_rate": 2.3333333333333336e-06, + "loss": -0.0832, + "num_tokens": 250717.0, + "reward": 0.17307570209959522, + "reward_std": 0.17698997142724693, + "rewards/code_reward/mean": 0.17307570209959522, + "rewards/code_reward/std": 0.17698997911065817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 631.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.5, + "completions/max_terminated_length": 759.5, + "completions/mean_length": 530.4375, + "completions/mean_terminated_length": 530.4375, + "completions/min_length": 260.25, + "completions/min_terminated_length": 260.25, "epoch": 0.0010074156988946412, - "grad_norm": 0.565508681156182, - "kl": 0.001064300537109375, - "learning_rate": 3e-06, - "loss": 0.0314, - "reward": 0.004687500069849193, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.046875, + "grad_norm": 0.6609406173759865, + "kl": 0.00013196468353271484, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0563, + "num_tokens": 290563.0, + "reward": 0.0257048096973449, + "reward_std": 0.036920994287356734, + "rewards/code_reward/mean": 0.0257048096973449, + "rewards/code_reward/std": 0.03692099452018738, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 509.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.25, + "completions/max_terminated_length": 749.25, + "completions/mean_length": 468.71875, + "completions/mean_terminated_length": 468.71875, + "completions/min_length": 277.25, + "completions/min_terminated_length": 277.25, "epoch": 0.0011193507765496012, - "grad_norm": 29.074260549966585, - "kl": 0.7770309448242188, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.0532, - "reward": 0.02031250041909516, - "reward_std": 0.04097762983292341, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.203125, + "grad_norm": 0.816974823658789, + "kl": 0.00015497207641601562, + "learning_rate": 3e-06, + "loss": -0.0226, + "num_tokens": 325418.0, + "reward": 0.09049492585472763, + "reward_std": 0.18241150537505746, + "rewards/code_reward/mean": 0.09049492585472763, + "rewards/code_reward/std": 0.18241151235997677, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 306.96875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.5, + "completions/max_terminated_length": 523.5, + "completions/mean_length": 360.09375, + "completions/mean_terminated_length": 360.09375, + "completions/min_length": 172.75, + "completions/min_terminated_length": 172.75, "epoch": 0.0012312858542045614, - "grad_norm": 1.335155005624375, - "kl": 0.03216552734375, - "learning_rate": 3.6666666666666666e-06, - "loss": 0.0838, - "reward": 0.025000000605359674, - "reward_std": 0.043084788136184216, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.25, + "grad_norm": 1.3579676028203136, + "kl": 0.0004100799560546875, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0149, + "num_tokens": 348085.0, + "reward": 0.07968997955322266, + "reward_std": 0.19473078846931458, + "rewards/code_reward/mean": 0.07968997955322266, + "rewards/code_reward/std": 0.19473078846931458, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 469.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.25, + "completions/max_terminated_length": 719.25, + "completions/mean_length": 476.03125, + "completions/mean_terminated_length": 476.03125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, "epoch": 0.0013432209318595214, - "grad_norm": 0.9728533904128814, - "kl": 0.11004638671875, - "learning_rate": 4.000000000000001e-06, - "loss": 0.0547, - "reward": 0.02187500020954758, - "reward_std": 0.04057852132245898, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.21875, + "grad_norm": 1.0254632341024847, + "kl": 0.00038051605224609375, + "learning_rate": 3.6666666666666666e-06, + "loss": -0.0456, + "num_tokens": 379078.0, + "reward": 0.06544117676094174, + "reward_std": 0.11873381165787578, + "rewards/code_reward/mean": 0.06544117676094174, + "rewards/code_reward/std": 0.11873381165787578, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 411.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.75, + "completions/max_terminated_length": 643.75, + "completions/mean_length": 456.40625, + "completions/mean_terminated_length": 456.40625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, "epoch": 0.0014551560095144816, - "grad_norm": 3.1109091298399707, - "kl": 0.34326171875, - "learning_rate": 4.333333333333334e-06, - "loss": 0.1651, - "reward": 0.05625000037252903, - "reward_std": 0.04977653082460165, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.5625, + "grad_norm": 0.7844627176055754, + "kl": 0.0006098747253417969, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0275, + "num_tokens": 410643.0, + "reward": 0.08870427880901843, + "reward_std": 0.1394934863783419, + "rewards/code_reward/mean": 0.08870427880901843, + "rewards/code_reward/std": 0.13949348265305161, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 437.4375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 946.5, + "completions/max_terminated_length": 779.25, + "completions/mean_length": 474.78125, + "completions/mean_terminated_length": 433.4776916503906, + "completions/min_length": 212.5, + "completions/min_terminated_length": 212.5, "epoch": 0.0015670910871694416, - "grad_norm": 1.4333106697160516, - "kl": 0.2135009765625, - "learning_rate": 4.666666666666667e-06, - "loss": 0.0872, - "reward": 0.06718750111758709, - "reward_std": 0.04840352013707161, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.671875, + "grad_norm": 1.0068663809227436, + "kl": 0.001148223876953125, + "learning_rate": 4.333333333333334e-06, + "loss": -0.0596, + "num_tokens": 444292.0, + "reward": 0.09596806723857298, + "reward_std": 0.1574952198425308, + "rewards/code_reward/mean": 0.09596806723857298, + "rewards/code_reward/std": 0.15749522170517594, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 459.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.75, + "completions/max_terminated_length": 754.75, + "completions/mean_length": 491.375, + "completions/mean_terminated_length": 491.375, + "completions/min_length": 320.5, + "completions/min_terminated_length": 320.5, "epoch": 0.0016790261648244019, - "grad_norm": 1.1323714811109955, - "kl": 0.0590972900390625, - "learning_rate": 5e-06, - "loss": 0.0902, - "reward": 0.07343750260770321, - "reward_std": 0.04493850376456976, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.734375, + "grad_norm": 0.6616890996356674, + "kl": 0.0012340545654296875, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0086, + "num_tokens": 480536.0, + "reward": 0.04718137255986221, + "reward_std": 0.06556019705021754, + "rewards/code_reward/mean": 0.04718137255986221, + "rewards/code_reward/std": 0.0655602045590058, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 430.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 434.03125, + "completions/mean_terminated_length": 434.03125, + "completions/min_length": 225.25, + "completions/min_terminated_length": 225.25, "epoch": 0.0017909612424793619, - "grad_norm": 1.1229606735516884, - "kl": 0.0349578857421875, - "learning_rate": 4.999952797253148e-06, - "loss": 0.0231, - "reward": 0.07031250186264515, - "reward_std": 0.04625816363841295, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.703125, + "grad_norm": 0.8195686424785313, + "kl": 0.0021371841430664062, + "learning_rate": 5e-06, + "loss": -0.0293, + "num_tokens": 517233.0, + "reward": 0.14742368459701538, + "reward_std": 0.13962780684232712, + "rewards/code_reward/mean": 0.14742368459701538, + "rewards/code_reward/std": 0.13962781056761742, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 361.84375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.25, + "completions/max_terminated_length": 670.25, + "completions/mean_length": 466.125, + "completions/mean_terminated_length": 466.125, + "completions/min_length": 264.25, + "completions/min_terminated_length": 264.25, "epoch": 0.001902896320134322, - "grad_norm": 1.3958687154501264, - "kl": 0.040618896484375, - "learning_rate": 4.9998111909931225e-06, - "loss": 0.0841, - "reward": 0.07656250149011612, - "reward_std": 0.04255262762308121, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.765625, + "grad_norm": 0.7604607536894525, + "kl": 0.00327301025390625, + "learning_rate": 4.999952797253148e-06, + "loss": -0.0188, + "num_tokens": 552989.0, + "reward": 0.03991336654871702, + "reward_std": 0.05816664919257164, + "rewards/code_reward/mean": 0.03991336654871702, + "rewards/code_reward/std": 0.05816664732992649, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 366.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 413.21875, + "completions/mean_terminated_length": 413.21875, + "completions/min_length": 243.25, + "completions/min_terminated_length": 243.25, "epoch": 0.0020148313977892823, - "grad_norm": 1.2810159881634564, - "kl": 0.058685302734375, - "learning_rate": 4.999575187161439e-06, - "loss": -0.0583, - "reward": 0.08125000260770321, - "reward_std": 0.038373483810573816, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.8125, + "grad_norm": 0.8446607287984066, + "kl": 0.00414276123046875, + "learning_rate": 4.9998111909931225e-06, + "loss": -0.0223, + "num_tokens": 582628.0, + "reward": 0.027369487448595464, + "reward_std": 0.05444430746138096, + "rewards/code_reward/mean": 0.027369487448595464, + "rewards/code_reward/std": 0.05444430839270353, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 544.984375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1086.75, + "completions/max_terminated_length": 799.25, + "completions/mean_length": 567.1875, + "completions/mean_terminated_length": 521.8258972167969, + "completions/min_length": 304.5, + "completions/min_terminated_length": 304.5, "epoch": 0.0021267664754442426, - "grad_norm": 0.6496120974985506, - "kl": 0.0307464599609375, - "learning_rate": 4.9992447956603455e-06, - "loss": 0.0475, - "reward": 0.09218750149011612, - "reward_std": 0.01743034040555358, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 0.6815859244268851, + "kl": 0.00553131103515625, + "learning_rate": 4.999575187161439e-06, + "loss": 0.0078, + "num_tokens": 620538.0, + "reward": 0.03672935510985553, + "reward_std": 0.029647217132151127, + "rewards/code_reward/mean": 0.03672935510985553, + "rewards/code_reward/std": 0.029647217132151127, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 470.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.25, + "completions/max_terminated_length": 805.25, + "completions/mean_length": 507.21875, + "completions/mean_terminated_length": 507.21875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, "epoch": 0.0022387015530992023, - "grad_norm": 1.1640288172191966, - "kl": 0.016815185546875, - "learning_rate": 4.998820030352409e-06, - "loss": 0.12, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.4364759611369158, + "kl": 0.00673675537109375, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.0085, + "num_tokens": 659145.0, + "reward": 0.00214460794813931, + "reward_std": 0.006065867375582457, + "rewards/code_reward/mean": 0.00214460794813931, + "rewards/code_reward/std": 0.006065867375582457, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 295.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.25, + "completions/max_terminated_length": 488.25, + "completions/mean_length": 313.0, + "completions/mean_terminated_length": 313.0, + "completions/min_length": 191.5, + "completions/min_terminated_length": 191.5, "epoch": 0.0023506366307541626, - "grad_norm": 2.092394327511984, - "kl": 0.4071197509765625, - "learning_rate": 4.998300909059929e-06, - "loss": 0.077, - "reward": 0.08906250260770321, - "reward_std": 0.025969465728849173, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.890625, + "grad_norm": 1.1196346766853063, + "kl": 0.009761810302734375, + "learning_rate": 4.998820030352409e-06, + "loss": -0.0401, + "num_tokens": 681841.0, + "reward": 0.21865647949744016, + "reward_std": 0.16933009633794427, + "rewards/code_reward/mean": 0.21865647949744016, + "rewards/code_reward/std": 0.16933008842170238, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 466.609375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1162.75, + "completions/max_terminated_length": 852.25, + "completions/mean_length": 618.125, + "completions/mean_terminated_length": 572.4866180419922, + "completions/min_length": 325.75, + "completions/min_terminated_length": 325.75, "epoch": 0.002462571708409123, - "grad_norm": 1.7353937284994216, - "kl": 0.130462646484375, - "learning_rate": 4.997687453564198e-06, - "loss": 0.0013, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8324675842567315, + "kl": 0.00992584228515625, + "learning_rate": 4.998300909059929e-06, + "loss": 0.0359, + "num_tokens": 722261.0, + "reward": 0.03132503107190132, + "reward_std": 0.04826245130971074, + "rewards/code_reward/mean": 0.03132503107190132, + "rewards/code_reward/std": 0.048262451542541385, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 364.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.5, + "completions/max_terminated_length": 668.5, + "completions/mean_length": 449.5625, + "completions/mean_terminated_length": 449.5625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, "epoch": 0.002574506786064083, - "grad_norm": 0.6464548918875196, - "kl": 0.0576171875, - "learning_rate": 4.9969796896045775e-06, - "loss": -0.0217, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8188216899529877, + "kl": 0.012542724609375, + "learning_rate": 4.997687453564198e-06, + "loss": -0.0757, + "num_tokens": 756751.0, + "reward": 0.0606757253408432, + "reward_std": 0.08386795781552792, + "rewards/code_reward/mean": 0.0606757253408432, + "rewards/code_reward/std": 0.08386795967817307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 342.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 397.21875, + "completions/mean_terminated_length": 397.21875, + "completions/min_length": 221.5, + "completions/min_terminated_length": 221.5, "epoch": 0.002686441863719043, - "grad_norm": 116.68805529495698, - "kl": 6.03173828125, - "learning_rate": 4.996177646877426e-06, - "loss": 0.0415, - "reward": 0.09218750149011612, - "reward_std": 0.017430341336876154, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 0.9882922552317236, + "kl": 0.020965576171875, + "learning_rate": 4.9969796896045775e-06, + "loss": -0.0241, + "num_tokens": 784286.0, + "reward": 0.03399203496519476, + "reward_std": 0.06442949129268527, + "rewards/code_reward/mean": 0.03399203496519476, + "rewards/code_reward/std": 0.06442949641495943, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 380.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.75, + "completions/max_terminated_length": 692.75, + "completions/mean_length": 500.4375, + "completions/mean_terminated_length": 500.4375, + "completions/min_length": 286.75, + "completions/min_terminated_length": 286.75, "epoch": 0.002798376941374003, - "grad_norm": 3.181683689923044, - "kl": 0.4976806640625, - "learning_rate": 4.995281359034851e-06, - "loss": -0.0548, - "reward": 0.09218750335276127, - "reward_std": 0.023328250739723444, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.0525741278330438, + "kl": 0.0211029052734375, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0871, + "num_tokens": 818260.0, + "reward": 0.1732453762087971, + "reward_std": 0.19011690141633153, + "rewards/code_reward/mean": 0.1732453762087971, + "rewards/code_reward/std": 0.19011691492050886, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 395.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.5, + "completions/max_terminated_length": 651.5, + "completions/mean_length": 481.875, + "completions/mean_terminated_length": 481.875, + "completions/min_length": 325.25, + "completions/min_terminated_length": 325.25, "epoch": 0.0029103120190289633, - "grad_norm": 0.9693325036669225, - "kl": 0.07562255859375, - "learning_rate": 4.994290863683296e-06, - "loss": 0.0118, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.8323803477726922, + "kl": 0.02197265625, + "learning_rate": 4.995281359034851e-06, + "loss": -0.0176, + "num_tokens": 858344.0, + "reward": 0.07887662292341702, + "reward_std": 0.06681955535896122, + "rewards/code_reward/mean": 0.07887662292341702, + "rewards/code_reward/std": 0.06681956280954182, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 505.640625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1061.75, + "completions/max_terminated_length": 923.25, + "completions/mean_length": 631.0, + "completions/mean_terminated_length": 599.9107360839844, + "completions/min_length": 343.5, + "completions/min_terminated_length": 343.5, "epoch": 0.0030222470966839235, - "grad_norm": 0.762738499506801, - "kl": 0.1248779296875, - "learning_rate": 4.99320620238196e-06, - "loss": 0.0572, - "reward": 0.09375000186264515, - "reward_std": 0.016327822115272284, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.758606194839006, + "kl": 0.01599884033203125, + "learning_rate": 4.994290863683296e-06, + "loss": -0.0909, + "num_tokens": 900352.0, + "reward": 0.019329323433339596, + "reward_std": 0.05079583264887333, + "rewards/code_reward/mean": 0.019329323433339596, + "rewards/code_reward/std": 0.05079583264887333, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 356.65625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1012.75, + "completions/max_terminated_length": 680.75, + "completions/mean_length": 559.90625, + "completions/mean_terminated_length": 515.2723236083984, + "completions/min_length": 366.5, + "completions/min_terminated_length": 366.5, "epoch": 0.0031341821743388833, - "grad_norm": 0.08544860970511584, - "kl": 0.04632568359375, - "learning_rate": 4.99202742064106e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6052468908705184, + "kl": 0.019195556640625, + "learning_rate": 4.99320620238196e-06, + "loss": -0.0222, + "num_tokens": 934653.0, + "reward": 0.3267045458778739, + "reward_std": 0.13263714499771595, + "rewards/code_reward/mean": 0.3267045458778739, + "rewards/code_reward/std": 0.1326371468603611, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 345.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.25, + "completions/max_terminated_length": 755.25, + "completions/mean_length": 560.125, + "completions/mean_terminated_length": 560.125, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, "epoch": 0.0032461172519938435, - "grad_norm": 1.1227892135326387, - "kl": 0.03948974609375, - "learning_rate": 4.990754567919917e-06, - "loss": -0.0086, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.7871054475315462, + "kl": 0.026947021484375, + "learning_rate": 4.99202742064106e-06, + "loss": -0.0025, + "num_tokens": 963673.0, + "reward": 0.1587616038741544, + "reward_std": 0.16337263770401478, + "rewards/code_reward/mean": 0.1587616038741544, + "rewards/code_reward/std": 0.16337264538742602, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 413.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 521.5625, + "completions/mean_terminated_length": 521.5625, + "completions/min_length": 357.5, + "completions/min_terminated_length": 357.5, "epoch": 0.0033580523296488037, - "grad_norm": 0.8427884883196753, - "kl": 0.03790283203125, - "learning_rate": 4.989387697624881e-06, - "loss": 0.0001, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.7847358134959852, + "kl": 0.0301513671875, + "learning_rate": 4.990754567919917e-06, + "loss": -0.0139, + "num_tokens": 1001035.0, + "reward": 0.06875000009313226, + "reward_std": 0.08176466450095177, + "rewards/code_reward/mean": 0.06875000009313226, + "rewards/code_reward/std": 0.08176466636359692, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 290.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.75, + "completions/max_terminated_length": 795.75, + "completions/mean_length": 541.53125, + "completions/mean_terminated_length": 541.53125, + "completions/min_length": 275.75, + "completions/min_terminated_length": 275.75, "epoch": 0.003469987407303764, - "grad_norm": 0.916292516898012, - "kl": 0.03955078125, - "learning_rate": 4.987926867107095e-06, - "loss": -0.0193, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.7341146031808772, + "kl": 0.038360595703125, + "learning_rate": 4.989387697624881e-06, + "loss": -0.0057, + "num_tokens": 1032172.0, + "reward": 0.12754360469989479, + "reward_std": 0.13973576435819268, + "rewards/code_reward/mean": 0.12754360469989479, + "rewards/code_reward/std": 0.13973576435819268, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 403.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.25, + "completions/max_terminated_length": 855.25, + "completions/mean_length": 615.25, + "completions/mean_terminated_length": 615.25, + "completions/min_length": 476.5, + "completions/min_terminated_length": 476.5, "epoch": 0.0035819224849587238, - "grad_norm": 0.5282928671535011, - "kl": 0.043212890625, - "learning_rate": 4.986372137660078e-06, - "loss": -0.0401, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6710232004425948, + "kl": 0.03857421875, + "learning_rate": 4.987926867107095e-06, + "loss": 0.0172, + "num_tokens": 1071724.0, + "reward": 0.04409082653000951, + "reward_std": 0.08446959964931011, + "rewards/code_reward/mean": 0.04409082653000951, + "rewards/code_reward/std": 0.08446960058063269, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 376.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.25, + "completions/max_terminated_length": 600.25, + "completions/mean_length": 480.375, + "completions/mean_terminated_length": 480.375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, "epoch": 0.003693857562613684, - "grad_norm": 0.8076561222873071, - "kl": 0.06463623046875, - "learning_rate": 4.984723574517165e-06, - "loss": 0.0103, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.9880874575660351, + "kl": 0.045196533203125, + "learning_rate": 4.986372137660078e-06, + "loss": 0.0123, + "num_tokens": 1105880.0, + "reward": 0.2375063351355493, + "reward_std": 0.21191274048760533, + "rewards/code_reward/mean": 0.2375063351355493, + "rewards/code_reward/std": 0.21191274095326662, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 352.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.25, + "completions/max_terminated_length": 810.25, + "completions/mean_length": 586.9375, + "completions/mean_terminated_length": 586.9375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.003805792640268644, - "grad_norm": 0.5820201891858064, - "kl": 0.04254150390625, - "learning_rate": 4.9829812468487655e-06, - "loss": -0.0118, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7360918797059441, + "kl": 0.036865234375, + "learning_rate": 4.984723574517165e-06, + "loss": 0.0507, + "num_tokens": 1137102.0, + "reward": 0.1048327736207284, + "reward_std": 0.12198017432820052, + "rewards/code_reward/mean": 0.1048327736207284, + "rewards/code_reward/std": 0.12198018177878112, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 321.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.25, + "completions/max_terminated_length": 671.25, + "completions/mean_length": 481.6875, + "completions/mean_terminated_length": 481.6875, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, "epoch": 0.0039177277179236044, - "grad_norm": 1.30271756609257, - "kl": 0.07342529296875, - "learning_rate": 4.981145227759457e-06, - "loss": 0.0033, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.8321841584943117, + "kl": 0.04937744140625, + "learning_rate": 4.9829812468487655e-06, + "loss": 0.0045, + "num_tokens": 1169516.0, + "reward": 0.13488754630088806, + "reward_std": 0.15473865950480103, + "rewards/code_reward/mean": 0.13488754630088806, + "rewards/code_reward/std": 0.15473866136744618, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 325.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.25, + "completions/max_terminated_length": 758.25, + "completions/mean_length": 503.5625, + "completions/mean_terminated_length": 503.5625, + "completions/min_length": 332.75, + "completions/min_terminated_length": 332.75, "epoch": 0.004029662795578565, - "grad_norm": 0.5253064587253158, - "kl": 0.0435791015625, - "learning_rate": 4.979215594284924e-06, - "loss": -0.004, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6486333735221288, + "kl": 0.04339599609375, + "learning_rate": 4.981145227759457e-06, + "loss": 0.032, + "num_tokens": 1200070.0, + "reward": 0.1878063678741455, + "reward_std": 0.2491721287369728, + "rewards/code_reward/mean": 0.1878063678741455, + "rewards/code_reward/std": 0.2491721287369728, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 285.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 447.4375, + "completions/mean_terminated_length": 447.4375, + "completions/min_length": 265.75, + "completions/min_terminated_length": 265.75, "epoch": 0.004141597873233525, - "grad_norm": 0.03440107214885444, - "kl": 0.03900146484375, - "learning_rate": 4.977192427388722e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7244692818471474, + "kl": 0.06103515625, + "learning_rate": 4.979215594284924e-06, + "loss": 0.0021, + "num_tokens": 1226348.0, + "reward": 0.20452898740768433, + "reward_std": 0.11701454967260361, + "rewards/code_reward/mean": 0.20452898740768433, + "rewards/code_reward/std": 0.1170145571231842, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 446.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 693.0625, + "completions/mean_terminated_length": 693.0625, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, "epoch": 0.004253532950888485, - "grad_norm": 0.03871583904571488, - "kl": 0.033447265625, - "learning_rate": 4.9750758119588824e-06, - "loss": 0.0003, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.849851302505875, + "kl": 0.04730224609375, + "learning_rate": 4.977192427388722e-06, + "loss": 0.0179, + "num_tokens": 1269150.0, + "reward": 0.27298991987481713, + "reward_std": 0.22980366041883826, + "rewards/code_reward/mean": 0.27298991987481713, + "rewards/code_reward/std": 0.22980366088449955, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 407.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.75, + "completions/max_terminated_length": 683.75, + "completions/mean_length": 558.6875, + "completions/mean_terminated_length": 558.6875, + "completions/min_length": 429.5, + "completions/min_terminated_length": 429.5, "epoch": 0.0043654680285434445, - "grad_norm": 0.14679688464116405, - "kl": 0.05303955078125, - "learning_rate": 4.972865836804349e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7007952533946012, + "kl": 0.0660400390625, + "learning_rate": 4.9750758119588824e-06, + "loss": -0.0113, + "num_tokens": 1303044.0, + "reward": 0.09943000599741936, + "reward_std": 0.06831434741616249, + "rewards/code_reward/mean": 0.09943000599741936, + "rewards/code_reward/std": 0.06831434927880764, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 386.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.25, + "completions/max_terminated_length": 757.25, + "completions/mean_length": 580.4375, + "completions/mean_terminated_length": 580.4375, + "completions/min_length": 434.5, + "completions/min_terminated_length": 434.5, "epoch": 0.004477403106198405, - "grad_norm": 1.1845417806658105, - "kl": 0.06744384765625, - "learning_rate": 4.970562594651254e-06, - "loss": 0.0676, - "reward": 0.09531250037252903, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.8837854421885906, + "kl": 0.0677490234375, + "learning_rate": 4.972865836804349e-06, + "loss": 0.036, + "num_tokens": 1341114.0, + "reward": 0.06429215706884861, + "reward_std": 0.10811880882829428, + "rewards/code_reward/mean": 0.06429215706884861, + "rewards/code_reward/std": 0.10811881255358458, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 411.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.25, + "completions/max_terminated_length": 751.25, + "completions/mean_length": 564.15625, + "completions/mean_terminated_length": 564.15625, + "completions/min_length": 286.25, + "completions/min_terminated_length": 286.25, "epoch": 0.004589338183853365, - "grad_norm": 0.6033536519344826, - "kl": 0.05682373046875, - "learning_rate": 4.968166182139026e-06, - "loss": 0.1634, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8908962386984877, + "kl": 0.05224609375, + "learning_rate": 4.970562594651254e-06, + "loss": 0.0452, + "num_tokens": 1374855.0, + "reward": 0.0228785730432719, + "reward_std": 0.06361539242789149, + "rewards/code_reward/mean": 0.0228785730432719, + "rewards/code_reward/std": 0.06361539429053664, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 419.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.25, + "completions/max_terminated_length": 771.25, + "completions/mean_length": 548.25, + "completions/mean_terminated_length": 548.25, + "completions/min_length": 359.5, + "completions/min_terminated_length": 359.5, "epoch": 0.004701273261508325, - "grad_norm": 0.03504335311347305, - "kl": 0.042327880859375, - "learning_rate": 4.9656766998163306e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6625726727295415, + "kl": 0.06036376953125, + "learning_rate": 4.968166182139026e-06, + "loss": 0.0256, + "num_tokens": 1408383.0, + "reward": 0.12928921589627862, + "reward_std": 0.1418905109167099, + "rewards/code_reward/mean": 0.12928921589627862, + "rewards/code_reward/std": 0.1418905109167099, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 414.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.5, + "completions/max_terminated_length": 734.5, + "completions/mean_length": 542.28125, + "completions/mean_terminated_length": 542.28125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, "epoch": 0.004813208339163285, - "grad_norm": 0.11226404372767065, - "kl": 0.0537109375, - "learning_rate": 4.963094252136865e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6465061355449234, + "kl": 0.06768798828125, + "learning_rate": 4.9656766998163306e-06, + "loss": 0.0207, + "num_tokens": 1446992.0, + "reward": 0.10072244703769684, + "reward_std": 0.11095328629016876, + "rewards/code_reward/mean": 0.10072244703769684, + "rewards/code_reward/std": 0.11095329001545906, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 470.296875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1169.75, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 709.5625, + "completions/mean_terminated_length": 673.1160888671875, + "completions/min_length": 441.75, + "completions/min_terminated_length": 441.75, "epoch": 0.004925143416818246, - "grad_norm": 1.682711207148907, - "kl": 0.2684326171875, - "learning_rate": 4.960418947454958e-06, - "loss": 0.0222, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.5674013554100908, + "kl": 0.05987548828125, + "learning_rate": 4.963094252136865e-06, + "loss": 0.0051, + "num_tokens": 1489002.0, + "reward": 0.026442307978868484, + "reward_std": 0.02707473188638687, + "rewards/code_reward/mean": 0.026442307978868484, + "rewards/code_reward/std": 0.027074730023741722, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 405.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1007.25, + "completions/mean_length": 751.75, + "completions/mean_terminated_length": 665.2291717529297, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, "epoch": 0.005037078494473206, - "grad_norm": 0.04103769366987546, - "kl": 0.0435791015625, - "learning_rate": 4.957650898021038e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.485271480797118, + "kl": 0.04827880859375, + "learning_rate": 4.960418947454958e-06, + "loss": 0.0172, + "num_tokens": 1535994.0, + "reward": 0.0928819477558136, + "reward_std": 0.059048041701316833, + "rewards/code_reward/mean": 0.0928819477558136, + "rewards/code_reward/std": 0.059048037976026535, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 422.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.25, + "completions/max_terminated_length": 862.25, + "completions/mean_length": 557.3125, + "completions/mean_terminated_length": 557.3125, + "completions/min_length": 322.75, + "completions/min_terminated_length": 322.75, "epoch": 0.005149013572128166, - "grad_norm": 0.524235834468468, - "kl": 0.065673828125, - "learning_rate": 4.954790219976915e-06, - "loss": -0.0335, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.5321904564182419, + "kl": 0.0755615234375, + "learning_rate": 4.957650898021038e-06, + "loss": 0.0296, + "num_tokens": 1574756.0, + "reward": 0.001838235417380929, + "reward_std": 0.0021725620608776808, + "rewards/code_reward/mean": 0.001838235417380929, + "rewards/code_reward/std": 0.0021725620608776808, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 443.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.75, + "completions/max_terminated_length": 822.75, + "completions/mean_length": 576.875, + "completions/mean_terminated_length": 576.875, + "completions/min_length": 400.25, + "completions/min_terminated_length": 400.25, "epoch": 0.005260948649783125, - "grad_norm": 105.38307149631456, - "kl": 0.32183837890625, - "learning_rate": 4.95183703335091e-06, - "loss": 0.0819, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.936192416992597, + "kl": 0.07989501953125, + "learning_rate": 4.954790219976915e-06, + "loss": 0.0045, + "num_tokens": 1610288.0, + "reward": 0.1439773216843605, + "reward_std": 0.18783001974225044, + "rewards/code_reward/mean": 0.1439773216843605, + "rewards/code_reward/std": 0.1878300216048956, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 385.65625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1394.5, + "completions/max_terminated_length": 718.5, + "completions/mean_length": 634.0, + "completions/mean_terminated_length": 540.5089416503906, + "completions/min_length": 364.25, + "completions/min_terminated_length": 364.25, "epoch": 0.005372883727438086, - "grad_norm": 0.6435525752943102, - "kl": 0.1368408203125, - "learning_rate": 4.948791462052819e-06, - "loss": 0.0042, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7287543485940715, + "kl": 0.0684814453125, + "learning_rate": 4.95183703335091e-06, + "loss": -0.0097, + "num_tokens": 1643344.0, + "reward": 0.06508574914187193, + "reward_std": 0.14080872386693954, + "rewards/code_reward/mean": 0.06508574914187193, + "rewards/code_reward/std": 0.14080872386693954, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 630.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1062.25, + "completions/mean_length": 779.1875, + "completions/mean_terminated_length": 739.1562652587891, + "completions/min_length": 460.5, + "completions/min_terminated_length": 460.5, "epoch": 0.005484818805093046, - "grad_norm": 0.5344456298032659, - "kl": 0.060546875, - "learning_rate": 4.945653633868716e-06, - "loss": 0.0254, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.5005111196440745, + "kl": 0.0516357421875, + "learning_rate": 4.948791462052819e-06, + "loss": 0.0943, + "num_tokens": 1697622.0, + "reward": 0.03162594046443701, + "reward_std": 0.04561105836182833, + "rewards/code_reward/mean": 0.03162594046443701, + "rewards/code_reward/std": 0.0456110592931509, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 422.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 550.5, + "completions/mean_terminated_length": 550.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, "epoch": 0.005596753882748006, - "grad_norm": 0.7155667213796403, - "kl": 0.08978271484375, - "learning_rate": 4.942423680455584e-06, - "loss": 0.0132, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6600509702450322, + "kl": 0.080078125, + "learning_rate": 4.945653633868716e-06, + "loss": -0.0276, + "num_tokens": 1736334.0, + "reward": 0.020256503019481897, + "reward_std": 0.05186595255509019, + "rewards/code_reward/mean": 0.020256503019481897, + "rewards/code_reward/std": 0.05186595278792083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 596.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 716.1875, + "completions/mean_terminated_length": 716.1875, + "completions/min_length": 532.75, + "completions/min_terminated_length": 532.75, "epoch": 0.005708688960402966, - "grad_norm": 0.6239904143882911, - "kl": 0.04815673828125, - "learning_rate": 4.939101737335802e-06, - "loss": 0.0135, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.7327612072681561, + "kl": 0.07684326171875, + "learning_rate": 4.942423680455584e-06, + "loss": 0.0245, + "num_tokens": 1782508.0, + "reward": 0.12191444495692849, + "reward_std": 0.18840329442173243, + "rewards/code_reward/mean": 0.12191444495692849, + "rewards/code_reward/std": 0.18840329255908728, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 402.71875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1145.75, + "completions/max_terminated_length": 794.5, + "completions/mean_length": 594.21875, + "completions/mean_terminated_length": 546.3482208251953, + "completions/min_length": 401.5, + "completions/min_terminated_length": 401.5, "epoch": 0.0058206240380579265, - "grad_norm": 1.104725268564954, - "kl": 0.145263671875, - "learning_rate": 4.935687943891447e-06, - "loss": 0.0015, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8852205460832024, + "kl": 0.0771484375, + "learning_rate": 4.939101737335802e-06, + "loss": 0.0285, + "num_tokens": 1817019.0, + "reward": 0.2491304986178875, + "reward_std": 0.283006114885211, + "rewards/code_reward/mean": 0.2491304986178875, + "rewards/code_reward/std": 0.2830061223357916, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 477.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 592.90625, + "completions/mean_terminated_length": 592.90625, + "completions/min_length": 387.5, + "completions/min_terminated_length": 387.5, "epoch": 0.005932559115712887, - "grad_norm": 0.5835454079488682, - "kl": 0.075927734375, - "learning_rate": 4.932182443358458e-06, - "loss": 0.1512, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9735177234540268, + "kl": 0.090576171875, + "learning_rate": 4.935687943891447e-06, + "loss": 0.0151, + "num_tokens": 1857712.0, + "reward": 0.05411792593076825, + "reward_std": 0.11774230282753706, + "rewards/code_reward/mean": 0.05411792593076825, + "rewards/code_reward/std": 0.11774230748414993, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 580.578125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1032.25, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 757.0, + "completions/mean_terminated_length": 704.8645935058594, + "completions/min_length": 557.75, + "completions/min_terminated_length": 557.75, "epoch": 0.006044494193367847, - "grad_norm": 0.49184303981459476, - "kl": 0.05926513671875, - "learning_rate": 4.928585382820616e-06, - "loss": 0.0194, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.528475149602509, + "kl": 0.081939697265625, + "learning_rate": 4.932182443358458e-06, + "loss": 0.0384, + "num_tokens": 1906368.0, + "reward": 0.08750000596046448, + "reward_std": 0.10493762046098709, + "rewards/code_reward/mean": 0.08750000596046448, + "rewards/code_reward/std": 0.10493762046098709, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 392.046875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1305.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 582.5, + "completions/mean_terminated_length": 488.51341247558594, + "completions/min_length": 289.25, + "completions/min_terminated_length": 289.25, "epoch": 0.006156429271022806, - "grad_norm": 1.0291704424132635, - "kl": 0.1390380859375, - "learning_rate": 4.924896913203376e-06, - "loss": 0.0102, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9161325800813456, + "kl": 0.06402587890625, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0907, + "num_tokens": 1939632.0, + "reward": 0.17478298512287438, + "reward_std": 0.1567421266809106, + "rewards/code_reward/mean": 0.17478298512287438, + "rewards/code_reward/std": 0.15674212691374123, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 465.0625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1085.75, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 603.28125, + "completions/mean_terminated_length": 558.6964416503906, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, "epoch": 0.006268364348677767, - "grad_norm": 0.5613069680179144, - "kl": 0.08880615234375, - "learning_rate": 4.921117189267535e-06, - "loss": 0.0121, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8910583501574744, + "kl": 0.07037353515625, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0682, + "num_tokens": 1979449.0, + "reward": 0.10180415771901608, + "reward_std": 0.13188505358994007, + "rewards/code_reward/mean": 0.10180415771901608, + "rewards/code_reward/std": 0.1318850601091981, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 480.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.5, + "completions/max_terminated_length": 806.5, + "completions/mean_length": 581.03125, + "completions/mean_terminated_length": 581.03125, + "completions/min_length": 407.5, + "completions/min_terminated_length": 407.5, "epoch": 0.006380299426332727, - "grad_norm": 0.7905247960482367, - "kl": 0.06884765625, - "learning_rate": 4.917246369602742e-06, - "loss": 0.0134, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.03891552766019, + "kl": 0.08428955078125, + "learning_rate": 4.921117189267535e-06, + "loss": -0.054, + "num_tokens": 2018810.0, + "reward": 0.1397020157892257, + "reward_std": 0.1752215747255832, + "rewards/code_reward/mean": 0.1397020157892257, + "rewards/code_reward/std": 0.17522158951032907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 393.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 548.53125, + "completions/mean_terminated_length": 548.53125, + "completions/min_length": 340.25, + "completions/min_terminated_length": 340.25, "epoch": 0.006492234503987687, - "grad_norm": 18.99182061239259, - "kl": 1.30133056640625, - "learning_rate": 4.9132846166208355e-06, - "loss": -0.0248, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.5842843696982555, + "kl": 0.0770263671875, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0329, + "num_tokens": 2054963.0, + "reward": 0.10270743072032928, + "reward_std": 0.14296946674585342, + "rewards/code_reward/mean": 0.10270743072032928, + "rewards/code_reward/std": 0.14296947047114372, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 509.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.75, + "completions/max_terminated_length": 741.75, + "completions/mean_length": 581.90625, + "completions/mean_terminated_length": 581.90625, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, "epoch": 0.006604169581642647, - "grad_norm": 1.0573356685875819, - "kl": 0.06280517578125, - "learning_rate": 4.9092320965490365e-06, - "loss": 0.0153, - "reward": 0.09375, - "reward_std": 0.02500000037252903, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.8753670558307941, + "kl": 0.07373046875, + "learning_rate": 4.9132846166208355e-06, + "loss": 0.0316, + "num_tokens": 2089368.0, + "reward": 0.008248626545537263, + "reward_std": 0.008944235043600202, + "rewards/code_reward/mean": 0.008248626545537263, + "rewards/code_reward/std": 0.008944234810769558, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 415.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.5, + "completions/max_terminated_length": 706.5, + "completions/mean_length": 519.03125, + "completions/mean_terminated_length": 519.03125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, "epoch": 0.0067161046592976075, - "grad_norm": 0.5498420295992223, - "kl": 0.070556640625, - "learning_rate": 4.905088979422971e-06, - "loss": -0.0324, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8925995523113797, + "kl": 0.0953369140625, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.0961, + "num_tokens": 2119905.0, + "reward": 0.11875520087778568, + "reward_std": 0.2019189279526472, + "rewards/code_reward/mean": 0.11875520087778568, + "rewards/code_reward/std": 0.20191893354058266, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 374.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.75, + "completions/max_terminated_length": 580.75, + "completions/mean_length": 387.21875, + "completions/mean_terminated_length": 387.21875, + "completions/min_length": 223.5, + "completions/min_terminated_length": 223.5, "epoch": 0.006828039736952568, - "grad_norm": 0.4563712143777335, - "kl": 0.065673828125, - "learning_rate": 4.900855439079536e-06, - "loss": -0.0263, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7379134154137567, + "kl": 0.0865478515625, + "learning_rate": 4.905088979422971e-06, + "loss": 0.0293, + "num_tokens": 2148904.0, + "reward": 0.021313310135155916, + "reward_std": 0.018601362127810717, + "rewards/code_reward/mean": 0.021313310135155916, + "rewards/code_reward/std": 0.018601362593472004, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 470.03125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1017.5, + "completions/max_terminated_length": 684.25, + "completions/mean_length": 573.875, + "completions/mean_terminated_length": 528.5982208251953, + "completions/min_length": 326.25, + "completions/min_terminated_length": 326.25, "epoch": 0.006939974814607528, - "grad_norm": 0.022676570314052416, - "kl": 0.0540771484375, - "learning_rate": 4.8965316531496055e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8052241518254897, + "kl": 0.0723876953125, + "learning_rate": 4.900855439079536e-06, + "loss": 0.0677, + "num_tokens": 2190676.0, + "reward": 0.055714288260787725, + "reward_std": 0.04886896489188075, + "rewards/code_reward/mean": 0.055714288260787725, + "rewards/code_reward/std": 0.048868965124711394, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 442.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.75, + "completions/max_terminated_length": 643.75, + "completions/mean_length": 476.25, + "completions/mean_terminated_length": 476.25, + "completions/min_length": 310.5, + "completions/min_terminated_length": 310.5, "epoch": 0.007051909892262488, - "grad_norm": 0.37277836211463244, - "kl": 0.0537109375, - "learning_rate": 4.892117803050578e-06, - "loss": -0.0112, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.4680852161414822, + "kl": 0.08984375, + "learning_rate": 4.8965316531496055e-06, + "loss": 0.0257, + "num_tokens": 2221412.0, + "reward": 0.0011574074160307646, + "reward_std": 0.0021431019995361567, + "rewards/code_reward/mean": 0.0011574074160307646, + "rewards/code_reward/std": 0.0021431019995361567, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 407.3125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1011.5, + "completions/max_terminated_length": 646.75, + "completions/mean_length": 617.40625, + "completions/mean_terminated_length": 520.71875, + "completions/min_length": 379.75, + "completions/min_terminated_length": 379.75, "epoch": 0.0071638449699174475, - "grad_norm": 0.6045549634649411, - "kl": 0.06939697265625, - "learning_rate": 4.887614073978761e-06, - "loss": 0.0378, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8146582859250777, + "kl": 0.0728759765625, + "learning_rate": 4.892117803050578e-06, + "loss": -0.0054, + "num_tokens": 2258025.0, + "reward": 0.2782451882958412, + "reward_std": 0.2655297741293907, + "rewards/code_reward/mean": 0.2782451882958412, + "rewards/code_reward/std": 0.2655297741293907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 490.53125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 750.5, + "completions/mean_length": 635.8125, + "completions/mean_terminated_length": 583.9464340209961, + "completions/min_length": 439.5, + "completions/min_terminated_length": 439.5, "epoch": 0.007275780047572408, - "grad_norm": 0.4804494114616987, - "kl": 0.0440673828125, - "learning_rate": 4.883020654901609e-06, - "loss": 0.0326, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6310220375924765, + "kl": 0.07208251953125, + "learning_rate": 4.887614073978761e-06, + "loss": 0.0001, + "num_tokens": 2296595.0, + "reward": 0.08172532171010971, + "reward_std": 0.08684739097952843, + "rewards/code_reward/mean": 0.08172532171010971, + "rewards/code_reward/std": 0.08684739866293967, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 414.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.25, + "completions/max_terminated_length": 660.25, + "completions/mean_length": 459.875, + "completions/mean_terminated_length": 459.875, + "completions/min_length": 291.5, + "completions/min_terminated_length": 291.5, "epoch": 0.007387715125227368, - "grad_norm": 0.02779607160337418, - "kl": 0.04901123046875, - "learning_rate": 4.878337738549785e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7464732599550883, + "kl": 0.10107421875, + "learning_rate": 4.883020654901609e-06, + "loss": -0.0305, + "num_tokens": 2331455.0, + "reward": 0.15218659490346909, + "reward_std": 0.10367358289659023, + "rewards/code_reward/mean": 0.15218659490346909, + "rewards/code_reward/std": 0.10367358289659023, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 391.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.5, + "completions/max_terminated_length": 657.5, + "completions/mean_length": 434.59375, + "completions/mean_terminated_length": 434.59375, + "completions/min_length": 252.75, + "completions/min_terminated_length": 252.75, "epoch": 0.007499650202882328, - "grad_norm": 0.019327395350080524, - "kl": 0.05145263671875, - "learning_rate": 4.873565521409082e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9414797694529051, + "kl": 0.112060546875, + "learning_rate": 4.878337738549785e-06, + "loss": 0.0316, + "num_tokens": 2356986.0, + "reward": 0.10024832468479872, + "reward_std": 0.18917413474991918, + "rewards/code_reward/mean": 0.10024832468479872, + "rewards/code_reward/std": 0.18917413474991918, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 408.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.25, + "completions/max_terminated_length": 927.25, + "completions/mean_length": 541.59375, + "completions/mean_terminated_length": 541.59375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, "epoch": 0.007611585280537288, - "grad_norm": 0.7217823002699405, - "kl": 0.07281494140625, - "learning_rate": 4.868704203712173e-06, - "loss": 0.0201, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9428500397053137, + "kl": 0.09423828125, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0535, + "num_tokens": 2388693.0, + "reward": 0.13753276504576206, + "reward_std": 0.11505712405778468, + "rewards/code_reward/mean": 0.13753276504576206, + "rewards/code_reward/std": 0.11505712429061532, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 488.171875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1034.75, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 647.96875, + "completions/mean_terminated_length": 614.2187652587891, + "completions/min_length": 344.75, + "completions/min_terminated_length": 344.75, "epoch": 0.007723520358192249, - "grad_norm": 0.01773957052813651, - "kl": 0.04571533203125, - "learning_rate": 4.86375398943021e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6413922701473375, + "kl": 0.0836181640625, + "learning_rate": 4.868704203712173e-06, + "loss": 0.0367, + "num_tokens": 2430900.0, + "reward": 0.02671811357140541, + "reward_std": 0.036580079700797796, + "rewards/code_reward/mean": 0.02671811357140541, + "rewards/code_reward/std": 0.03658008202910423, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 405.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.25, + "completions/max_terminated_length": 733.25, + "completions/mean_length": 464.53125, + "completions/mean_terminated_length": 464.53125, + "completions/min_length": 264.25, + "completions/min_terminated_length": 264.25, "epoch": 0.007835455435847209, - "grad_norm": 0.5495053845625697, - "kl": 0.05712890625, - "learning_rate": 4.858715086264274e-06, - "loss": 0.0313, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9506305895466796, + "kl": 0.12646484375, + "learning_rate": 4.86375398943021e-06, + "loss": 0.0209, + "num_tokens": 2466109.0, + "reward": 0.10709889512509108, + "reward_std": 0.14360995404422283, + "rewards/code_reward/mean": 0.10709889512509108, + "rewards/code_reward/std": 0.14360996149480343, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 340.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 150.75, + "completions/min_terminated_length": 150.75, "epoch": 0.007947390513502168, - "grad_norm": 221.95709261316097, - "kl": 29.68072509765625, - "learning_rate": 4.853587705636646e-06, - "loss": 0.4784, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9998509369749687, + "kl": 0.118896484375, + "learning_rate": 4.858715086264274e-06, + "loss": 0.078, + "num_tokens": 2489565.0, + "reward": 0.08216492831707001, + "reward_std": 0.06686047837138176, + "rewards/code_reward/mean": 0.08216492831707001, + "rewards/code_reward/std": 0.06686047837138176, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 328.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.5, + "completions/max_terminated_length": 539.5, + "completions/mean_length": 371.84375, + "completions/mean_terminated_length": 371.84375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, "epoch": 0.00805932559115713, - "grad_norm": 0.6777938636001378, - "kl": 0.0565185546875, - "learning_rate": 4.84837206268195e-06, - "loss": -0.034, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.8636085966435983, + "kl": 0.1112060546875, + "learning_rate": 4.853587705636646e-06, + "loss": 0.0659, + "num_tokens": 2518496.0, + "reward": 0.3326955884695053, + "reward_std": 0.1749400496482849, + "rewards/code_reward/mean": 0.3326955884695053, + "rewards/code_reward/std": 0.17494005151093006, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 283.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.25, + "completions/max_terminated_length": 421.25, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 154.5, + "completions/min_terminated_length": 154.5, "epoch": 0.008171260668812089, - "grad_norm": 0.03603965081169223, - "kl": 0.07452392578125, - "learning_rate": 4.8430683762381195e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8623456501670774, + "kl": 0.1112060546875, + "learning_rate": 4.84837206268195e-06, + "loss": -0.0657, + "num_tokens": 2541951.0, + "reward": 0.41282894741743803, + "reward_std": 0.14345367066562176, + "rewards/code_reward/mean": 0.41282894741743803, + "rewards/code_reward/std": 0.14345368463546038, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 357.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.75, + "completions/max_terminated_length": 546.75, + "completions/mean_length": 413.25, + "completions/mean_terminated_length": 413.25, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, "epoch": 0.00828319574646705, - "grad_norm": 0.024934251266286088, - "kl": 0.06591796875, - "learning_rate": 4.837676868837213e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6709675033187479, + "kl": 0.1011962890625, + "learning_rate": 4.8430683762381195e-06, + "loss": 0.0792, + "num_tokens": 2571935.0, + "reward": 0.09283980540931225, + "reward_std": 0.18030504882335663, + "rewards/code_reward/mean": 0.09283980540931225, + "rewards/code_reward/std": 0.18030504882335663, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 395.796875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.5, + "completions/max_terminated_length": 701.5, + "completions/mean_length": 471.4375, + "completions/mean_terminated_length": 471.4375, + "completions/min_length": 277.75, + "completions/min_terminated_length": 277.75, "epoch": 0.008395130824122009, - "grad_norm": 0.025440481130695015, - "kl": 0.05645751953125, - "learning_rate": 4.832197766696085e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9648853970330805, + "kl": 0.1002197265625, + "learning_rate": 4.837676868837213e-06, + "loss": 0.0101, + "num_tokens": 2603869.0, + "reward": 0.09730057418346405, + "reward_std": 0.08919950015842915, + "rewards/code_reward/mean": 0.09730057418346405, + "rewards/code_reward/std": 0.08919950760900974, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 448.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.5, + "completions/max_terminated_length": 705.5, + "completions/mean_length": 496.84375, + "completions/mean_terminated_length": 496.84375, + "completions/min_length": 335.25, + "completions/min_terminated_length": 335.25, "epoch": 0.00850706590177697, - "grad_norm": 0.053611239059879696, - "kl": 0.056640625, - "learning_rate": 4.826631299706887e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0521374237347803, + "kl": 0.1251220703125, + "learning_rate": 4.832197766696085e-06, + "loss": 0.0548, + "num_tokens": 2632288.0, + "reward": 0.18549207970499992, + "reward_std": 0.23195349983870983, + "rewards/code_reward/mean": 0.18549207970499992, + "rewards/code_reward/std": 0.23195349797606468, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 498.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 525.625, + "completions/mean_terminated_length": 525.625, + "completions/min_length": 347.5, + "completions/min_terminated_length": 347.5, "epoch": 0.00861900097943193, - "grad_norm": 0.6813369559573024, - "kl": 0.05133056640625, - "learning_rate": 4.820977701427424e-06, - "loss": 0.1191, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.025045228642194366, + "kl": 0.1104736328125, + "learning_rate": 4.826631299706887e-06, + "loss": 0.0011, + "num_tokens": 2669812.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 345.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.25, + "completions/max_terminated_length": 583.25, + "completions/mean_length": 442.46875, + "completions/mean_terminated_length": 442.46875, + "completions/min_length": 208.5, + "completions/min_terminated_length": 208.5, "epoch": 0.008730936057086889, - "grad_norm": 0.7460088285353335, - "kl": 0.0655517578125, - "learning_rate": 4.81523720907136e-06, - "loss": 0.0598, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1367736845257632, + "kl": 0.1265869140625, + "learning_rate": 4.820977701427424e-06, + "loss": 0.0331, + "num_tokens": 2700491.0, + "reward": 0.047237071208655834, + "reward_std": 0.10418419446796179, + "rewards/code_reward/mean": 0.047237071208655834, + "rewards/code_reward/std": 0.10418420331552625, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 374.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.5, + "completions/max_terminated_length": 689.5, + "completions/mean_length": 505.6875, + "completions/mean_terminated_length": 505.6875, + "completions/min_length": 330.25, + "completions/min_terminated_length": 330.25, "epoch": 0.00884287113474185, - "grad_norm": 0.6085630747158598, - "kl": 0.06109619140625, - "learning_rate": 4.809410063498254e-06, - "loss": 0.0091, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.5193528707901873, + "kl": 0.103515625, + "learning_rate": 4.81523720907136e-06, + "loss": 0.0032, + "num_tokens": 2734425.0, + "reward": 0.04880136996507645, + "reward_std": 0.053204361349344254, + "rewards/code_reward/mean": 0.04880136996507645, + "rewards/code_reward/std": 0.053204361349344254, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 373.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.5, + "completions/max_terminated_length": 586.5, + "completions/mean_length": 443.75, + "completions/mean_terminated_length": 443.75, + "completions/min_length": 276.25, + "completions/min_terminated_length": 276.25, "epoch": 0.00895480621239681, - "grad_norm": 0.05060284374443874, - "kl": 0.06451416015625, - "learning_rate": 4.8034965092034656e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0427847884848562, + "kl": 0.1318359375, + "learning_rate": 4.809410063498254e-06, + "loss": 0.0817, + "num_tokens": 2765545.0, + "reward": 0.2281582325231284, + "reward_std": 0.21775340917520225, + "rewards/code_reward/mean": 0.2281582325231284, + "rewards/code_reward/std": 0.21775341662578285, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 479.15625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 588.875, + "completions/mean_terminated_length": 541.5669708251953, + "completions/min_length": 409.75, + "completions/min_terminated_length": 409.75, "epoch": 0.00906674129005177, - "grad_norm": 0.020553083133118145, - "kl": 0.05169677734375, - "learning_rate": 4.797496794307889e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.4518948869491964, + "kl": 0.1085205078125, + "learning_rate": 4.8034965092034656e-06, + "loss": -0.0053, + "num_tokens": 2810309.0, + "reward": 0.06289062649011612, + "reward_std": 0.0699656680226326, + "rewards/code_reward/mean": 0.06289062649011612, + "rewards/code_reward/std": 0.0699656680226326, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 383.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 440.5, + "completions/mean_terminated_length": 440.5, + "completions/min_length": 277.5, + "completions/min_terminated_length": 277.5, "epoch": 0.00917867636770673, - "grad_norm": 0.02331309838539552, - "kl": 0.05242919921875, - "learning_rate": 4.791411170547545e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8869624498321675, + "kl": 0.1461181640625, + "learning_rate": 4.797496794307889e-06, + "loss": 0.0534, + "num_tokens": 2842813.0, + "reward": 0.010840552393347025, + "reward_std": 0.02040189504623413, + "rewards/code_reward/mean": 0.010840552393347025, + "rewards/code_reward/std": 0.02040189504623413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 470.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.5, + "completions/max_terminated_length": 715.5, + "completions/mean_length": 515.71875, + "completions/mean_terminated_length": 515.71875, + "completions/min_length": 315.5, + "completions/min_terminated_length": 315.5, "epoch": 0.009290611445361691, - "grad_norm": 0.8366878780204882, - "kl": 0.0616455078125, - "learning_rate": 4.785239893263017e-06, - "loss": 0.019, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.8955728645359476, + "kl": 0.1201171875, + "learning_rate": 4.791411170547545e-06, + "loss": -0.0048, + "num_tokens": 2878988.0, + "reward": 0.0949601458851248, + "reward_std": 0.043580688536167145, + "rewards/code_reward/mean": 0.0949601458851248, + "rewards/code_reward/std": 0.043580688536167145, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 379.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.75, + "completions/max_terminated_length": 712.75, + "completions/mean_length": 503.875, + "completions/mean_terminated_length": 503.875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, "epoch": 0.00940254652301665, - "grad_norm": 0.4621690087605406, - "kl": 0.07269287109375, - "learning_rate": 4.778983221388742e-06, - "loss": -0.0043, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.5731316506045098, + "kl": 0.10693359375, + "learning_rate": 4.785239893263017e-06, + "loss": 0.0657, + "num_tokens": 2916960.0, + "reward": 0.055458965012803674, + "reward_std": 0.06723660067655146, + "rewards/code_reward/mean": 0.055458965012803674, + "rewards/code_reward/std": 0.06723660079296678, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 323.453125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 815.25, + "completions/max_terminated_length": 511.75, + "completions/mean_length": 416.78125, + "completions/mean_terminated_length": 372.52679443359375, + "completions/min_length": 259.5, + "completions/min_terminated_length": 259.5, "epoch": 0.00951448160067161, - "grad_norm": 0.7196439160838366, - "kl": 0.07952880859375, - "learning_rate": 4.77264141744214e-06, - "loss": 0.0222, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1451584135613004, + "kl": 0.138671875, + "learning_rate": 4.778983221388742e-06, + "loss": 0.0337, + "num_tokens": 2944785.0, + "reward": 0.020502878935076296, + "reward_std": 0.0165937872370705, + "rewards/code_reward/mean": 0.020502878935076296, + "rewards/code_reward/std": 0.0165937872370705, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 363.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.25, + "completions/max_terminated_length": 617.25, + "completions/mean_length": 446.9375, + "completions/mean_terminated_length": 446.9375, + "completions/min_length": 280.5, + "completions/min_terminated_length": 280.5, "epoch": 0.00962641667832657, - "grad_norm": 0.42630497015032043, - "kl": 0.10479736328125, - "learning_rate": 4.766214747512603e-06, - "loss": -0.0067, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6695429277037601, + "kl": 0.143798828125, + "learning_rate": 4.77264141744214e-06, + "loss": 0.0076, + "num_tokens": 2976815.0, + "reward": 0.012867647223174572, + "reward_std": 0.027845492586493492, + "rewards/code_reward/mean": 0.012867647223174572, + "rewards/code_reward/std": 0.02784549444913864, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 301.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.75, + "completions/max_terminated_length": 511.75, + "completions/mean_length": 362.0625, + "completions/mean_terminated_length": 362.0625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, "epoch": 0.00973835175598153, - "grad_norm": 0.0717989075808753, - "kl": 0.06146240234375, - "learning_rate": 4.759703481250331e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0671674919636422, + "kl": 0.11083984375, + "learning_rate": 4.766214747512603e-06, + "loss": 0.0937, + "num_tokens": 2998881.0, + "reward": 0.26682692021131516, + "reward_std": 0.16957121342420578, + "rewards/code_reward/mean": 0.26682692021131516, + "rewards/code_reward/std": 0.16957121714949608, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 514.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 465.0, + "completions/mean_terminated_length": 465.0, + "completions/min_length": 320.25, + "completions/min_terminated_length": 320.25, "epoch": 0.009850286833636491, - "grad_norm": 0.5199995504833264, - "kl": 0.0433349609375, - "learning_rate": 4.753107891855015e-06, - "loss": -0.0066, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0133211477792508, + "kl": 0.1400146484375, + "learning_rate": 4.759703481250331e-06, + "loss": 0.0317, + "num_tokens": 3034097.0, + "reward": 0.07334506892948411, + "reward_std": 0.11912691406905651, + "rewards/code_reward/mean": 0.07334506892948411, + "rewards/code_reward/std": 0.11912691593170166, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 270.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 289.90625, + "completions/mean_terminated_length": 289.90625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, "epoch": 0.00996222191129145, - "grad_norm": 0.04715436027273259, - "kl": 0.07342529296875, - "learning_rate": 4.746428256064375e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0515861072357229, + "kl": 0.1114501953125, + "learning_rate": 4.753107891855015e-06, + "loss": -0.0234, + "num_tokens": 3056046.0, + "reward": 0.28452102770097554, + "reward_std": 0.07553892768919468, + "rewards/code_reward/mean": 0.28452102770097554, + "rewards/code_reward/std": 0.07553892862051725, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 313.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.75, + "completions/max_terminated_length": 519.75, + "completions/mean_length": 333.6875, + "completions/mean_terminated_length": 333.6875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, "epoch": 0.010074156988946412, - "grad_norm": 0.03668445511393964, - "kl": 0.05902099609375, - "learning_rate": 4.7396648541425534e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8537837657218319, + "kl": 0.1558837890625, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0327, + "num_tokens": 3079588.0, + "reward": 0.14791105315089226, + "reward_std": 0.09374275244772434, + "rewards/code_reward/mean": 0.14791105315089226, + "rewards/code_reward/std": 0.09374275989830494, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 343.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 349.25, + "completions/min_length": 185.5, + "completions/min_terminated_length": 185.5, "epoch": 0.010186092066601371, - "grad_norm": 0.025197578254458317, - "kl": 0.080322265625, - "learning_rate": 4.732817969868348e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1628188809572875, + "kl": 0.16552734375, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.071, + "num_tokens": 3108756.0, + "reward": 0.08873509289696813, + "reward_std": 0.11791448388248682, + "rewards/code_reward/mean": 0.08873509289696813, + "rewards/code_reward/std": 0.11791448295116425, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 337.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.75, + "completions/max_terminated_length": 591.75, + "completions/mean_length": 385.59375, + "completions/mean_terminated_length": 385.59375, + "completions/min_length": 168.5, + "completions/min_terminated_length": 168.5, "epoch": 0.010298027144256332, - "grad_norm": 0.027850466760737828, - "kl": 0.057373046875, - "learning_rate": 4.7258878905233095e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0836904121924078, + "kl": 0.140380859375, + "learning_rate": 4.732817969868348e-06, + "loss": -0.0549, + "num_tokens": 3141759.0, + "reward": 0.1353156054392457, + "reward_std": 0.0778092760592699, + "rewards/code_reward/mean": 0.1353156054392457, + "rewards/code_reward/std": 0.07780927885323763, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 340.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 406.0, + "completions/mean_terminated_length": 406.0, + "completions/min_length": 236.75, + "completions/min_terminated_length": 236.75, "epoch": 0.010409962221911291, - "grad_norm": 0.8302898332353863, - "kl": 0.06591796875, - "learning_rate": 4.718874906879688e-06, - "loss": 0.2943, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9641885111816164, + "kl": 0.142333984375, + "learning_rate": 4.7258878905233095e-06, + "loss": -0.0439, + "num_tokens": 3167615.0, + "reward": 0.22870281734503806, + "reward_std": 0.08901303343009204, + "rewards/code_reward/mean": 0.22870281734503806, + "rewards/code_reward/std": 0.08901303354650736, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 412.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.25, + "completions/max_terminated_length": 550.25, + "completions/mean_length": 400.0, + "completions/mean_terminated_length": 400.0, + "completions/min_length": 247.5, + "completions/min_terminated_length": 247.5, "epoch": 0.01052189729956625, - "grad_norm": 0.030863660874464384, - "kl": 0.052978515625, - "learning_rate": 4.711779313188231e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6901557941958711, + "kl": 0.139892578125, + "learning_rate": 4.718874906879688e-06, + "loss": 0.0551, + "num_tokens": 3198319.0, + "reward": 0.05615717824548483, + "reward_std": 0.09094760753214359, + "rewards/code_reward/mean": 0.05615717824548483, + "rewards/code_reward/std": 0.09094761684536934, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 377.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 405.3125, + "completions/mean_terminated_length": 405.3125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, "epoch": 0.010633832377221212, - "grad_norm": 0.04564269693547498, - "kl": 0.06561279296875, - "learning_rate": 4.70460140716584e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1099151469644146, + "kl": 0.1552734375, + "learning_rate": 4.711779313188231e-06, + "loss": 0.0123, + "num_tokens": 3232577.0, + "reward": 0.1259501683525741, + "reward_std": 0.10068924725055695, + "rewards/code_reward/mean": 0.1259501683525741, + "rewards/code_reward/std": 0.10068925376981497, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 372.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.75, + "completions/max_terminated_length": 447.75, + "completions/mean_length": 318.53125, + "completions/mean_terminated_length": 318.53125, + "completions/min_length": 214.25, + "completions/min_terminated_length": 214.25, "epoch": 0.010745767454876171, - "grad_norm": 1.0393244090787366, - "kl": 0.06671142578125, - "learning_rate": 4.697341489983076e-06, - "loss": 0.0811, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.9797800882072801, + "kl": 0.160888671875, + "learning_rate": 4.70460140716584e-06, + "loss": 0.0019, + "num_tokens": 3260770.0, + "reward": 0.15165849681943655, + "reward_std": 0.020956640131771564, + "rewards/code_reward/mean": 0.15165849681943655, + "rewards/code_reward/std": 0.020956639666110277, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 362.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.25, + "completions/max_terminated_length": 615.25, + "completions/mean_length": 405.21875, + "completions/mean_terminated_length": 405.21875, + "completions/min_length": 257.5, + "completions/min_terminated_length": 257.5, "epoch": 0.010857702532531132, - "grad_norm": 0.023478872872016283, - "kl": 0.0535888671875, - "learning_rate": 4.6899998662515215e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8843772526949033, + "kl": 0.1278076171875, + "learning_rate": 4.697341489983076e-06, + "loss": -0.0262, + "num_tokens": 3292233.0, + "reward": 0.22608212963677943, + "reward_std": 0.17254789546132088, + "rewards/code_reward/mean": 0.22608212963677943, + "rewards/code_reward/std": 0.17254790337756276, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 373.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 432.28125, + "completions/mean_terminated_length": 432.28125, + "completions/min_length": 277.75, + "completions/min_terminated_length": 277.75, "epoch": 0.010969637610186092, - "grad_norm": 0.019758461356285777, - "kl": 0.0531005859375, - "learning_rate": 4.682576844011007e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7844902602771369, + "kl": 0.14306640625, + "learning_rate": 4.6899998662515215e-06, + "loss": -0.0089, + "num_tokens": 3325538.0, + "reward": 0.034402412828058004, + "reward_std": 0.08192514721304178, + "rewards/code_reward/mean": 0.034402412828058004, + "rewards/code_reward/std": 0.08192514767870307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 395.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.25, + "completions/max_terminated_length": 498.25, + "completions/mean_length": 350.1875, + "completions/mean_terminated_length": 350.1875, + "completions/min_length": 214.5, + "completions/min_terminated_length": 214.5, "epoch": 0.011081572687841053, - "grad_norm": 0.01976638212707949, - "kl": 0.0645751953125, - "learning_rate": 4.675072734716678e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.053730221138067, + "kl": 0.1533203125, + "learning_rate": 4.682576844011007e-06, + "loss": 0.011, + "num_tokens": 3360072.0, + "reward": 0.03388687747064978, + "reward_std": 0.09327089437283576, + "rewards/code_reward/mean": 0.03388687747064978, + "rewards/code_reward/std": 0.09327089437283576, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 366.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.5, + "completions/max_terminated_length": 522.5, + "completions/mean_length": 355.53125, + "completions/mean_terminated_length": 355.53125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, "epoch": 0.011193507765496012, - "grad_norm": 2.002878750503418, - "kl": 0.0760498046875, - "learning_rate": 4.667487853225931e-06, - "loss": 0.3922, - "reward": 0.09531250037252903, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.1649821726930594, + "kl": 0.138671875, + "learning_rate": 4.675072734716678e-06, + "loss": 0.0495, + "num_tokens": 3386449.0, + "reward": 0.2362132353009656, + "reward_std": 0.2431453033350408, + "rewards/code_reward/mean": 0.2362132353009656, + "rewards/code_reward/std": 0.2431453038007021, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 477.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.25, + "completions/max_terminated_length": 550.25, + "completions/mean_length": 408.90625, + "completions/mean_terminated_length": 408.90625, + "completions/min_length": 278.25, + "completions/min_terminated_length": 278.25, "epoch": 0.011305442843150973, - "grad_norm": 0.6672407190007655, - "kl": 0.05157470703125, - "learning_rate": 4.659822517785203e-06, - "loss": 0.1641, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0129951041942997, + "kl": 0.1279296875, + "learning_rate": 4.667487853225931e-06, + "loss": 0.0108, + "num_tokens": 3415838.0, + "reward": 0.11401251330971718, + "reward_std": 0.17430034466087818, + "rewards/code_reward/mean": 0.11401251330971718, + "rewards/code_reward/std": 0.17430034838616848, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 345.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.5, + "completions/max_terminated_length": 497.5, + "completions/mean_length": 321.125, + "completions/mean_terminated_length": 321.125, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, "epoch": 0.011417377920805933, - "grad_norm": 0.9439619070630538, - "kl": 0.061767578125, - "learning_rate": 4.6520770500166165e-06, - "loss": 0.3783, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.046761590657522806, + "kl": 0.1375732421875, + "learning_rate": 4.659822517785203e-06, + "loss": 0.0014, + "num_tokens": 3437298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 539.90625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1112.5, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 582.5, + "completions/mean_terminated_length": 539.5535736083984, + "completions/min_length": 330.75, + "completions/min_terminated_length": 330.75, "epoch": 0.011529312998460892, - "grad_norm": 0.7519670307168462, - "kl": 0.05535888671875, - "learning_rate": 4.644251774904487e-06, - "loss": 0.0952, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.936453543752835, + "kl": 0.1123046875, + "learning_rate": 4.6520770500166165e-06, + "loss": -0.0705, + "num_tokens": 3476618.0, + "reward": 0.11124547757208347, + "reward_std": 0.17285121232271194, + "rewards/code_reward/mean": 0.11124547757208347, + "rewards/code_reward/std": 0.17285121977329254, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 396.53125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 329.78125, + "completions/mean_terminated_length": 329.78125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.011641248076115853, - "grad_norm": 1.2637256595006654, - "kl": 0.160400390625, - "learning_rate": 4.636347020781684e-06, - "loss": -0.0378, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9390473948263904, + "kl": 0.153564453125, + "learning_rate": 4.644251774904487e-06, + "loss": -0.1439, + "num_tokens": 3504803.0, + "reward": 0.15121639240533113, + "reward_std": 0.10038218321278691, + "rewards/code_reward/mean": 0.15121639240533113, + "rewards/code_reward/std": 0.10038218321278691, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 403.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 424.125, + "completions/mean_terminated_length": 424.125, + "completions/min_length": 292.75, + "completions/min_terminated_length": 292.75, "epoch": 0.011753183153770812, - "grad_norm": 0.9414284114713096, - "kl": 0.0699462890625, - "learning_rate": 4.6283631193158605e-06, - "loss": -0.0089, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.2049832685071231, + "kl": 0.167236328125, + "learning_rate": 4.636347020781684e-06, + "loss": -0.0759, + "num_tokens": 3539471.0, + "reward": 0.07833968009799719, + "reward_std": 0.13395367993507534, + "rewards/code_reward/mean": 0.07833968009799719, + "rewards/code_reward/std": 0.13395368051715195, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 404.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 419.71875, + "completions/mean_terminated_length": 419.71875, + "completions/min_length": 227.5, + "completions/min_terminated_length": 227.5, "epoch": 0.011865118231425774, - "grad_norm": 0.15424717287769374, - "kl": 0.0926513671875, - "learning_rate": 4.620300405495532e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8526628663219907, + "kl": 0.146240234375, + "learning_rate": 4.6283631193158605e-06, + "loss": 0.039, + "num_tokens": 3576414.0, + "reward": 0.12546709179878235, + "reward_std": 0.13694094121456146, + "rewards/code_reward/mean": 0.12546709179878235, + "rewards/code_reward/std": 0.13694094866514206, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 376.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.5, + "completions/max_terminated_length": 574.5, + "completions/mean_length": 430.21875, + "completions/mean_terminated_length": 430.21875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, "epoch": 0.011977053309080733, - "grad_norm": 0.6366716085256641, - "kl": 0.0911865234375, - "learning_rate": 4.612159217616022e-06, - "loss": 0.0133, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1258481541663952, + "kl": 0.1580810546875, + "learning_rate": 4.620300405495532e-06, + "loss": -0.0443, + "num_tokens": 3613485.0, + "reward": 0.11382943368516862, + "reward_std": 0.1571302842348814, + "rewards/code_reward/mean": 0.11382943368516862, + "rewards/code_reward/std": 0.1571302842348814, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 385.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 514.90625, + "completions/mean_terminated_length": 514.90625, + "completions/min_length": 229.75, + "completions/min_terminated_length": 229.75, "epoch": 0.012088988386735694, - "grad_norm": 0.020907253695270293, - "kl": 0.06060791015625, - "learning_rate": 4.603939897265268e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8215610976673118, + "kl": 0.1373291015625, + "learning_rate": 4.612159217616022e-06, + "loss": -0.0201, + "num_tokens": 3648370.0, + "reward": 0.2085580751299858, + "reward_std": 0.1858917400240898, + "rewards/code_reward/mean": 0.2085580751299858, + "rewards/code_reward/std": 0.185891754925251, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 381.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.25, + "completions/max_terminated_length": 700.25, + "completions/mean_length": 498.0625, + "completions/mean_terminated_length": 498.0625, + "completions/min_length": 314.25, + "completions/min_terminated_length": 314.25, "epoch": 0.012200923464390653, - "grad_norm": 0.024537257651566672, - "kl": 0.06396484375, - "learning_rate": 4.595642789309492e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0580602071827545, + "kl": 0.15966796875, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0414, + "num_tokens": 3684604.0, + "reward": 0.08671755698742345, + "reward_std": 0.06942056433763355, + "rewards/code_reward/mean": 0.08671755698742345, + "rewards/code_reward/std": 0.06942057458218187, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 344.421875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1063.25, + "completions/max_terminated_length": 766.75, + "completions/mean_length": 572.75, + "completions/mean_terminated_length": 522.6160736083984, + "completions/min_length": 313.75, + "completions/min_terminated_length": 313.75, "epoch": 0.012312858542045613, - "grad_norm": 0.02438549022281726, - "kl": 0.06048583984375, - "learning_rate": 4.587268241878724e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6888409929879039, + "kl": 0.118896484375, + "learning_rate": 4.595642789309492e-06, + "loss": -0.246, + "num_tokens": 3720668.0, + "reward": 0.10784313827753067, + "reward_std": 0.164918415248394, + "rewards/code_reward/mean": 0.10784313827753067, + "rewards/code_reward/std": 0.164918415248394, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 338.796875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.5, + "completions/max_terminated_length": 660.5, + "completions/mean_length": 428.15625, + "completions/mean_terminated_length": 428.15625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, "epoch": 0.012424793619700574, - "grad_norm": 0.5942167965684412, - "kl": 0.0833740234375, - "learning_rate": 4.578816606352205e-06, - "loss": 0.0065, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6151737403026374, + "kl": 0.153564453125, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0941, + "num_tokens": 3757241.0, + "reward": 0.0126953125, + "reward_std": 0.03590776585042477, + "rewards/code_reward/mean": 0.0126953125, + "rewards/code_reward/std": 0.03590776678174734, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 379.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.25, + "completions/max_terminated_length": 644.25, + "completions/mean_length": 468.65625, + "completions/mean_terminated_length": 468.65625, + "completions/min_length": 265.25, + "completions/min_terminated_length": 265.25, "epoch": 0.012536728697355533, - "grad_norm": 0.023256524966985128, - "kl": 0.05865478515625, - "learning_rate": 4.570288237343632e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8581479811610659, + "kl": 0.162841796875, + "learning_rate": 4.578816606352205e-06, + "loss": -0.0116, + "num_tokens": 3786094.0, + "reward": 0.10341486724792048, + "reward_std": 0.09727730182930827, + "rewards/code_reward/mean": 0.10341486724792048, + "rewards/code_reward/std": 0.09727730927988887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 344.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.75, + "completions/max_terminated_length": 602.75, + "completions/mean_length": 443.21875, + "completions/mean_terminated_length": 443.21875, + "completions/min_length": 278.25, + "completions/min_terminated_length": 278.25, "epoch": 0.012648663775010494, - "grad_norm": 0.4917785489739923, - "kl": 0.0616455078125, - "learning_rate": 4.561683492686289e-06, - "loss": 0.0131, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.032518001055115, + "kl": 0.150634765625, + "learning_rate": 4.570288237343632e-06, + "loss": 0.0057, + "num_tokens": 3815197.0, + "reward": 0.19150842766975984, + "reward_std": 0.17536822147667408, + "rewards/code_reward/mean": 0.19150842766975984, + "rewards/code_reward/std": 0.17536823637783527, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 323.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 471.78125, + "completions/mean_terminated_length": 471.78125, + "completions/min_length": 229.75, + "completions/min_terminated_length": 229.75, "epoch": 0.012760598852665454, - "grad_norm": 0.4189078075428464, - "kl": 0.06109619140625, - "learning_rate": 4.5530027334180285e-06, - "loss": -0.0367, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7883185431993377, + "kl": 0.154296875, + "learning_rate": 4.561683492686289e-06, + "loss": -0.0316, + "num_tokens": 3849462.0, + "reward": 0.15483782812952995, + "reward_std": 0.09077820833772421, + "rewards/code_reward/mean": 0.15483782812952995, + "rewards/code_reward/std": 0.09077820833772421, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 415.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.5, + "completions/max_terminated_length": 707.5, + "completions/mean_length": 486.5, + "completions/mean_terminated_length": 486.5, + "completions/min_length": 265.5, + "completions/min_terminated_length": 265.5, "epoch": 0.012872533930320415, - "grad_norm": 0.03244601571119012, - "kl": 0.0718994140625, - "learning_rate": 4.544246323766122e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8556740089529604, + "kl": 0.157958984375, + "learning_rate": 4.5530027334180285e-06, + "loss": 0.117, + "num_tokens": 3887790.0, + "reward": 0.127931407361757, + "reward_std": 0.11615415895357728, + "rewards/code_reward/mean": 0.127931407361757, + "rewards/code_reward/std": 0.1161541665205732, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 294.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 393.375, + "completions/mean_terminated_length": 393.375, + "completions/min_length": 257.25, + "completions/min_terminated_length": 257.25, "epoch": 0.012984469007975374, - "grad_norm": 0.04077561657409853, - "kl": 0.0740966796875, - "learning_rate": 4.535414631131983e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7897406376014194, + "kl": 0.19970703125, + "learning_rate": 4.544246323766122e-06, + "loss": 0.0248, + "num_tokens": 3919554.0, + "reward": 0.3160191457718611, + "reward_std": 0.041524797677993774, + "rewards/code_reward/mean": 0.3160191457718611, + "rewards/code_reward/std": 0.041524799540638924, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 259.078125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.25, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 380.90625, + "completions/mean_terminated_length": 380.90625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, "epoch": 0.013096404085630335, - "grad_norm": 0.04561395129255693, - "kl": 0.0780029296875, - "learning_rate": 4.526508026075746e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1134080298302058, + "kl": 0.165771484375, + "learning_rate": 4.535414631131983e-06, + "loss": -0.0078, + "num_tokens": 3944911.0, + "reward": 0.24838980130152777, + "reward_std": 0.09577816107776016, + "rewards/code_reward/mean": 0.24838980130152777, + "rewards/code_reward/std": 0.09577816678211093, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 353.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.5, + "completions/max_terminated_length": 570.5, + "completions/mean_length": 429.4375, + "completions/mean_terminated_length": 429.4375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, "epoch": 0.013208339163285295, - "grad_norm": 0.032666442461546756, - "kl": 0.071533203125, - "learning_rate": 4.517526882300721e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9495608914425865, + "kl": 0.1414794921875, + "learning_rate": 4.526508026075746e-06, + "loss": 0.0082, + "num_tokens": 3972925.0, + "reward": 0.14121240563690662, + "reward_std": 0.18964817747473717, + "rewards/code_reward/mean": 0.14121240563690662, + "rewards/code_reward/std": 0.18964817561209202, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 445.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.5, + "completions/max_terminated_length": 786.5, + "completions/mean_length": 586.34375, + "completions/mean_terminated_length": 586.34375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, "epoch": 0.013320274240940254, - "grad_norm": 0.030745081226969093, - "kl": 0.0496826171875, - "learning_rate": 4.508471576637713e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, + "grad_norm": 0.02608214090069118, + "kl": 0.1209716796875, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0012, + "num_tokens": 4010480.0, + "reward": 0.0, "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 410.46875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 434.34375, + "completions/mean_terminated_length": 434.34375, + "completions/min_length": 315.25, + "completions/min_terminated_length": 315.25, "epoch": 0.013432209318595215, - "grad_norm": 0.025825744807750583, - "kl": 0.0751953125, - "learning_rate": 4.499342489029211e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1103364222586907, + "kl": 0.16455078125, + "learning_rate": 4.508471576637713e-06, + "loss": 0.0019, + "num_tokens": 4047539.0, + "reward": 0.11785737407626584, + "reward_std": 0.09593676403164864, + "rewards/code_reward/mean": 0.11785737407626584, + "rewards/code_reward/std": 0.09593676414806396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 261.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.5, + "completions/max_terminated_length": 587.5, + "completions/mean_length": 426.9375, + "completions/mean_terminated_length": 426.9375, + "completions/min_length": 296.25, + "completions/min_terminated_length": 296.25, "epoch": 0.013544144396250174, - "grad_norm": 0.027055040661668105, - "kl": 0.071533203125, - "learning_rate": 4.490140002513449e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9365073082574228, + "kl": 0.1627197265625, + "learning_rate": 4.499342489029211e-06, + "loss": -0.0644, + "num_tokens": 4073449.0, + "reward": 0.25551173387793824, + "reward_std": 0.1488891058252193, + "rewards/code_reward/mean": 0.25551173387793824, + "rewards/code_reward/std": 0.1488891058252193, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 503.46875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.5, + "completions/max_terminated_length": 777.5, + "completions/mean_length": 561.3125, + "completions/mean_terminated_length": 561.3125, + "completions/min_length": 308.5, + "completions/min_terminated_length": 308.5, "epoch": 0.013656079473905135, - "grad_norm": 0.9837824917503732, - "kl": 0.05841064453125, - "learning_rate": 4.48086450320833e-06, - "loss": 0.156, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9171469588767179, + "kl": 0.16162109375, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0833, + "num_tokens": 4117531.0, + "reward": 0.06771073397248983, + "reward_std": 0.10771879553794861, + "rewards/code_reward/mean": 0.06771073397248983, + "rewards/code_reward/std": 0.1077187992632389, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 551.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 602.0, + "completions/mean_terminated_length": 602.0, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, "epoch": 0.013768014551560095, - "grad_norm": 0.36909382007632124, - "kl": 0.0616455078125, - "learning_rate": 4.4715163802952266e-06, - "loss": 0.034, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.5430581678238743, + "kl": 0.1229248046875, + "learning_rate": 4.48086450320833e-06, + "loss": 0.0193, + "num_tokens": 4164115.0, + "reward": 0.06402191519737244, + "reward_std": 0.10564571619033813, + "rewards/code_reward/mean": 0.06402191519737244, + "rewards/code_reward/std": 0.10564571805298328, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 311.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.75, + "completions/max_terminated_length": 700.75, + "completions/mean_length": 464.5, + "completions/mean_terminated_length": 464.5, + "completions/min_length": 276.25, + "completions/min_terminated_length": 276.25, "epoch": 0.013879949629215056, - "grad_norm": 0.02289685144542213, - "kl": 0.0621337890625, - "learning_rate": 4.462096026002655e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9177689772341072, + "kl": 0.154296875, + "learning_rate": 4.4715163802952266e-06, + "loss": 0.0239, + "num_tokens": 4192843.0, + "reward": 0.1863182729575783, + "reward_std": 0.09033735934644938, + "rewards/code_reward/mean": 0.1863182729575783, + "rewards/code_reward/std": 0.09033736307173967, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 349.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.5, + "completions/max_terminated_length": 752.5, + "completions/mean_length": 489.78125, + "completions/mean_terminated_length": 489.78125, + "completions/min_length": 206.25, + "completions/min_terminated_length": 206.25, "epoch": 0.013991884706870015, - "grad_norm": 0.5288400884276316, - "kl": 0.06207275390625, - "learning_rate": 4.4526038355898144e-06, - "loss": -0.0128, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8322506450321084, + "kl": 0.202392578125, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0184, + "num_tokens": 4227268.0, + "reward": 0.21419981867074966, + "reward_std": 0.21183521673083305, + "rewards/code_reward/mean": 0.21419981867074966, + "rewards/code_reward/std": 0.21183520928025246, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 376.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 540.03125, + "completions/mean_terminated_length": 540.03125, + "completions/min_length": 284.25, + "completions/min_terminated_length": 284.25, "epoch": 0.014103819784524976, - "grad_norm": 0.019666228568501372, - "kl": 0.056396484375, - "learning_rate": 4.4430402073300035e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9689144701432464, + "kl": 0.146728515625, + "learning_rate": 4.4526038355898144e-06, + "loss": -0.0308, + "num_tokens": 4261797.0, + "reward": 0.261167012155056, + "reward_std": 0.22630748711526394, + "rewards/code_reward/mean": 0.261167012155056, + "rewards/code_reward/std": 0.22630748711526394, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 338.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.75, + "completions/max_terminated_length": 673.75, + "completions/mean_length": 450.34375, + "completions/mean_terminated_length": 450.34375, + "completions/min_length": 210.25, + "completions/min_terminated_length": 210.25, "epoch": 0.014215754862179936, - "grad_norm": 0.022162440604770836, - "kl": 0.0638427734375, - "learning_rate": 4.433405542493909e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.4213790833134132, + "kl": 0.144287109375, + "learning_rate": 4.4430402073300035e-06, + "loss": 0.0292, + "num_tokens": 4290992.0, + "reward": 0.012987012974917889, + "reward_std": 0.013883699662983418, + "rewards/code_reward/mean": 0.012987012974917889, + "rewards/code_reward/std": 0.013883701525628567, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 356.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.25, + "completions/max_terminated_length": 664.25, + "completions/mean_length": 453.6875, + "completions/mean_terminated_length": 453.6875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, "epoch": 0.014327689939834895, - "grad_norm": 0.5613568768967913, - "kl": 0.06121826171875, - "learning_rate": 4.4237002453327734e-06, - "loss": -0.0001, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8712327296844586, + "kl": 0.145263671875, + "learning_rate": 4.433405542493909e-06, + "loss": -0.0154, + "num_tokens": 4323358.0, + "reward": 0.12838431354612112, + "reward_std": 0.14957262016832829, + "rewards/code_reward/mean": 0.12838431354612112, + "rewards/code_reward/std": 0.14957262203097343, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 375.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.75, + "completions/max_terminated_length": 840.75, + "completions/mean_length": 554.40625, + "completions/mean_terminated_length": 554.40625, + "completions/min_length": 317.5, + "completions/min_terminated_length": 317.5, "epoch": 0.014439625017489856, - "grad_norm": 0.025068232916270288, - "kl": 0.06744384765625, - "learning_rate": 4.4139247230614245e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8824605681531097, + "kl": 0.15185546875, + "learning_rate": 4.4237002453327734e-06, + "loss": 0.096, + "num_tokens": 4357363.0, + "reward": 0.22759733814746141, + "reward_std": 0.2646235190331936, + "rewards/code_reward/mean": 0.22759733814746141, + "rewards/code_reward/std": 0.26462352089583874, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 407.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.75, + "completions/max_terminated_length": 601.75, + "completions/mean_length": 434.46875, + "completions/mean_terminated_length": 434.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, "epoch": 0.014551560095144815, - "grad_norm": 0.018723191179922716, - "kl": 0.05828857421875, - "learning_rate": 4.404079385841201e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.011223461250626, + "kl": 0.16943359375, + "learning_rate": 4.4139247230614245e-06, + "loss": 0.0213, + "num_tokens": 4390298.0, + "reward": 0.17828914523124695, + "reward_std": 0.2471884172409773, + "rewards/code_reward/mean": 0.17828914523124695, + "rewards/code_reward/std": 0.24718842469155788, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 265.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.25, + "completions/max_terminated_length": 502.25, + "completions/mean_length": 342.0625, + "completions/mean_terminated_length": 342.0625, + "completions/min_length": 229.75, + "completions/min_terminated_length": 229.75, "epoch": 0.014663495172799777, - "grad_norm": 0.16972024878699354, - "kl": 0.09710693359375, - "learning_rate": 4.394164646762734e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7287897396776124, + "kl": 0.223876953125, + "learning_rate": 4.404079385841201e-06, + "loss": -0.0213, + "num_tokens": 4411124.0, + "reward": 0.599999999627471, + "reward_std": 0.13620114093646407, + "rewards/code_reward/mean": 0.599999999627471, + "rewards/code_reward/std": 0.13620115583762527, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 321.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 406.1875, + "completions/mean_terminated_length": 406.1875, + "completions/min_length": 241.5, + "completions/min_terminated_length": 241.5, "epoch": 0.014775430250454736, - "grad_norm": 0.0237436227671808, - "kl": 0.06884765625, - "learning_rate": 4.384180921828618e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1828069225002862, + "kl": 0.21142578125, + "learning_rate": 4.394164646762734e-06, + "loss": 0.0079, + "num_tokens": 4436370.0, + "reward": 0.070248453237582, + "reward_std": 0.07654083496890962, + "rewards/code_reward/mean": 0.070248453237582, + "rewards/code_reward/std": 0.07654083543457091, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 419.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.75, + "completions/max_terminated_length": 777.75, + "completions/mean_length": 456.53125, + "completions/mean_terminated_length": 456.53125, + "completions/min_length": 206.25, + "completions/min_terminated_length": 206.25, "epoch": 0.014887365328109697, - "grad_norm": 0.6392088951882646, - "kl": 0.05865478515625, - "learning_rate": 4.374128629935955e-06, - "loss": 0.1335, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8073699112875448, + "kl": 0.1446533203125, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0692, + "num_tokens": 4466595.0, + "reward": 0.17728960141539574, + "reward_std": 0.20221376791596413, + "rewards/code_reward/mean": 0.17728960141539574, + "rewards/code_reward/std": 0.20221376977860928, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 371.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.5, + "completions/max_terminated_length": 632.5, + "completions/mean_length": 455.34375, + "completions/mean_terminated_length": 455.34375, + "completions/min_length": 254.75, + "completions/min_terminated_length": 254.75, "epoch": 0.014999300405764656, - "grad_norm": 0.0258451541588282, - "kl": 0.0711669921875, - "learning_rate": 4.364008192858781e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8462919317415648, + "kl": 0.156494140625, + "learning_rate": 4.374128629935955e-06, + "loss": 0.0137, + "num_tokens": 4500494.0, + "reward": 0.1631067901616916, + "reward_std": 0.13719243195373565, + "rewards/code_reward/mean": 0.1631067901616916, + "rewards/code_reward/std": 0.13719243567902595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 386.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.75, + "completions/max_terminated_length": 654.75, + "completions/mean_length": 447.09375, + "completions/mean_terminated_length": 447.09375, + "completions/min_length": 267.75, + "completions/min_terminated_length": 267.75, "epoch": 0.015111235483419617, - "grad_norm": 0.023614977303173818, - "kl": 0.0672607421875, - "learning_rate": 4.353820035230366e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0654308580054503, + "kl": 0.18505859375, + "learning_rate": 4.364008192858781e-06, + "loss": -0.0584, + "num_tokens": 4531953.0, + "reward": 0.30278054997324944, + "reward_std": 0.2559507302939892, + "rewards/code_reward/mean": 0.30278054997324944, + "rewards/code_reward/std": 0.2559507489204407, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 466.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 524.3125, + "completions/mean_terminated_length": 524.3125, + "completions/min_length": 310.25, + "completions/min_terminated_length": 310.25, "epoch": 0.015223170561074577, - "grad_norm": 0.019070024346192215, - "kl": 0.066162109375, - "learning_rate": 4.3435645845254e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7375182423747296, + "kl": 0.1689453125, + "learning_rate": 4.353820035230366e-06, + "loss": -0.0053, + "num_tokens": 4570779.0, + "reward": 0.27923886105418205, + "reward_std": 0.0807168073952198, + "rewards/code_reward/mean": 0.27923886105418205, + "rewards/code_reward/std": 0.0807168073952198, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 504.890625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 954.0, + "completions/max_terminated_length": 635.25, + "completions/mean_length": 518.625, + "completions/mean_terminated_length": 471.20982360839844, + "completions/min_length": 182.75, + "completions/min_terminated_length": 182.75, "epoch": 0.015335105638729536, - "grad_norm": 0.37696582370006476, - "kl": 0.06463623046875, - "learning_rate": 4.333242271042054e-06, - "loss": -0.0004, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.7408425833924634, + "kl": 0.128662109375, + "learning_rate": 4.3435645845254e-06, + "loss": -0.0565, + "num_tokens": 4603031.0, + "reward": 0.08707524091005325, + "reward_std": 0.1465706154704094, + "rewards/code_reward/mean": 0.08707524091005325, + "rewards/code_reward/std": 0.1465706117451191, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 567.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.75, + "completions/max_terminated_length": 649.75, + "completions/mean_length": 465.34375, + "completions/mean_terminated_length": 465.34375, + "completions/min_length": 238.25, + "completions/min_terminated_length": 238.25, "epoch": 0.015447040716384497, - "grad_norm": 3.0390479634412357, - "kl": 0.071533203125, - "learning_rate": 4.32285352788393e-06, - "loss": 0.2188, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.9226530191775734, + "kl": 0.196533203125, + "learning_rate": 4.333242271042054e-06, + "loss": 0.0199, + "num_tokens": 4640226.0, + "reward": 0.12062139442423359, + "reward_std": 0.12237106915563345, + "rewards/code_reward/mean": 0.12062139442423359, + "rewards/code_reward/std": 0.1223710693884641, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 457.21875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 463.03125, + "completions/mean_terminated_length": 463.03125, + "completions/min_length": 170.5, + "completions/min_terminated_length": 170.5, "epoch": 0.015558975794039457, - "grad_norm": 1.8150343506782198, - "kl": 0.106689453125, - "learning_rate": 4.312398790941882e-06, - "loss": 0.3003, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.5630743662662061, + "kl": 0.133544921875, + "learning_rate": 4.32285352788393e-06, + "loss": -0.0273, + "num_tokens": 4672011.0, + "reward": 0.0625, + "reward_std": 0.06681530922651291, + "rewards/code_reward/mean": 0.0625, + "rewards/code_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 609.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.25, + "completions/max_terminated_length": 667.25, + "completions/mean_length": 473.21875, + "completions/mean_terminated_length": 473.21875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, "epoch": 0.015670910871694418, - "grad_norm": 0.03462843424405208, - "kl": 0.05792236328125, - "learning_rate": 4.301878498875735e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8457340493749413, + "kl": 0.203125, + "learning_rate": 4.312398790941882e-06, + "loss": 0.0252, + "num_tokens": 4707650.0, + "reward": 0.01744219067040831, + "reward_std": 0.03082139673642814, + "rewards/code_reward/mean": 0.01744219067040831, + "rewards/code_reward/std": 0.030821396969258785, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 473.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 459.03125, + "completions/mean_terminated_length": 459.03125, + "completions/min_length": 282.75, + "completions/min_terminated_length": 282.75, "epoch": 0.015782845949349377, - "grad_norm": 1.7729469082974438, - "kl": 0.06402587890625, - "learning_rate": 4.291293093095873e-06, - "loss": 0.1156, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0215167669033631, + "kl": 0.1568603515625, + "learning_rate": 4.301878498875735e-06, + "loss": -0.0223, + "num_tokens": 4738659.0, + "reward": 0.14855818077921867, + "reward_std": 0.18304241634905338, + "rewards/code_reward/mean": 0.14855818077921867, + "rewards/code_reward/std": 0.18304241262376308, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 440.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.75, + "completions/max_terminated_length": 573.75, + "completions/mean_length": 438.84375, + "completions/mean_terminated_length": 438.84375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, "epoch": 0.015894781027004336, - "grad_norm": 0.678889245969432, - "kl": 0.0787353515625, - "learning_rate": 4.280643017744723e-06, - "loss": 0.0363, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.958766664227721, + "kl": 0.20068359375, + "learning_rate": 4.291293093095873e-06, + "loss": 0.0597, + "num_tokens": 4769838.0, + "reward": 0.0944940485060215, + "reward_std": 0.07186714326962829, + "rewards/code_reward/mean": 0.0944940485060215, + "rewards/code_reward/std": 0.07186715072020888, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 615.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.25, + "completions/max_terminated_length": 705.25, + "completions/mean_length": 510.0625, + "completions/mean_terminated_length": 510.0625, + "completions/min_length": 270.25, + "completions/min_terminated_length": 270.25, "epoch": 0.0160067161046593, - "grad_norm": 3.1600108504474265, - "kl": 0.103759765625, - "learning_rate": 4.269928719678117e-06, - "loss": 0.2578, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.8340511277589133, + "kl": 0.191650390625, + "learning_rate": 4.280643017744723e-06, + "loss": -0.0546, + "num_tokens": 4813416.0, + "reward": 0.017440817784518003, + "reward_std": 0.015970090869814157, + "rewards/code_reward/mean": 0.017440817784518003, + "rewards/code_reward/std": 0.015970090869814157, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 383.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.5, + "completions/max_terminated_length": 676.5, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 436.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, "epoch": 0.01611865118231426, - "grad_norm": 0.13214155840655448, - "kl": 0.101318359375, - "learning_rate": 4.2591506484465426e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9354512924987337, + "kl": 0.1859130859375, + "learning_rate": 4.269928719678117e-06, + "loss": 0.0158, + "num_tokens": 4850540.0, + "reward": 0.18274498358368874, + "reward_std": 0.1233069859445095, + "rewards/code_reward/mean": 0.18274498358368874, + "rewards/code_reward/std": 0.1233069896697998, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 414.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 417.8125, + "completions/mean_terminated_length": 417.8125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, "epoch": 0.016230586259969218, - "grad_norm": 0.7194283098737337, - "kl": 0.08380126953125, - "learning_rate": 4.248309256276283e-06, - "loss": -0.0069, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8775436859559402, + "kl": 0.200927734375, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.06, + "num_tokens": 4880958.0, + "reward": 0.1889239656738937, + "reward_std": 0.06604543374851346, + "rewards/code_reward/mean": 0.1889239656738937, + "rewards/code_reward/std": 0.06604543328285217, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 480.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.5, + "completions/max_terminated_length": 642.5, + "completions/mean_length": 453.6875, + "completions/mean_terminated_length": 453.6875, + "completions/min_length": 225.75, + "completions/min_terminated_length": 225.75, "epoch": 0.016342521337624177, - "grad_norm": 0.10784588867048772, - "kl": 0.0704345703125, - "learning_rate": 4.23740499805044e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0579923330835856, + "kl": 0.190673828125, + "learning_rate": 4.248309256276283e-06, + "loss": 0.0058, + "num_tokens": 4908772.0, + "reward": 0.22657467075623572, + "reward_std": 0.27265046804677695, + "rewards/code_reward/mean": 0.22657467075623572, + "rewards/code_reward/std": 0.27265046804677695, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 386.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 373.4375, + "completions/mean_terminated_length": 373.4375, + "completions/min_length": 167.75, + "completions/min_terminated_length": 167.75, "epoch": 0.016454456415279137, - "grad_norm": 1.2992200606773725, - "kl": 0.1204833984375, - "learning_rate": 4.22643833128985e-06, - "loss": 0.0012, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2602004582489772, + "kl": 0.2349853515625, + "learning_rate": 4.23740499805044e-06, + "loss": 0.0749, + "num_tokens": 4935178.0, + "reward": 0.3513445816934109, + "reward_std": 0.20541435480117798, + "rewards/code_reward/mean": 0.3513445816934109, + "rewards/code_reward/std": 0.20541436225175858, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 430.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.25, + "completions/max_terminated_length": 619.25, + "completions/mean_length": 427.78125, + "completions/mean_terminated_length": 427.78125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.0165663914929341, - "grad_norm": 16.491496062528398, - "kl": 0.4547119140625, - "learning_rate": 4.215409716133885e-06, - "loss": 0.131, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.1160890350070682, + "kl": 0.17919921875, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0269, + "num_tokens": 4966539.0, + "reward": 0.279205069411546, + "reward_std": 0.04902365058660507, + "rewards/code_reward/mean": 0.279205069411546, + "rewards/code_reward/std": 0.04902365151792765, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 471.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.25, + "completions/max_terminated_length": 665.25, + "completions/mean_length": 371.625, + "completions/mean_terminated_length": 371.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, "epoch": 0.01667832657058906, - "grad_norm": 4.096997318208475, - "kl": 0.2073974609375, - "learning_rate": 4.204319615321151e-06, - "loss": 0.0021, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1750545317228818, + "kl": 0.23681640625, + "learning_rate": 4.215409716133885e-06, + "loss": 0.015, + "num_tokens": 5001903.0, + "reward": 0.17107138480059803, + "reward_std": 0.16521674406249076, + "rewards/code_reward/mean": 0.17107138480059803, + "rewards/code_reward/std": 0.16521676117554307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 423.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.25, + "completions/max_terminated_length": 576.25, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 144.5, + "completions/min_terminated_length": 144.5, "epoch": 0.016790261648244018, - "grad_norm": 3.910798929312304, - "kl": 0.2662353515625, - "learning_rate": 4.193168494170065e-06, - "loss": 0.2077, - "reward": 0.09218750335276127, - "reward_std": 0.01861694734543562, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.0413783228416158, + "kl": 0.249755859375, + "learning_rate": 4.204319615321151e-06, + "loss": 0.0077, + "num_tokens": 5030091.0, + "reward": 0.09492883179336786, + "reward_std": 0.12909611221402884, + "rewards/code_reward/mean": 0.09492883179336786, + "rewards/code_reward/std": 0.12909611966460943, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 388.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.75, + "completions/max_terminated_length": 585.75, + "completions/mean_length": 353.1875, + "completions/mean_terminated_length": 353.1875, + "completions/min_length": 91.25, + "completions/min_terminated_length": 91.25, "epoch": 0.016902196725898978, - "grad_norm": 13.951120691377671, - "kl": 0.37646484375, - "learning_rate": 4.181956820559339e-06, - "loss": 0.5985, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.579544739950842, + "kl": 0.50390625, + "learning_rate": 4.193168494170065e-06, + "loss": 0.0444, + "num_tokens": 5057441.0, + "reward": 0.600965291261673, + "reward_std": 0.2557707913219929, + "rewards/code_reward/mean": 0.600965291261673, + "rewards/code_reward/std": 0.2557708006352186, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 348.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.25, + "completions/max_terminated_length": 577.25, + "completions/mean_length": 340.625, + "completions/mean_terminated_length": 340.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, "epoch": 0.01701413180355394, - "grad_norm": 5.2519598128920455, - "kl": 0.1241455078125, - "learning_rate": 4.170685064908342e-06, - "loss": 0.2291, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.3003579285788192, + "kl": 0.190673828125, + "learning_rate": 4.181956820559339e-06, + "loss": 0.132, + "num_tokens": 5082069.0, + "reward": 0.32964441180229187, + "reward_std": 0.2922050729393959, + "rewards/code_reward/mean": 0.32964441180229187, + "rewards/code_reward/std": 0.2922050729393959, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 459.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.75, + "completions/max_terminated_length": 463.75, + "completions/mean_length": 249.5625, + "completions/mean_terminated_length": 249.5625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, "epoch": 0.0171260668812089, - "grad_norm": 10.132405471868221, - "kl": 0.1990966796875, - "learning_rate": 4.159353700157365e-06, - "loss": 0.1752, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.2964116512514992, + "kl": 0.23046875, + "learning_rate": 4.170685064908342e-06, + "loss": 0.0824, + "num_tokens": 5110151.0, + "reward": 0.128064907155931, + "reward_std": 0.0706186261959374, + "rewards/code_reward/mean": 0.128064907155931, + "rewards/code_reward/std": 0.07061862386763096, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 386.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.75, + "completions/max_terminated_length": 499.75, + "completions/mean_length": 290.53125, + "completions/mean_terminated_length": 290.53125, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.01723800195886386, - "grad_norm": 0.784730424677537, - "kl": 0.0968017578125, - "learning_rate": 4.14796320174778e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8703199526169038, + "kl": 0.276611328125, + "learning_rate": 4.159353700157365e-06, + "loss": -0.0831, + "num_tokens": 5137592.0, + "reward": 0.11129332333803177, + "reward_std": 0.10705379582941532, + "rewards/code_reward/mean": 0.11129332333803177, + "rewards/code_reward/std": 0.1070537967607379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 368.65625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 119.75, + "completions/min_terminated_length": 119.75, "epoch": 0.01734993703651882, - "grad_norm": 27.02725934627477, - "kl": 0.109619140625, - "learning_rate": 4.136514047602087e-06, - "loss": 0.1772, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.224937538572504, + "kl": 0.201416015625, + "learning_rate": 4.14796320174778e-06, + "loss": -0.0439, + "num_tokens": 5162960.0, + "reward": 0.1461925357580185, + "reward_std": 0.23236336186528206, + "rewards/code_reward/mean": 0.1461925357580185, + "rewards/code_reward/std": 0.2323633674532175, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 477.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 347.59375, + "completions/mean_terminated_length": 347.59375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, "epoch": 0.017461872114173778, - "grad_norm": 14.370055706061692, - "kl": 0.1593017578125, - "learning_rate": 4.1250067181038635e-06, - "loss": 0.2029, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6766785249872507, + "kl": 0.1558837890625, + "learning_rate": 4.136514047602087e-06, + "loss": 0.0103, + "num_tokens": 5192755.0, + "reward": 0.0625, + "reward_std": 0.1157275140285492, + "rewards/code_reward/mean": 0.0625, + "rewards/code_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 480.53125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.5, + "completions/max_terminated_length": 517.5, + "completions/mean_length": 301.0625, + "completions/mean_terminated_length": 301.0625, + "completions/min_length": 112.25, + "completions/min_terminated_length": 112.25, "epoch": 0.01757380719182874, - "grad_norm": 13.084686861766809, - "kl": 0.1204833984375, - "learning_rate": 4.113441696077608e-06, - "loss": 0.1918, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.2551889406028631, + "kl": 0.197998046875, + "learning_rate": 4.1250067181038635e-06, + "loss": -0.0209, + "num_tokens": 5216477.0, + "reward": 0.17783564236015081, + "reward_std": 0.24008767772465944, + "rewards/code_reward/mean": 0.17783564236015081, + "rewards/code_reward/std": 0.24008767493069172, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 379.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 232.6875, + "completions/mean_terminated_length": 232.6875, + "completions/min_length": 86.25, + "completions/min_terminated_length": 86.25, "epoch": 0.0176857422694837, - "grad_norm": 0.6843422318132463, - "kl": 0.07861328125, - "learning_rate": 4.101819466768484e-06, - "loss": 0.017, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4488039878157586, + "kl": 0.1767578125, + "learning_rate": 4.113441696077608e-06, + "loss": -0.0524, + "num_tokens": 5237427.0, + "reward": 0.03743714070878923, + "reward_std": 0.10588822257705033, + "rewards/code_reward/mean": 0.03743714070878923, + "rewards/code_reward/std": 0.10588822374120355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 366.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.25, + "completions/max_terminated_length": 477.25, + "completions/mean_length": 279.34375, + "completions/mean_terminated_length": 279.34375, + "completions/min_length": 133.25, + "completions/min_terminated_length": 133.25, "epoch": 0.01779767734713866, - "grad_norm": 6.203392924309637, - "kl": 0.2252197265625, - "learning_rate": 4.0901405178219535e-06, - "loss": -0.0466, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.5189669762242, + "kl": 0.238037109375, + "learning_rate": 4.101819466768484e-06, + "loss": -0.1518, + "num_tokens": 5268406.0, + "reward": 0.08647377614397556, + "reward_std": 0.06177530816057697, + "rewards/code_reward/mean": 0.08647377614397556, + "rewards/code_reward/std": 0.06177531188586727, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 404.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.5, + "completions/max_terminated_length": 491.5, + "completions/mean_length": 260.4375, + "completions/mean_terminated_length": 260.4375, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, "epoch": 0.01790961242479362, - "grad_norm": 0.7482548079571155, - "kl": 0.15234375, - "learning_rate": 4.078405339263326e-06, - "loss": 0.0015, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0554249659877584, + "kl": 0.147705078125, + "learning_rate": 4.0901405178219535e-06, + "loss": 0.0005, + "num_tokens": 5291300.0, + "reward": 0.04570723883807659, + "reward_std": 0.07690948667004704, + "rewards/code_reward/mean": 0.04570723883807659, + "rewards/code_reward/std": 0.07690948317758739, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 392.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 341.28125, + "completions/mean_terminated_length": 341.28125, + "completions/min_length": 155.75, + "completions/min_terminated_length": 155.75, "epoch": 0.018021547502448578, - "grad_norm": 0.821487756754832, - "kl": 0.095458984375, - "learning_rate": 4.06661442347719e-06, - "loss": 0.008, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9717735557494784, + "kl": 0.212890625, + "learning_rate": 4.078405339263326e-06, + "loss": -0.0304, + "num_tokens": 5321093.0, + "reward": 0.053125000558793545, + "reward_std": 0.07709404267370701, + "rewards/code_reward/mean": 0.053125000558793545, + "rewards/code_reward/std": 0.07709404919296503, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 370.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.5, + "completions/max_terminated_length": 480.5, + "completions/mean_length": 237.9375, + "completions/mean_terminated_length": 237.9375, + "completions/min_length": 116.75, + "completions/min_terminated_length": 116.75, "epoch": 0.01813348258010354, - "grad_norm": 0.25693644980051783, - "kl": 0.1165771484375, - "learning_rate": 4.054768265186758e-06, - "loss": 0.0012, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.173075355333341, + "kl": 0.188232421875, + "learning_rate": 4.06661442347719e-06, + "loss": -0.0205, + "num_tokens": 5348659.0, + "reward": 0.2592630833387375, + "reward_std": 0.15858712047338486, + "rewards/code_reward/mean": 0.2592630833387375, + "rewards/code_reward/std": 0.15858712792396545, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 336.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.75, + "completions/max_terminated_length": 530.75, + "completions/mean_length": 297.1875, + "completions/mean_terminated_length": 297.1875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, "epoch": 0.0182454176577585, - "grad_norm": 0.3151740457382109, - "kl": 0.0853271484375, - "learning_rate": 4.0428673614331036e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.287995519073581, + "kl": 0.18896484375, + "learning_rate": 4.054768265186758e-06, + "loss": -0.0652, + "num_tokens": 5372217.0, + "reward": 0.33238982781767845, + "reward_std": 0.27272730600088835, + "rewards/code_reward/mean": 0.33238982781767845, + "rewards/code_reward/std": 0.2727273255586624, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 354.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.5, + "completions/max_terminated_length": 636.5, + "completions/mean_length": 292.53125, + "completions/mean_terminated_length": 292.53125, + "completions/min_length": 117.25, + "completions/min_terminated_length": 117.25, "epoch": 0.01835735273541346, - "grad_norm": 0.2872706706321094, - "kl": 0.090087890625, - "learning_rate": 4.030912211554316e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1060543366676017, + "kl": 0.165283203125, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.0064, + "num_tokens": 5397890.0, + "reward": 0.20836169831454754, + "reward_std": 0.17235604114830494, + "rewards/code_reward/mean": 0.20836169831454754, + "rewards/code_reward/std": 0.17235605791211128, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 337.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.5, + "completions/max_terminated_length": 814.5, + "completions/mean_length": 351.71875, + "completions/mean_terminated_length": 351.71875, + "completions/min_length": 145.75, + "completions/min_terminated_length": 145.75, "epoch": 0.01846928781306842, - "grad_norm": 0.11020779139062825, - "kl": 0.0782470703125, - "learning_rate": 4.018903317164539e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7382591236061559, + "kl": 0.1884765625, + "learning_rate": 4.030912211554316e-06, + "loss": 0.0313, + "num_tokens": 5423913.0, + "reward": 0.13007790176197886, + "reward_std": 0.05658754054456949, + "rewards/code_reward/mean": 0.13007790176197886, + "rewards/code_reward/std": 0.056587545201182365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 346.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.25, + "completions/max_terminated_length": 539.25, + "completions/mean_length": 299.21875, + "completions/mean_terminated_length": 299.21875, + "completions/min_length": 131.25, + "completions/min_terminated_length": 131.25, "epoch": 0.018581222890723382, - "grad_norm": 0.045653419133126164, - "kl": 0.0740966796875, - "learning_rate": 4.006841182132932e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.373915385466877, + "kl": 0.193359375, + "learning_rate": 4.018903317164539e-06, + "loss": -0.1003, + "num_tokens": 5448488.0, + "reward": 0.08751785231288522, + "reward_std": 0.11654674645978957, + "rewards/code_reward/mean": 0.08751785231288522, + "rewards/code_reward/std": 0.11654674645978957, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 343.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 359.65625, + "completions/mean_terminated_length": 359.65625, + "completions/min_length": 132.5, + "completions/min_terminated_length": 132.5, "epoch": 0.01869315796837834, - "grad_norm": 0.021075436862415513, - "kl": 0.06011962890625, - "learning_rate": 3.9947263125625195e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0340989098627629, + "kl": 0.17822265625, + "learning_rate": 4.006841182132932e-06, + "loss": -0.0343, + "num_tokens": 5474285.0, + "reward": 0.1759367436170578, + "reward_std": 0.2240792140364647, + "rewards/code_reward/mean": 0.1759367436170578, + "rewards/code_reward/std": 0.224079217761755, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 263.21875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.75, + "completions/max_terminated_length": 461.75, + "completions/mean_length": 281.5625, + "completions/mean_terminated_length": 281.5625, + "completions/min_length": 102.25, + "completions/min_terminated_length": 102.25, "epoch": 0.0188050930460333, - "grad_norm": 0.04494777486555804, - "kl": 0.07147216796875, - "learning_rate": 3.982559216768967e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4134315706278993, + "kl": 0.210205078125, + "learning_rate": 3.9947263125625195e-06, + "loss": 0.013, + "num_tokens": 5498599.0, + "reward": 0.3900106647051871, + "reward_std": 0.2551127364858985, + "rewards/code_reward/mean": 0.3900106647051871, + "rewards/code_reward/std": 0.25511275534518063, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 339.40625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1069.5, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 497.46875, + "completions/mean_terminated_length": 441.75, + "completions/min_length": 222.75, + "completions/min_terminated_length": 222.75, "epoch": 0.01891702812368826, - "grad_norm": 0.018822857218736996, - "kl": 0.0582275390625, - "learning_rate": 3.970340405259245e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0149191598734515, + "kl": 0.197998046875, + "learning_rate": 3.982559216768967e-06, + "loss": 0.0765, + "num_tokens": 5530310.0, + "reward": 0.1429782696068287, + "reward_std": 0.16231020726263523, + "rewards/code_reward/mean": 0.1429782696068287, + "rewards/code_reward/std": 0.16231020539999008, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 350.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.5, + "completions/max_terminated_length": 601.5, + "completions/mean_length": 391.875, + "completions/mean_terminated_length": 391.875, + "completions/min_length": 166.5, + "completions/min_terminated_length": 166.5, "epoch": 0.01902896320134322, - "grad_norm": 0.023628578386486077, - "kl": 0.07000732421875, - "learning_rate": 3.958070390710214e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8227739015604961, + "kl": 0.18212890625, + "learning_rate": 3.970340405259245e-06, + "loss": 0.1136, + "num_tokens": 5562970.0, + "reward": 0.1685887835919857, + "reward_std": 0.17748497053980827, + "rewards/code_reward/mean": 0.1685887835919857, + "rewards/code_reward/std": 0.17748496308922768, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 367.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.25, + "completions/max_terminated_length": 595.25, + "completions/mean_length": 413.125, + "completions/mean_terminated_length": 413.125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, "epoch": 0.019140898278998182, - "grad_norm": 0.023811978335883294, - "kl": 0.0609130859375, - "learning_rate": 3.945749687947109e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0311180144750687, + "kl": 0.2353515625, + "learning_rate": 3.958070390710214e-06, + "loss": -0.0245, + "num_tokens": 5591150.0, + "reward": 0.1419127695262432, + "reward_std": 0.12009143829345703, + "rewards/code_reward/mean": 0.1419127695262432, + "rewards/code_reward/std": 0.12009144574403763, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 300.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 454.03125, + "completions/mean_terminated_length": 454.03125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, "epoch": 0.01925283335665314, - "grad_norm": 0.8750934577120364, - "kl": 0.07550048828125, - "learning_rate": 3.933378813921942e-06, - "loss": 0.013, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9357441509233236, + "kl": 0.19580078125, + "learning_rate": 3.945749687947109e-06, + "loss": -0.0136, + "num_tokens": 5620991.0, + "reward": 0.06574675627052784, + "reward_std": 0.07077404530718923, + "rewards/code_reward/mean": 0.06574675627052784, + "rewards/code_reward/std": 0.07077404530718923, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 335.546875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.75, + "completions/max_terminated_length": 678.75, + "completions/mean_length": 487.78125, + "completions/mean_terminated_length": 487.78125, + "completions/min_length": 288.25, + "completions/min_terminated_length": 288.25, "epoch": 0.0193647684343081, - "grad_norm": 0.6477727504447495, - "kl": 0.071044921875, - "learning_rate": 3.920958287691811e-06, - "loss": -0.0026, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0872281099064747, + "kl": 0.210205078125, + "learning_rate": 3.933378813921942e-06, + "loss": -0.0373, + "num_tokens": 5656416.0, + "reward": 0.1226367698982358, + "reward_std": 0.20265722228214145, + "rewards/code_reward/mean": 0.1226367698982358, + "rewards/code_reward/std": 0.20265722228214145, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 268.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.75, + "completions/max_terminated_length": 523.75, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 95.75, + "completions/min_terminated_length": 95.75, "epoch": 0.01947670351196306, - "grad_norm": 27.871539574475218, - "kl": 0.44085693359375, - "learning_rate": 3.908488630397121e-06, - "loss": -0.0071, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.0504957433338074, + "kl": 0.21142578125, + "learning_rate": 3.920958287691811e-06, + "loss": -0.0152, + "num_tokens": 5680844.0, + "reward": 0.4488864839076996, + "reward_std": 0.3014371059834957, + "rewards/code_reward/mean": 0.4488864839076996, + "rewards/code_reward/std": 0.3014371246099472, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 328.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 430.875, + "completions/mean_terminated_length": 430.875, + "completions/min_length": 249.5, + "completions/min_terminated_length": 249.5, "epoch": 0.019588638589618023, - "grad_norm": 0.057425424775596354, - "kl": 0.06646728515625, - "learning_rate": 3.8959703652397175e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7795786053104797, + "kl": 0.229736328125, + "learning_rate": 3.908488630397121e-06, + "loss": 0.0764, + "num_tokens": 5713200.0, + "reward": 0.04957035928964615, + "reward_std": 0.06563462410122156, + "rewards/code_reward/mean": 0.04957035928964615, + "rewards/code_reward/std": 0.06563462410122156, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 359.21875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.75, + "completions/max_terminated_length": 609.75, + "completions/mean_length": 444.75, + "completions/mean_terminated_length": 444.75, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, "epoch": 0.019700573667272982, - "grad_norm": 0.02368246574314423, - "kl": 0.055419921875, - "learning_rate": 3.883404017460935e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8790829345446359, + "kl": 0.202392578125, + "learning_rate": 3.8959703652397175e-06, + "loss": 0.0125, + "num_tokens": 5742760.0, + "reward": 0.06789090437814593, + "reward_std": 0.10605220403522253, + "rewards/code_reward/mean": 0.06789090437814593, + "rewards/code_reward/std": 0.10605220403522253, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 320.234375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 921.75, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 502.5625, + "completions/mean_terminated_length": 457.6607208251953, + "completions/min_length": 233.75, + "completions/min_terminated_length": 233.75, "epoch": 0.019812508744927942, - "grad_norm": 0.03358527901376269, - "kl": 0.05792236328125, - "learning_rate": 3.870790114319559e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6856544409285574, + "kl": 0.212890625, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0149, + "num_tokens": 5776802.0, + "reward": 0.125, + "reward_std": 0.13363061845302582, + "rewards/code_reward/mean": 0.125, + "rewards/code_reward/std": 0.13363061845302582, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 249.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.25, + "completions/max_terminated_length": 573.25, + "completions/mean_length": 400.78125, + "completions/mean_terminated_length": 400.78125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, "epoch": 0.0199244438225829, - "grad_norm": 0.5392466815020299, - "kl": 0.06982421875, - "learning_rate": 3.858129185069701e-06, - "loss": -0.0209, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7758589030560867, + "kl": 0.256591796875, + "learning_rate": 3.870790114319559e-06, + "loss": -0.0786, + "num_tokens": 5804987.0, + "reward": 0.2419273192062974, + "reward_std": 0.05694087781012058, + "rewards/code_reward/mean": 0.2419273192062974, + "rewards/code_reward/std": 0.05694088339805603, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 322.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.25, + "completions/max_terminated_length": 721.25, + "completions/mean_length": 484.3125, + "completions/mean_terminated_length": 484.3125, + "completions/min_length": 227.75, + "completions/min_terminated_length": 227.75, "epoch": 0.02003637890023786, - "grad_norm": 0.17046312215041995, - "kl": 0.08056640625, - "learning_rate": 3.845421760938597e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8831663988161996, + "kl": 0.20166015625, + "learning_rate": 3.858129185069701e-06, + "loss": -0.0142, + "num_tokens": 5838165.0, + "reward": 0.15460877772420645, + "reward_std": 0.1456797532737255, + "rewards/code_reward/mean": 0.15460877772420645, + "rewards/code_reward/std": 0.1456797607243061, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 343.234375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 943.5, + "completions/max_terminated_length": 603.5, + "completions/mean_length": 490.1875, + "completions/mean_terminated_length": 443.3526916503906, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, "epoch": 0.020148313977892823, - "grad_norm": 0.05778936711697761, - "kl": 0.05181884765625, - "learning_rate": 3.832668375104312e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.051749696084055, + "kl": 0.194091796875, + "learning_rate": 3.845421760938597e-06, + "loss": 0.0364, + "num_tokens": 5868107.0, + "reward": 0.052815594244748354, + "reward_std": 0.11768656317144632, + "rewards/code_reward/mean": 0.052815594244748354, + "rewards/code_reward/std": 0.11768656317144632, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 328.75, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1064.75, + "completions/max_terminated_length": 742.5, + "completions/mean_length": 527.46875, + "completions/mean_terminated_length": 478.9464340209961, + "completions/min_length": 230.5, + "completions/min_terminated_length": 230.5, "epoch": 0.020260249055547783, - "grad_norm": 0.9368586862857481, - "kl": 0.0615234375, - "learning_rate": 3.8198695626733725e-06, - "loss": -0.0006, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9254371836384011, + "kl": 0.224365234375, + "learning_rate": 3.832668375104312e-06, + "loss": 0.0786, + "num_tokens": 5900842.0, + "reward": 0.11509167775511742, + "reward_std": 0.2528133289888501, + "rewards/code_reward/mean": 0.11509167775511742, + "rewards/code_reward/std": 0.2528133289888501, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 303.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.75, + "completions/max_terminated_length": 607.75, + "completions/mean_length": 416.75, + "completions/mean_terminated_length": 416.75, + "completions/min_length": 192.75, + "completions/min_terminated_length": 192.75, "epoch": 0.020372184133202742, - "grad_norm": 0.1463135701740714, - "kl": 0.05364990234375, - "learning_rate": 3.8070258606583156e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1539914541007663, + "kl": 0.260009765625, + "learning_rate": 3.8198695626733725e-06, + "loss": -0.0358, + "num_tokens": 5926258.0, + "reward": 0.2809056378901005, + "reward_std": 0.25853854790329933, + "rewards/code_reward/mean": 0.2809056378901005, + "rewards/code_reward/std": 0.2585385534912348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 268.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 439.6875, + "completions/mean_terminated_length": 439.6875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, "epoch": 0.0204841192108577, - "grad_norm": 24.196429667374314, - "kl": 0.20501708984375, - "learning_rate": 3.7941378079551544e-06, - "loss": 0.0021, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.7671721848701089, + "kl": 0.207763671875, + "learning_rate": 3.8070258606583156e-06, + "loss": -0.0237, + "num_tokens": 5955016.0, + "reward": 0.118256576359272, + "reward_std": 0.10753975436091423, + "rewards/code_reward/mean": 0.118256576359272, + "rewards/code_reward/std": 0.10753976181149483, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 350.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.75, + "completions/max_terminated_length": 727.75, + "completions/mean_length": 514.6875, + "completions/mean_terminated_length": 514.6875, + "completions/min_length": 295.75, + "completions/min_terminated_length": 295.75, "epoch": 0.020596054288512664, - "grad_norm": 0.7575728537579188, - "kl": 0.067138671875, - "learning_rate": 3.7812059453207677e-06, - "loss": -0.0088, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.9937661413851208, + "kl": 0.22119140625, + "learning_rate": 3.7941378079551544e-06, + "loss": -0.0669, + "num_tokens": 5988158.0, + "reward": 0.017067496781237423, + "reward_std": 0.02749600470997393, + "rewards/code_reward/mean": 0.017067496781237423, + "rewards/code_reward/std": 0.027496004942804575, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 381.71875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1087.5, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 524.875, + "completions/mean_terminated_length": 471.5446472167969, + "completions/min_length": 291.25, + "completions/min_terminated_length": 291.25, "epoch": 0.020707989366167624, - "grad_norm": 1.5781775679619348, - "kl": 0.129150390625, - "learning_rate": 3.768230815350213e-06, - "loss": -0.0091, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6638578813804005, + "kl": 0.237060546875, + "learning_rate": 3.7812059453207677e-06, + "loss": 0.1742, + "num_tokens": 6023682.0, + "reward": 0.1103343702852726, + "reward_std": 0.11875982582569122, + "rewards/code_reward/mean": 0.1103343702852726, + "rewards/code_reward/std": 0.11875982582569122, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 384.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.75, + "completions/max_terminated_length": 968.75, + "completions/mean_length": 510.34375, + "completions/mean_terminated_length": 510.34375, + "completions/min_length": 203.5, + "completions/min_terminated_length": 203.5, "epoch": 0.020819924443822583, - "grad_norm": 3.7258182746280184, - "kl": 0.70550537109375, - "learning_rate": 3.7552129624539557e-06, - "loss": 0.2283, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.111764046709291, + "kl": 0.22021484375, + "learning_rate": 3.768230815350213e-06, + "loss": -0.2216, + "num_tokens": 6058277.0, + "reward": 0.08250047732144594, + "reward_std": 0.18678564205765724, + "rewards/code_reward/mean": 0.08250047732144594, + "rewards/code_reward/std": 0.18678564997389913, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 449.1875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1082.75, + "completions/max_terminated_length": 816.5, + "completions/mean_length": 579.78125, + "completions/mean_terminated_length": 536.25, + "completions/min_length": 290.75, + "completions/min_terminated_length": 290.75, "epoch": 0.020931859521477542, - "grad_norm": 23.600980161646945, - "kl": 2.8865966796875, - "learning_rate": 3.7421529328350316e-06, - "loss": 0.2557, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.6213788285878431, + "kl": 0.198974609375, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.0099, + "num_tokens": 6096662.0, + "reward": 0.15393732488155365, + "reward_std": 0.158139206469059, + "rewards/code_reward/mean": 0.15393732488155365, + "rewards/code_reward/std": 0.158139206469059, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 352.265625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1410.25, + "completions/max_terminated_length": 706.5, + "completions/mean_length": 560.1875, + "completions/mean_terminated_length": 458.20983123779297, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, "epoch": 0.0210437945991325, - "grad_norm": 36.37207086101098, - "kl": 1.830078125, - "learning_rate": 3.7290512744661274e-06, - "loss": 0.3201, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7369665680166173, + "kl": 0.207275390625, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.0253, + "num_tokens": 6130348.0, + "reward": 0.016329039994161576, + "reward_std": 0.015793586208019406, + "rewards/code_reward/mean": 0.016329039994161576, + "rewards/code_reward/std": 0.01579358527669683, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 396.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.75, + "completions/max_terminated_length": 808.75, + "completions/mean_length": 575.625, + "completions/mean_terminated_length": 575.625, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, "epoch": 0.021155729676787464, - "grad_norm": 20.154618516530974, - "kl": 0.8580322265625, - "learning_rate": 3.715908537066589e-06, - "loss": 0.236, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8246271999593108, + "kl": 0.204345703125, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.0457, + "num_tokens": 6171304.0, + "reward": 0.07762476638890803, + "reward_std": 0.16000637132674456, + "rewards/code_reward/mean": 0.07762476638890803, + "rewards/code_reward/std": 0.16000637412071228, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 371.265625, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1693.5, + "completions/max_terminated_length": 887.75, + "completions/mean_length": 880.3125, + "completions/mean_terminated_length": 621.9541931152344, + "completions/min_length": 358.75, + "completions/min_terminated_length": 358.75, "epoch": 0.021267664754442424, - "grad_norm": 1.0506630322756636, - "kl": 0.18212890625, - "learning_rate": 3.7027252720793538e-06, - "loss": -0.0025, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7247939676119982, + "kl": 0.166748046875, + "learning_rate": 3.715908537066589e-06, + "loss": -0.0747, + "num_tokens": 6218770.0, + "reward": 0.18021205358672887, + "reward_std": 0.20904676476493478, + "rewards/code_reward/mean": 0.18021205358672887, + "rewards/code_reward/std": 0.2090467723319307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 290.59375, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1439.5, + "completions/max_terminated_length": 634.25, + "completions/mean_length": 787.9375, + "completions/mean_terminated_length": 455.0029830932617, + "completions/min_length": 289.5, + "completions/min_terminated_length": 289.5, "epoch": 0.021379599832097383, - "grad_norm": 0.10894834856423846, - "kl": 0.0850830078125, - "learning_rate": 3.689502032647817e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.5466790904917126, + "kl": 0.152587890625, + "learning_rate": 3.7027252720793538e-06, + "loss": 0.1295, + "num_tokens": 6262056.0, + "reward": 0.193359375, + "reward_std": 0.15328529477119446, + "rewards/code_reward/mean": 0.193359375, + "rewards/code_reward/std": 0.15328530967235565, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 406.421875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1206.75, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 588.03125, + "completions/mean_terminated_length": 537.1607208251953, + "completions/min_length": 300.25, + "completions/min_terminated_length": 300.25, "epoch": 0.021491534909752343, - "grad_norm": 0.070012885417872, - "kl": 0.0955810546875, - "learning_rate": 3.6762393735926245e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8256952276538428, + "kl": 0.239501953125, + "learning_rate": 3.689502032647817e-06, + "loss": -0.1993, + "num_tokens": 6310129.0, + "reward": 0.11067206133157015, + "reward_std": 0.11477606277912855, + "rewards/code_reward/mean": 0.11067206133157015, + "rewards/code_reward/std": 0.11477606697008014, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 288.953125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1253.5, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 544.9375, + "completions/mean_terminated_length": 496.7276840209961, + "completions/min_length": 312.75, + "completions/min_terminated_length": 312.75, "epoch": 0.021603469987407305, - "grad_norm": 0.055813531691409256, - "kl": 0.068603515625, - "learning_rate": 3.6629378513883852e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9219945943560189, + "kl": 0.228759765625, + "learning_rate": 3.6762393735926245e-06, + "loss": 0.0478, + "num_tokens": 6343727.0, + "reward": 0.08743459376273677, + "reward_std": 0.060669250786304474, + "rewards/code_reward/mean": 0.08743459376273677, + "rewards/code_reward/std": 0.06066925637423992, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 279.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.75, + "completions/max_terminated_length": 747.75, + "completions/mean_length": 467.28125, + "completions/mean_terminated_length": 467.28125, + "completions/min_length": 274.25, + "completions/min_terminated_length": 274.25, "epoch": 0.021715405065062265, - "grad_norm": 52.57320558278293, - "kl": 0.1983642578125, - "learning_rate": 3.6495980241403307e-06, - "loss": 0.3357, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.087269428253631, + "kl": 0.224853515625, + "learning_rate": 3.6629378513883852e-06, + "loss": -0.0435, + "num_tokens": 6369656.0, + "reward": 0.01468671576003544, + "reward_std": 0.0163727342733182, + "rewards/code_reward/mean": 0.01468671576003544, + "rewards/code_reward/std": 0.0163727342733182, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 291.78125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 711.5, + "completions/mean_length": 528.34375, + "completions/mean_terminated_length": 428.1741180419922, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, "epoch": 0.021827340142717224, - "grad_norm": 0.04396489484628073, - "kl": 0.0579833984375, - "learning_rate": 3.636220451560896e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.877705585064893, + "kl": 0.191162109375, + "learning_rate": 3.6495980241403307e-06, + "loss": -0.0435, + "num_tokens": 6402635.0, + "reward": 0.28124301601201296, + "reward_std": 0.1287369872443378, + "rewards/code_reward/mean": 0.28124301601201296, + "rewards/code_reward/std": 0.1287369979545474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 390.78125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1505.75, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 629.875, + "completions/mean_terminated_length": 531.4151840209961, + "completions/min_length": 338.5, + "completions/min_terminated_length": 338.5, "epoch": 0.021939275220372183, - "grad_norm": 1.0092735457153308, - "kl": 0.08642578125, - "learning_rate": 3.622805694946235e-06, - "loss": 0.0163, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7532827929414431, + "kl": 0.193115234375, + "learning_rate": 3.636220451560896e-06, + "loss": 0.067, + "num_tokens": 6441607.0, + "reward": 0.07744654751149938, + "reward_std": 0.049496792489662766, + "rewards/code_reward/mean": 0.07744654751149938, + "rewards/code_reward/std": 0.04949679644778371, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 307.9375, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 1397.75, + "completions/max_terminated_length": 630.75, + "completions/mean_length": 608.0, + "completions/mean_terminated_length": 451.27679443359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, "epoch": 0.022051210298027143, - "grad_norm": 0.11501833373142471, - "kl": 0.07763671875, - "learning_rate": 3.609354317152667e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.8485874245870415, + "kl": 0.191162109375, + "learning_rate": 3.622805694946235e-06, + "loss": -0.1349, + "num_tokens": 6479383.0, + "reward": 0.26938944309949875, + "reward_std": 0.26605916023254395, + "rewards/code_reward/mean": 0.26938944309949875, + "rewards/code_reward/std": 0.26605917513370514, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 354.546875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1237.75, + "completions/max_terminated_length": 865.5, + "completions/mean_length": 552.3125, + "completions/mean_terminated_length": 501.0982208251953, + "completions/min_length": 266.25, + "completions/min_terminated_length": 266.25, "epoch": 0.022163145375682106, - "grad_norm": 0.24707486499522355, - "kl": 0.0819091796875, - "learning_rate": 3.595866882573063e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.824562163366152, + "kl": 0.19140625, + "learning_rate": 3.609354317152667e-06, + "loss": -0.025, + "num_tokens": 6511257.0, + "reward": 0.05427030206192285, + "reward_std": 0.047577258897945285, + "rewards/code_reward/mean": 0.05427030206192285, + "rewards/code_reward/std": 0.04757725913077593, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 307.640625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1021.5, + "completions/max_terminated_length": 694.75, + "completions/mean_length": 498.78125, + "completions/mean_terminated_length": 447.3214340209961, + "completions/min_length": 226.5, + "completions/min_terminated_length": 226.5, "epoch": 0.022275080453337065, - "grad_norm": 27.72743469845055, - "kl": 0.6578369140625, - "learning_rate": 3.5823439571131675e-06, - "loss": 0.0387, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.0082207311300992, + "kl": 0.229736328125, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0149, + "num_tokens": 6541178.0, + "reward": 0.2747242748737335, + "reward_std": 0.2067141029983759, + "rewards/code_reward/mean": 0.2747242748737335, + "rewards/code_reward/std": 0.20671410486102104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 297.953125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 980.0, + "completions/max_terminated_length": 573.25, + "completions/mean_length": 469.53125, + "completions/mean_terminated_length": 416.4330368041992, + "completions/min_length": 246.5, + "completions/min_terminated_length": 246.5, "epoch": 0.022387015530992024, - "grad_norm": 4.257281648957348, - "kl": 0.2021484375, - "learning_rate": 3.5687861081678477e-06, - "loss": 0.002, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9952889000470718, + "kl": 0.216064453125, + "learning_rate": 3.5823439571131675e-06, + "loss": -0.1906, + "num_tokens": 6570243.0, + "reward": 0.1142054102383554, + "reward_std": 0.18550929613411427, + "rewards/code_reward/mean": 0.1142054102383554, + "rewards/code_reward/std": 0.18550931010395288, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 406.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.25, + "completions/max_terminated_length": 764.25, + "completions/mean_length": 500.96875, + "completions/mean_terminated_length": 500.96875, + "completions/min_length": 323.5, + "completions/min_terminated_length": 323.5, "epoch": 0.022498950608646984, - "grad_norm": 1.991901860771057, - "kl": 0.127197265625, - "learning_rate": 3.555193904597291e-06, - "loss": -0.0177, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9455318918616956, + "kl": 0.227294921875, + "learning_rate": 3.5687861081678477e-06, + "loss": 0.031, + "num_tokens": 6603946.0, + "reward": 0.16931893583387136, + "reward_std": 0.20531310141086578, + "rewards/code_reward/mean": 0.16931893583387136, + "rewards/code_reward/std": 0.20531310513615608, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 360.75, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1030.5, + "completions/max_terminated_length": 688.5, + "completions/mean_length": 418.1875, + "completions/mean_terminated_length": 367.9821472167969, + "completions/min_length": 194.75, + "completions/min_terminated_length": 194.75, "epoch": 0.022610885686301947, - "grad_norm": 22.923764220941024, - "kl": 0.1807861328125, - "learning_rate": 3.541567916703138e-06, - "loss": 0.1058, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.0325105547845679, + "kl": 0.2138671875, + "learning_rate": 3.555193904597291e-06, + "loss": 0.0613, + "num_tokens": 6636552.0, + "reward": 0.3791414946317673, + "reward_std": 0.17875608056783676, + "rewards/code_reward/mean": 0.3791414946317673, + "rewards/code_reward/std": 0.17875608801841736, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 361.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.25, + "completions/max_terminated_length": 754.25, + "completions/mean_length": 455.34375, + "completions/mean_terminated_length": 455.34375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, "epoch": 0.022722820763956906, - "grad_norm": 148.13942760303817, - "kl": 10.0704345703125, - "learning_rate": 3.5279087162045517e-06, - "loss": 0.3985, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.9727989512601006, + "kl": 0.22119140625, + "learning_rate": 3.541567916703138e-06, + "loss": 0.0159, + "num_tokens": 6668595.0, + "reward": 0.19191165082156658, + "reward_std": 0.15741402097046375, + "rewards/code_reward/mean": 0.19191165082156658, + "rewards/code_reward/std": 0.15741403214633465, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 312.625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 989.25, + "completions/max_terminated_length": 615.5, + "completions/mean_length": 476.40625, + "completions/mean_terminated_length": 425.71875762939453, + "completions/min_length": 255.25, + "completions/min_terminated_length": 255.25, "epoch": 0.022834755841611865, - "grad_norm": 5.396743744887109, - "kl": 0.3883056640625, - "learning_rate": 3.5142168762142265e-06, - "loss": -0.0111, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.7440983993424026, + "kl": 0.203857421875, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.0571, + "num_tokens": 6702376.0, + "reward": 0.11487132962793112, + "reward_std": 0.1093948557972908, + "rewards/code_reward/mean": 0.11487132962793112, + "rewards/code_reward/std": 0.1093948557972908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 409.84375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.25, + "completions/max_terminated_length": 861.25, + "completions/mean_length": 514.59375, + "completions/mean_terminated_length": 514.59375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, "epoch": 0.022946690919266825, - "grad_norm": 2770.9102363815387, - "kl": 340.0640869140625, - "learning_rate": 3.500492971214347e-06, - "loss": 3.6234, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9601225973651024, + "kl": 0.18505859375, + "learning_rate": 3.5142168762142265e-06, + "loss": 0.0168, + "num_tokens": 6739939.0, + "reward": 0.07519801473245025, + "reward_std": 0.09981238306500018, + "rewards/code_reward/mean": 0.07519801473245025, + "rewards/code_reward/std": 0.09981238329783082, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 254.1875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 914.25, + "completions/max_terminated_length": 670.75, + "completions/mean_length": 427.03125, + "completions/mean_terminated_length": 379.75, + "completions/min_length": 184.25, + "completions/min_terminated_length": 184.25, "epoch": 0.023058625996921784, - "grad_norm": 11.015389916640636, - "kl": 1.8922119140625, - "learning_rate": 3.48673757703248e-06, - "loss": -0.0028, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.8984794673889067, + "kl": 0.207763671875, + "learning_rate": 3.500492971214347e-06, + "loss": 0.1382, + "num_tokens": 6769180.0, + "reward": 0.22681757621467113, + "reward_std": 0.20832497254014015, + "rewards/code_reward/mean": 0.22681757621467113, + "rewards/code_reward/std": 0.20832498744130135, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 350.78125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1280.25, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 650.65625, + "completions/mean_terminated_length": 608.4821472167969, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, "epoch": 0.023170561074576747, - "grad_norm": 1.8419760739328712, - "kl": 0.1317138671875, - "learning_rate": 3.472951270817418e-06, - "loss": -0.0237, - "reward": 0.09062500298023224, - "reward_std": 0.024866947438567877, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.90625, + "grad_norm": 0.7922918264942025, + "kl": 0.1728515625, + "learning_rate": 3.48673757703248e-06, + "loss": -0.0944, + "num_tokens": 6805289.0, + "reward": 0.14009581343270838, + "reward_std": 0.11429419624619186, + "rewards/code_reward/mean": 0.14009581343270838, + "rewards/code_reward/std": 0.11429420742206275, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 384.078125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1129.5, + "completions/max_terminated_length": 776.25, + "completions/mean_length": 539.34375, + "completions/mean_terminated_length": 491.6026916503906, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, "epoch": 0.023282496152231706, - "grad_norm": 1.681815519531453, - "kl": 0.16943359375, - "learning_rate": 3.4591346310149578e-06, - "loss": -0.0704, - "reward": 0.08906250260770321, - "reward_std": 0.025969466660171747, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.890625, + "grad_norm": 0.9119324704049495, + "kl": 0.1783447265625, + "learning_rate": 3.472951270817418e-06, + "loss": -0.064, + "num_tokens": 6837436.0, + "reward": 0.05461701576132327, + "reward_std": 0.09534355666255578, + "rewards/code_reward/mean": 0.05461701576132327, + "rewards/code_reward/std": 0.09534356038784608, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 265.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.25, + "completions/max_terminated_length": 903.25, + "completions/mean_length": 384.75, + "completions/mean_terminated_length": 384.75, + "completions/min_length": 159.75, + "completions/min_terminated_length": 159.75, "epoch": 0.023394431229886666, - "grad_norm": 4.149140370325476, - "kl": 0.343994140625, - "learning_rate": 3.445288237343632e-06, - "loss": 0.0316, - "reward": 0.08906250260770321, - "reward_std": 0.031116947531700134, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.890625, + "grad_norm": 1.2240979756453376, + "kl": 0.183349609375, + "learning_rate": 3.4591346310149578e-06, + "loss": 0.0503, + "num_tokens": 6864492.0, + "reward": 0.4224591121310368, + "reward_std": 0.2887880225898698, + "rewards/code_reward/mean": 0.4224591121310368, + "rewards/code_reward/std": 0.2887880523921922, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 323.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.75, + "completions/max_terminated_length": 767.75, + "completions/mean_length": 399.28125, + "completions/mean_terminated_length": 399.28125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, "epoch": 0.023506366307541625, - "grad_norm": 7.97862155393176, - "kl": 0.59326171875, - "learning_rate": 3.4314126707703895e-06, - "loss": -0.0088, - "reward": 0.0937500037252903, - "reward_std": 0.017078250646591187, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.1994406910258044, + "kl": 0.204833984375, + "learning_rate": 3.445288237343632e-06, + "loss": -0.0509, + "num_tokens": 6891213.0, + "reward": 0.09320073015987873, + "reward_std": 0.12864024192094803, + "rewards/code_reward/mean": 0.09320073015987873, + "rewards/code_reward/std": 0.12864025123417377, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 410.84375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1011.25, + "completions/max_terminated_length": 669.75, + "completions/mean_length": 461.25, + "completions/mean_terminated_length": 410.2276840209961, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, "epoch": 0.023618301385196588, - "grad_norm": 37.00032182098118, - "kl": 3.810546875, - "learning_rate": 3.4175085134862128e-06, - "loss": 0.1349, - "reward": 0.08281250484287739, - "reward_std": 0.03833641391247511, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.828125, + "grad_norm": 1.1498675164259378, + "kl": 0.2158203125, + "learning_rate": 3.4314126707703895e-06, + "loss": 0.0824, + "num_tokens": 6919749.0, + "reward": 0.2663097037002444, + "reward_std": 0.21830029226839542, + "rewards/code_reward/mean": 0.2663097037002444, + "rewards/code_reward/std": 0.21830029599368572, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 348.96875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 868.25, + "completions/max_terminated_length": 621.75, + "completions/mean_length": 385.84375, + "completions/mean_terminated_length": 338.625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, "epoch": 0.023730236462851547, - "grad_norm": 78.79622548097791, - "kl": 3.765380859375, - "learning_rate": 3.4035763488816953e-06, - "loss": 0.2076, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.220829227031836, + "kl": 0.20703125, + "learning_rate": 3.4175085134862128e-06, + "loss": 0.1624, + "num_tokens": 6948192.0, + "reward": 0.27685857750475407, + "reward_std": 0.24184924457222223, + "rewards/code_reward/mean": 0.27685857750475407, + "rewards/code_reward/std": 0.24184925481677055, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 321.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.25, + "completions/max_terminated_length": 540.25, + "completions/mean_length": 334.1875, + "completions/mean_terminated_length": 334.1875, + "completions/min_length": 196.25, + "completions/min_terminated_length": 196.25, "epoch": 0.023842171540506506, - "grad_norm": 10.816304901178587, - "kl": 3.8818359375, - "learning_rate": 3.3896167615225594e-06, - "loss": 0.1445, - "reward": 0.08906250260770321, - "reward_std": 0.025969465728849173, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.890625, + "grad_norm": 1.0550764471568521, + "kl": 0.224365234375, + "learning_rate": 3.4035763488816953e-06, + "loss": 0.1182, + "num_tokens": 6973222.0, + "reward": 0.5495182275772095, + "reward_std": 0.17330202460289001, + "rewards/code_reward/mean": 0.5495182275772095, + "rewards/code_reward/std": 0.17330202646553516, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 375.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 360.46875, + "completions/mean_terminated_length": 360.46875, + "completions/min_length": 144.75, + "completions/min_terminated_length": 144.75, "epoch": 0.023954106618161466, - "grad_norm": 13.522641035708572, - "kl": 4.187744140625, - "learning_rate": 3.375630337125133e-06, - "loss": 0.0886, - "reward": 0.09062500111758709, - "reward_std": 0.020155644044280052, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.90625, + "grad_norm": 1.0108308143941853, + "kl": 0.258544921875, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.0636, + "num_tokens": 6998765.0, + "reward": 0.13169488031417131, + "reward_std": 0.07389534078538418, + "rewards/code_reward/mean": 0.13169488031417131, + "rewards/code_reward/std": 0.07389534404501319, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 292.984375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1026.5, + "completions/max_terminated_length": 619.5, + "completions/mean_length": 401.0, + "completions/mean_terminated_length": 346.1383972167969, + "completions/min_length": 137.25, + "completions/min_terminated_length": 137.25, "epoch": 0.024066041695816425, - "grad_norm": 9.482318398692025, - "kl": 5.662109375, - "learning_rate": 3.361617662531772e-06, - "loss": 0.135, - "reward": 0.09062500298023224, - "reward_std": 0.0295782508328557, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.90625, + "grad_norm": 0.750694546999131, + "kl": 0.19580078125, + "learning_rate": 3.375630337125133e-06, + "loss": 0.1223, + "num_tokens": 7028501.0, + "reward": 0.07791783940047026, + "reward_std": 0.08552672585938126, + "rewards/code_reward/mean": 0.07791783940047026, + "rewards/code_reward/std": 0.0855267186416313, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 330.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 329.9375, + "completions/mean_terminated_length": 329.9375, + "completions/min_length": 163.75, + "completions/min_terminated_length": 163.75, "epoch": 0.024177976773471388, - "grad_norm": 9.298647150834725, - "kl": 1.287109375, - "learning_rate": 3.347579325686237e-06, - "loss": -0.0025, - "reward": 0.09218750335276127, - "reward_std": 0.01861694734543562, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 0.9861460225219651, + "kl": 0.20947265625, + "learning_rate": 3.361617662531772e-06, + "loss": 0.0307, + "num_tokens": 7059667.0, + "reward": 0.28345959074795246, + "reward_std": 0.10491538979113102, + "rewards/code_reward/mean": 0.28345959074795246, + "rewards/code_reward/std": 0.10491538792848587, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 321.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 300.9375, + "completions/mean_terminated_length": 300.9375, + "completions/min_length": 150.5, + "completions/min_terminated_length": 150.5, "epoch": 0.024289911851126347, - "grad_norm": 9.33505130284713, - "kl": 2.80908203125, - "learning_rate": 3.333515915609027e-06, - "loss": 0.0542, - "reward": 0.09218750335276127, - "reward_std": 0.01861694734543562, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.2867180675052194, + "kl": 0.19677734375, + "learning_rate": 3.347579325686237e-06, + "loss": 0.0498, + "num_tokens": 7084721.0, + "reward": 0.38907771836966276, + "reward_std": 0.32206146977841854, + "rewards/code_reward/mean": 0.38907771836966276, + "rewards/code_reward/std": 0.3220614865422249, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 364.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.75, + "completions/max_terminated_length": 508.75, + "completions/mean_length": 332.0625, + "completions/mean_terminated_length": 332.0625, + "completions/min_length": 201.25, + "completions/min_terminated_length": 201.25, "epoch": 0.024401846928781307, - "grad_norm": 46.939949673345936, - "kl": 7.6962890625, - "learning_rate": 3.3194280223726616e-06, - "loss": 0.1244, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.221746654434671, + "kl": 0.192626953125, + "learning_rate": 3.333515915609027e-06, + "loss": -0.0326, + "num_tokens": 7112387.0, + "reward": 0.05860341805964708, + "reward_std": 0.07969626039266586, + "rewards/code_reward/mean": 0.05860341805964708, + "rewards/code_reward/std": 0.07969625853002071, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 389.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.25, + "completions/max_terminated_length": 567.25, + "completions/mean_length": 317.25, + "completions/mean_terminated_length": 317.25, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, "epoch": 0.024513782006436266, - "grad_norm": 8.965046605455813, - "kl": 1.474365234375, - "learning_rate": 3.305316237076927e-06, - "loss": 0.0928, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.0695893302551942, + "kl": 0.23388671875, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.027, + "num_tokens": 7138323.0, + "reward": 0.14415738731622696, + "reward_std": 0.14080366492271423, + "rewards/code_reward/mean": 0.14415738731622696, + "rewards/code_reward/std": 0.14080367609858513, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 380.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.5, + "completions/max_terminated_length": 387.5, + "completions/mean_length": 239.6875, + "completions/mean_terminated_length": 239.6875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, "epoch": 0.024625717084091225, - "grad_norm": 30.26772960669773, - "kl": 3.83154296875, - "learning_rate": 3.291181151824071e-06, - "loss": 0.0081, - "reward": 0.09375000186264515, - "reward_std": 0.011180340312421322, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.763480550756643, + "kl": 0.20849609375, + "learning_rate": 3.305316237076927e-06, + "loss": -0.1439, + "num_tokens": 7159529.0, + "reward": 0.10015321767423302, + "reward_std": 0.17220470518805087, + "rewards/code_reward/mean": 0.10015321767423302, + "rewards/code_reward/std": 0.17220470635220408, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 431.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 327.15625, + "completions/mean_terminated_length": 327.15625, + "completions/min_length": 177.5, + "completions/min_terminated_length": 177.5, "epoch": 0.024737652161746188, - "grad_norm": 9.50042126895199, - "kl": 0.977294921875, - "learning_rate": 3.27702335969396e-06, - "loss": -0.0179, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.8169319139286209, + "kl": 0.1607666015625, + "learning_rate": 3.291181151824071e-06, + "loss": 0.0895, + "num_tokens": 7191342.0, + "reward": 0.1822916641831398, + "reward_std": 0.2553221881389618, + "rewards/code_reward/mean": 0.1822916641831398, + "rewards/code_reward/std": 0.2553221881389618, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 363.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.25, + "completions/max_terminated_length": 436.25, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 153.25, + "completions/min_terminated_length": 153.25, "epoch": 0.024849587239401148, - "grad_norm": 8.200151360167519, - "kl": 0.697509765625, - "learning_rate": 3.2628434547191985e-06, - "loss": -0.037, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1996415284846553, + "kl": 0.2158203125, + "learning_rate": 3.27702335969396e-06, + "loss": -0.0201, + "num_tokens": 7216737.0, + "reward": 0.10358373820781708, + "reward_std": 0.1254219285910949, + "rewards/code_reward/mean": 0.10358373820781708, + "rewards/code_reward/std": 0.1254219323163852, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 305.859375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.5, + "completions/max_terminated_length": 349.5, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, "epoch": 0.024961522317056107, - "grad_norm": 18.01832349548423, - "kl": 0.634033203125, - "learning_rate": 3.2486420318601973e-06, - "loss": -0.0207, - "reward": 0.09218750335276127, - "reward_std": 0.01861694734543562, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.2608189223620252, + "kl": 0.2314453125, + "learning_rate": 3.2628434547191985e-06, + "loss": 0.0994, + "num_tokens": 7235498.0, + "reward": 0.14160977257415652, + "reward_std": 0.0918192695826292, + "rewards/code_reward/mean": 0.14160977257415652, + "rewards/code_reward/std": 0.0918192733079195, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 368.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 223.5625, + "completions/mean_terminated_length": 223.5625, + "completions/min_length": 149.25, + "completions/min_terminated_length": 149.25, "epoch": 0.025073457394711066, - "grad_norm": 99.84487944919411, - "kl": 4.774169921875, - "learning_rate": 3.2344196869802187e-06, - "loss": -0.0292, - "reward": 0.09218750149011612, - "reward_std": 0.02257782220840454, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.600431868786668, + "kl": 0.225341796875, + "learning_rate": 3.2486420318601973e-06, + "loss": 0.0364, + "num_tokens": 7262236.0, + "reward": 0.27536666474770755, + "reward_std": 0.14216232020407915, + "rewards/code_reward/mean": 0.27536666474770755, + "rewards/code_reward/std": 0.14216232066974044, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 553.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.75, + "completions/max_terminated_length": 559.75, + "completions/mean_length": 338.53125, + "completions/mean_terminated_length": 338.53125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, "epoch": 0.02518539247236603, - "grad_norm": 4.540807623552518, - "kl": 0.2640380859375, - "learning_rate": 3.2201770168203694e-06, - "loss": 0.0409, - "reward": 0.09531250037252903, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.8719466687856433, + "kl": 0.18359375, + "learning_rate": 3.2344196869802187e-06, + "loss": 0.0315, + "num_tokens": 7298189.0, + "reward": 0.015560166910290718, + "reward_std": 0.012789241969585419, + "rewards/code_reward/mean": 0.015560166910290718, + "rewards/code_reward/std": 0.012789241969585419, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 470.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.25, + "completions/max_terminated_length": 465.25, + "completions/mean_length": 276.65625, + "completions/mean_terminated_length": 276.65625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, "epoch": 0.02529732755002099, - "grad_norm": 1.9006028901018341, - "kl": 0.2882080078125, - "learning_rate": 3.205914618974563e-06, - "loss": -0.0101, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0186655813271028, + "kl": 0.19287109375, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.115, + "num_tokens": 7334746.0, + "reward": 0.10499188816174865, + "reward_std": 0.10288760857656598, + "rewards/code_reward/mean": 0.10499188816174865, + "rewards/code_reward/std": 0.10288760880939662, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 359.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.75, + "completions/max_terminated_length": 314.75, + "completions/mean_length": 205.90625, + "completions/mean_terminated_length": 205.90625, + "completions/min_length": 130.75, + "completions/min_terminated_length": 130.75, "epoch": 0.025409262627675948, - "grad_norm": 8.497076561424455, - "kl": 1.46826171875, - "learning_rate": 3.1916330918644496e-06, - "loss": 0.0076, - "reward": 0.0937500037252903, - "reward_std": 0.017078250646591187, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 0.7605637028441592, + "kl": 0.16796875, + "learning_rate": 3.205914618974563e-06, + "loss": 0.0079, + "num_tokens": 7353919.0, + "reward": 0.01245777029544115, + "reward_std": 0.01250904705375433, + "rewards/code_reward/mean": 0.01245777029544115, + "rewards/code_reward/std": 0.01250904705375433, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 398.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 191.46875, + "completions/mean_terminated_length": 191.46875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, "epoch": 0.025521197705330907, - "grad_norm": 6.49286525909613, - "kl": 0.5, - "learning_rate": 3.177333034714303e-06, - "loss": 0.0436, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.2858515605192504, + "kl": 0.259765625, + "learning_rate": 3.1916330918644496e-06, + "loss": 0.0768, + "num_tokens": 7377150.0, + "reward": 0.12344044167548418, + "reward_std": 0.12209718860685825, + "rewards/code_reward/mean": 0.12344044167548418, + "rewards/code_reward/std": 0.12209718953818083, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 340.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 226.6875, + "completions/mean_terminated_length": 226.6875, + "completions/min_length": 149.75, + "completions/min_terminated_length": 149.75, "epoch": 0.025633132782985867, - "grad_norm": 99.57124601550895, - "kl": 13.4130859375, - "learning_rate": 3.1630150475258813e-06, - "loss": 0.1554, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.9529571983949628, + "kl": 0.20458984375, + "learning_rate": 3.177333034714303e-06, + "loss": -0.0135, + "num_tokens": 7402444.0, + "reward": 0.06041666865348816, + "reward_std": 0.038540102541446686, + "rewards/code_reward/mean": 0.06041666865348816, + "rewards/code_reward/std": 0.038540102541446686, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 352.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.75, + "completions/max_terminated_length": 312.75, + "completions/mean_length": 147.1875, + "completions/mean_terminated_length": 147.1875, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, "epoch": 0.02574506786064083, - "grad_norm": 49.128639521298574, - "kl": 7.0673828125, - "learning_rate": 3.148679731053252e-06, - "loss": 0.0762, - "reward": 0.09218750335276127, - "reward_std": 0.023328250739723444, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.921875, + "grad_norm": 1.403338545743401, + "kl": 0.30029296875, + "learning_rate": 3.1630150475258813e-06, + "loss": 0.0342, + "num_tokens": 7429962.0, + "reward": 0.0703125, + "reward_std": 0.11608850955963135, + "rewards/code_reward/mean": 0.0703125, + "rewards/code_reward/std": 0.1160885114222765, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 351.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 183.46875, + "completions/mean_terminated_length": 183.46875, + "completions/min_length": 92.75, + "completions/min_terminated_length": 92.75, "epoch": 0.02585700293829579, - "grad_norm": 367.82367126923293, - "kl": 26.0625, - "learning_rate": 3.1343276867775805e-06, - "loss": 0.19, - "reward": 0.09531250037252903, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.2487410703105084, + "kl": 0.259765625, + "learning_rate": 3.148679731053252e-06, + "loss": -0.0378, + "num_tokens": 7455169.0, + "reward": 0.21321137621998787, + "reward_std": 0.2805868834257126, + "rewards/code_reward/mean": 0.21321137621998787, + "rewards/code_reward/std": 0.2805868834257126, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 366.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.25, + "completions/max_terminated_length": 303.25, + "completions/mean_length": 153.84375, + "completions/mean_terminated_length": 153.84375, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, "epoch": 0.025968938015950748, - "grad_norm": 2760.034913740409, - "kl": 32.2548828125, - "learning_rate": 3.1199595168819043e-06, - "loss": 0.4045, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.0579237191929745, + "kl": 0.232421875, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.0811, + "num_tokens": 7480004.0, + "reward": 0.1274509804788977, + "reward_std": 0.2177756354212761, + "rewards/code_reward/mean": 0.1274509804788977, + "rewards/code_reward/std": 0.2177756503224373, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 421.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.75, + "completions/max_terminated_length": 246.75, + "completions/mean_length": 166.1875, + "completions/mean_terminated_length": 166.1875, + "completions/min_length": 100.5, + "completions/min_terminated_length": 100.5, "epoch": 0.026080873093605707, - "grad_norm": 24.759326550765813, - "kl": 2.56591796875, - "learning_rate": 3.105575824225852e-06, - "loss": 0.1236, - "reward": 0.09531250037252903, - "reward_std": 0.018750000279396772, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.373681707551141, + "kl": 0.2509765625, + "learning_rate": 3.1199595168819043e-06, + "loss": 0.005, + "num_tokens": 7508034.0, + "reward": 0.2599347122013569, + "reward_std": 0.22181765362620354, + "rewards/code_reward/mean": 0.2599347122013569, + "rewards/code_reward/std": 0.22181766107678413, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 352.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.25, + "completions/max_terminated_length": 369.25, + "completions/mean_length": 237.15625, + "completions/mean_terminated_length": 237.15625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, "epoch": 0.02619280817126067, - "grad_norm": 11.271203444155871, - "kl": 0.311767578125, - "learning_rate": 3.091177212320363e-06, - "loss": 0.0506, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.449039323532629, + "kl": 0.205322265625, + "learning_rate": 3.105575824225852e-06, + "loss": 0.0024, + "num_tokens": 7536911.0, + "reward": 0.17171062319539487, + "reward_std": 0.1438203388825059, + "rewards/code_reward/mean": 0.17171062319539487, + "rewards/code_reward/std": 0.1438203463330865, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 294.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.75, + "completions/max_terminated_length": 345.75, + "completions/mean_length": 170.21875, + "completions/mean_terminated_length": 170.21875, + "completions/min_length": 95.75, + "completions/min_terminated_length": 95.75, "epoch": 0.02630474324891563, - "grad_norm": 60.98662683249761, - "kl": 6.624267578125, - "learning_rate": 3.0767642853023538e-06, - "loss": 0.0659, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0698079544355648, + "kl": 0.1624755859375, + "learning_rate": 3.091177212320363e-06, + "loss": -0.1894, + "num_tokens": 7554142.0, + "reward": 0.3125, + "reward_std": 0.1462521031498909, + "rewards/code_reward/mean": 0.3125, + "rewards/code_reward/std": 0.1462521031498909, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 387.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 175.46875, + "completions/mean_terminated_length": 175.46875, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, "epoch": 0.02641667832657059, - "grad_norm": 8.237085005507922, - "kl": 0.9478759765625, - "learning_rate": 3.062337647909376e-06, - "loss": -0.0391, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4540255844594339, + "kl": 0.2158203125, + "learning_rate": 3.0767642853023538e-06, + "loss": -0.0223, + "num_tokens": 7584357.0, + "reward": 0.21975820884108543, + "reward_std": 0.1363154649734497, + "rewards/code_reward/mean": 0.21975820884108543, + "rewards/code_reward/std": 0.1363154649734497, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 260.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.25, + "completions/max_terminated_length": 228.25, + "completions/mean_length": 116.59375, + "completions/mean_terminated_length": 116.59375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.02652861340422555, - "grad_norm": 18.57477238550014, - "kl": 2.2236328125, - "learning_rate": 3.04789790545424e-06, - "loss": 0.0015, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.9465041168415773, + "kl": 0.247802734375, + "learning_rate": 3.062337647909376e-06, + "loss": -0.039, + "num_tokens": 7602040.0, + "reward": 0.4692905358970165, + "reward_std": 0.24660581722855568, + "rewards/code_reward/mean": 0.4692905358970165, + "rewards/code_reward/std": 0.24660583958029747, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 322.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.75, + "completions/max_terminated_length": 583.75, + "completions/mean_length": 281.21875, + "completions/mean_terminated_length": 281.21875, + "completions/min_length": 133.5, + "completions/min_terminated_length": 133.5, "epoch": 0.026640548481880508, - "grad_norm": 16.627244952129402, - "kl": 0.564453125, - "learning_rate": 3.033445663799621e-06, - "loss": 0.0793, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.333783365108213, + "kl": 0.1785888671875, + "learning_rate": 3.04789790545424e-06, + "loss": 0.0396, + "num_tokens": 7627319.0, + "reward": 0.17131002363748848, + "reward_std": 0.18950149056036025, + "rewards/code_reward/mean": 0.17131002363748848, + "rewards/code_reward/std": 0.18950149248121306, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 315.96875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.5, + "completions/max_terminated_length": 231.5, + "completions/mean_length": 128.28125, + "completions/mean_terminated_length": 128.28125, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, "epoch": 0.02675248355953547, - "grad_norm": 54.633519018974916, - "kl": 4.1083984375, - "learning_rate": 3.018981529332633e-06, - "loss": 0.1508, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.8510436652715602, + "kl": 0.250732421875, + "learning_rate": 3.033445663799621e-06, + "loss": -0.0327, + "num_tokens": 7644360.0, + "reward": 0.07549504935741425, + "reward_std": 0.07982433587312698, + "rewards/code_reward/mean": 0.07549504935741425, + "rewards/code_reward/std": 0.07982433587312698, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 380.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.25, + "completions/max_terminated_length": 198.25, + "completions/mean_length": 118.28125, + "completions/mean_terminated_length": 118.28125, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, "epoch": 0.02686441863719043, - "grad_norm": 8.038755997888481, - "kl": 3.0703125, - "learning_rate": 3.00450610893939e-06, - "loss": 0.0033, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 2.420152417522015, + "kl": 0.25146484375, + "learning_rate": 3.018981529332633e-06, + "loss": 0.0544, + "num_tokens": 7661793.0, + "reward": 0.07239184161880985, + "reward_std": 0.05399157607462257, + "rewards/code_reward/mean": 0.07239184161880985, + "rewards/code_reward/std": 0.05399157712236047, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 386.46875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 212.125, + "completions/mean_terminated_length": 212.125, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.02697635371484539, - "grad_norm": 12.489323661336277, - "kl": 1.4534912109375, - "learning_rate": 2.9900200099795396e-06, - "loss": 0.0417, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3615781255434425, + "kl": 0.1885986328125, + "learning_rate": 3.00450610893939e-06, + "loss": 0.0817, + "num_tokens": 7685573.0, + "reward": 0.2050044471397996, + "reward_std": 0.1304325871169567, + "rewards/code_reward/mean": 0.2050044471397996, + "rewards/code_reward/std": 0.13043258781544864, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 345.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.75, + "completions/max_terminated_length": 230.75, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, "epoch": 0.02708828879250035, - "grad_norm": 5.250434377988738, - "kl": 1.822265625, - "learning_rate": 2.9755238402607826e-06, - "loss": -0.0469, - "reward": 0.0937500037252903, - "reward_std": 0.017078250646591187, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.4458971248752004, + "kl": 0.3115234375, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.1362, + "num_tokens": 7711785.0, + "reward": 0.12025879789143801, + "reward_std": 0.10159321606624871, + "rewards/code_reward/mean": 0.12025879789143801, + "rewards/code_reward/std": 0.10159321606624871, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 369.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.5, + "completions/max_terminated_length": 615.5, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 117.75, + "completions/min_terminated_length": 117.75, "epoch": 0.02720022387015531, - "grad_norm": 57.97881368696394, - "kl": 5.73095703125, - "learning_rate": 2.961018208013367e-06, - "loss": 0.2336, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 0.9925754447279824, + "kl": 0.1759033203125, + "learning_rate": 2.9755238402607826e-06, + "loss": 0.0145, + "num_tokens": 7736909.0, + "reward": 0.2357253096997738, + "reward_std": 0.11498994007706642, + "rewards/code_reward/mean": 0.2357253096997738, + "rewards/code_reward/std": 0.11498994193971157, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 264.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.25, + "completions/max_terminated_length": 201.25, + "completions/mean_length": 117.375, + "completions/mean_terminated_length": 117.375, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, "epoch": 0.02731215894781027, - "grad_norm": 49.75089750933376, - "kl": 4.38427734375, - "learning_rate": 2.9465037218645694e-06, - "loss": 0.0314, - "reward": 0.09375000186264515, - "reward_std": 0.021039125509560108, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.9375, + "grad_norm": 1.7694959761730793, + "kl": 0.1773681640625, + "learning_rate": 2.961018208013367e-06, + "loss": 0.0806, + "num_tokens": 7753785.0, + "reward": 0.26032672822475433, + "reward_std": 0.2158903395757079, + "rewards/code_reward/mean": 0.26032672822475433, + "rewards/code_reward/std": 0.21589034423232079, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 322.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.5, + "completions/max_terminated_length": 259.5, + "completions/mean_length": 134.0, + "completions/mean_terminated_length": 134.0, + "completions/min_length": 77.75, + "completions/min_terminated_length": 77.75, "epoch": 0.02742409402546523, - "grad_norm": 1.707430633827994, - "kl": 0.204833984375, - "learning_rate": 2.9319809908131604e-06, - "loss": 0.002, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5239142070937466, + "kl": 0.1982421875, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.0341, + "num_tokens": 7770921.0, + "reward": 0.15353127755224705, + "reward_std": 0.1622099713422358, + "rewards/code_reward/mean": 0.15353127755224705, + "rewards/code_reward/std": 0.16220997110940516, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 376.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.25, + "completions/max_terminated_length": 193.25, + "completions/mean_length": 129.40625, + "completions/mean_terminated_length": 129.40625, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, "epoch": 0.02753602910312019, - "grad_norm": 13.47781256474575, - "kl": 0.590576171875, - "learning_rate": 2.917450624203847e-06, - "loss": 0.0719, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.4700135521247835, + "kl": 0.311767578125, + "learning_rate": 2.9319809908131604e-06, + "loss": -0.0235, + "num_tokens": 7793438.0, + "reward": 0.22987624257802963, + "reward_std": 0.19782325625419617, + "rewards/code_reward/mean": 0.22987624257802963, + "rewards/code_reward/std": 0.19782325625419617, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 376.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.25, + "completions/max_terminated_length": 221.25, + "completions/mean_length": 132.5625, + "completions/mean_terminated_length": 132.5625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, "epoch": 0.02764796418077515, - "grad_norm": 1680.1301953567038, - "kl": 79.1229248046875, - "learning_rate": 2.9029132317017118e-06, - "loss": 0.7462, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 0.7938111312829735, + "kl": 0.2685546875, + "learning_rate": 2.917450624203847e-06, + "loss": 0.0108, + "num_tokens": 7811344.0, + "reward": 0.1285112500190735, + "reward_std": 0.03530046343803406, + "rewards/code_reward/mean": 0.1285112500190735, + "rewards/code_reward/std": 0.03530046343803406, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 379.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 72.75, + "completions/min_terminated_length": 72.75, "epoch": 0.02775989925843011, - "grad_norm": 0.6798016263852162, - "kl": 0.1409912109375, - "learning_rate": 2.888369423266629e-06, - "loss": 0.0014, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.461851676320045, + "kl": 0.321533203125, + "learning_rate": 2.9029132317017118e-06, + "loss": 0.0822, + "num_tokens": 7836400.0, + "reward": 0.07058638549642637, + "reward_std": 0.09412376256659627, + "rewards/code_reward/mean": 0.07058638549642637, + "rewards/code_reward/std": 0.09412377001717687, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 352.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.25, + "completions/max_terminated_length": 184.25, + "completions/mean_length": 131.59375, + "completions/mean_terminated_length": 131.59375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, "epoch": 0.02787183433608507, - "grad_norm": 5.926518387479333, - "kl": 0.63525390625, - "learning_rate": 2.8738198091276712e-06, - "loss": -0.0057, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.6504030899968662, + "kl": 0.275634765625, + "learning_rate": 2.888369423266629e-06, + "loss": 0.0701, + "num_tokens": 7857059.0, + "reward": 0.059203914599493146, + "reward_std": 0.09994567523244768, + "rewards/code_reward/mean": 0.059203914599493146, + "rewards/code_reward/std": 0.09994568361435086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 400.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.5, + "completions/max_terminated_length": 415.5, + "completions/mean_length": 206.15625, + "completions/mean_terminated_length": 206.15625, + "completions/min_length": 104.25, + "completions/min_terminated_length": 104.25, "epoch": 0.02798376941374003, - "grad_norm": 73.88428710060259, - "kl": 6.923828125, - "learning_rate": 2.859264999757509e-06, - "loss": 0.2651, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.159655895359276, + "kl": 0.165283203125, + "learning_rate": 2.8738198091276712e-06, + "loss": -0.0308, + "num_tokens": 7882080.0, + "reward": 0.10000000149011612, + "reward_std": 0.1331607922911644, + "rewards/code_reward/mean": 0.10000000149011612, + "rewards/code_reward/std": 0.1331607922911644, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 413.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.25, + "completions/max_terminated_length": 266.25, + "completions/mean_length": 179.0, + "completions/mean_terminated_length": 179.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, "epoch": 0.02809570449139499, - "grad_norm": 19.57610821275727, - "kl": 1.18603515625, - "learning_rate": 2.8447056058467928e-06, - "loss": -0.0419, - "reward": 0.09531250223517418, - "reward_std": 0.01478912541642785, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.953125, + "grad_norm": 1.3860909228149765, + "kl": 0.3243408203125, + "learning_rate": 2.859264999757509e-06, + "loss": -0.0087, + "num_tokens": 7904552.0, + "reward": 0.17192643135786057, + "reward_std": 0.12011632975190878, + "rewards/code_reward/mean": 0.17192643135786057, + "rewards/code_reward/std": 0.12011633953079581, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 357.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 111.34375, + "completions/mean_terminated_length": 111.34375, + "completions/min_length": 78.25, + "completions/min_terminated_length": 78.25, "epoch": 0.028207639569049953, - "grad_norm": 15.02657989678683, + "grad_norm": 2.5730420152805027, "kl": 0.35986328125, - "learning_rate": 2.830142238278531e-06, - "loss": 0.0012, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "learning_rate": 2.8447056058467928e-06, + "loss": -0.0585, + "num_tokens": 7929307.0, + "reward": 0.17651335208211094, + "reward_std": 0.22341035841964185, + "rewards/code_reward/mean": 0.17651335208211094, + "rewards/code_reward/std": 0.22341035841964185, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 381.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.5, + "completions/max_terminated_length": 225.5, + "completions/mean_length": 145.40625, + "completions/mean_terminated_length": 145.40625, + "completions/min_length": 86.25, + "completions/min_terminated_length": 86.25, "epoch": 0.028319574646704912, - "grad_norm": 2.875721583070933, - "kl": 0.475341796875, - "learning_rate": 2.81557550810246e-06, - "loss": -0.0231, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.8094443062077088, + "kl": 0.36767578125, + "learning_rate": 2.830142238278531e-06, + "loss": 0.0709, + "num_tokens": 7952504.0, + "reward": 0.2017338698497042, + "reward_std": 0.20673675020225346, + "rewards/code_reward/mean": 0.2017338698497042, + "rewards/code_reward/std": 0.20673674996942282, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 371.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.25, + "completions/max_terminated_length": 203.25, + "completions/mean_length": 119.84375, + "completions/mean_terminated_length": 119.84375, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, "epoch": 0.02843150972435987, - "grad_norm": 0.6092524986743804, - "kl": 0.1376953125, - "learning_rate": 2.8010060265094026e-06, - "loss": 0.0014, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.3402410563756315, + "kl": 0.354736328125, + "learning_rate": 2.81557550810246e-06, + "loss": -0.0806, + "num_tokens": 7976539.0, + "reward": 0.3907702271826565, + "reward_std": 0.26296099089086056, + "rewards/code_reward/mean": 0.3907702271826565, + "rewards/code_reward/std": 0.26296099927276373, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 375.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 128.8125, + "completions/mean_terminated_length": 128.8125, + "completions/min_length": 72.5, + "completions/min_terminated_length": 72.5, "epoch": 0.02854344480201483, - "grad_norm": 1.3831215185283958, - "kl": 0.130615234375, - "learning_rate": 2.786434404805629e-06, - "loss": -0.032, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.6470365991478257, + "kl": 0.3701171875, + "learning_rate": 2.8010060265094026e-06, + "loss": 0.0623, + "num_tokens": 7998165.0, + "reward": 0.14626706298440695, + "reward_std": 0.13755429768934846, + "rewards/code_reward/mean": 0.14626706298440695, + "rewards/code_reward/std": 0.13755429675802588, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 394.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, "epoch": 0.02865537987966979, - "grad_norm": 0.29391488966565826, - "kl": 0.1136474609375, - "learning_rate": 2.771861254387199e-06, - "loss": 0.0011, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5339855216333798, + "kl": 0.224365234375, + "learning_rate": 2.786434404805629e-06, + "loss": 0.0387, + "num_tokens": 8031691.0, + "reward": 0.09155143890529871, + "reward_std": 0.09001913899555802, + "rewards/code_reward/mean": 0.09155143890529871, + "rewards/code_reward/std": 0.09001914283726364, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 418.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 145.6875, + "completions/mean_terminated_length": 145.6875, + "completions/min_length": 88.75, + "completions/min_terminated_length": 88.75, "epoch": 0.028767314957324753, - "grad_norm": 8.597539761339826, - "kl": 0.3472900390625, - "learning_rate": 2.7572871867143204e-06, - "loss": 0.0113, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.8276516553873643, + "kl": 0.37451171875, + "learning_rate": 2.771861254387199e-06, + "loss": -0.0261, + "num_tokens": 8052065.0, + "reward": 0.21322817541658878, + "reward_std": 0.16002243757247925, + "rewards/code_reward/mean": 0.21322817541658878, + "rewards/code_reward/std": 0.16002243757247925, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 425.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.75, + "completions/max_terminated_length": 243.75, + "completions/mean_length": 164.90625, + "completions/mean_terminated_length": 164.90625, + "completions/min_length": 109.25, + "completions/min_terminated_length": 109.25, "epoch": 0.028879250034979712, - "grad_norm": 0.5239580359655661, - "kl": 0.1436767578125, - "learning_rate": 2.742712813285681e-06, - "loss": 0.0014, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.559190798859591, + "kl": 0.28076171875, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.0077, + "num_tokens": 8082230.0, + "reward": 0.19782285764813423, + "reward_std": 0.23767431639134884, + "rewards/code_reward/mean": 0.19782285764813423, + "rewards/code_reward/std": 0.23767432384192944, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 341.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.5, + "completions/max_terminated_length": 329.5, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 83.5, + "completions/min_terminated_length": 83.5, "epoch": 0.02899118511263467, - "grad_norm": 1.9555271298208972, - "kl": 0.2032470703125, - "learning_rate": 2.7281387456128017e-06, - "loss": 0.002, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9033316433119087, + "kl": 0.302978515625, + "learning_rate": 2.742712813285681e-06, + "loss": 0.0697, + "num_tokens": 8106786.0, + "reward": 0.0914294570684433, + "reward_std": 0.09275190159678459, + "rewards/code_reward/mean": 0.0914294570684433, + "rewards/code_reward/std": 0.09275190159678459, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 368.65625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 123.71875, + "completions/mean_terminated_length": 123.71875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, "epoch": 0.02910312019028963, - "grad_norm": 1.252836717471123, - "kl": 0.2208251953125, - "learning_rate": 2.7135655951943716e-06, - "loss": 0.0022, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.9152198651462768, + "kl": 0.333984375, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.014, + "num_tokens": 8126217.0, + "reward": 0.21696891635656357, + "reward_std": 0.28822916746139526, + "rewards/code_reward/mean": 0.21696891635656357, + "rewards/code_reward/std": 0.28822918236255646, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 375.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.75, + "completions/max_terminated_length": 369.75, + "completions/mean_length": 167.9375, + "completions/mean_terminated_length": 167.9375, + "completions/min_length": 73.25, + "completions/min_terminated_length": 73.25, "epoch": 0.029215055267944594, - "grad_norm": 0.8719765743586952, - "kl": 0.1676025390625, - "learning_rate": 2.698993973490598e-06, - "loss": 0.0017, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9135668163744887, + "kl": 0.24755859375, + "learning_rate": 2.7135655951943716e-06, + "loss": -0.0166, + "num_tokens": 8150855.0, + "reward": 0.03386699501425028, + "reward_std": 0.06380424555391073, + "rewards/code_reward/mean": 0.03386699501425028, + "rewards/code_reward/std": 0.06380424555391073, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 333.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.75, + "completions/max_terminated_length": 271.75, + "completions/mean_length": 116.125, + "completions/mean_terminated_length": 116.125, + "completions/min_length": 61.75, + "completions/min_terminated_length": 61.75, "epoch": 0.029326990345599553, - "grad_norm": 0.13713063096829928, - "kl": 0.10205078125, - "learning_rate": 2.6844244918975416e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6076094252791506, + "kl": 0.31201171875, + "learning_rate": 2.698993973490598e-06, + "loss": 0.0945, + "num_tokens": 8165387.0, + "reward": 0.20722341747023165, + "reward_std": 0.11584698176011443, + "rewards/code_reward/mean": 0.20722341747023165, + "rewards/code_reward/std": 0.11584698967635632, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 320.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.5, + "completions/max_terminated_length": 260.5, + "completions/mean_length": 160.3125, + "completions/mean_terminated_length": 160.3125, + "completions/min_length": 107.5, + "completions/min_terminated_length": 107.5, "epoch": 0.029438925423254513, - "grad_norm": 1.2802535059679794, - "kl": 0.09423828125, - "learning_rate": 2.66985776172147e-06, - "loss": 0.0411, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.2712990086273, + "kl": 0.266357421875, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.0136, + "num_tokens": 8185445.0, + "reward": 0.12302206363528967, + "reward_std": 0.1178859043866396, + "rewards/code_reward/mean": 0.12302206363528967, + "rewards/code_reward/std": 0.11788590624928474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 434.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.75, + "completions/max_terminated_length": 276.75, + "completions/mean_length": 177.96875, + "completions/mean_terminated_length": 177.96875, + "completions/min_length": 109.25, + "completions/min_terminated_length": 109.25, "epoch": 0.029550860500909472, - "grad_norm": 3.6751799791380084, - "kl": 0.1397705078125, - "learning_rate": 2.6552943941532088e-06, - "loss": -0.0156, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.5932680686482104, + "kl": 0.27490234375, + "learning_rate": 2.66985776172147e-06, + "loss": -0.0647, + "num_tokens": 8214836.0, + "reward": 0.3632364124059677, + "reward_std": 0.24340662360191345, + "rewards/code_reward/mean": 0.3632364124059677, + "rewards/code_reward/std": 0.24340663105249405, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 352.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 131.03125, + "completions/mean_terminated_length": 131.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, "epoch": 0.02966279557856443, - "grad_norm": 0.07806269511080617, - "kl": 0.0855712890625, - "learning_rate": 2.6407350002424927e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3583412717133385, + "kl": 0.3232421875, + "learning_rate": 2.6552943941532088e-06, + "loss": 0.0688, + "num_tokens": 8233101.0, + "reward": 0.19121321476995945, + "reward_std": 0.1444133589975536, + "rewards/code_reward/mean": 0.19121321476995945, + "rewards/code_reward/std": 0.14441336272284389, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 298.84375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.25, + "completions/max_terminated_length": 195.25, + "completions/mean_length": 131.9375, + "completions/mean_terminated_length": 131.9375, + "completions/min_length": 98.75, + "completions/min_terminated_length": 98.75, "epoch": 0.029774730656219394, - "grad_norm": 0.06731730411902957, - "kl": 0.087646484375, - "learning_rate": 2.626180190872329e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2376740863800209, + "kl": 0.347900390625, + "learning_rate": 2.6407350002424927e-06, + "loss": -0.0064, + "num_tokens": 8253363.0, + "reward": 0.24439102411270142, + "reward_std": 0.17834187299013138, + "rewards/code_reward/mean": 0.24439102411270142, + "rewards/code_reward/std": 0.17834188044071198, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 295.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.5, + "completions/max_terminated_length": 177.5, + "completions/mean_length": 106.03125, + "completions/mean_terminated_length": 106.03125, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, "epoch": 0.029886665733874353, - "grad_norm": 0.13087790730798726, - "kl": 0.0902099609375, - "learning_rate": 2.611630576733372e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4198187332614487, + "kl": 0.30126953125, + "learning_rate": 2.626180190872329e-06, + "loss": -0.047, + "num_tokens": 8267084.0, + "reward": 0.04570374824106693, + "reward_std": 0.034461796283721924, + "rewards/code_reward/mean": 0.04570374824106693, + "rewards/code_reward/std": 0.03446180047467351, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 273.078125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 119.0, + "completions/mean_terminated_length": 119.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, "epoch": 0.029998600811529313, - "grad_norm": 5.554675473835038, - "kl": 0.14794921875, - "learning_rate": 2.5970867682982885e-06, - "loss": -0.051, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.766302572356958, + "kl": 0.254150390625, + "learning_rate": 2.611630576733372e-06, + "loss": 0.0719, + "num_tokens": 8285988.0, + "reward": 0.20828989439178258, + "reward_std": 0.1855767808156088, + "rewards/code_reward/mean": 0.20828989439178258, + "rewards/code_reward/std": 0.18557679950026795, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 344.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 205.78125, + "completions/mean_terminated_length": 205.78125, + "completions/min_length": 125.5, + "completions/min_terminated_length": 125.5, "epoch": 0.030110535889184272, - "grad_norm": 0.14583524682041712, - "kl": 0.0927734375, - "learning_rate": 2.582549375796154e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.559152868059546, + "kl": 0.216064453125, + "learning_rate": 2.5970867682982885e-06, + "loss": 0.0113, + "num_tokens": 8315381.0, + "reward": 0.01448170654475689, + "reward_std": 0.03370444104075432, + "rewards/code_reward/mean": 0.01448170654475689, + "rewards/code_reward/std": 0.03370444104075432, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 290.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 88.75, + "completions/min_terminated_length": 88.75, "epoch": 0.030222470966839235, - "grad_norm": 0.058225539376494065, - "kl": 0.077880859375, - "learning_rate": 2.568019009186841e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7716322097224022, + "kl": 0.329833984375, + "learning_rate": 2.582549375796154e-06, + "loss": 0.1976, + "num_tokens": 8336289.0, + "reward": 0.1647916678339243, + "reward_std": 0.1911229882389307, + "rewards/code_reward/mean": 0.1647916678339243, + "rewards/code_reward/std": 0.1911229882389307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 361.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.25, + "completions/max_terminated_length": 275.25, + "completions/mean_length": 184.71875, + "completions/mean_terminated_length": 184.71875, + "completions/min_length": 112.25, + "completions/min_terminated_length": 112.25, "epoch": 0.030334406044494194, - "grad_norm": 0.04344087707852648, - "kl": 0.0789794921875, - "learning_rate": 2.5534962781354317e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9190427217910049, + "kl": 0.28369140625, + "learning_rate": 2.568019009186841e-06, + "loss": -0.014, + "num_tokens": 8358944.0, + "reward": 0.20673798964708112, + "reward_std": 0.11309454750153236, + "rewards/code_reward/mean": 0.20673798964708112, + "rewards/code_reward/std": 0.11309454750153236, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 305.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.5, + "completions/max_terminated_length": 260.5, + "completions/mean_length": 145.5625, + "completions/mean_terminated_length": 145.5625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, "epoch": 0.030446341122149154, - "grad_norm": 0.07717630877616868, - "kl": 0.07293701171875, - "learning_rate": 2.538981791986634e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.575353161865142, + "kl": 0.3779296875, + "learning_rate": 2.5534962781354317e-06, + "loss": 0.1436, + "num_tokens": 8380378.0, + "reward": 0.240084670484066, + "reward_std": 0.27030207961797714, + "rewards/code_reward/mean": 0.240084670484066, + "rewards/code_reward/std": 0.27030208706855774, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 419.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.25, + "completions/max_terminated_length": 411.25, + "completions/mean_length": 185.34375, + "completions/mean_terminated_length": 185.34375, + "completions/min_length": 99.25, + "completions/min_terminated_length": 99.25, "epoch": 0.030558276199804113, - "grad_norm": 0.22582461693188216, - "kl": 0.0968017578125, - "learning_rate": 2.524476159739218e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.529799496784454, + "kl": 0.27734375, + "learning_rate": 2.538981791986634e-06, + "loss": -0.072, + "num_tokens": 8410077.0, + "reward": 0.36352282762527466, + "reward_std": 0.24801481142640114, + "rewards/code_reward/mean": 0.36352282762527466, + "rewards/code_reward/std": 0.24801481887698174, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 299.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.5, + "completions/max_terminated_length": 322.5, + "completions/mean_length": 156.71875, + "completions/mean_terminated_length": 156.71875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, "epoch": 0.030670211277459072, - "grad_norm": 27.999475574336614, - "kl": 0.5250244140625, - "learning_rate": 2.5099799900204607e-06, - "loss": 0.0169, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.670315398590079, + "kl": 0.28173828125, + "learning_rate": 2.524476159739218e-06, + "loss": -0.0316, + "num_tokens": 8433564.0, + "reward": 0.08395027136430144, + "reward_std": 0.10782372578978539, + "rewards/code_reward/mean": 0.08395027136430144, + "rewards/code_reward/std": 0.10782372625544667, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 357.21875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 130.09375, + "completions/mean_terminated_length": 130.09375, + "completions/min_length": 76.75, + "completions/min_terminated_length": 76.75, "epoch": 0.030782146355114035, - "grad_norm": 0.034837371039734215, - "kl": 0.07232666015625, - "learning_rate": 2.4954938910606108e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8106311365726324, + "kl": 0.3251953125, + "learning_rate": 2.5099799900204607e-06, + "loss": 0.0782, + "num_tokens": 8452687.0, + "reward": 0.32567203789949417, + "reward_std": 0.27224994264543056, + "rewards/code_reward/mean": 0.32567203789949417, + "rewards/code_reward/std": 0.27224994637072086, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 325.265625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 657.75, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 171.0, + "completions/mean_terminated_length": 111.20089340209961, + "completions/min_length": 57.25, + "completions/min_terminated_length": 57.25, "epoch": 0.030894081432768995, - "grad_norm": 18.866980558523725, - "kl": 1.3262939453125, - "learning_rate": 2.481018470667368e-06, - "loss": 0.0199, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.8321308568390302, + "kl": 0.321533203125, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.1624, + "num_tokens": 8475671.0, + "reward": 0.14915229193866253, + "reward_std": 0.11303082318045199, + "rewards/code_reward/mean": 0.14915229193866253, + "rewards/code_reward/std": 0.1130308248102665, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 375.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.25, + "completions/max_terminated_length": 378.25, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 114.5, + "completions/min_terminated_length": 114.5, "epoch": 0.031006016510423954, - "grad_norm": 0.08715091318862284, - "kl": 0.0843505859375, - "learning_rate": 2.4665543362003802e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5281779015369072, + "kl": 0.2308349609375, + "learning_rate": 2.481018470667368e-06, + "loss": 0.1693, + "num_tokens": 8502299.0, + "reward": 0.18173168785870075, + "reward_std": 0.10828323196619749, + "rewards/code_reward/mean": 0.18173168785870075, + "rewards/code_reward/std": 0.10828323615714908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 383.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.5, + "completions/max_terminated_length": 260.5, + "completions/mean_length": 150.34375, + "completions/mean_terminated_length": 150.34375, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, "epoch": 0.031117951588078913, - "grad_norm": 18.97786056185319, - "kl": 0.388671875, - "learning_rate": 2.4521020945457615e-06, - "loss": 0.0333, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.6397888672992769, + "kl": 0.310546875, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.0215, + "num_tokens": 8528406.0, + "reward": 0.12074580090120435, + "reward_std": 0.17130711488425732, + "rewards/code_reward/mean": 0.12074580090120435, + "rewards/code_reward/std": 0.17130712047219276, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 292.890625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.5, + "completions/max_terminated_length": 241.5, + "completions/mean_length": 120.4375, + "completions/mean_terminated_length": 120.4375, + "completions/min_length": 69.75, + "completions/min_terminated_length": 69.75, "epoch": 0.031229886665733873, - "grad_norm": 0.3777705304986896, - "kl": 0.09979248046875, - "learning_rate": 2.4376623520906255e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8254531380803452, + "kl": 0.32275390625, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.0678, + "num_tokens": 8549612.0, + "reward": 0.6036184206604958, + "reward_std": 0.3284572381526232, + "rewards/code_reward/mean": 0.6036184206604958, + "rewards/code_reward/std": 0.3284572381526232, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 306.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 126.84375, + "completions/mean_terminated_length": 126.84375, + "completions/min_length": 56.25, + "completions/min_terminated_length": 56.25, "epoch": 0.031341821743388835, - "grad_norm": 23.98251858869545, - "kl": 5.38348388671875, - "learning_rate": 2.4232357146976478e-06, - "loss": 0.0275, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.4668184273965648, + "kl": 0.4033203125, + "learning_rate": 2.4376623520906255e-06, + "loss": 0.1532, + "num_tokens": 8569279.0, + "reward": 0.18534822203218937, + "reward_std": 0.146333621814847, + "rewards/code_reward/mean": 0.18534822203218937, + "rewards/code_reward/std": 0.1463336320593953, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 403.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 150.90625, + "completions/mean_terminated_length": 150.90625, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, "epoch": 0.031453756821043795, - "grad_norm": 0.09304733402356336, - "kl": 0.0772705078125, - "learning_rate": 2.408822787679637e-06, - "loss": 0.0008, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2334442728026884, + "kl": 0.3759765625, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0028, + "num_tokens": 8597444.0, + "reward": 0.1862155646085739, + "reward_std": 0.10262486711144447, + "rewards/code_reward/mean": 0.1862155646085739, + "rewards/code_reward/std": 0.10262487456202507, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 354.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.5, + "completions/max_terminated_length": 510.5, + "completions/mean_length": 210.78125, + "completions/mean_terminated_length": 210.78125, + "completions/min_length": 97.25, + "completions/min_terminated_length": 97.25, "epoch": 0.031565691898698754, - "grad_norm": 0.14440303776607324, - "kl": 0.0906982421875, - "learning_rate": 2.3944241757741475e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.484546879319261, + "kl": 0.309326171875, + "learning_rate": 2.408822787679637e-06, + "loss": -0.0366, + "num_tokens": 8622829.0, + "reward": 0.1472295392304659, + "reward_std": 0.10856602992862463, + "rewards/code_reward/mean": 0.1472295392304659, + "rewards/code_reward/std": 0.10856604157015681, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 323.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.25, + "completions/max_terminated_length": 266.25, + "completions/mean_length": 177.84375, + "completions/mean_terminated_length": 177.84375, + "completions/min_length": 121.5, + "completions/min_terminated_length": 121.5, "epoch": 0.031677626976353714, - "grad_norm": 2.359216668730623, - "kl": 0.90911865234375, - "learning_rate": 2.380040483118097e-06, - "loss": -0.0461, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.6429453484339698, + "kl": 0.318359375, + "learning_rate": 2.3944241757741475e-06, + "loss": 0.0429, + "num_tokens": 8643536.0, + "reward": 0.1551339291036129, + "reward_std": 0.21810386329889297, + "rewards/code_reward/mean": 0.1551339291036129, + "rewards/code_reward/std": 0.21810387633740902, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 269.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.5, + "completions/max_terminated_length": 218.5, + "completions/mean_length": 132.6875, + "completions/mean_terminated_length": 132.6875, + "completions/min_length": 90.75, + "completions/min_terminated_length": 90.75, "epoch": 0.03178956205400867, - "grad_norm": 0.045061338534023075, - "kl": 0.05657958984375, - "learning_rate": 2.365672313222419e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.9278006659321785, + "kl": 0.294677734375, + "learning_rate": 2.380040483118097e-06, + "loss": -0.0661, + "num_tokens": 8660110.0, + "reward": 0.1566466533113271, + "reward_std": 0.15316736698150635, + "rewards/code_reward/mean": 0.1566466533113271, + "rewards/code_reward/std": 0.1531673688441515, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 281.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.75, + "completions/max_terminated_length": 200.75, + "completions/mean_length": 111.15625, + "completions/mean_terminated_length": 111.15625, + "completions/min_length": 67.25, + "completions/min_terminated_length": 67.25, "epoch": 0.03190149713166363, - "grad_norm": 0.21772012539025168, - "kl": 0.0853271484375, - "learning_rate": 2.351320268946749e-06, - "loss": 0.0009, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.924152560934373, + "kl": 0.46728515625, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0708, + "num_tokens": 8676963.0, + "reward": 0.3067304156720638, + "reward_std": 0.17022380698472261, + "rewards/code_reward/mean": 0.3067304156720638, + "rewards/code_reward/std": 0.1702238107100129, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 314.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.5, + "completions/max_terminated_length": 146.5, + "completions/mean_length": 88.375, + "completions/mean_terminated_length": 88.375, + "completions/min_length": 53.5, + "completions/min_terminated_length": 53.5, "epoch": 0.0320134322093186, - "grad_norm": 0.6734360714201397, - "kl": 0.1312255859375, - "learning_rate": 2.336984952474119e-06, - "loss": 0.0013, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.52648816592765, + "kl": 0.4990234375, + "learning_rate": 2.351320268946749e-06, + "loss": -0.0968, + "num_tokens": 8696055.0, + "reward": 0.21885720640420914, + "reward_std": 0.18590925447642803, + "rewards/code_reward/mean": 0.21885720640420914, + "rewards/code_reward/std": 0.18590926192700863, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 415.15625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 949.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 296.96875, + "completions/mean_terminated_length": 238.07143020629883, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.03212536728697356, - "grad_norm": 4.987286255355766, - "kl": 0.12139892578125, - "learning_rate": 2.322666965285697e-06, - "loss": -0.0532, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4087649689505792, + "kl": 0.279296875, + "learning_rate": 2.336984952474119e-06, + "loss": 0.1631, + "num_tokens": 8732022.0, + "reward": 0.12815122242318466, + "reward_std": 0.13949624670203775, + "rewards/code_reward/mean": 0.12815122242318466, + "rewards/code_reward/std": 0.13949625426903367, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 400.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.5, + "completions/max_terminated_length": 313.5, + "completions/mean_length": 165.6875, + "completions/mean_terminated_length": 165.6875, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, "epoch": 0.03223730236462852, - "grad_norm": 0.6096977245482182, - "kl": 0.09234619140625, - "learning_rate": 2.3083669081355507e-06, - "loss": -0.0601, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5911200294302021, + "kl": 0.309326171875, + "learning_rate": 2.322666965285697e-06, + "loss": -0.0499, + "num_tokens": 8752596.0, + "reward": 0.2135441319551319, + "reward_std": 0.1789869824424386, + "rewards/code_reward/mean": 0.2135441319551319, + "rewards/code_reward/std": 0.17898700083605945, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 502.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.25, + "completions/max_terminated_length": 487.25, + "completions/mean_length": 240.5625, + "completions/mean_terminated_length": 240.5625, + "completions/min_length": 111.5, + "completions/min_terminated_length": 111.5, "epoch": 0.03234923744228348, - "grad_norm": 0.04237782613357625, - "kl": 0.06573486328125, - "learning_rate": 2.2940853810254377e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6229086799739825, + "kl": 0.305908203125, + "learning_rate": 2.3083669081355507e-06, + "loss": 0.1546, + "num_tokens": 8783550.0, + "reward": 0.060625465121120214, + "reward_std": 0.031893965788185596, + "rewards/code_reward/mean": 0.060625465121120214, + "rewards/code_reward/std": 0.03189396392554045, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 324.53125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 736.75, + "completions/max_terminated_length": 277.75, + "completions/mean_length": 226.15625, + "completions/mean_terminated_length": 167.17411041259766, + "completions/min_length": 81.25, + "completions/min_terminated_length": 81.25, "epoch": 0.032461172519938436, - "grad_norm": 0.029403412259251353, - "kl": 0.06024169921875, - "learning_rate": 2.2798229831796313e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9678300330589205, + "kl": 0.138671875, + "learning_rate": 2.2940853810254377e-06, + "loss": 0.1927, + "num_tokens": 8806243.0, + "reward": 0.301976312417537, + "reward_std": 0.05864762840792537, + "rewards/code_reward/mean": 0.301976312417537, + "rewards/code_reward/std": 0.058647628873586655, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 413.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.75, + "completions/max_terminated_length": 419.75, + "completions/mean_length": 161.28125, + "completions/mean_terminated_length": 161.28125, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, "epoch": 0.032573107597593395, - "grad_norm": 0.08363337386352669, - "kl": 0.0560302734375, - "learning_rate": 2.2655803130197816e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.0533336308641092, + "kl": 0.43505859375, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.0806, + "num_tokens": 8830396.0, + "reward": 0.08084819512441754, + "reward_std": 0.043647464364767075, + "rewards/code_reward/mean": 0.08084819512441754, + "rewards/code_reward/std": 0.043647464364767075, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 318.734375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 739.75, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 217.4375, + "completions/mean_terminated_length": 157.30357360839844, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, "epoch": 0.032685042675248355, - "grad_norm": 0.023053884997018263, - "kl": 0.06512451171875, - "learning_rate": 2.2513579681398034e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.637832465555356, + "kl": 0.2890625, + "learning_rate": 2.2655803130197816e-06, + "loss": 0.2231, + "num_tokens": 8853858.0, + "reward": 0.2264392450451851, + "reward_std": 0.21030585933476686, + "rewards/code_reward/mean": 0.2264392450451851, + "rewards/code_reward/std": 0.21030588168650866, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 460.546875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.5, + "completions/max_terminated_length": 349.5, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.032796977752903314, - "grad_norm": 1.460556497688483, - "kl": 0.15765380859375, - "learning_rate": 2.237156545280803e-06, - "loss": -0.026, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.574199000350919, + "kl": 0.384765625, + "learning_rate": 2.2513579681398034e-06, + "loss": 0.0158, + "num_tokens": 8878935.0, + "reward": 0.2135722152888775, + "reward_std": 0.17660537734627724, + "rewards/code_reward/mean": 0.2135722152888775, + "rewards/code_reward/std": 0.17660538339987397, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 282.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.25, + "completions/max_terminated_length": 311.25, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 91.25, + "completions/min_terminated_length": 91.25, "epoch": 0.03290891283055827, - "grad_norm": 7.935071492001548, - "kl": 1.8072509765625, - "learning_rate": 2.2229766403060403e-06, - "loss": -0.0278, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.6072325452218685, + "kl": 0.363037109375, + "learning_rate": 2.237156545280803e-06, + "loss": 0.0884, + "num_tokens": 8901727.0, + "reward": 0.3473220057785511, + "reward_std": 0.18608891125768423, + "rewards/code_reward/mean": 0.3473220057785511, + "rewards/code_reward/std": 0.18608891125768423, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 385.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.75, + "completions/max_terminated_length": 419.75, + "completions/mean_length": 209.28125, + "completions/mean_terminated_length": 209.28125, + "completions/min_length": 105.75, + "completions/min_terminated_length": 105.75, "epoch": 0.03302084790821324, - "grad_norm": 0.01681019430958899, - "kl": 0.0477294921875, - "learning_rate": 2.2088188481759305e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6249151148932088, + "kl": 0.248779296875, + "learning_rate": 2.2229766403060403e-06, + "loss": -0.0182, + "num_tokens": 8925072.0, + "reward": 0.307357229758054, + "reward_std": 0.13686690758913755, + "rewards/code_reward/mean": 0.307357229758054, + "rewards/code_reward/std": 0.13686690386384726, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 414.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 178.40625, + "completions/mean_terminated_length": 178.40625, + "completions/min_length": 90.25, + "completions/min_terminated_length": 90.25, "epoch": 0.0331327829858682, - "grad_norm": 0.01847832809783982, - "kl": 0.04632568359375, - "learning_rate": 2.194683762923073e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6672544303984767, + "kl": 0.32666015625, + "learning_rate": 2.2088188481759305e-06, + "loss": 0.0041, + "num_tokens": 8942397.0, + "reward": 0.17835952731547877, + "reward_std": 0.14297430915758014, + "rewards/code_reward/mean": 0.17835952731547877, + "rewards/code_reward/std": 0.1429743110202253, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 443.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 168.09375, + "completions/mean_terminated_length": 168.09375, + "completions/min_length": 82.75, + "completions/min_terminated_length": 82.75, "epoch": 0.03324471806352316, - "grad_norm": 0.02629542045427332, - "kl": 0.0531005859375, - "learning_rate": 2.1805719776273387e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5915878451347125, + "kl": 0.42919921875, + "learning_rate": 2.194683762923073e-06, + "loss": -0.0342, + "num_tokens": 8967448.0, + "reward": 0.23388671875, + "reward_std": 0.09839868592098355, + "rewards/code_reward/mean": 0.23388671875, + "rewards/code_reward/std": 0.09839868592098355, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 371.09375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1053.75, + "completions/max_terminated_length": 626.25, + "completions/mean_length": 331.40625, + "completions/mean_terminated_length": 275.5669708251953, + "completions/min_length": 122.5, + "completions/min_terminated_length": 122.5, "epoch": 0.03335665314117812, - "grad_norm": 0.039099364449013255, - "kl": 0.0689697265625, - "learning_rate": 2.166484084390974e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1259318778679914, + "kl": 0.24853515625, + "learning_rate": 2.1805719776273387e-06, + "loss": 0.1031, + "num_tokens": 8996029.0, + "reward": 0.21752450801432133, + "reward_std": 0.22587602585554123, + "rewards/code_reward/mean": 0.21752450801432133, + "rewards/code_reward/std": 0.22587604075670242, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 348.171875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 810.25, + "completions/max_terminated_length": 459.5, + "completions/mean_length": 282.46875, + "completions/mean_terminated_length": 228.1651840209961, + "completions/min_length": 90.25, + "completions/min_terminated_length": 90.25, "epoch": 0.03346858821883308, - "grad_norm": 0.054692981575624876, - "kl": 0.067138671875, - "learning_rate": 2.1524206743137636e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.327314761495312, + "kl": 0.2470703125, + "learning_rate": 2.166484084390974e-06, + "loss": -0.0158, + "num_tokens": 9024660.0, + "reward": 0.4248046875, + "reward_std": 0.41644760966300964, + "rewards/code_reward/mean": 0.4248046875, + "rewards/code_reward/std": 0.41644763946533203, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 453.71875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 816.5, + "completions/max_terminated_length": 452.75, + "completions/mean_length": 257.34375, + "completions/mean_terminated_length": 201.17857360839844, + "completions/min_length": 88.5, + "completions/min_terminated_length": 88.5, "epoch": 0.033580523296488037, - "grad_norm": 0.020024754177952568, - "kl": 0.04815673828125, - "learning_rate": 2.1383823374682287e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2867387634028893, + "kl": 0.251953125, + "learning_rate": 2.1524206743137636e-06, + "loss": -0.2782, + "num_tokens": 9049823.0, + "reward": 0.2559996712952852, + "reward_std": 0.17017995577771217, + "rewards/code_reward/mean": 0.2559996712952852, + "rewards/code_reward/std": 0.1701799522852525, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 397.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.5, + "completions/max_terminated_length": 544.5, + "completions/mean_length": 296.03125, + "completions/mean_terminated_length": 296.03125, + "completions/min_length": 139.25, + "completions/min_terminated_length": 139.25, "epoch": 0.033692458374142996, - "grad_norm": 0.7962215451972806, - "kl": 0.0548095703125, - "learning_rate": 2.124369662874868e-06, - "loss": 0.0015, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3498052592722694, + "kl": 0.25732421875, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0851, + "num_tokens": 9079328.0, + "reward": 0.38671875, + "reward_std": 0.3085732739418745, + "rewards/code_reward/mean": 0.38671875, + "rewards/code_reward/std": 0.3085732851177454, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 439.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 205.40625, + "completions/mean_terminated_length": 205.40625, + "completions/min_length": 79.75, + "completions/min_terminated_length": 79.75, "epoch": 0.033804393451797955, - "grad_norm": 0.03594032335785641, - "kl": 0.04754638671875, - "learning_rate": 2.110383238477441e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1786971151666847, + "kl": 0.253662109375, + "learning_rate": 2.124369662874868e-06, + "loss": 0.0537, + "num_tokens": 9103917.0, + "reward": 0.1022359449416399, + "reward_std": 0.1313032009638846, + "rewards/code_reward/mean": 0.1022359449416399, + "rewards/code_reward/std": 0.13130320748314261, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 382.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.5, + "completions/max_terminated_length": 785.5, + "completions/mean_length": 290.25, + "completions/mean_terminated_length": 290.25, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, "epoch": 0.033916328529452915, - "grad_norm": 0.02676423646625272, - "kl": 0.0552978515625, - "learning_rate": 2.096423651118305e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2753854412194898, + "kl": 0.23779296875, + "learning_rate": 2.110383238477441e-06, + "loss": 0.1839, + "num_tokens": 9131989.0, + "reward": 0.3315134688746184, + "reward_std": 0.21372198988683522, + "rewards/code_reward/mean": 0.3315134688746184, + "rewards/code_reward/std": 0.21372198243625462, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 364.1875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 834.75, + "completions/max_terminated_length": 440.25, + "completions/mean_length": 297.6875, + "completions/mean_terminated_length": 241.50446701049805, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, "epoch": 0.03402826360710788, - "grad_norm": 0.021001193681821163, - "kl": 0.04876708984375, - "learning_rate": 2.082491486513788e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9686437804358383, + "kl": 0.25732421875, + "learning_rate": 2.096423651118305e-06, + "loss": 0.0919, + "num_tokens": 9155547.0, + "reward": 0.18424479104578495, + "reward_std": 0.2077017817646265, + "rewards/code_reward/mean": 0.18424479104578495, + "rewards/code_reward/std": 0.20770180504769087, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 353.9375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 807.0, + "completions/max_terminated_length": 623.5, + "completions/mean_length": 361.53125, + "completions/mean_terminated_length": 315.49554443359375, + "completions/min_length": 142.25, + "completions/min_terminated_length": 142.25, "epoch": 0.03414019868476284, - "grad_norm": 0.015515949438366839, - "kl": 0.0455322265625, - "learning_rate": 2.0685873292296116e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.203308108515461, + "kl": 0.24169921875, + "learning_rate": 2.082491486513788e-06, + "loss": 0.0801, + "num_tokens": 9183796.0, + "reward": 0.22409930732101202, + "reward_std": 0.2232498861849308, + "rewards/code_reward/mean": 0.22409930732101202, + "rewards/code_reward/std": 0.22324990667402744, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 502.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.25, + "completions/max_terminated_length": 560.25, + "completions/mean_length": 283.53125, + "completions/mean_terminated_length": 283.53125, + "completions/min_length": 173.75, + "completions/min_terminated_length": 173.75, "epoch": 0.0342521337624178, - "grad_norm": 0.10070236191895302, - "kl": 0.042633056640625, - "learning_rate": 2.054711762656369e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0443749007299015, + "kl": 0.22998046875, + "learning_rate": 2.0685873292296116e-06, + "loss": -0.0535, + "num_tokens": 9212077.0, + "reward": 0.3671575216576457, + "reward_std": 0.18197334744036198, + "rewards/code_reward/mean": 0.3671575216576457, + "rewards/code_reward/std": 0.18197335489094257, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 423.890625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 959.25, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 339.875, + "completions/mean_terminated_length": 283.05357360839844, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.03436406884007276, - "grad_norm": 8.449530301304263, - "kl": 0.17718505859375, - "learning_rate": 2.040865368985044e-06, - "loss": 0.0072, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9280325752065158, + "kl": 0.26708984375, + "learning_rate": 2.054711762656369e-06, + "loss": 0.0166, + "num_tokens": 9245945.0, + "reward": 0.20142045244574547, + "reward_std": 0.1855016816407442, + "rewards/code_reward/mean": 0.20142045244574547, + "rewards/code_reward/std": 0.18550169840455055, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 365.953125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 917.75, + "completions/max_terminated_length": 533.5, + "completions/mean_length": 328.03125, + "completions/mean_terminated_length": 271.48661041259766, + "completions/min_length": 96.25, + "completions/min_terminated_length": 96.25, "epoch": 0.03447600391772772, - "grad_norm": 4.554366640679164, - "kl": 0.10430908203125, - "learning_rate": 2.027048729182583e-06, - "loss": 0.0096, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.110612410740818, + "kl": 0.26904296875, + "learning_rate": 2.040865368985044e-06, + "loss": 0.1496, + "num_tokens": 9271114.0, + "reward": 0.2448565848171711, + "reward_std": 0.2629811018705368, + "rewards/code_reward/mean": 0.2448565848171711, + "rewards/code_reward/std": 0.26298110000789165, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 400.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.5, + "completions/max_terminated_length": 690.5, + "completions/mean_length": 286.53125, + "completions/mean_terminated_length": 286.53125, + "completions/min_length": 103.25, + "completions/min_terminated_length": 103.25, "epoch": 0.03458793899538268, - "grad_norm": 1.4037888132598981, - "kl": 0.065673828125, - "learning_rate": 2.0132624229675205e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9114432475860804, + "kl": 0.25927734375, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0919, + "num_tokens": 9294987.0, + "reward": 0.30750996619462967, + "reward_std": 0.21396427508443594, + "rewards/code_reward/mean": 0.30750996619462967, + "rewards/code_reward/std": 0.21396427601575851, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 356.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.75, + "completions/max_terminated_length": 448.75, + "completions/mean_length": 235.46875, + "completions/mean_terminated_length": 235.46875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, "epoch": 0.03469987407303764, - "grad_norm": 0.08573893743799793, - "kl": 0.04962158203125, - "learning_rate": 1.9995070287856546e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1384514018728271, + "kl": 0.2802734375, + "learning_rate": 2.0132624229675205e-06, + "loss": 0.0654, + "num_tokens": 9320514.0, + "reward": 0.31562499701976776, + "reward_std": 0.12151388870552182, + "rewards/code_reward/mean": 0.31562499701976776, + "rewards/code_reward/std": 0.12151389149948955, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 412.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 407.65625, + "completions/mean_terminated_length": 296.8125, + "completions/min_length": 103.75, + "completions/min_terminated_length": 103.75, "epoch": 0.034811809150692596, - "grad_norm": 14.764228813015494, - "kl": 0.43438720703125, - "learning_rate": 1.985783123785774e-06, - "loss": -0.03, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0596592897748647, + "kl": 0.222412109375, + "learning_rate": 1.9995070287856546e-06, + "loss": 0.1233, + "num_tokens": 9352039.0, + "reward": 0.10777858644723892, + "reward_std": 0.1648613102734089, + "rewards/code_reward/mean": 0.10777858644723892, + "rewards/code_reward/std": 0.1648613139986992, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 279.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.25, + "completions/max_terminated_length": 472.25, + "completions/mean_length": 266.09375, + "completions/mean_terminated_length": 266.09375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, "epoch": 0.034923744228347556, - "grad_norm": 0.2193000836614737, - "kl": 0.067138671875, - "learning_rate": 1.9720912837954486e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3278661225451498, + "kl": 0.267333984375, + "learning_rate": 1.985783123785774e-06, + "loss": 0.1761, + "num_tokens": 9375930.0, + "reward": 0.6153363855555654, + "reward_std": 0.09626698028296232, + "rewards/code_reward/mean": 0.6153363855555654, + "rewards/code_reward/std": 0.09626698028296232, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 385.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.25, + "completions/max_terminated_length": 448.25, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 285.46875, + "completions/min_length": 127.5, + "completions/min_terminated_length": 127.5, "epoch": 0.035035679306002515, - "grad_norm": 0.1633478285227259, - "kl": 0.05523681640625, - "learning_rate": 1.958432083296862e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2384888571370045, + "kl": 0.280029296875, + "learning_rate": 1.9720912837954486e-06, + "loss": 0.0208, + "num_tokens": 9399217.0, + "reward": 0.26853298489004374, + "reward_std": 0.2630241848528385, + "rewards/code_reward/mean": 0.26853298489004374, + "rewards/code_reward/std": 0.26302417647093534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 388.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 319.59375, + "completions/mean_terminated_length": 319.59375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, "epoch": 0.03514761438365748, - "grad_norm": 0.145955357953582, - "kl": 0.051025390625, - "learning_rate": 1.9448060954027093e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5140046934528983, + "kl": 0.2685546875, + "learning_rate": 1.958432083296862e-06, + "loss": 0.1386, + "num_tokens": 9427972.0, + "reward": 0.3911227434873581, + "reward_std": 0.2716307928785682, + "rewards/code_reward/mean": 0.3911227434873581, + "rewards/code_reward/std": 0.2716307919472456, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 378.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.25, + "completions/max_terminated_length": 329.25, + "completions/mean_length": 162.96875, + "completions/mean_terminated_length": 162.96875, + "completions/min_length": 85.75, + "completions/min_terminated_length": 85.75, "epoch": 0.03525954946131244, - "grad_norm": 7.683536847345325, - "kl": 0.1790771484375, - "learning_rate": 1.931213891832153e-06, - "loss": -0.0279, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4191024334529987, + "kl": 0.25927734375, + "learning_rate": 1.9448060954027093e-06, + "loss": 0.0713, + "num_tokens": 9447267.0, + "reward": 0.5250866562128067, + "reward_std": 0.19822602486237884, + "rewards/code_reward/mean": 0.5250866562128067, + "rewards/code_reward/std": 0.19822603231295943, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 410.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.5, + "completions/max_terminated_length": 490.5, + "completions/mean_length": 262.53125, + "completions/mean_terminated_length": 262.53125, + "completions/min_length": 117.5, + "completions/min_terminated_length": 117.5, "epoch": 0.0353714845389674, - "grad_norm": 0.19129137636650337, - "kl": 0.05731201171875, - "learning_rate": 1.9176560428868336e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2448753208331158, + "kl": 0.245361328125, + "learning_rate": 1.931213891832153e-06, + "loss": 0.251, + "num_tokens": 9471212.0, + "reward": 0.19074449688196182, + "reward_std": 0.07534042606130242, + "rewards/code_reward/mean": 0.19074449688196182, + "rewards/code_reward/std": 0.07534042652696371, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 390.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.25, + "completions/max_terminated_length": 464.25, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 93.25, + "completions/min_terminated_length": 93.25, "epoch": 0.03548341961662236, - "grad_norm": 6.515657611937849, - "kl": 1.45611572265625, - "learning_rate": 1.9041331174269373e-06, - "loss": -0.0073, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3212499853268918, + "kl": 0.286865234375, + "learning_rate": 1.9176560428868336e-06, + "loss": -0.0392, + "num_tokens": 9494912.0, + "reward": 0.23464674223214388, + "reward_std": 0.1325080880196765, + "rewards/code_reward/mean": 0.23464674223214388, + "rewards/code_reward/std": 0.13250808895099908, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 454.234375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 814.5, + "completions/max_terminated_length": 676.5, + "completions/mean_length": 313.59375, + "completions/mean_terminated_length": 265.1071472167969, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, "epoch": 0.03559535469427732, - "grad_norm": 0.2025260783774205, - "kl": 0.05340576171875, - "learning_rate": 1.8906456828473341e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7038540009433336, + "kl": 0.28857421875, + "learning_rate": 1.9041331174269373e-06, + "loss": 0.4071, + "num_tokens": 9524787.0, + "reward": 0.3640685440041125, + "reward_std": 0.17312923236750066, + "rewards/code_reward/mean": 0.3640685440041125, + "rewards/code_reward/std": 0.173129228875041, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 407.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 232.84375, + "completions/mean_terminated_length": 232.84375, + "completions/min_length": 120.25, + "completions/min_terminated_length": 120.25, "epoch": 0.03570728977193228, - "grad_norm": 4.576351957328261, - "kl": 1.09912109375, - "learning_rate": 1.8771943050537656e-06, - "loss": -0.0441, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3967647406257568, + "kl": 0.264892578125, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0554, + "num_tokens": 9548390.0, + "reward": 0.19295948650687933, + "reward_std": 0.11571824550628662, + "rewards/code_reward/mean": 0.19295948650687933, + "rewards/code_reward/std": 0.1157182501628995, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 414.34375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 901.25, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 317.78125, + "completions/mean_terminated_length": 262.4151840209961, + "completions/min_length": 105.25, + "completions/min_terminated_length": 105.25, "epoch": 0.03581922484958724, - "grad_norm": 0.04886630482006485, - "kl": 0.0460205078125, - "learning_rate": 1.8637795484391046e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2881984024223554, + "kl": 0.2744140625, + "learning_rate": 1.8771943050537656e-06, + "loss": -0.0248, + "num_tokens": 9578255.0, + "reward": 0.0896820523776114, + "reward_std": 0.08542403136380017, + "rewards/code_reward/mean": 0.0896820523776114, + "rewards/code_reward/std": 0.08542404044419527, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 456.3125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1355.5, + "completions/max_terminated_length": 495.5, + "completions/mean_length": 346.8125, + "completions/mean_terminated_length": 230.4151840209961, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, "epoch": 0.0359311599272422, - "grad_norm": 0.21065449469813613, - "kl": 0.0640869140625, - "learning_rate": 1.8504019758596698e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2010225217377024, + "kl": 0.206298828125, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.562, + "num_tokens": 9612289.0, + "reward": 0.38001057505607605, + "reward_std": 0.18961793556809425, + "rewards/code_reward/mean": 0.38001057505607605, + "rewards/code_reward/std": 0.18961793649941683, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 353.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 219.4375, + "completions/mean_terminated_length": 219.4375, + "completions/min_length": 105.5, + "completions/min_terminated_length": 105.5, "epoch": 0.036043095004897156, - "grad_norm": 4.883286184793087, - "kl": 1.3126220703125, - "learning_rate": 1.8370621486116163e-06, - "loss": -0.0417, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.188208321486219, + "kl": 0.26806640625, + "learning_rate": 1.8504019758596698e-06, + "loss": -0.046, + "num_tokens": 9634663.0, + "reward": 0.1651124432682991, + "reward_std": 0.14856510423123837, + "rewards/code_reward/mean": 0.1651124432682991, + "rewards/code_reward/std": 0.14856510609388351, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 396.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.25, + "completions/max_terminated_length": 482.25, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, "epoch": 0.03615503008255212, - "grad_norm": 2.4312943229919903, - "kl": 0.7298583984375, - "learning_rate": 1.823760626407377e-06, - "loss": -0.0482, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1155401373073985, + "kl": 0.293701171875, + "learning_rate": 1.8370621486116163e-06, + "loss": 0.172, + "num_tokens": 9655579.0, + "reward": 0.074991176254116, + "reward_std": 0.06526870373636484, + "rewards/code_reward/mean": 0.074991176254116, + "rewards/code_reward/std": 0.06526870559900999, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 366.21875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.75, + "completions/max_terminated_length": 332.75, + "completions/mean_length": 169.78125, + "completions/mean_terminated_length": 169.78125, + "completions/min_length": 57.5, + "completions/min_terminated_length": 57.5, "epoch": 0.03626696516020708, - "grad_norm": 0.023794241531636316, - "kl": 0.0516357421875, - "learning_rate": 1.8104979673521838e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4175182217669504, + "kl": 0.36181640625, + "learning_rate": 1.823760626407377e-06, + "loss": 0.0677, + "num_tokens": 9678716.0, + "reward": 0.5682446430437267, + "reward_std": 0.253071456681937, + "rewards/code_reward/mean": 0.5682446430437267, + "rewards/code_reward/std": 0.2530714562162757, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 338.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, "epoch": 0.03637890023786204, - "grad_norm": 0.06501484692075148, - "kl": 0.0458984375, - "learning_rate": 1.7972747279206482e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.466909812259168, + "kl": 0.302490234375, + "learning_rate": 1.8104979673521838e-06, + "loss": 0.0551, + "num_tokens": 9697405.0, + "reward": 0.3620302341878414, + "reward_std": 0.24883326888084412, + "rewards/code_reward/mean": 0.3620302341878414, + "rewards/code_reward/std": 0.2488332763314247, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 392.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.75, + "completions/max_terminated_length": 393.75, + "completions/mean_length": 216.21875, + "completions/mean_terminated_length": 216.21875, + "completions/min_length": 80.5, + "completions/min_terminated_length": 80.5, "epoch": 0.036490835315517, - "grad_norm": 0.0554933350153533, - "kl": 0.045013427734375, - "learning_rate": 1.7840914629334122e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6340135294611648, + "kl": 0.302978515625, + "learning_rate": 1.7972747279206482e-06, + "loss": 0.0425, + "num_tokens": 9716260.0, + "reward": 0.2104739099740982, + "reward_std": 0.11729209683835506, + "rewards/code_reward/mean": 0.2104739099740982, + "rewards/code_reward/std": 0.11729210242629051, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 361.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 241.09375, + "completions/mean_terminated_length": 241.09375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, "epoch": 0.03660277039317196, - "grad_norm": 0.034907067888093626, - "kl": 0.05291748046875, - "learning_rate": 1.7709487255338731e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1688628621437198, + "kl": 0.2314453125, + "learning_rate": 1.7840914629334122e-06, + "loss": -0.0012, + "num_tokens": 9739031.0, + "reward": 0.16952253691852093, + "reward_std": 0.042415026342496276, + "rewards/code_reward/mean": 0.16952253691852093, + "rewards/code_reward/std": 0.04241502704098821, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 375.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.5, + "completions/max_terminated_length": 523.5, + "completions/mean_length": 207.65625, + "completions/mean_terminated_length": 207.65625, + "completions/min_length": 90.5, + "completions/min_terminated_length": 90.5, "epoch": 0.03671470547082692, - "grad_norm": 0.04733189685853122, - "kl": 0.0457763671875, - "learning_rate": 1.7578470671649684e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0941067328247338, + "kl": 0.2578125, + "learning_rate": 1.7709487255338731e-06, + "loss": 0.0704, + "num_tokens": 9761348.0, + "reward": 0.21878245938569307, + "reward_std": 0.10285742627456784, + "rewards/code_reward/mean": 0.21878245938569307, + "rewards/code_reward/std": 0.10285743046551943, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 337.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.5, + "completions/max_terminated_length": 244.5, + "completions/mean_length": 134.71875, + "completions/mean_terminated_length": 134.71875, + "completions/min_length": 77.75, + "completions/min_terminated_length": 77.75, "epoch": 0.03682664054848188, - "grad_norm": 8.191477588254072, - "kl": 0.31414794921875, - "learning_rate": 1.744787037546045e-06, - "loss": -0.0328, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.9134234308602869, + "kl": 0.34326171875, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0705, + "num_tokens": 9781267.0, + "reward": 0.17752246744930744, + "reward_std": 0.12675740150734782, + "rewards/code_reward/mean": 0.17752246744930744, + "rewards/code_reward/std": 0.12675740336999297, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 380.625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 795.0, + "completions/max_terminated_length": 382.5, + "completions/mean_length": 250.09375, + "completions/mean_terminated_length": 192.71875381469727, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, "epoch": 0.03693857562613684, - "grad_norm": 4.485682653488296, - "kl": 0.06427001953125, - "learning_rate": 1.731769184649788e-06, - "loss": -0.0303, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5415476088332252, + "kl": 0.39501953125, + "learning_rate": 1.744787037546045e-06, + "loss": 0.2168, + "num_tokens": 9808326.0, + "reward": 0.21277573192492127, + "reward_std": 0.23475970514118671, + "rewards/code_reward/mean": 0.21277573192492127, + "rewards/code_reward/std": 0.234759708866477, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 421.796875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.75, + "completions/max_terminated_length": 282.75, + "completions/mean_length": 171.125, + "completions/mean_terminated_length": 171.125, + "completions/min_length": 96.25, + "completions/min_terminated_length": 96.25, "epoch": 0.0370505107037918, - "grad_norm": 0.15001206134528802, - "kl": 0.054931640625, - "learning_rate": 1.7187940546792325e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7106509716957057, + "kl": 0.269775390625, + "learning_rate": 1.731769184649788e-06, + "loss": -0.0451, + "num_tokens": 9829634.0, + "reward": 0.09405737672932446, + "reward_std": 0.1773677747696638, + "rewards/code_reward/mean": 0.09405737672932446, + "rewards/code_reward/std": 0.17736777663230896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 431.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.75, + "completions/max_terminated_length": 274.75, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 149.90625, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, "epoch": 0.037162445781446764, - "grad_norm": 0.05596585517144101, - "kl": 0.04620361328125, - "learning_rate": 1.7058621920448465e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.6731683502607013, + "kl": 0.664306640625, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0639, + "num_tokens": 9848823.0, + "reward": 0.0996803566813469, + "reward_std": 0.07073929067701101, + "rewards/code_reward/mean": 0.0996803566813469, + "rewards/code_reward/std": 0.0707392911426723, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 374.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.75, + "completions/max_terminated_length": 291.75, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, "epoch": 0.03727438085910172, - "grad_norm": 0.21063855683262964, - "kl": 0.05145263671875, - "learning_rate": 1.6929741393416855e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6268477182520276, + "kl": 0.28955078125, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0592, + "num_tokens": 9869263.0, + "reward": 0.15218693669885397, + "reward_std": 0.21367748617194593, + "rewards/code_reward/mean": 0.15218693669885397, + "rewards/code_reward/std": 0.21367749362252653, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 401.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.25, + "completions/max_terminated_length": 377.25, + "completions/mean_length": 193.34375, + "completions/mean_terminated_length": 193.34375, + "completions/min_length": 74.25, + "completions/min_terminated_length": 74.25, "epoch": 0.03738631593675668, - "grad_norm": 14.703670642314776, - "kl": 0.6478271484375, - "learning_rate": 1.6801304373266286e-06, - "loss": -0.0265, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.582329034740645, + "kl": 0.268798828125, + "learning_rate": 1.6929741393416855e-06, + "loss": -0.0098, + "num_tokens": 9902154.0, + "reward": 0.19454657658934593, + "reward_std": 0.20047161541879177, + "rewards/code_reward/mean": 0.19454657658934593, + "rewards/code_reward/std": 0.20047162100672722, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 405.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.75, + "completions/max_terminated_length": 293.75, + "completions/mean_length": 170.0625, + "completions/mean_terminated_length": 170.0625, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, "epoch": 0.03749825101441164, - "grad_norm": 3.2693646713839533, - "kl": 0.2021484375, - "learning_rate": 1.667331624895689e-06, - "loss": -0.0538, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5771763046671186, + "kl": 0.250244140625, + "learning_rate": 1.6801304373266286e-06, + "loss": 0.0037, + "num_tokens": 9921964.0, + "reward": 0.20569872483611107, + "reward_std": 0.1606605793349445, + "rewards/code_reward/mean": 0.20569872483611107, + "rewards/code_reward/std": 0.16066057654097676, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 464.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.75, + "completions/max_terminated_length": 485.75, + "completions/mean_length": 203.8125, + "completions/mean_terminated_length": 203.8125, + "completions/min_length": 88.75, + "completions/min_terminated_length": 88.75, "epoch": 0.0376101860920666, - "grad_norm": 0.0326789044313002, - "kl": 0.06781005859375, - "learning_rate": 1.6545782390614037e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7807442268855076, + "kl": 0.286865234375, + "learning_rate": 1.667331624895689e-06, + "loss": 0.0622, + "num_tokens": 9952862.0, + "reward": 0.1456711394712329, + "reward_std": 0.22775039146654308, + "rewards/code_reward/mean": 0.1456711394712329, + "rewards/code_reward/std": 0.22775039146654308, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 356.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.5, + "completions/max_terminated_length": 295.5, + "completions/mean_length": 133.75, + "completions/mean_terminated_length": 133.75, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, "epoch": 0.03772212116972156, - "grad_norm": 3.381374096168454, - "kl": 0.24603271484375, - "learning_rate": 1.6418708149302992e-06, - "loss": -0.0531, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.640447948869111, + "kl": 0.3310546875, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.0577, + "num_tokens": 9970606.0, + "reward": 0.47745162434875965, + "reward_std": 0.23838305938988924, + "rewards/code_reward/mean": 0.47745162434875965, + "rewards/code_reward/std": 0.23838307429105043, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 350.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 127.84375, + "completions/mean_terminated_length": 127.84375, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, "epoch": 0.03783405624737652, - "grad_norm": 4.256284487475119, - "kl": 0.145263671875, - "learning_rate": 1.6292098856804423e-06, - "loss": -0.0518, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.018865032840707, + "kl": 0.361328125, + "learning_rate": 1.6418708149302992e-06, + "loss": 0.0511, + "num_tokens": 9992753.0, + "reward": 0.4375, + "reward_std": 0.17353582940995693, + "rewards/code_reward/mean": 0.4375, + "rewards/code_reward/std": 0.17353583686053753, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 244.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 118.28125, + "completions/mean_terminated_length": 118.28125, + "completions/min_length": 63.75, + "completions/min_terminated_length": 63.75, "epoch": 0.03794599132503148, - "grad_norm": 4.758319146409537, - "kl": 0.52178955078125, - "learning_rate": 1.6165959825390661e-06, - "loss": -0.0436, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3053097026379892, + "kl": 0.3037109375, + "learning_rate": 1.6292098856804423e-06, + "loss": 0.0182, + "num_tokens": 10005258.0, + "reward": 0.3365098312497139, + "reward_std": 0.20095888897776604, + "rewards/code_reward/mean": 0.3365098312497139, + "rewards/code_reward/std": 0.20095889456570148, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 352.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 133.25, + "completions/min_terminated_length": 133.25, "epoch": 0.03805792640268644, - "grad_norm": 0.06125517244847344, - "kl": 0.0491943359375, - "learning_rate": 1.604029634760284e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9354111744654126, + "kl": 0.2366943359375, + "learning_rate": 1.6165959825390661e-06, + "loss": 0.0313, + "num_tokens": 10030994.0, + "reward": 0.05368073424324393, + "reward_std": 0.018324243370443583, + "rewards/code_reward/mean": 0.05368073424324393, + "rewards/code_reward/std": 0.018324245465919375, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 363.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.25, + "completions/max_terminated_length": 428.25, + "completions/mean_length": 181.5625, + "completions/mean_terminated_length": 181.5625, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, "epoch": 0.038169861480341405, - "grad_norm": 0.680605704729485, - "kl": 0.10308837890625, - "learning_rate": 1.59151136960288e-06, - "loss": 0.001, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6930152926719917, + "kl": 0.37255859375, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0426, + "num_tokens": 10053388.0, + "reward": 0.24092174973338842, + "reward_std": 0.18441250827163458, + "rewards/code_reward/mean": 0.24092174973338842, + "rewards/code_reward/std": 0.18441250827163458, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 393.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.25, + "completions/max_terminated_length": 191.25, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 50.25, + "completions/min_terminated_length": 50.25, "epoch": 0.038281796557996364, - "grad_norm": 0.06326323223836276, - "kl": 0.04736328125, - "learning_rate": 1.5790417123081903e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.4426044186229556, + "kl": 0.513427734375, + "learning_rate": 1.59151136960288e-06, + "loss": -0.1329, + "num_tokens": 10074740.0, + "reward": 0.43505216389894485, + "reward_std": 0.09191552549600601, + "rewards/code_reward/mean": 0.43505216389894485, + "rewards/code_reward/std": 0.09191552549600601, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 381.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.5, + "completions/max_terminated_length": 375.5, + "completions/mean_length": 191.9375, + "completions/mean_terminated_length": 191.9375, + "completions/min_length": 79.75, + "completions/min_terminated_length": 79.75, "epoch": 0.038393731635651324, - "grad_norm": 0.049854259040433335, - "kl": 0.048828125, - "learning_rate": 1.5666211860780583e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4272881181440114, + "kl": 0.2998046875, + "learning_rate": 1.5790417123081903e-06, + "loss": 0.0731, + "num_tokens": 10095146.0, + "reward": 0.40253712981939316, + "reward_std": 0.4054878391325474, + "rewards/code_reward/mean": 0.40253712981939316, + "rewards/code_reward/std": 0.40548786148428917, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 403.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, "epoch": 0.03850566671330628, - "grad_norm": 0.07874363283266796, - "kl": 0.05059814453125, - "learning_rate": 1.5542503120528918e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5919937138741604, + "kl": 0.22802734375, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.1869, + "num_tokens": 10115245.0, + "reward": 0.11901041585952044, + "reward_std": 0.06176098808646202, + "rewards/code_reward/mean": 0.11901041585952044, + "rewards/code_reward/std": 0.06176098808646202, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 413.3125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 709.5, + "completions/max_terminated_length": 495.5, + "completions/mean_length": 248.71875, + "completions/mean_terminated_length": 197.40625762939453, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, "epoch": 0.03861760179096124, - "grad_norm": 0.12008769303035174, - "kl": 0.0606689453125, - "learning_rate": 1.5419296092897866e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8660163924674529, + "kl": 0.5517578125, + "learning_rate": 1.5542503120528918e-06, + "loss": 0.1448, + "num_tokens": 10142828.0, + "reward": 0.4072798676788807, + "reward_std": 0.128664406016469, + "rewards/code_reward/mean": 0.4072798676788807, + "rewards/code_reward/std": 0.128664406016469, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 445.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.0387295368686162, - "grad_norm": 0.10834525189023946, - "kl": 0.0592041015625, - "learning_rate": 1.529659594740755e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2237817773466955, + "kl": 0.3408203125, + "learning_rate": 1.5419296092897866e-06, + "loss": 0.1399, + "num_tokens": 10168664.0, + "reward": 0.02313591120764613, + "reward_std": 0.02397587802261114, + "rewards/code_reward/mean": 0.02313591120764613, + "rewards/code_reward/std": 0.023975879419595003, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 378.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.75, + "completions/max_terminated_length": 286.75, + "completions/mean_length": 148.8125, + "completions/mean_terminated_length": 148.8125, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, "epoch": 0.03884147194627116, - "grad_norm": 0.05706678172063102, - "kl": 0.040863037109375, - "learning_rate": 1.5174407832310338e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.0973241700512655, + "kl": 0.444580078125, + "learning_rate": 1.529659594740755e-06, + "loss": 0.0837, + "num_tokens": 10185306.0, + "reward": 0.3948034793138504, + "reward_std": 0.1760760466568172, + "rewards/code_reward/mean": 0.3948034793138504, + "rewards/code_reward/std": 0.17607605503872037, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 316.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 158.78125, + "completions/mean_terminated_length": 158.78125, + "completions/min_length": 66.75, + "completions/min_terminated_length": 66.75, "epoch": 0.03895340702392612, - "grad_norm": 0.2260930410291458, - "kl": 0.0599365234375, - "learning_rate": 1.5052736874374815e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4572287618545743, + "kl": 0.26953125, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0326, + "num_tokens": 10203115.0, + "reward": 0.4470205195248127, + "reward_std": 0.19961272552609444, + "rewards/code_reward/mean": 0.4470205195248127, + "rewards/code_reward/std": 0.19961273297667503, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 412.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 160.59375, + "completions/mean_terminated_length": 160.59375, + "completions/min_length": 91.75, + "completions/min_terminated_length": 91.75, "epoch": 0.03906534210158108, - "grad_norm": 0.03574155761474703, - "kl": 0.0439453125, - "learning_rate": 1.4931588178670695e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.587914542250274, + "kl": 0.302978515625, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0176, + "num_tokens": 10226750.0, + "reward": 0.1811899826861918, + "reward_std": 0.15240496955811977, + "rewards/code_reward/mean": 0.1811899826861918, + "rewards/code_reward/std": 0.15240497328341007, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 325.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.5, + "completions/max_terminated_length": 255.5, + "completions/mean_length": 134.0625, + "completions/mean_terminated_length": 134.0625, + "completions/min_length": 62.25, + "completions/min_terminated_length": 62.25, "epoch": 0.039177277179236046, - "grad_norm": 0.06765325310501516, - "kl": 0.05303955078125, - "learning_rate": 1.4810966828354605e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.127239539473459, + "kl": 0.37744140625, + "learning_rate": 1.4931588178670695e-06, + "loss": 0.0222, + "num_tokens": 10248072.0, + "reward": 0.3137185089290142, + "reward_std": 0.06511987652629614, + "rewards/code_reward/mean": 0.3137185089290142, + "rewards/code_reward/std": 0.06511988304555416, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 382.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.25, + "completions/max_terminated_length": 295.25, + "completions/mean_length": 149.15625, + "completions/mean_terminated_length": 149.15625, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, "epoch": 0.039289212256891005, - "grad_norm": 0.45741242496960755, - "kl": 0.063720703125, - "learning_rate": 1.469087788445684e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4510463271520762, + "kl": 0.26611328125, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.1718, + "num_tokens": 10270941.0, + "reward": 0.18543480592779815, + "reward_std": 0.12552618235349655, + "rewards/code_reward/mean": 0.18543480592779815, + "rewards/code_reward/std": 0.12552619352936745, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 415.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.25, + "completions/max_terminated_length": 232.25, + "completions/mean_length": 142.46875, + "completions/mean_terminated_length": 142.46875, + "completions/min_length": 60.5, + "completions/min_terminated_length": 60.5, "epoch": 0.039401147334545965, - "grad_norm": 0.059362152677566817, - "kl": 0.044921875, - "learning_rate": 1.4571326385668965e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.0984464768903726, + "kl": 0.345703125, + "learning_rate": 1.469087788445684e-06, + "loss": -0.0558, + "num_tokens": 10291156.0, + "reward": 0.22514494694769382, + "reward_std": 0.23393048718571663, + "rewards/code_reward/mean": 0.22514494694769382, + "rewards/code_reward/std": 0.23393050953745842, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 409.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.5, + "completions/max_terminated_length": 250.5, + "completions/mean_length": 148.25, + "completions/mean_terminated_length": 148.25, + "completions/min_length": 71.5, + "completions/min_terminated_length": 71.5, "epoch": 0.039513082412200924, - "grad_norm": 0.03486583265069237, - "kl": 0.04302978515625, - "learning_rate": 1.4452317348132434e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6718356673761792, + "kl": 0.290771484375, + "learning_rate": 1.4571326385668965e-06, + "loss": -0.0597, + "num_tokens": 10315236.0, + "reward": 0.3922019712626934, + "reward_std": 0.3044360801577568, + "rewards/code_reward/mean": 0.3922019712626934, + "rewards/code_reward/std": 0.3044360838830471, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 549.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.5, + "completions/max_terminated_length": 239.5, + "completions/mean_length": 131.21875, + "completions/mean_terminated_length": 131.21875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, "epoch": 0.039625017489855884, - "grad_norm": 0.044710017702799705, - "kl": 0.04583740234375, - "learning_rate": 1.4333855765228104e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.884602182380735, + "kl": 0.362060546875, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.1699, + "num_tokens": 10342059.0, + "reward": 0.2568647051230073, + "reward_std": 0.057626438327133656, + "rewards/code_reward/mean": 0.2568647051230073, + "rewards/code_reward/std": 0.05762644065544009, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 374.359375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 216.78125, + "completions/mean_terminated_length": 216.78125, + "completions/min_length": 93.25, + "completions/min_terminated_length": 93.25, "epoch": 0.03973695256751084, - "grad_norm": 0.0864511831454649, - "kl": 0.04925537109375, - "learning_rate": 1.421594660736675e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5608540814114584, + "kl": 0.2548828125, + "learning_rate": 1.4333855765228104e-06, + "loss": 0.0906, + "num_tokens": 10365764.0, + "reward": 0.1356297740712762, + "reward_std": 0.07451130566187203, + "rewards/code_reward/mean": 0.1356297740712762, + "rewards/code_reward/std": 0.07451130612753332, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 361.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.25, + "completions/max_terminated_length": 319.25, + "completions/mean_length": 133.78125, + "completions/mean_terminated_length": 133.78125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, "epoch": 0.0398488876451658, - "grad_norm": 0.03699683675596586, - "kl": 0.04742431640625, - "learning_rate": 1.4098594821780476e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8748645260654178, + "kl": 0.329833984375, + "learning_rate": 1.421594660736675e-06, + "loss": -0.0276, + "num_tokens": 10390429.0, + "reward": 0.4849093444645405, + "reward_std": 0.17599604558199644, + "rewards/code_reward/mean": 0.4849093444645405, + "rewards/code_reward/std": 0.17599604465067387, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 362.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.5, + "completions/max_terminated_length": 256.5, + "completions/mean_length": 138.90625, + "completions/mean_terminated_length": 138.90625, + "completions/min_length": 55.25, + "completions/min_terminated_length": 55.25, "epoch": 0.03996082272282076, - "grad_norm": 0.07941048515036817, - "kl": 0.05633544921875, - "learning_rate": 1.3981805332315174e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.9096248965574965, + "kl": 0.291015625, + "learning_rate": 1.4098594821780476e-06, + "loss": -0.0702, + "num_tokens": 10411850.0, + "reward": 0.1599155543372035, + "reward_std": 0.14085367415100336, + "rewards/code_reward/mean": 0.1599155543372035, + "rewards/code_reward/std": 0.14085367461666465, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 293.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 64.25, + "completions/min_terminated_length": 64.25, "epoch": 0.04007275780047572, - "grad_norm": 12.642109265209918, - "kl": 1.81060791015625, - "learning_rate": 1.3865583039223929e-06, - "loss": -0.0185, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9256594634997231, + "kl": 0.1795654296875, + "learning_rate": 1.3981805332315174e-06, + "loss": 0.0024, + "num_tokens": 10434984.0, + "reward": 0.3102440594229847, + "reward_std": 0.08654948882758617, + "rewards/code_reward/mean": 0.3102440594229847, + "rewards/code_reward/std": 0.08654948882758617, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 357.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.5, + "completions/max_terminated_length": 166.5, + "completions/mean_length": 91.75, + "completions/mean_terminated_length": 91.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, "epoch": 0.04018469287813069, - "grad_norm": 0.08034212517091562, - "kl": 0.0445556640625, - "learning_rate": 1.374993281896137e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8840891544197063, + "kl": 0.41650390625, + "learning_rate": 1.3865583039223929e-06, + "loss": -0.1494, + "num_tokens": 10457064.0, + "reward": 0.215488045476377, + "reward_std": 0.08538101147860289, + "rewards/code_reward/mean": 0.215488045476377, + "rewards/code_reward/std": 0.08538101892918348, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 367.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.25, + "completions/max_terminated_length": 268.25, + "completions/mean_length": 169.90625, + "completions/mean_terminated_length": 169.90625, + "completions/min_length": 92.25, + "completions/min_terminated_length": 92.25, "epoch": 0.04029662795578565, - "grad_norm": 0.09767901214575864, - "kl": 0.04425048828125, - "learning_rate": 1.3634859523979134e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.543399169236028, + "kl": 0.379638671875, + "learning_rate": 1.374993281896137e-06, + "loss": -0.0768, + "num_tokens": 10481869.0, + "reward": 0.22161551751196384, + "reward_std": 0.23532075341790915, + "rewards/code_reward/mean": 0.22161551751196384, + "rewards/code_reward/std": 0.2353207627311349, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 392.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.5, + "completions/max_terminated_length": 324.5, + "completions/mean_length": 172.65625, + "completions/mean_terminated_length": 172.65625, + "completions/min_length": 72.5, + "completions/min_terminated_length": 72.5, "epoch": 0.040408563033440606, - "grad_norm": 0.07629861701658103, - "kl": 0.048583984375, - "learning_rate": 1.3520367982522208e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4416318985457022, + "kl": 0.3037109375, + "learning_rate": 1.3634859523979134e-06, + "loss": -0.0104, + "num_tokens": 10507626.0, + "reward": 0.19733425695449114, + "reward_std": 0.2422337755560875, + "rewards/code_reward/mean": 0.19733425695449114, + "rewards/code_reward/std": 0.2422337755560875, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 359.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 132.1875, + "completions/mean_terminated_length": 132.1875, + "completions/min_length": 57.5, + "completions/min_terminated_length": 57.5, "epoch": 0.040520498111095565, - "grad_norm": 2.436909257977141, - "kl": 0.3397216796875, - "learning_rate": 1.3406462998426358e-06, - "loss": -0.0575, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.7918502710892381, + "kl": 0.283935546875, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.0692, + "num_tokens": 10528088.0, + "reward": 0.31125493720173836, + "reward_std": 0.22952541639097035, + "rewards/code_reward/mean": 0.31125493720173836, + "rewards/code_reward/std": 0.22952541639097035, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 345.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.75, + "completions/max_terminated_length": 342.75, + "completions/mean_length": 163.375, + "completions/mean_terminated_length": 163.375, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, "epoch": 0.040632433188750525, - "grad_norm": 1.7488785493981298, - "kl": 0.41937255859375, - "learning_rate": 1.3293149350916595e-06, - "loss": -0.0315, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5949676136183217, + "kl": 0.316162109375, + "learning_rate": 1.3406462998426358e-06, + "loss": -0.0073, + "num_tokens": 10547284.0, + "reward": 0.33256023190915585, + "reward_std": 0.23680819105356932, + "rewards/code_reward/mean": 0.33256023190915585, + "rewards/code_reward/std": 0.2368081919848919, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 340.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.75, + "completions/max_terminated_length": 246.75, + "completions/mean_length": 127.90625, + "completions/mean_terminated_length": 127.90625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, "epoch": 0.040744368266405484, - "grad_norm": 0.042176241580247326, - "kl": 0.03741455078125, - "learning_rate": 1.3180431794406623e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.889587616740625, + "kl": 0.40771484375, + "learning_rate": 1.3293149350916595e-06, + "loss": -0.0672, + "num_tokens": 10564489.0, + "reward": 0.3066699914634228, + "reward_std": 0.09056703024543822, + "rewards/code_reward/mean": 0.3066699914634228, + "rewards/code_reward/std": 0.09056703303940594, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 438.40625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 926.75, + "completions/max_terminated_length": 492.75, + "completions/mean_length": 281.25, + "completions/mean_terminated_length": 221.99107360839844, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, "epoch": 0.04085630334406044, - "grad_norm": 0.06954183102024396, - "kl": 0.0513916015625, - "learning_rate": 1.3068315058299358e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1530140221693261, + "kl": 0.166748046875, + "learning_rate": 1.3180431794406623e-06, + "loss": 0.5007, + "num_tokens": 10590441.0, + "reward": 0.25817783176898956, + "reward_std": 0.1975763700902462, + "rewards/code_reward/mean": 0.25817783176898956, + "rewards/code_reward/std": 0.1975763738155365, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 384.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 214.03125, + "completions/mean_terminated_length": 214.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.0409682384217154, - "grad_norm": 0.0585988602116313, - "kl": 0.03924560546875, - "learning_rate": 1.2956803846788503e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5427579182120241, + "kl": 0.2431640625, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0483, + "num_tokens": 10611458.0, + "reward": 0.2973039257340133, + "reward_std": 0.24095657613361254, + "rewards/code_reward/mean": 0.2973039257340133, + "rewards/code_reward/std": 0.2409565910929814, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 369.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 169.53125, + "completions/mean_terminated_length": 169.53125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.04108017349937036, - "grad_norm": 0.029227802328528226, - "kl": 0.05963134765625, - "learning_rate": 1.284590283866116e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5032805435473915, + "kl": 0.297119140625, + "learning_rate": 1.2956803846788503e-06, + "loss": 0.0369, + "num_tokens": 10640499.0, + "reward": 0.21608419064432383, + "reward_std": 0.08488713996484876, + "rewards/code_reward/mean": 0.21608419064432383, + "rewards/code_reward/std": 0.08488714415580034, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 468.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 154.53125, + "completions/mean_terminated_length": 154.53125, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, "epoch": 0.04119210857702533, - "grad_norm": 0.024030066071591895, - "kl": 0.039306640625, - "learning_rate": 1.2735616687101518e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2609885337673685, + "kl": 0.3779296875, + "learning_rate": 1.284590283866116e-06, + "loss": -0.103, + "num_tokens": 10665612.0, + "reward": 0.6632775068283081, + "reward_std": 0.22238866239786148, + "rewards/code_reward/mean": 0.6632775068283081, + "rewards/code_reward/std": 0.22238866239786148, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 345.578125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.25, + "completions/max_terminated_length": 287.25, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, "epoch": 0.04130404365468029, - "grad_norm": 0.022956778080715768, - "kl": 0.04571533203125, - "learning_rate": 1.2625950019495614e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6198013908337032, + "kl": 0.283203125, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.0275, + "num_tokens": 10687588.0, + "reward": 0.04903295123949647, + "reward_std": 0.02168478211387992, + "rewards/code_reward/mean": 0.04903295123949647, + "rewards/code_reward/std": 0.02168478397652507, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 419.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 169.125, + "completions/mean_terminated_length": 169.125, + "completions/min_length": 76.5, + "completions/min_terminated_length": 76.5, "epoch": 0.04141597873233525, - "grad_norm": 0.03031369908455225, - "kl": 0.04296875, - "learning_rate": 1.251690743723718e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9528743354172768, + "kl": 0.283203125, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0836, + "num_tokens": 10710032.0, + "reward": 0.17378074233420193, + "reward_std": 0.162479427177459, + "rewards/code_reward/mean": 0.17378074233420193, + "rewards/code_reward/std": 0.16247944394126534, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 327.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.75, + "completions/max_terminated_length": 193.75, + "completions/mean_length": 113.15625, + "completions/mean_terminated_length": 113.15625, + "completions/min_length": 53.25, + "completions/min_terminated_length": 53.25, "epoch": 0.041527913809990206, - "grad_norm": 1.4619752022004908, - "kl": 0.42498779296875, - "learning_rate": 1.2408493515534581e-06, - "loss": -0.0518, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5988336537399075, + "kl": 0.29150390625, + "learning_rate": 1.251690743723718e-06, + "loss": -0.0146, + "num_tokens": 10732597.0, + "reward": 0.6325892880558968, + "reward_std": 0.16140316799283028, + "rewards/code_reward/mean": 0.6325892880558968, + "rewards/code_reward/std": 0.16140317544341087, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 403.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 107.4375, + "completions/mean_terminated_length": 107.4375, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, "epoch": 0.041639848887645166, - "grad_norm": 0.06787211806577707, - "kl": 0.04547119140625, - "learning_rate": 1.2300712803218834e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5312147706022616, + "kl": 0.34375, + "learning_rate": 1.2408493515534581e-06, + "loss": 0.0191, + "num_tokens": 10749987.0, + "reward": 0.47150277020409703, + "reward_std": 0.04196681221947074, + "rewards/code_reward/mean": 0.47150277020409703, + "rewards/code_reward/std": 0.04196681268513203, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 384.984375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.5, + "completions/max_terminated_length": 254.5, + "completions/mean_length": 179.5625, + "completions/mean_terminated_length": 179.5625, + "completions/min_length": 109.25, + "completions/min_terminated_length": 109.25, "epoch": 0.041751783965300125, - "grad_norm": 0.844949833560224, - "kl": 0.04034423828125, - "learning_rate": 1.2193569822552772e-06, - "loss": 0.0324, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.2602835048997771, + "kl": 0.37158203125, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0472, + "num_tokens": 10773077.0, + "reward": 0.2942133641336113, + "reward_std": 0.06957495538517833, + "rewards/code_reward/mean": 0.2942133641336113, + "rewards/code_reward/std": 0.06957494793459773, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 469.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.25, + "completions/max_terminated_length": 411.25, + "completions/mean_length": 237.9375, + "completions/mean_terminated_length": 237.9375, + "completions/min_length": 83.5, + "completions/min_terminated_length": 83.5, "epoch": 0.041863719042955085, - "grad_norm": 0.5718530387307899, - "kl": 0.034820556640625, - "learning_rate": 1.2087069069041268e-06, - "loss": 0.014, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.77643334650333, + "kl": 0.2418212890625, + "learning_rate": 1.2193569822552772e-06, + "loss": -0.0534, + "num_tokens": 10800323.0, + "reward": 0.34760985895991325, + "reward_std": 0.09279043786227703, + "rewards/code_reward/mean": 0.34760985895991325, + "rewards/code_reward/std": 0.09279044345021248, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 408.0625, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 704.75, + "completions/max_terminated_length": 458.25, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 221.75000762939453, + "completions/min_length": 117.5, + "completions/min_terminated_length": 117.5, "epoch": 0.041975654120610044, - "grad_norm": 0.03780725899196792, - "kl": 0.0430908203125, - "learning_rate": 1.1981215011242654e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7108480467121594, + "kl": 0.26763916015625, + "learning_rate": 1.2087069069041268e-06, + "loss": 0.075, + "num_tokens": 10826268.0, + "reward": 0.09669792652130127, + "reward_std": 0.14347750786691904, + "rewards/code_reward/mean": 0.09669792652130127, + "rewards/code_reward/std": 0.14347750786691904, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 394.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.25, + "completions/max_terminated_length": 446.25, + "completions/mean_length": 234.9375, + "completions/mean_terminated_length": 234.9375, + "completions/min_length": 107.5, + "completions/min_terminated_length": 107.5, "epoch": 0.042087589198265, - "grad_norm": 0.022995562401170407, - "kl": 0.05279541015625, - "learning_rate": 1.1876012090581184e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2434311763858945, + "kl": 0.213134765625, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.131, + "num_tokens": 10854130.0, + "reward": 0.043518811551621184, + "reward_std": 0.047120289877057076, + "rewards/code_reward/mean": 0.043518811551621184, + "rewards/code_reward/std": 0.047120293602347374, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 411.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 136.03125, + "completions/mean_terminated_length": 136.03125, + "completions/min_length": 54.25, + "completions/min_terminated_length": 54.25, "epoch": 0.04219952427591997, - "grad_norm": 0.10984965670222785, - "kl": 0.04351806640625, - "learning_rate": 1.177146472116071e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.671522684663737, + "kl": 0.36328125, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0534, + "num_tokens": 10877771.0, + "reward": 0.5510788485407829, + "reward_std": 0.11537208966910839, + "rewards/code_reward/mean": 0.5510788485407829, + "rewards/code_reward/std": 0.11537209153175354, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 400.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.25, + "completions/max_terminated_length": 304.25, + "completions/mean_length": 165.71875, + "completions/mean_terminated_length": 165.71875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, "epoch": 0.04231145935357493, - "grad_norm": 3.2788406827242613, - "kl": 0.182952880859375, - "learning_rate": 1.1667577289579462e-06, - "loss": -0.0157, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.1250436666643788, + "kl": 0.33447265625, + "learning_rate": 1.177146472116071e-06, + "loss": -0.0308, + "num_tokens": 10904074.0, + "reward": 0.04710310218797531, + "reward_std": 0.030758424138184637, + "rewards/code_reward/mean": 0.04710310218797531, + "rewards/code_reward/std": 0.030758424138184637, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 379.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.5, + "completions/max_terminated_length": 289.5, + "completions/mean_length": 151.8125, + "completions/mean_terminated_length": 151.8125, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, "epoch": 0.04242339443122989, - "grad_norm": 0.04631375660525243, - "kl": 0.0482177734375, - "learning_rate": 1.1564354154746007e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.360047496133987, + "kl": 0.3671875, + "learning_rate": 1.1667577289579462e-06, + "loss": -0.0164, + "num_tokens": 10930116.0, + "reward": 0.4695088779553771, + "reward_std": 0.12898865342140198, + "rewards/code_reward/mean": 0.4695088779553771, + "rewards/code_reward/std": 0.12898865342140198, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 361.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.5, + "completions/max_terminated_length": 281.5, + "completions/mean_length": 191.65625, + "completions/mean_terminated_length": 191.65625, + "completions/min_length": 91.75, + "completions/min_terminated_length": 91.75, "epoch": 0.04253532950888485, - "grad_norm": 0.5029931057300824, - "kl": 0.04144287109375, - "learning_rate": 1.146179964769635e-06, - "loss": -0.0096, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.396767149969482, + "kl": 0.222412109375, + "learning_rate": 1.1564354154746007e-06, + "loss": 0.0289, + "num_tokens": 10951289.0, + "reward": 0.38920454680919647, + "reward_std": 0.1452226829715073, + "rewards/code_reward/mean": 0.38920454680919647, + "rewards/code_reward/std": 0.14522269228473306, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 408.53125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.25, + "completions/max_terminated_length": 232.25, + "completions/mean_length": 126.9375, + "completions/mean_terminated_length": 126.9375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, "epoch": 0.04264726458653981, - "grad_norm": 2.062066313854755, - "kl": 0.10888671875, - "learning_rate": 1.1359918071412195e-06, - "loss": 0.0137, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.272607027139455, + "kl": 0.4501953125, + "learning_rate": 1.146179964769635e-06, + "loss": -0.0172, + "num_tokens": 10973007.0, + "reward": 0.5514450334012508, + "reward_std": 0.1807562008034438, + "rewards/code_reward/mean": 0.5514450334012508, + "rewards/code_reward/std": 0.1807561982423067, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 473.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.25, + "completions/max_terminated_length": 377.25, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 75.5, + "completions/min_terminated_length": 75.5, "epoch": 0.042759199664194766, - "grad_norm": 5.689014120759552, - "kl": 0.19818115234375, - "learning_rate": 1.1258713700640456e-06, - "loss": 0.002, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8894869007416317, + "kl": 0.28662109375, + "learning_rate": 1.1359918071412195e-06, + "loss": 0.0651, + "num_tokens": 11000324.0, + "reward": 0.3411928308196366, + "reward_std": 0.15844399761408567, + "rewards/code_reward/mean": 0.3411928308196366, + "rewards/code_reward/std": 0.15844399388879538, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 378.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.25, + "completions/max_terminated_length": 199.25, + "completions/mean_length": 122.78125, + "completions/mean_terminated_length": 122.78125, + "completions/min_length": 74.75, + "completions/min_terminated_length": 74.75, "epoch": 0.042871134741849726, - "grad_norm": 5.456471510055751, - "kl": 0.1583251953125, - "learning_rate": 1.115819078171383e-06, - "loss": -0.0152, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.4633639922004784, + "kl": 0.448486328125, + "learning_rate": 1.1258713700640456e-06, + "loss": -0.0042, + "num_tokens": 11025333.0, + "reward": 0.39490123838186264, + "reward_std": 0.09689067304134369, + "rewards/code_reward/mean": 0.39490123838186264, + "rewards/code_reward/std": 0.09689067304134369, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 405.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 170.78125, + "completions/mean_terminated_length": 170.78125, + "completions/min_length": 94.25, + "completions/min_terminated_length": 94.25, "epoch": 0.042983069819504685, - "grad_norm": 0.01860238339511713, - "kl": 0.04095458984375, - "learning_rate": 1.1058353532372667e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5596521493483513, + "kl": 0.26416015625, + "learning_rate": 1.115819078171383e-06, + "loss": -0.0304, + "num_tokens": 11052478.0, + "reward": 0.11266797501593828, + "reward_std": 0.04459898290224373, + "rewards/code_reward/mean": 0.11266797501593828, + "rewards/code_reward/std": 0.04459898569621146, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 382.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.75, + "completions/max_terminated_length": 396.75, + "completions/mean_length": 161.8125, + "completions/mean_terminated_length": 161.8125, + "completions/min_length": 66.25, + "completions/min_terminated_length": 66.25, "epoch": 0.043095004897159644, - "grad_norm": 0.13949154395909993, - "kl": 0.0496826171875, - "learning_rate": 1.0959206141587998e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2904801129088306, + "kl": 0.337890625, + "learning_rate": 1.1058353532372667e-06, + "loss": 0.0852, + "num_tokens": 11072608.0, + "reward": 0.39945168420672417, + "reward_std": 0.24530693516135216, + "rewards/code_reward/mean": 0.39945168420672417, + "rewards/code_reward/std": 0.24530693143606186, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 315.453125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.5, + "completions/max_terminated_length": 270.5, + "completions/mean_length": 145.25, + "completions/mean_terminated_length": 145.25, + "completions/min_length": 80.5, + "completions/min_terminated_length": 80.5, "epoch": 0.04320693997481461, - "grad_norm": 1.107285367889642, - "kl": 0.3203125, - "learning_rate": 1.0860752769385766e-06, - "loss": -0.0542, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.533287258494552, + "kl": 0.321533203125, + "learning_rate": 1.0959206141587998e-06, + "loss": -0.0497, + "num_tokens": 11094568.0, + "reward": 0.32392971869558096, + "reward_std": 0.0603926875628531, + "rewards/code_reward/mean": 0.32392971869558096, + "rewards/code_reward/std": 0.060392691288143396, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 304.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.25, + "completions/max_terminated_length": 460.25, + "completions/mean_length": 244.65625, + "completions/mean_terminated_length": 244.65625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, "epoch": 0.04331887505246957, - "grad_norm": 14.993173199026563, - "kl": 1.521728515625, - "learning_rate": 1.0762997546672279e-06, - "loss": -0.0433, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.673535799840682, + "kl": 0.1998291015625, + "learning_rate": 1.0860752769385766e-06, + "loss": -0.0478, + "num_tokens": 11115893.0, + "reward": 0.19461633265018463, + "reward_std": 0.2882770374417305, + "rewards/code_reward/mean": 0.19461633265018463, + "rewards/code_reward/std": 0.2882770411670208, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 456.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.75, + "completions/max_terminated_length": 260.75, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.04343081013012453, - "grad_norm": 0.10799334625634623, - "kl": 0.041534423828125, - "learning_rate": 1.0665944575060914e-06, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8694819476907374, + "kl": 0.28662109375, + "learning_rate": 1.0762997546672279e-06, + "loss": -0.1618, + "num_tokens": 11140117.0, + "reward": 0.13581378757953644, + "reward_std": 0.1375128449872136, + "rewards/code_reward/mean": 0.13581378757953644, + "rewards/code_reward/std": 0.1375128524377942, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 410.546875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.25, + "completions/max_terminated_length": 363.25, + "completions/mean_length": 193.84375, + "completions/mean_terminated_length": 193.84375, + "completions/min_length": 98.5, + "completions/min_terminated_length": 98.5, "epoch": 0.04354274520777949, - "grad_norm": 0.37423665656457383, - "kl": 0.061767578125, - "learning_rate": 1.056959792669997e-06, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0753739650687322, + "kl": 0.333984375, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0196, + "num_tokens": 11165288.0, + "reward": 0.2044280730187893, + "reward_std": 0.20719696558080614, + "rewards/code_reward/mean": 0.2044280730187893, + "rewards/code_reward/std": 0.20719696604646742, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 373.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.5, + "completions/max_terminated_length": 206.5, + "completions/mean_length": 101.28125, + "completions/mean_terminated_length": 101.28125, + "completions/min_length": 45.75, + "completions/min_terminated_length": 45.75, "epoch": 0.04365468028543445, - "grad_norm": 0.19089367747123068, - "kl": 0.05108642578125, - "learning_rate": 1.0473961644101856e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.0710686171209, + "kl": 0.34033203125, + "learning_rate": 1.056959792669997e-06, + "loss": 0.0855, + "num_tokens": 11184777.0, + "reward": 0.3098377622663975, + "reward_std": 0.11287019960582256, + "rewards/code_reward/mean": 0.3098377622663975, + "rewards/code_reward/std": 0.11287020146846771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 358.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.5, + "completions/max_terminated_length": 224.5, + "completions/mean_length": 128.0625, + "completions/mean_terminated_length": 128.0625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, "epoch": 0.04376661536308941, - "grad_norm": 1.5778323232854403, - "kl": 0.1815185546875, - "learning_rate": 1.037903973997345e-06, - "loss": -0.0537, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.6186447722026904, + "kl": 0.3369140625, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0431, + "num_tokens": 11207051.0, + "reward": 0.40973464399576187, + "reward_std": 0.2817695839330554, + "rewards/code_reward/mean": 0.40973464399576187, + "rewards/code_reward/std": 0.281769591383636, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 326.71875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.25, + "completions/max_terminated_length": 391.25, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 73.25, + "completions/min_terminated_length": 73.25, "epoch": 0.04387855044074437, - "grad_norm": 0.31904972301610846, - "kl": 0.052734375, - "learning_rate": 1.0284836197047737e-06, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1817397898346427, + "kl": 0.214599609375, + "learning_rate": 1.037903973997345e-06, + "loss": 0.0475, + "num_tokens": 11225013.0, + "reward": 0.30992063134908676, + "reward_std": 0.14644738845527172, + "rewards/code_reward/mean": 0.30992063134908676, + "rewards/code_reward/std": 0.14644739404320717, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 266.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.5, + "completions/max_terminated_length": 259.5, + "completions/mean_length": 149.8125, + "completions/mean_terminated_length": 149.8125, + "completions/min_length": 86.25, + "completions/min_terminated_length": 86.25, "epoch": 0.043990485518399326, - "grad_norm": 2.0851570445818868, - "kl": 0.2457275390625, - "learning_rate": 1.0191354967916712e-06, - "loss": 0.0287, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.2696184824554122, + "kl": 0.26806640625, + "learning_rate": 1.0284836197047737e-06, + "loss": -0.0078, + "num_tokens": 11242503.0, + "reward": 0.4278051145374775, + "reward_std": 0.09913837909698486, + "rewards/code_reward/mean": 0.4278051145374775, + "rewards/code_reward/std": 0.09913837816566229, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 415.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 150.03125, + "completions/mean_terminated_length": 150.03125, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, "epoch": 0.044102420596054286, - "grad_norm": 0.3993597476384507, - "kl": 0.07476806640625, - "learning_rate": 1.0098599974865515e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.45552546760352, + "kl": 0.28857421875, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.0331, + "num_tokens": 11269880.0, + "reward": 0.24055082583799958, + "reward_std": 0.11007735197199509, + "rewards/code_reward/mean": 0.24055082583799958, + "rewards/code_reward/std": 0.11007736308965832, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 378.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.04421435567370925, - "grad_norm": 0.3569068198951939, - "kl": 0.06988525390625, - "learning_rate": 1.0006575109707898e-06, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.368061303454261, + "kl": 0.311279296875, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0704, + "num_tokens": 11297700.0, + "reward": 0.07068161107599735, + "reward_std": 0.11775721522280946, + "rewards/code_reward/mean": 0.07068161107599735, + "rewards/code_reward/std": 0.11775722278980538, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 314.234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 166.28125, + "completions/mean_terminated_length": 166.28125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, "epoch": 0.04432629075136421, - "grad_norm": 0.13286047664045245, - "kl": 0.04833984375, - "learning_rate": 9.915284233622877e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.02580412202015, + "kl": 0.312255859375, + "learning_rate": 1.0006575109707898e-06, + "loss": 0.1445, + "num_tokens": 11315909.0, + "reward": 0.2796209901571274, + "reward_std": 0.20085123018361628, + "rewards/code_reward/mean": 0.2796209901571274, + "rewards/code_reward/std": 0.200851232977584, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 416.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.5, + "completions/max_terminated_length": 423.5, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, "epoch": 0.04443822582901917, - "grad_norm": 2.2497661432685447, - "kl": 0.05743408203125, - "learning_rate": 9.824731176992796e-07, - "loss": -0.002, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.711611197709339, + "kl": 0.398681640625, + "learning_rate": 9.915284233622877e-07, + "loss": -0.0014, + "num_tokens": 11345121.0, + "reward": 0.3469575219787657, + "reward_std": 0.2414399441331625, + "rewards/code_reward/mean": 0.3469575219787657, + "rewards/code_reward/std": 0.2414399590343237, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 365.140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.25, + "completions/max_terminated_length": 339.25, + "completions/mean_length": 163.15625, + "completions/mean_terminated_length": 163.15625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, "epoch": 0.04455016090667413, - "grad_norm": 3.30119576808313, - "kl": 1.076416015625, - "learning_rate": 9.734919739242543e-07, - "loss": -0.0157, - "reward": 0.09687500260770321, - "reward_std": 0.008539125323295593, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.6559828290200511, + "kl": 0.32080078125, + "learning_rate": 9.824731176992796e-07, + "loss": 0.0554, + "num_tokens": 11366862.0, + "reward": 0.21360408567124978, + "reward_std": 0.14055794943124056, + "rewards/code_reward/mean": 0.21360408567124978, + "rewards/code_reward/std": 0.14055794943124056, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 277.28125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.5, + "completions/max_terminated_length": 250.5, + "completions/mean_length": 163.09375, + "completions/mean_terminated_length": 163.09375, + "completions/min_length": 97.25, + "completions/min_terminated_length": 97.25, "epoch": 0.04466209598432909, - "grad_norm": 0.08032954719670685, - "kl": 0.06231689453125, - "learning_rate": 9.645853688680177e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4395787879499458, + "kl": 0.2861328125, + "learning_rate": 9.734919739242543e-07, + "loss": 0.0094, + "num_tokens": 11390465.0, + "reward": 0.37181805819272995, + "reward_std": 0.13883061078377068, + "rewards/code_reward/mean": 0.37181805819272995, + "rewards/code_reward/std": 0.1388306178851053, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 379.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.75, + "completions/max_terminated_length": 351.75, + "completions/mean_length": 168.9375, + "completions/mean_terminated_length": 168.9375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, "epoch": 0.04477403106198405, - "grad_norm": 0.30125779125635027, - "kl": 0.06890869140625, - "learning_rate": 9.557536762338786e-07, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5497285885828123, + "kl": 0.28857421875, + "learning_rate": 9.645853688680177e-07, + "loss": -0.0077, + "num_tokens": 11412903.0, + "reward": 0.22598881646990776, + "reward_std": 0.05764714028919116, + "rewards/code_reward/mean": 0.22598881646990776, + "rewards/code_reward/std": 0.05764713906683028, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 408.609375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 810.0, + "completions/max_terminated_length": 618.5, + "completions/mean_length": 326.8125, + "completions/mean_terminated_length": 279.2276916503906, + "completions/min_length": 143.75, + "completions/min_terminated_length": 143.75, "epoch": 0.04488596613963901, - "grad_norm": 0.10410594418892839, - "kl": 0.04498291015625, - "learning_rate": 9.46997266581973e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3195383455561336, + "kl": 0.1973876953125, + "learning_rate": 9.557536762338786e-07, + "loss": 0.1984, + "num_tokens": 11445705.0, + "reward": 0.4391447389498353, + "reward_std": 0.2860143817961216, + "rewards/code_reward/mean": 0.4391447389498353, + "rewards/code_reward/std": 0.28601440228521824, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 388.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.25, + "completions/max_terminated_length": 287.25, + "completions/mean_length": 152.84375, + "completions/mean_terminated_length": 152.84375, + "completions/min_length": 75.25, + "completions/min_terminated_length": 75.25, "epoch": 0.04499790121729397, - "grad_norm": 0.21768463357305143, - "kl": 0.04937744140625, - "learning_rate": 9.383165073137115e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3849099249418695, + "kl": 0.2900390625, + "learning_rate": 9.46997266581973e-07, + "loss": 0.0243, + "num_tokens": 11470668.0, + "reward": 0.5938801132142544, + "reward_std": 0.22660082660149783, + "rewards/code_reward/mean": 0.5938801132142544, + "rewards/code_reward/std": 0.22660081752110273, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 340.828125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.5, + "completions/max_terminated_length": 446.5, + "completions/mean_length": 251.75, + "completions/mean_terminated_length": 251.75, + "completions/min_length": 149.25, + "completions/min_terminated_length": 149.25, "epoch": 0.04510983629494893, - "grad_norm": 0.24943422959107125, - "kl": 0.054443359375, - "learning_rate": 9.297117626563687e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1229376006957024, + "kl": 0.2052001953125, + "learning_rate": 9.383165073137115e-07, + "loss": -0.0179, + "num_tokens": 11493260.0, + "reward": 0.36087851971387863, + "reward_std": 0.14250769466161728, + "rewards/code_reward/mean": 0.36087851971387863, + "rewards/code_reward/std": 0.14250769466161728, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 398.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.75, + "completions/max_terminated_length": 299.75, + "completions/mean_length": 161.46875, + "completions/mean_terminated_length": 161.46875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.04522177137260389, - "grad_norm": 0.15214769860694116, - "kl": 0.05352783203125, - "learning_rate": 9.211833936477957e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3839235753481969, + "kl": 0.34619140625, + "learning_rate": 9.297117626563687e-07, + "loss": 0.1469, + "num_tokens": 11513939.0, + "reward": 0.6742284968495369, + "reward_std": 0.05968676181510091, + "rewards/code_reward/mean": 0.6742284968495369, + "rewards/code_reward/std": 0.059686762280762196, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 309.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.5, + "completions/max_terminated_length": 180.5, + "completions/mean_length": 114.53125, + "completions/mean_terminated_length": 114.53125, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, "epoch": 0.04533370645025885, - "grad_norm": 0.12737968723084875, - "kl": 0.04833984375, - "learning_rate": 9.127317581212753e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.4362013118805237, + "kl": 0.326171875, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0929, + "num_tokens": 11532444.0, + "reward": 0.33231060579419136, + "reward_std": 0.09006076445803046, + "rewards/code_reward/mean": 0.33231060579419136, + "rewards/code_reward/std": 0.09006076492369175, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 367.078125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.75, + "completions/max_terminated_length": 474.75, + "completions/mean_length": 210.15625, + "completions/mean_terminated_length": 210.15625, + "completions/min_length": 100.25, + "completions/min_terminated_length": 100.25, "epoch": 0.04544564152791381, - "grad_norm": 0.12491707622355717, - "kl": 0.0419921875, - "learning_rate": 9.043572106905084e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3572230509856378, + "kl": 0.225341796875, + "learning_rate": 9.127317581212753e-07, + "loss": -0.13, + "num_tokens": 11553801.0, + "reward": 0.4415045604109764, + "reward_std": 0.1545610846951604, + "rewards/code_reward/mean": 0.4415045604109764, + "rewards/code_reward/std": 0.1545610912144184, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 347.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 135.875, + "completions/mean_terminated_length": 135.875, + "completions/min_length": 65.25, + "completions/min_terminated_length": 65.25, "epoch": 0.04555757660556877, - "grad_norm": 0.2782478324098304, - "kl": 0.04632568359375, - "learning_rate": 8.960601027347321e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7906307431939532, + "kl": 0.37890625, + "learning_rate": 9.043572106905084e-07, + "loss": -0.0301, + "num_tokens": 11574885.0, + "reward": 0.2516532065346837, + "reward_std": 0.1726220678538084, + "rewards/code_reward/mean": 0.2516532065346837, + "rewards/code_reward/std": 0.17262207716703415, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 344.59375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 714.25, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 230.8125, + "completions/mean_terminated_length": 174.68303680419922, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, "epoch": 0.04566951168322373, - "grad_norm": 1.707651816399516, - "kl": 0.17364501953125, - "learning_rate": 8.878407823839788e-07, - "loss": 0.0017, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.5575273200953244, + "kl": 0.662841796875, + "learning_rate": 8.960601027347321e-07, + "loss": 0.2496, + "num_tokens": 11602207.0, + "reward": 0.5628770813345909, + "reward_std": 0.1447618722449988, + "rewards/code_reward/mean": 0.5628770813345909, + "rewards/code_reward/std": 0.14476187201216817, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 258.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.75, + "completions/max_terminated_length": 240.75, + "completions/mean_length": 148.78125, + "completions/mean_terminated_length": 148.78125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, "epoch": 0.04578144676087869, - "grad_norm": 0.18198728965219338, - "kl": 0.05511474609375, - "learning_rate": 8.796995945044689e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6621037231884788, + "kl": 0.3466796875, + "learning_rate": 8.878407823839788e-07, + "loss": 0.0366, + "num_tokens": 11618608.0, + "reward": 0.25418527983129025, + "reward_std": 0.11003150884062052, + "rewards/code_reward/mean": 0.25418527983129025, + "rewards/code_reward/std": 0.11003150977194309, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 329.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.5, + "completions/max_terminated_length": 213.5, + "completions/mean_length": 146.90625, + "completions/mean_terminated_length": 146.90625, + "completions/min_length": 91.25, + "completions/min_terminated_length": 91.25, "epoch": 0.04589338183853365, - "grad_norm": 3.0116035887758312, - "kl": 0.07513427734375, - "learning_rate": 8.716368806841405e-07, - "loss": 0.0028, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4559367674936419, + "kl": 0.26123046875, + "learning_rate": 8.796995945044689e-07, + "loss": 0.0374, + "num_tokens": 11637933.0, + "reward": 0.36544950399547815, + "reward_std": 0.024054846144281328, + "rewards/code_reward/mean": 0.36544950399547815, + "rewards/code_reward/std": 0.02405484637711197, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 370.46875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.5, + "completions/max_terminated_length": 236.5, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 87.25, + "completions/min_terminated_length": 87.25, "epoch": 0.04600531691618861, - "grad_norm": 0.036581584760347785, - "kl": 0.04638671875, - "learning_rate": 8.636529792183171e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6433420190741899, + "kl": 0.28955078125, + "learning_rate": 8.716368806841405e-07, + "loss": -0.0265, + "num_tokens": 11658509.0, + "reward": 0.3223713766783476, + "reward_std": 0.1642971858382225, + "rewards/code_reward/mean": 0.3223713766783476, + "rewards/code_reward/std": 0.16429719096049666, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 412.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.25, + "completions/max_terminated_length": 392.25, + "completions/mean_length": 195.6875, + "completions/mean_terminated_length": 195.6875, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.04611725199384357, - "grad_norm": 1.7675486622831695, - "kl": 0.08441162109375, - "learning_rate": 8.557482250955144e-07, - "loss": 0.0507, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.0424618486352764, + "kl": 0.280517578125, + "learning_rate": 8.636529792183171e-07, + "loss": 0.0174, + "num_tokens": 11683955.0, + "reward": 0.3553215153515339, + "reward_std": 0.1284142378717661, + "rewards/code_reward/mean": 0.3553215153515339, + "rewards/code_reward/std": 0.1284142378717661, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 376.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 249.4375, + "completions/mean_terminated_length": 249.4375, + "completions/min_length": 124.25, + "completions/min_terminated_length": 124.25, "epoch": 0.046229187071498534, - "grad_norm": 0.8339879341516419, - "kl": 0.06793212890625, - "learning_rate": 8.479229499833844e-07, - "loss": -0.0386, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 0.9268703240625641, + "kl": 0.181884765625, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0329, + "num_tokens": 11707953.0, + "reward": 0.5278465449810028, + "reward_std": 0.059144818456843495, + "rewards/code_reward/mean": 0.5278465449810028, + "rewards/code_reward/std": 0.05914481892250478, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 269.328125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.75, + "completions/max_terminated_length": 324.75, + "completions/mean_length": 194.625, + "completions/mean_terminated_length": 194.625, + "completions/min_length": 112.5, + "completions/min_terminated_length": 112.5, "epoch": 0.046341122149153494, - "grad_norm": 0.03758254954126514, - "kl": 0.046630859375, - "learning_rate": 8.401774822147976e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2660479068382882, + "kl": 0.270751953125, + "learning_rate": 8.479229499833844e-07, + "loss": 0.0482, + "num_tokens": 11731301.0, + "reward": 0.20539462007582188, + "reward_std": 0.1615639952942729, + "rewards/code_reward/mean": 0.20539462007582188, + "rewards/code_reward/std": 0.1615639952942729, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 426.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.25, + "completions/max_terminated_length": 262.25, + "completions/mean_length": 178.9375, + "completions/mean_terminated_length": 178.9375, + "completions/min_length": 97.75, + "completions/min_terminated_length": 97.75, "epoch": 0.04645305722680845, - "grad_norm": 0.03884055882244416, - "kl": 0.0457763671875, - "learning_rate": 8.325121467740695e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.460499815260377, + "kl": 0.33349609375, + "learning_rate": 8.401774822147976e-07, + "loss": 0.0594, + "num_tokens": 11754019.0, + "reward": 0.3291256055235863, + "reward_std": 0.1305392780341208, + "rewards/code_reward/mean": 0.3291256055235863, + "rewards/code_reward/std": 0.13053929095622152, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 327.15625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.5, + "completions/max_terminated_length": 680.5, + "completions/mean_length": 255.3125, + "completions/mean_terminated_length": 255.3125, + "completions/min_length": 91.25, + "completions/min_terminated_length": 91.25, "epoch": 0.04656499230446341, - "grad_norm": 0.13058568836650014, - "kl": 0.06011962890625, - "learning_rate": 8.249272652833226e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5950867620592668, + "kl": 0.260498046875, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0056, + "num_tokens": 11784677.0, + "reward": 0.4860835336148739, + "reward_std": 0.19814053922891617, + "rewards/code_reward/mean": 0.4860835336148739, + "rewards/code_reward/std": 0.19814054295420647, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 389.59375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.75, + "completions/max_terminated_length": 380.75, + "completions/mean_length": 203.8125, + "completions/mean_terminated_length": 203.8125, + "completions/min_length": 100.25, + "completions/min_terminated_length": 100.25, "epoch": 0.04667692738211837, - "grad_norm": 0.05421556382240225, - "kl": 0.0401611328125, - "learning_rate": 8.174231559889931e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4255259834081973, + "kl": 0.23291015625, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0277, + "num_tokens": 11812087.0, + "reward": 0.1272990070283413, + "reward_std": 0.05336737190373242, + "rewards/code_reward/mean": 0.1272990070283413, + "rewards/code_reward/std": 0.053367371554486454, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 266.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 193.78125, + "completions/mean_terminated_length": 193.78125, + "completions/min_length": 109.75, + "completions/min_terminated_length": 109.75, "epoch": 0.04678886245977333, - "grad_norm": 2.135742456611555, - "kl": 0.55340576171875, - "learning_rate": 8.100001337484787e-07, - "loss": -0.052, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3483016005578479, + "kl": 0.382568359375, + "learning_rate": 8.174231559889931e-07, + "loss": -0.0485, + "num_tokens": 11828464.0, + "reward": 0.3802599012851715, + "reward_std": 0.24896394088864326, + "rewards/code_reward/mean": 0.3802599012851715, + "rewards/code_reward/std": 0.24896394088864326, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 301.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.75, + "completions/max_terminated_length": 240.75, + "completions/mean_length": 158.25, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 95.5, + "completions/min_terminated_length": 95.5, "epoch": 0.04690079753742829, - "grad_norm": 27.79993671743234, - "kl": 2.71258544921875, - "learning_rate": 8.026585100169251e-07, - "loss": 0.1087, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.4554834982831377, + "kl": 0.31591796875, + "learning_rate": 8.100001337484787e-07, + "loss": 0.0997, + "num_tokens": 11847600.0, + "reward": 0.47475508879870176, + "reward_std": 0.10649433638900518, + "rewards/code_reward/mean": 0.47475508879870176, + "rewards/code_reward/std": 0.10649433825165033, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 381.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.5, + "completions/max_terminated_length": 308.5, + "completions/mean_length": 141.1875, + "completions/mean_terminated_length": 141.1875, + "completions/min_length": 68.75, + "completions/min_terminated_length": 68.75, "epoch": 0.04701273261508325, - "grad_norm": 0.3229458185249445, - "kl": 0.0728759765625, - "learning_rate": 7.953985928341601e-07, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.2293342326111887, + "kl": 0.29931640625, + "learning_rate": 8.026585100169251e-07, + "loss": -0.2137, + "num_tokens": 11868750.0, + "reward": 0.5101216156035662, + "reward_std": 0.014662902103736997, + "rewards/code_reward/mean": 0.5101216156035662, + "rewards/code_reward/std": 0.014662901870906353, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 352.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.5, + "completions/max_terminated_length": 297.5, + "completions/mean_length": 166.3125, + "completions/mean_terminated_length": 166.3125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.04712466769273821, - "grad_norm": 0.18899120451502113, - "kl": 0.04962158203125, - "learning_rate": 7.882206868117693e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7014274221258614, + "kl": 0.2890625, + "learning_rate": 7.953985928341601e-07, + "loss": 0.0663, + "num_tokens": 11895960.0, + "reward": 0.5694793821312487, + "reward_std": 0.16413932980503887, + "rewards/code_reward/mean": 0.5694793821312487, + "rewards/code_reward/std": 0.16413932980503887, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 285.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.75, + "completions/max_terminated_length": 279.75, + "completions/mean_length": 171.78125, + "completions/mean_terminated_length": 171.78125, + "completions/min_length": 101.5, + "completions/min_terminated_length": 101.5, "epoch": 0.047236602770393175, - "grad_norm": 0.05378027086791007, - "kl": 0.0703125, - "learning_rate": 7.81125093120313e-07, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1761445629861498, + "kl": 0.256103515625, + "learning_rate": 7.882206868117693e-07, + "loss": -0.0198, + "num_tokens": 11919857.0, + "reward": 0.7860226929187775, + "reward_std": 0.15767237346153706, + "rewards/code_reward/mean": 0.7860226929187775, + "rewards/code_reward/std": 0.1576723720645532, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 366.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.5, + "completions/max_terminated_length": 335.5, + "completions/mean_length": 185.5, + "completions/mean_terminated_length": 185.5, + "completions/min_length": 92.5, + "completions/min_terminated_length": 92.5, "epoch": 0.047348537848048135, - "grad_norm": 0.275422779661784, - "kl": 0.05328369140625, - "learning_rate": 7.741121094766916e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8051288181053138, + "kl": 0.281982421875, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0279, + "num_tokens": 11942537.0, + "reward": 0.3795018047094345, + "reward_std": 0.15914139337837696, + "rewards/code_reward/mean": 0.3795018047094345, + "rewards/code_reward/std": 0.15914138592779636, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 386.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.25, + "completions/max_terminated_length": 387.25, + "completions/mean_length": 178.40625, + "completions/mean_terminated_length": 178.40625, + "completions/min_length": 84.75, + "completions/min_terminated_length": 84.75, "epoch": 0.047460472925703094, - "grad_norm": 0.02946053453290786, - "kl": 0.04119873046875, - "learning_rate": 7.671820301316532e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4032630704318265, + "kl": 0.30078125, + "learning_rate": 7.741121094766916e-07, + "loss": -0.1775, + "num_tokens": 11966390.0, + "reward": 0.6128955632448196, + "reward_std": 0.11446365877054632, + "rewards/code_reward/mean": 0.6128955632448196, + "rewards/code_reward/std": 0.11446366063319147, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 381.890625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.25, + "completions/max_terminated_length": 384.25, + "completions/mean_length": 168.40625, + "completions/mean_terminated_length": 168.40625, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, "epoch": 0.047572408003358053, - "grad_norm": 0.02258693420380951, - "kl": 0.0445556640625, - "learning_rate": 7.603351458574474e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7373521797945488, + "kl": 0.27392578125, + "learning_rate": 7.671820301316532e-07, + "loss": 0.1031, + "num_tokens": 11991491.0, + "reward": 0.5329861111240461, + "reward_std": 0.2041158601641655, + "rewards/code_reward/mean": 0.5329861111240461, + "rewards/code_reward/std": 0.2041158601641655, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 416.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.5, + "completions/max_terminated_length": 376.5, + "completions/mean_length": 191.46875, + "completions/mean_terminated_length": 191.46875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, "epoch": 0.04768434308101301, - "grad_norm": 1.383085540888027, - "kl": 0.811279296875, - "learning_rate": 7.535717439356255e-07, - "loss": -0.0196, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 2.0077272533591266, + "kl": 0.25927734375, + "learning_rate": 7.603351458574474e-07, + "loss": 0.1358, + "num_tokens": 12013706.0, + "reward": 0.2916666716337204, + "reward_std": 0.07259188406169415, + "rewards/code_reward/mean": 0.2916666716337204, + "rewards/code_reward/std": 0.072591882199049, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 368.796875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.5, + "completions/max_terminated_length": 479.5, + "completions/mean_length": 218.28125, + "completions/mean_terminated_length": 218.28125, + "completions/min_length": 85.25, + "completions/min_terminated_length": 85.25, "epoch": 0.04779627815866797, - "grad_norm": 3.7777508498427164, - "kl": 0.77886962890625, - "learning_rate": 7.46892108144986e-07, - "loss": -0.0481, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3492407743127255, + "kl": 0.30029296875, + "learning_rate": 7.535717439356255e-07, + "loss": 0.031, + "num_tokens": 12042155.0, + "reward": 0.5065476968884468, + "reward_std": 0.2563619986176491, + "rewards/code_reward/mean": 0.5065476968884468, + "rewards/code_reward/std": 0.2563620023429394, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 276.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.5, + "completions/max_terminated_length": 268.5, + "completions/mean_length": 178.71875, + "completions/mean_terminated_length": 178.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, "epoch": 0.04790821323632293, - "grad_norm": 0.10955470879553754, - "kl": 0.05078125, - "learning_rate": 7.402965187496697e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.279825584779865, + "kl": 0.3115234375, + "learning_rate": 7.46892108144986e-07, + "loss": -0.0739, + "num_tokens": 12066530.0, + "reward": 0.5710227191448212, + "reward_std": 0.18481200002133846, + "rewards/code_reward/mean": 0.5710227191448212, + "rewards/code_reward/std": 0.18481199722737074, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 380.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.75, + "completions/max_terminated_length": 262.75, + "completions/mean_length": 157.84375, + "completions/mean_terminated_length": 157.84375, + "completions/min_length": 91.5, + "completions/min_terminated_length": 91.5, "epoch": 0.04802014831397789, - "grad_norm": 0.04113851640884478, - "kl": 0.05523681640625, - "learning_rate": 7.337852524873974e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2537079099692274, + "kl": 0.32080078125, + "learning_rate": 7.402965187496697e-07, + "loss": -0.062, + "num_tokens": 12093773.0, + "reward": 0.40365831553936005, + "reward_std": 0.06347889173775911, + "rewards/code_reward/mean": 0.40365831553936005, + "rewards/code_reward/std": 0.06347889162134379, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 420.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.75, + "completions/max_terminated_length": 342.75, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, "epoch": 0.04813208339163285, - "grad_norm": 0.018684245104689922, - "kl": 0.03826904296875, - "learning_rate": 7.273585825578608e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5682030729649754, + "kl": 0.3115234375, + "learning_rate": 7.337852524873974e-07, + "loss": 0.0633, + "num_tokens": 12119825.0, + "reward": 0.4007348418235779, + "reward_std": 0.2273004651069641, + "rewards/code_reward/mean": 0.4007348418235779, + "rewards/code_reward/std": 0.227300476282835, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 345.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.75, + "completions/max_terminated_length": 307.75, + "completions/mean_length": 166.8125, + "completions/mean_terminated_length": 166.8125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.04824401846928781, - "grad_norm": 8.259759861418749, - "kl": 1.716339111328125, - "learning_rate": 7.21016778611259e-07, - "loss": -0.0347, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.414420572830449, + "kl": 0.25830078125, + "learning_rate": 7.273585825578608e-07, + "loss": -0.004, + "num_tokens": 12141963.0, + "reward": 0.09160848939791322, + "reward_std": 0.09675811271881685, + "rewards/code_reward/mean": 0.09160848939791322, + "rewards/code_reward/std": 0.09675811271881685, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 431 }, { "clip_ratio": 0.0, - "completion_length": 402.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.5, + "completions/max_terminated_length": 289.5, + "completions/mean_length": 160.6875, + "completions/mean_terminated_length": 160.6875, + "completions/min_length": 85.75, + "completions/min_terminated_length": 85.75, "epoch": 0.048355953546942776, - "grad_norm": 0.020343976644658462, - "kl": 0.033447265625, - "learning_rate": 7.147601067369835e-07, - "loss": 0.0003, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2819351419525538, + "kl": 0.299072265625, + "learning_rate": 7.21016778611259e-07, + "loss": 0.0442, + "num_tokens": 12160385.0, + "reward": 0.3042712155729532, + "reward_std": 0.1732648597098887, + "rewards/code_reward/mean": 0.3042712155729532, + "rewards/code_reward/std": 0.17326486064121127, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 351.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 199.65625, + "completions/mean_terminated_length": 199.65625, + "completions/min_length": 90.25, + "completions/min_terminated_length": 90.25, "epoch": 0.048467888624597735, - "grad_norm": 1.0066422924478642, - "kl": 0.75701904296875, - "learning_rate": 7.085888294524561e-07, - "loss": -0.0467, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4343614720564808, + "kl": 0.3349609375, + "learning_rate": 7.147601067369835e-07, + "loss": -0.0444, + "num_tokens": 12183238.0, + "reward": 0.2736266343854368, + "reward_std": 0.11790771875530481, + "rewards/code_reward/mean": 0.2736266343854368, + "rewards/code_reward/std": 0.11790771875530481, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 415.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.5, + "completions/max_terminated_length": 430.5, + "completions/mean_length": 220.1875, + "completions/mean_terminated_length": 220.1875, + "completions/min_length": 140.25, + "completions/min_terminated_length": 140.25, "epoch": 0.048579823702252695, - "grad_norm": 0.026562111650317053, - "kl": 0.04150390625, - "learning_rate": 7.025032056921117e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5149123099259716, + "kl": 0.208740234375, + "learning_rate": 7.085888294524561e-07, + "loss": 0.0555, + "num_tokens": 12205652.0, + "reward": 0.1229942380450666, + "reward_std": 0.1744669363833964, + "rewards/code_reward/mean": 0.1229942380450666, + "rewards/code_reward/std": 0.17446694057434797, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 308.921875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.5, + "completions/max_terminated_length": 280.5, + "completions/mean_length": 172.625, + "completions/mean_terminated_length": 172.625, + "completions/min_length": 68.25, + "completions/min_terminated_length": 68.25, "epoch": 0.048691758779907654, - "grad_norm": 0.03150175672968521, - "kl": 0.03955078125, - "learning_rate": 6.965034907965349e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1887088926887228, + "kl": 0.293701171875, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0018, + "num_tokens": 12225632.0, + "reward": 0.36100322124548256, + "reward_std": 0.18402530439198017, + "rewards/code_reward/mean": 0.36100322124548256, + "rewards/code_reward/std": 0.18402530439198017, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 343.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 165.9375, + "completions/mean_terminated_length": 165.9375, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, "epoch": 0.04880369385756261, - "grad_norm": 0.030310737599258293, - "kl": 0.04107666015625, - "learning_rate": 6.905899365017462e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3337355570198512, + "kl": 0.3330078125, + "learning_rate": 6.965034907965349e-07, + "loss": -0.0914, + "num_tokens": 12244678.0, + "reward": 0.5988663695752621, + "reward_std": 0.20593830198049545, + "rewards/code_reward/mean": 0.5988663695752621, + "rewards/code_reward/std": 0.20593830046709627, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 319.328125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 690.25, + "completions/max_terminated_length": 308.5, + "completions/mean_length": 242.875, + "completions/mean_terminated_length": 187.48214721679688, + "completions/min_length": 99.75, + "completions/min_terminated_length": 99.75, "epoch": 0.04891562893521757, - "grad_norm": 1.8528337687242826, - "kl": 1.40911865234375, - "learning_rate": 6.847627909286409e-07, - "loss": -0.0344, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.333932781238413, + "kl": 0.2191162109375, + "learning_rate": 6.905899365017462e-07, + "loss": 0.1709, + "num_tokens": 12265450.0, + "reward": 0.28224857337772846, + "reward_std": 0.13666313188150525, + "rewards/code_reward/mean": 0.28224857337772846, + "rewards/code_reward/std": 0.13666313188150525, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 328.515625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.5, + "completions/max_terminated_length": 410.5, + "completions/mean_length": 176.8125, + "completions/mean_terminated_length": 176.8125, + "completions/min_length": 76.25, + "completions/min_terminated_length": 76.25, "epoch": 0.04902756401287253, - "grad_norm": 0.028586652468959688, - "kl": 0.04345703125, - "learning_rate": 6.790222985725761e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2985967740803328, + "kl": 0.283447265625, + "learning_rate": 6.847627909286409e-07, + "loss": 0.1118, + "num_tokens": 12284524.0, + "reward": 0.40528881177306175, + "reward_std": 0.15535564813762903, + "rewards/code_reward/mean": 0.40528881177306175, + "rewards/code_reward/std": 0.15535564627498388, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 335.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.25, + "completions/max_terminated_length": 376.25, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 91.75, + "completions/min_terminated_length": 91.75, "epoch": 0.04913949909052749, - "grad_norm": 0.08766284200366595, - "kl": 0.0438232421875, - "learning_rate": 6.733687002931141e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1950680004085388, + "kl": 0.315185546875, + "learning_rate": 6.790222985725761e-07, + "loss": 0.025, + "num_tokens": 12306080.0, + "reward": 0.48305153474211693, + "reward_std": 0.12105439510196447, + "rewards/code_reward/mean": 0.48305153474211693, + "rewards/code_reward/std": 0.12105440441519022, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 305.109375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.5, + "completions/max_terminated_length": 223.5, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 148.75, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, "epoch": 0.04925143416818245, - "grad_norm": 2.391400552951245, - "kl": 0.09228515625, - "learning_rate": 6.678022333039158e-07, - "loss": -0.0495, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.7349156594295274, + "kl": 0.23681640625, + "learning_rate": 6.733687002931141e-07, + "loss": -0.0181, + "num_tokens": 12327448.0, + "reward": 0.3754356447607279, + "reward_std": 0.11030078027397394, + "rewards/code_reward/mean": 0.3754356447607279, + "rewards/code_reward/std": 0.11030078679323196, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 322.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.5, + "completions/max_terminated_length": 367.5, + "completions/mean_length": 209.40625, + "completions/mean_terminated_length": 209.40625, + "completions/min_length": 101.75, + "completions/min_terminated_length": 101.75, "epoch": 0.04936336924583742, - "grad_norm": 0.02480090522832702, - "kl": 0.0423583984375, - "learning_rate": 6.623231311627876e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4076525318028708, + "kl": 0.3115234375, + "learning_rate": 6.678022333039158e-07, + "loss": -0.0373, + "num_tokens": 12347901.0, + "reward": 0.09711253456771374, + "reward_std": 0.057268128264695406, + "rewards/code_reward/mean": 0.09711253456771374, + "rewards/code_reward/std": 0.05726812733337283, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 395.546875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.25, + "completions/max_terminated_length": 265.25, + "completions/mean_length": 161.5, + "completions/mean_terminated_length": 161.5, + "completions/min_length": 74.75, + "completions/min_terminated_length": 74.75, "epoch": 0.049475304323492376, - "grad_norm": 0.019725004180409796, - "kl": 0.0413818359375, - "learning_rate": 6.569316237618811e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4640386964818788, + "kl": 0.293701171875, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0452, + "num_tokens": 12372045.0, + "reward": 0.4493050128221512, + "reward_std": 0.08245376159902662, + "rewards/code_reward/mean": 0.4493050128221512, + "rewards/code_reward/std": 0.08245376858394593, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 359.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.25, + "completions/max_terminated_length": 381.25, + "completions/mean_length": 170.96875, + "completions/mean_terminated_length": 170.96875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, "epoch": 0.049587239401147336, - "grad_norm": 0.04162342966221699, - "kl": 0.0498046875, - "learning_rate": 6.516279373180499e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.1086386312820666, + "kl": 0.27001953125, + "learning_rate": 6.569316237618811e-07, + "loss": 0.1217, + "num_tokens": 12396860.0, + "reward": 0.34658948611468077, + "reward_std": 0.23270382836926728, + "rewards/code_reward/mean": 0.34658948611468077, + "rewards/code_reward/std": 0.23270384327042848, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 344.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.75, + "completions/max_terminated_length": 419.75, + "completions/mean_length": 220.5625, + "completions/mean_terminated_length": 220.5625, + "completions/min_length": 120.25, + "completions/min_terminated_length": 120.25, "epoch": 0.049699174478802295, - "grad_norm": 0.02391594067619232, - "kl": 0.0406494140625, - "learning_rate": 6.464122943633543e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4630455489204783, + "kl": 0.26171875, + "learning_rate": 6.516279373180499e-07, + "loss": 0.2184, + "num_tokens": 12422750.0, + "reward": 0.37283046543598175, + "reward_std": 0.12992971763014793, + "rewards/code_reward/mean": 0.37283046543598175, + "rewards/code_reward/std": 0.12992972321808338, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 369.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.25, + "completions/max_terminated_length": 268.25, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 72.75, + "completions/min_terminated_length": 72.75, "epoch": 0.049811109556457255, - "grad_norm": 0.026401154596888455, - "kl": 0.03631591796875, - "learning_rate": 6.412849137357271e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6080266837223955, + "kl": 0.27734375, + "learning_rate": 6.464122943633543e-07, + "loss": -0.0419, + "num_tokens": 12441410.0, + "reward": 0.14340316224843264, + "reward_std": 0.15656377002596855, + "rewards/code_reward/mean": 0.14340316224843264, + "rewards/code_reward/std": 0.15656376257538795, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 378.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, "epoch": 0.049923044634112214, - "grad_norm": 0.02165150509733936, - "kl": 0.04083251953125, - "learning_rate": 6.3624601056979e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.2494209477442706, + "kl": 0.2666015625, + "learning_rate": 6.412849137357271e-07, + "loss": -0.0008, + "num_tokens": 12469060.0, + "reward": 0.41476833214983344, + "reward_std": 0.1326767287682742, + "rewards/code_reward/mean": 0.41476833214983344, + "rewards/code_reward/std": 0.1326767250429839, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 424.734375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.25, + "completions/max_terminated_length": 367.25, + "completions/mean_length": 207.5, + "completions/mean_terminated_length": 207.5, + "completions/min_length": 119.5, + "completions/min_terminated_length": 119.5, "epoch": 0.05003497971176717, - "grad_norm": 0.056253725649908846, - "kl": 0.04132080078125, - "learning_rate": 6.312957962878278e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5486642122578553, + "kl": 0.235107421875, + "learning_rate": 6.3624601056979e-07, + "loss": 0.1428, + "num_tokens": 12493716.0, + "reward": 0.5466772168874741, + "reward_std": 0.3743356466293335, + "rewards/code_reward/mean": 0.5466772168874741, + "rewards/code_reward/std": 0.37433566339313984, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 332.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.5, + "completions/max_terminated_length": 450.5, + "completions/mean_length": 191.28125, + "completions/mean_terminated_length": 191.28125, + "completions/min_length": 94.5, + "completions/min_terminated_length": 94.5, "epoch": 0.05014691478942213, - "grad_norm": 0.026148724794812197, - "kl": 0.05267333984375, - "learning_rate": 6.264344785909181e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3722991110183906, + "kl": 0.2452392578125, + "learning_rate": 6.312957962878278e-07, + "loss": 0.2083, + "num_tokens": 12519901.0, + "reward": 0.4153126999735832, + "reward_std": 0.04972913861274719, + "rewards/code_reward/mean": 0.4153126999735832, + "rewards/code_reward/std": 0.04972913861274719, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 310.484375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.25, + "completions/max_terminated_length": 242.25, + "completions/mean_length": 156.4375, + "completions/mean_terminated_length": 156.4375, + "completions/min_length": 76.75, + "completions/min_terminated_length": 76.75, "epoch": 0.05025884986707709, - "grad_norm": 0.1390653311408599, - "kl": 0.049560546875, - "learning_rate": 6.216622614502149e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6733807611752833, + "kl": 0.339599609375, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0653, + "num_tokens": 12537763.0, + "reward": 0.27301738993264735, + "reward_std": 0.14166639209724963, + "rewards/code_reward/mean": 0.27301738993264735, + "rewards/code_reward/std": 0.14166639978066087, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 388.671875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 685.0, + "completions/max_terminated_length": 364.25, + "completions/mean_length": 219.84375, + "completions/mean_terminated_length": 163.70536041259766, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, "epoch": 0.05037078494473206, - "grad_norm": 0.07042561536180422, - "kl": 0.048095703125, - "learning_rate": 6.169793450983916e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3730882038714862, + "kl": 0.3033447265625, + "learning_rate": 6.216622614502149e-07, + "loss": 0.2151, + "num_tokens": 12564502.0, + "reward": 0.27368341060355306, + "reward_std": 0.12103560357354581, + "rewards/code_reward/mean": 0.27368341060355306, + "rewards/code_reward/std": 0.1210356056690216, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 368.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 192.75, + "completions/mean_terminated_length": 192.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, "epoch": 0.05048272002238702, - "grad_norm": 0.8824459143035142, - "kl": 0.163818359375, - "learning_rate": 6.123859260212393e-07, - "loss": 0.0016, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1120930858197564, + "kl": 0.256103515625, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0663, + "num_tokens": 12595766.0, + "reward": 0.2891203761100769, + "reward_std": 0.005760519183240831, + "rewards/code_reward/mean": 0.2891203761100769, + "rewards/code_reward/std": 0.005760519299656153, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 328.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.75, + "completions/max_terminated_length": 274.75, + "completions/mean_length": 180.71875, + "completions/mean_terminated_length": 180.71875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, "epoch": 0.05059465510004198, - "grad_norm": 0.9135028043813113, - "kl": 0.13751220703125, - "learning_rate": 6.07882196949423e-07, - "loss": -0.0596, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.5436514492166131, + "kl": 0.3642578125, + "learning_rate": 6.123859260212393e-07, + "loss": 0.07, + "num_tokens": 12617805.0, + "reward": 0.3369871713221073, + "reward_std": 0.1278561158105731, + "rewards/code_reward/mean": 0.3369871713221073, + "rewards/code_reward/std": 0.12785612046718597, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 396.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.5, + "completions/max_terminated_length": 417.5, + "completions/mean_length": 189.8125, + "completions/mean_terminated_length": 189.8125, + "completions/min_length": 80.25, + "completions/min_terminated_length": 80.25, "epoch": 0.050706590177696936, - "grad_norm": 0.02488219762931012, - "kl": 0.0374755859375, - "learning_rate": 6.034683468503948e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.655439639393973, + "kl": 0.302978515625, + "learning_rate": 6.07882196949423e-07, + "loss": -0.0797, + "num_tokens": 12641655.0, + "reward": 0.19023456424474716, + "reward_std": 0.11184495687484741, + "rewards/code_reward/mean": 0.19023456424474716, + "rewards/code_reward/std": 0.11184496060013771, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 339.765625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 267.09375, + "completions/mean_terminated_length": 267.09375, + "completions/min_length": 137.5, + "completions/min_terminated_length": 137.5, "epoch": 0.050818525255351896, - "grad_norm": 0.07775771681049337, - "kl": 0.040863037109375, - "learning_rate": 5.991445609204641e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0464296647386528, + "kl": 0.1806640625, + "learning_rate": 6.034683468503948e-07, + "loss": -0.0225, + "num_tokens": 12663874.0, + "reward": 0.3470753263682127, + "reward_std": 0.1855549574829638, + "rewards/code_reward/mean": 0.3470753263682127, + "rewards/code_reward/std": 0.1855549574829638, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 323.953125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 245.15625, + "completions/mean_terminated_length": 245.15625, + "completions/min_length": 89.75, + "completions/min_terminated_length": 89.75, "epoch": 0.050930460333006855, - "grad_norm": 0.23941770812751062, - "kl": 0.07257080078125, - "learning_rate": 5.949110205770292e-07, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3181084417709616, + "kl": 0.22265625, + "learning_rate": 5.991445609204641e-07, + "loss": -0.2186, + "num_tokens": 12690983.0, + "reward": 0.3848821893334389, + "reward_std": 0.1244323942810297, + "rewards/code_reward/mean": 0.3848821893334389, + "rewards/code_reward/std": 0.12443239195272326, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 377.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.5, + "completions/max_terminated_length": 298.5, + "completions/mean_length": 179.1875, + "completions/mean_terminated_length": 179.1875, + "completions/min_length": 97.5, + "completions/min_terminated_length": 97.5, "epoch": 0.051042395410661814, - "grad_norm": 0.838372427480566, - "kl": 0.10174560546875, - "learning_rate": 5.90767903450964e-07, - "loss": -0.0595, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1842353804957628, + "kl": 0.24755859375, + "learning_rate": 5.949110205770292e-07, + "loss": -0.0126, + "num_tokens": 12714285.0, + "reward": 0.23306879866868258, + "reward_std": 0.058227866189554334, + "rewards/code_reward/mean": 0.23306879866868258, + "rewards/code_reward/std": 0.05822786991484463, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 354.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.75, + "completions/max_terminated_length": 424.75, + "completions/mean_length": 212.28125, + "completions/mean_terminated_length": 212.28125, + "completions/min_length": 89.25, + "completions/min_terminated_length": 89.25, "epoch": 0.051154330488316774, - "grad_norm": 0.02696775344751943, - "kl": 0.0364990234375, - "learning_rate": 5.867153833791652e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.361690839229129, + "kl": 0.21875, + "learning_rate": 5.90767903450964e-07, + "loss": 0.0986, + "num_tokens": 12738542.0, + "reward": 0.07210950274020433, + "reward_std": 0.05199644831009209, + "rewards/code_reward/mean": 0.07210950274020433, + "rewards/code_reward/std": 0.05199644842650741, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 293.890625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.75, + "completions/max_terminated_length": 288.75, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 172.09375, + "completions/min_length": 76.25, + "completions/min_terminated_length": 76.25, "epoch": 0.05126626556597173, - "grad_norm": 2.593781001852504, - "kl": 0.27880859375, - "learning_rate": 5.827536303972587e-07, - "loss": 0.0028, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.627223053243688, + "kl": 0.25048828125, + "learning_rate": 5.867153833791652e-07, + "loss": -0.0443, + "num_tokens": 12761809.0, + "reward": 0.2288264101371169, + "reward_std": 0.18154687527567148, + "rewards/code_reward/mean": 0.2288264101371169, + "rewards/code_reward/std": 0.18154688365757465, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 440.28125, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 853.75, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 259.6875, + "completions/mean_terminated_length": 201.59375381469727, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.0513782006436267, - "grad_norm": 0.09378689742071065, - "kl": 0.0506591796875, - "learning_rate": 5.78882810732465e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.37901004815236, + "kl": 0.231201171875, + "learning_rate": 5.827536303972587e-07, + "loss": 0.3202, + "num_tokens": 12797623.0, + "reward": 0.4052652306854725, + "reward_std": 0.09384694416075945, + "rewards/code_reward/mean": 0.4052652306854725, + "rewards/code_reward/std": 0.09384695184417069, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 286.09375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.5, + "completions/max_terminated_length": 467.5, + "completions/mean_length": 190.0625, + "completions/mean_terminated_length": 190.0625, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, "epoch": 0.05149013572128166, - "grad_norm": 3.1614265708752236, - "kl": 0.16015625, - "learning_rate": 5.75103086796625e-07, - "loss": 0.0016, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1377121561938153, + "kl": 0.241943359375, + "learning_rate": 5.78882810732465e-07, + "loss": -0.0275, + "num_tokens": 12819217.0, + "reward": 0.45869156159460545, + "reward_std": 0.021421764977276325, + "rewards/code_reward/mean": 0.45869156159460545, + "rewards/code_reward/std": 0.021421766839921474, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 311.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.25, + "completions/max_terminated_length": 263.25, + "completions/mean_length": 155.9375, + "completions/mean_terminated_length": 155.9375, + "completions/min_length": 84.5, + "completions/min_terminated_length": 84.5, "epoch": 0.05160207079893662, - "grad_norm": 1.7687368403392676, - "kl": 0.1033935546875, - "learning_rate": 5.714146171793846e-07, - "loss": -0.0578, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1807451529495485, + "kl": 0.306884765625, + "learning_rate": 5.75103086796625e-07, + "loss": -0.0194, + "num_tokens": 12843095.0, + "reward": 0.018822902347892523, + "reward_std": 0.015156067907810211, + "rewards/code_reward/mean": 0.018822902347892523, + "rewards/code_reward/std": 0.015156067907810211, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 377.265625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 258.4375, + "completions/mean_terminated_length": 258.4375, + "completions/min_length": 161.25, + "completions/min_terminated_length": 161.25, "epoch": 0.05171400587659158, - "grad_norm": 0.049362194712489586, - "kl": 0.035614013671875, - "learning_rate": 5.678175566415422e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.336417383594584, + "kl": 0.199951171875, + "learning_rate": 5.714146171793846e-07, + "loss": 0.1694, + "num_tokens": 12866173.0, + "reward": 0.13850605115294456, + "reward_std": 0.0918186865746975, + "rewards/code_reward/mean": 0.13850605115294456, + "rewards/code_reward/std": 0.09181869029998779, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 364.171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.5, + "completions/max_terminated_length": 452.5, + "completions/mean_length": 190.40625, + "completions/mean_terminated_length": 190.40625, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, "epoch": 0.05182594095424654, - "grad_norm": 0.02337468550122759, - "kl": 0.0418701171875, - "learning_rate": 5.643120561085528e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7230754097104986, + "kl": 0.298583984375, + "learning_rate": 5.678175566415422e-07, + "loss": 0.0863, + "num_tokens": 12892290.0, + "reward": 0.33602308854460716, + "reward_std": 0.07540364377200603, + "rewards/code_reward/mean": 0.33602308854460716, + "rewards/code_reward/std": 0.07540364749729633, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 286.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.75, + "completions/max_terminated_length": 373.75, + "completions/mean_length": 159.21875, + "completions/mean_terminated_length": 159.21875, + "completions/min_length": 69.25, + "completions/min_terminated_length": 69.25, "epoch": 0.051937876031901496, - "grad_norm": 0.19211609005932148, - "kl": 0.06243896484375, - "learning_rate": 5.608982626641991e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5832482382803177, + "kl": 0.1796875, + "learning_rate": 5.643120561085528e-07, + "loss": -0.0099, + "num_tokens": 12911025.0, + "reward": 0.5693264603614807, + "reward_std": 0.09451888594776392, + "rewards/code_reward/mean": 0.5693264603614807, + "rewards/code_reward/std": 0.09451888781040907, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 460.9375, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 771.25, + "completions/max_terminated_length": 698.25, + "completions/mean_length": 317.71875, + "completions/mean_terminated_length": 270.93304443359375, + "completions/min_length": 121.25, + "completions/min_terminated_length": 121.25, "epoch": 0.052049811109556456, - "grad_norm": 0.02527496415068125, - "kl": 0.0364990234375, - "learning_rate": 5.575763195444166e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.5755700855439375, + "kl": 0.2149658203125, + "learning_rate": 5.608982626641991e-07, + "loss": 0.033, + "num_tokens": 12946576.0, + "reward": 0.3332868255674839, + "reward_std": 0.08039725571870804, + "rewards/code_reward/mean": 0.3332868255674839, + "rewards/code_reward/std": 0.08039725571870804, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 337.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.25, + "completions/max_terminated_length": 229.25, + "completions/mean_length": 158.65625, + "completions/mean_terminated_length": 158.65625, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, "epoch": 0.052161746187211415, - "grad_norm": 0.02483601186922771, - "kl": 0.0379638671875, - "learning_rate": 5.543463661312847e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4999164462281511, + "kl": 0.25244140625, + "learning_rate": 5.575763195444166e-07, + "loss": 0.1101, + "num_tokens": 12962893.0, + "reward": 0.26572345197200775, + "reward_std": 0.08325213519856334, + "rewards/code_reward/mean": 0.26572345197200775, + "rewards/code_reward/std": 0.08325213845819235, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 382.203125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.75, + "completions/max_terminated_length": 256.75, + "completions/mean_length": 156.65625, + "completions/mean_terminated_length": 156.65625, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, "epoch": 0.052273681264866374, - "grad_norm": 0.030569824177275738, - "kl": 0.036956787109375, - "learning_rate": 5.512085379471808e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5654218625865297, + "kl": 0.29931640625, + "learning_rate": 5.543463661312847e-07, + "loss": 0.0124, + "num_tokens": 12989394.0, + "reward": 0.4510860964655876, + "reward_std": 0.038132989313453436, + "rewards/code_reward/mean": 0.4510860964655876, + "rewards/code_reward/std": 0.03813299024477601, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 372.96875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.75, + "completions/max_terminated_length": 284.75, + "completions/mean_length": 189.0625, + "completions/mean_terminated_length": 189.0625, + "completions/min_length": 110.25, + "completions/min_terminated_length": 110.25, "epoch": 0.05238561634252134, - "grad_norm": 0.870995271718928, - "kl": 0.053680419921875, - "learning_rate": 5.481629666490903e-07, - "loss": -0.0283, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.1317457646109947, + "kl": 0.223388671875, + "learning_rate": 5.512085379471808e-07, + "loss": -0.0249, + "num_tokens": 13014908.0, + "reward": 0.3786849081516266, + "reward_std": 0.1657154718413949, + "rewards/code_reward/mean": 0.3786849081516266, + "rewards/code_reward/std": 0.16571548115462065, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 255.546875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.25, + "completions/max_terminated_length": 308.25, + "completions/mean_length": 182.21875, + "completions/mean_terminated_length": 182.21875, + "completions/min_length": 90.75, + "completions/min_terminated_length": 90.75, "epoch": 0.0524975514201763, - "grad_norm": 0.054911233572157374, - "kl": 0.04248046875, - "learning_rate": 5.452097800230853e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3580873503854825, + "kl": 0.2578125, + "learning_rate": 5.481629666490903e-07, + "loss": 0.041, + "num_tokens": 13033779.0, + "reward": 0.6068142354488373, + "reward_std": 0.21611789241433144, + "rewards/code_reward/mean": 0.6068142354488373, + "rewards/code_reward/std": 0.21611790172755718, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 365.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.75, + "completions/max_terminated_length": 379.75, + "completions/mean_length": 216.59375, + "completions/mean_terminated_length": 216.59375, + "completions/min_length": 120.5, + "completions/min_terminated_length": 120.5, "epoch": 0.05260948649783126, - "grad_norm": 0.9189343613920824, - "kl": 0.35272216796875, - "learning_rate": 5.423491019789623e-07, - "loss": -0.0551, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.2530077335631926, + "kl": 0.28271484375, + "learning_rate": 5.452097800230853e-07, + "loss": 0.0203, + "num_tokens": 13058070.0, + "reward": 0.4219468259252608, + "reward_std": 0.08188007143326104, + "rewards/code_reward/mean": 0.4219468259252608, + "rewards/code_reward/std": 0.08188007143326104, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 364.390625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.25, + "completions/max_terminated_length": 373.25, + "completions/mean_length": 230.09375, + "completions/mean_terminated_length": 230.09375, + "completions/min_length": 153.25, + "completions/min_terminated_length": 153.25, "epoch": 0.05272142157548622, - "grad_norm": 0.0633802951576789, - "kl": 0.03961181640625, - "learning_rate": 5.395810525450425e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8321679081555318, + "kl": 0.239013671875, + "learning_rate": 5.423491019789623e-07, + "loss": -0.1368, + "num_tokens": 13082769.0, + "reward": 0.30861951038241386, + "reward_std": 0.16406975965946913, + "rewards/code_reward/mean": 0.30861951038241386, + "rewards/code_reward/std": 0.16406976664438844, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 332.03125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.75, + "completions/max_terminated_length": 404.75, + "completions/mean_length": 182.65625, + "completions/mean_terminated_length": 182.65625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, "epoch": 0.05283335665314118, - "grad_norm": 0.1011785822244599, - "kl": 0.053466796875, - "learning_rate": 5.369057478631359e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9729476481805254, + "kl": 0.255126953125, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0919, + "num_tokens": 13106534.0, + "reward": 0.21364107308909297, + "reward_std": 0.09037529258057475, + "rewards/code_reward/mean": 0.21364107308909297, + "rewards/code_reward/std": 0.09037529304623604, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 287.609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.5, + "completions/max_terminated_length": 315.5, + "completions/mean_length": 170.96875, + "completions/mean_terminated_length": 170.96875, + "completions/min_length": 92.75, + "completions/min_terminated_length": 92.75, "epoch": 0.05294529173079614, - "grad_norm": 0.13769887874885445, - "kl": 0.0439453125, - "learning_rate": 5.343233001836694e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9407242390344841, + "kl": 0.236083984375, + "learning_rate": 5.369057478631359e-07, + "loss": 0.0076, + "num_tokens": 13125717.0, + "reward": 0.18790849673678167, + "reward_std": 0.11648390302434564, + "rewards/code_reward/mean": 0.18790849673678167, + "rewards/code_reward/std": 0.11648390302434564, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 387.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 287.9375, + "completions/mean_terminated_length": 287.9375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, "epoch": 0.0530572268084511, - "grad_norm": 0.09302862947626754, - "kl": 0.03985595703125, - "learning_rate": 5.318338178609754e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.6022039821926372, + "kl": 0.2093505859375, + "learning_rate": 5.343233001836694e-07, + "loss": -0.0311, + "num_tokens": 13152515.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/code_reward/mean": 0.46875, + "rewards/code_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 374.40625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.75, + "completions/max_terminated_length": 255.75, + "completions/mean_length": 157.5625, + "completions/mean_terminated_length": 157.5625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, "epoch": 0.053169161886106056, - "grad_norm": 0.1263278332387542, - "kl": 0.062255859375, - "learning_rate": 5.294374053487459e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.3145736475742311, + "kl": 0.292236328125, + "learning_rate": 5.318338178609754e-07, + "loss": -0.0802, + "num_tokens": 13178797.0, + "reward": 0.07483806018717587, + "reward_std": 0.034171308507211506, + "rewards/code_reward/mean": 0.07483806018717587, + "rewards/code_reward/std": 0.03417130699381232, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 358.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.25, + "completions/max_terminated_length": 396.25, + "completions/mean_length": 235.125, + "completions/mean_terminated_length": 235.125, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.053281096963761015, - "grad_norm": 0.020700901911474265, - "kl": 0.0355224609375, - "learning_rate": 5.271341631956511e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.0884074198105211, + "kl": 0.156005859375, + "learning_rate": 5.294374053487459e-07, + "loss": 0.0987, + "num_tokens": 13203897.0, + "reward": 0.15914655849337578, + "reward_std": 0.08056560717523098, + "rewards/code_reward/mean": 0.15914655849337578, + "rewards/code_reward/std": 0.0805656099691987, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 401.046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.75, + "completions/max_terminated_length": 263.75, + "completions/mean_length": 168.9375, + "completions/mean_terminated_length": 168.9375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, "epoch": 0.05339303204141598, - "grad_norm": 0.027139069654941325, - "kl": 0.0396728515625, - "learning_rate": 5.249241880411181e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.9450638032428148, + "kl": 0.3330078125, + "learning_rate": 5.271341631956511e-07, + "loss": -0.038, + "num_tokens": 13233671.0, + "reward": 0.5676594115793705, + "reward_std": 0.0675080195069313, + "rewards/code_reward/mean": 0.5676594115793705, + "rewards/code_reward/std": 0.0675080232322216, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 515.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.75, + "completions/max_terminated_length": 334.75, + "completions/mean_length": 192.8125, + "completions/mean_terminated_length": 192.8125, + "completions/min_length": 88.75, + "completions/min_terminated_length": 88.75, "epoch": 0.05350496711907094, - "grad_norm": 0.04265200650936088, - "kl": 0.060302734375, - "learning_rate": 5.228075726112785e-07, - "loss": 0.0006, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.336348928731024, + "kl": 0.270751953125, + "learning_rate": 5.249241880411181e-07, + "loss": -0.0158, + "num_tokens": 13265057.0, + "reward": 0.381644893437624, + "reward_std": 0.09550960175693035, + "rewards/code_reward/mean": 0.381644893437624, + "rewards/code_reward/std": 0.09550959896296263, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 386.640625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.25, + "completions/max_terminated_length": 299.25, + "completions/mean_length": 193.1875, + "completions/mean_terminated_length": 193.1875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, "epoch": 0.0536169021967259, - "grad_norm": 0.025504104198776033, - "kl": 0.03570556640625, - "learning_rate": 5.207844057150768e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5049532201349622, + "kl": 0.23974609375, + "learning_rate": 5.228075726112785e-07, + "loss": 0.0894, + "num_tokens": 13285927.0, + "reward": 0.16660759504884481, + "reward_std": 0.04511617706157267, + "rewards/code_reward/mean": 0.16660759504884481, + "rewards/code_reward/std": 0.04511618078686297, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 213.78125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.5, + "completions/max_terminated_length": 325.5, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 116.75, + "completions/min_terminated_length": 116.75, "epoch": 0.05372883727438086, - "grad_norm": 18.185082196534506, - "kl": 1.0419921875, - "learning_rate": 5.188547722405437e-07, - "loss": 0.0419, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3864869015820263, + "kl": 0.21337890625, + "learning_rate": 5.207844057150768e-07, + "loss": 0.158, + "num_tokens": 13303407.0, + "reward": 0.6197916716337204, + "reward_std": 0.20998739823698997, + "rewards/code_reward/mean": 0.6197916716337204, + "rewards/code_reward/std": 0.20998739078640938, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 390.96875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.5, + "completions/max_terminated_length": 265.5, + "completions/mean_length": 170.75, + "completions/mean_terminated_length": 170.75, + "completions/min_length": 92.75, + "completions/min_terminated_length": 92.75, "epoch": 0.05384077235203582, - "grad_norm": 0.03327137868690443, - "kl": 0.0408935546875, - "learning_rate": 5.170187531512351e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7579933265437377, + "kl": 0.256591796875, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0498, + "num_tokens": 13323183.0, + "reward": 0.30053258687257767, + "reward_std": 0.14006465952843428, + "rewards/code_reward/mean": 0.30053258687257767, + "rewards/code_reward/std": 0.1400646585971117, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 426.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.75, + "completions/max_terminated_length": 254.75, + "completions/mean_length": 146.4375, + "completions/mean_terminated_length": 146.4375, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, "epoch": 0.05395270742969078, - "grad_norm": 1.7420279438246644, - "kl": 1.31329345703125, - "learning_rate": 5.152764254828348e-07, - "loss": -0.0354, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4622693544793195, + "kl": 0.31591796875, + "learning_rate": 5.170187531512351e-07, + "loss": 0.0813, + "num_tokens": 13348197.0, + "reward": 0.2889851483050734, + "reward_std": 0.04471541300881654, + "rewards/code_reward/mean": 0.2889851483050734, + "rewards/code_reward/std": 0.044715409399941564, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 432.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.5, + "completions/max_terminated_length": 292.5, + "completions/mean_length": 178.875, + "completions/mean_terminated_length": 178.875, + "completions/min_length": 100.75, + "completions/min_terminated_length": 100.75, "epoch": 0.05406464250734574, - "grad_norm": 0.33787614688903167, - "kl": 0.06884765625, - "learning_rate": 5.136278623399225e-07, - "loss": 0.0007, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1459398821291238, + "kl": 0.280029296875, + "learning_rate": 5.152764254828348e-07, + "loss": 0.1023, + "num_tokens": 13372969.0, + "reward": 0.5509072579443455, + "reward_std": 0.1530819907784462, + "rewards/code_reward/mean": 0.5509072579443455, + "rewards/code_reward/std": 0.1530819982290268, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 291.515625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.25, + "completions/max_terminated_length": 220.25, + "completions/mean_length": 151.3125, + "completions/mean_terminated_length": 151.3125, + "completions/min_length": 80.5, + "completions/min_terminated_length": 80.5, "epoch": 0.0541765775850007, - "grad_norm": 0.023809496275349114, - "kl": 0.05426025390625, - "learning_rate": 5.120731328929058e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5763809825687893, + "kl": 0.2509765625, + "learning_rate": 5.136278623399225e-07, + "loss": -0.0076, + "num_tokens": 13397611.0, + "reward": 0.43999266996979713, + "reward_std": 0.15885511133819818, + "rewards/code_reward/mean": 0.43999266996979713, + "rewards/code_reward/std": 0.15885510575026274, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 383.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.25, + "completions/max_terminated_length": 348.25, + "completions/mean_length": 190.71875, + "completions/mean_terminated_length": 190.71875, + "completions/min_length": 70.25, + "completions/min_terminated_length": 70.25, "epoch": 0.05428851266265566, - "grad_norm": 0.015058815890629395, - "kl": 0.0362548828125, - "learning_rate": 5.106123023751187e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.543893067503445, + "kl": 0.22705078125, + "learning_rate": 5.120731328929058e-07, + "loss": 0.1822, + "num_tokens": 13421994.0, + "reward": 0.4371974468231201, + "reward_std": 0.06127816252410412, + "rewards/code_reward/mean": 0.4371974468231201, + "rewards/code_reward/std": 0.06127816252410412, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 373.515625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 158.1875, + "completions/mean_terminated_length": 158.1875, + "completions/min_length": 79.25, + "completions/min_terminated_length": 79.25, "epoch": 0.05440044774031062, - "grad_norm": 0.019845886834378008, - "kl": 0.03704833984375, - "learning_rate": 5.092454320800833e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 2.287656708183697, + "kl": 0.248046875, + "learning_rate": 5.106123023751187e-07, + "loss": 0.1385, + "num_tokens": 13446792.0, + "reward": 0.37210020469501615, + "reward_std": 0.13278006156906486, + "rewards/code_reward/mean": 0.37210020469501615, + "rewards/code_reward/std": 0.132780060172081, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 336.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.75, + "completions/max_terminated_length": 296.75, + "completions/mean_length": 184.6875, + "completions/mean_terminated_length": 184.6875, + "completions/min_length": 116.75, + "completions/min_terminated_length": 116.75, "epoch": 0.05451238281796558, - "grad_norm": 0.05683684336283696, - "kl": 0.0426025390625, - "learning_rate": 5.079725793589405e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.5495749732421749, + "kl": 0.2958984375, + "learning_rate": 5.092454320800833e-07, + "loss": 0.0935, + "num_tokens": 13472366.0, + "reward": 0.29983099177479744, + "reward_std": 0.1254198516253382, + "rewards/code_reward/mean": 0.29983099177479744, + "rewards/code_reward/std": 0.1254198516253382, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 286.71875, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 726.5, + "completions/max_terminated_length": 252.75, + "completions/mean_length": 214.78125, + "completions/mean_terminated_length": 153.8258934020996, + "completions/min_length": 74.75, + "completions/min_terminated_length": 74.75, "epoch": 0.05462431789562054, - "grad_norm": 0.03418640906474549, - "kl": 0.04443359375, - "learning_rate": 5.067937976180407e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.1195840237499781, + "kl": 0.3201904296875, + "learning_rate": 5.079725793589405e-07, + "loss": 0.0209, + "num_tokens": 13498479.0, + "reward": 0.5496091386303306, + "reward_std": 0.08358209393918514, + "rewards/code_reward/mean": 0.5496091386303306, + "rewards/code_reward/std": 0.08358209580183029, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 318.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.5, + "completions/max_terminated_length": 250.5, + "completions/mean_length": 134.15625, + "completions/mean_terminated_length": 134.15625, + "completions/min_length": 74.75, + "completions/min_terminated_length": 74.75, "epoch": 0.0547362529732755, - "grad_norm": 0.036114039198282856, - "kl": 0.05450439453125, - "learning_rate": 5.057091363167046e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6494391124504628, + "kl": 0.277099609375, + "learning_rate": 5.067937976180407e-07, + "loss": 0.158, + "num_tokens": 13520068.0, + "reward": 0.4070262387394905, + "reward_std": 0.1368686156347394, + "rewards/code_reward/mean": 0.4070262387394905, + "rewards/code_reward/std": 0.13686862308532, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 376.90625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 142.4375, + "completions/mean_terminated_length": 142.4375, + "completions/min_length": 90.75, + "completions/min_terminated_length": 90.75, "epoch": 0.05484818805093046, - "grad_norm": 0.023332365324867536, - "kl": 0.03631591796875, - "learning_rate": 5.047186409651489e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.8021016908957268, + "kl": 0.306640625, + "learning_rate": 5.057091363167046e-07, + "loss": -0.0293, + "num_tokens": 13540754.0, + "reward": 0.2056608572602272, + "reward_std": 0.09596531838178635, + "rewards/code_reward/mean": 0.2056608572602272, + "rewards/code_reward/std": 0.09596531558781862, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 388.671875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.75, + "completions/max_terminated_length": 342.75, + "completions/mean_length": 205.40625, + "completions/mean_terminated_length": 205.40625, + "completions/min_length": 105.25, + "completions/min_terminated_length": 105.25, "epoch": 0.05496012312858542, - "grad_norm": 13.172841203218344, - "kl": 2.97564697265625, - "learning_rate": 5.038223531225742e-07, - "loss": 0.0339, - "reward": 0.09687500074505806, - "reward_std": 0.012500000186264515, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.96875, + "grad_norm": 1.1900185406401222, + "kl": 0.287109375, + "learning_rate": 5.047186409651489e-07, + "loss": -0.002, + "num_tokens": 13564991.0, + "reward": 0.4886061754077673, + "reward_std": 0.2226488906890154, + "rewards/code_reward/mean": 0.4886061754077673, + "rewards/code_reward/std": 0.22264889813959599, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 310.796875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.5, + "completions/max_terminated_length": 295.5, + "completions/mean_length": 171.8125, + "completions/mean_terminated_length": 171.8125, + "completions/min_length": 101.75, + "completions/min_terminated_length": 101.75, "epoch": 0.05507205820624038, - "grad_norm": 9.652533322003624, - "kl": 0.4891357421875, - "learning_rate": 5.030203103954232e-07, - "loss": -0.0404, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.2327221390909047, + "kl": 0.21728515625, + "learning_rate": 5.038223531225742e-07, + "loss": -0.0388, + "num_tokens": 13586065.0, + "reward": 0.5599798411130905, + "reward_std": 0.1288916040211916, + "rewards/code_reward/mean": 0.5599798411130905, + "rewards/code_reward/std": 0.12889160588383675, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 302.296875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.5, + "completions/max_terminated_length": 265.5, + "completions/mean_length": 154.625, + "completions/mean_terminated_length": 154.625, + "completions/min_length": 77.25, + "completions/min_terminated_length": 77.25, "epoch": 0.05518399328389534, - "grad_norm": 0.030140393405088423, - "kl": 0.0369873046875, - "learning_rate": 5.023125464358026e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7391260110979152, + "kl": 0.26513671875, + "learning_rate": 5.030203103954232e-07, + "loss": -0.1875, + "num_tokens": 13605021.0, + "reward": 0.431189201772213, + "reward_std": 0.26025911793112755, + "rewards/code_reward/mean": 0.431189201772213, + "rewards/code_reward/std": 0.2602591188624501, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 353.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.25, + "completions/max_terminated_length": 289.25, + "completions/mean_length": 161.4375, + "completions/mean_terminated_length": 161.4375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, "epoch": 0.0552959283615503, - "grad_norm": 0.02422588912502369, - "kl": 0.037200927734375, - "learning_rate": 5.016990909400709e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.4124131086208604, + "kl": 0.2646484375, + "learning_rate": 5.023125464358026e-07, + "loss": 0.0666, + "num_tokens": 13624443.0, + "reward": 0.33201567456126213, + "reward_std": 0.02758226078003645, + "rewards/code_reward/mean": 0.33201567456126213, + "rewards/code_reward/std": 0.027582260314375162, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 370.34375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.25, + "completions/max_terminated_length": 228.25, + "completions/mean_length": 148.9375, + "completions/mean_terminated_length": 148.9375, + "completions/min_length": 84.5, + "completions/min_terminated_length": 84.5, "epoch": 0.055407863439205264, - "grad_norm": 0.0372886708561457, - "kl": 0.05389404296875, - "learning_rate": 5.011799696475915e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6145368535030744, + "kl": 0.35107421875, + "learning_rate": 5.016990909400709e-07, + "loss": -0.0059, + "num_tokens": 13651457.0, + "reward": 0.326155461370945, + "reward_std": 0.1283545382320881, + "rewards/code_reward/mean": 0.326155461370945, + "rewards/code_reward/std": 0.1283545382320881, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 384.703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.5, + "completions/max_terminated_length": 320.5, + "completions/mean_length": 155.40625, + "completions/mean_terminated_length": 155.40625, + "completions/min_length": 78.75, + "completions/min_terminated_length": 78.75, "epoch": 0.05551979851686022, - "grad_norm": 0.01359500582951308, - "kl": 0.035980224609375, - "learning_rate": 5.007552043396547e-07, - "loss": 0.0004, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.6838493896543587, + "kl": 0.251220703125, + "learning_rate": 5.011799696475915e-07, + "loss": 0.0376, + "num_tokens": 13676038.0, + "reward": 0.4704548120498657, + "reward_std": 0.23622475564479828, + "rewards/code_reward/mean": 0.4704548120498657, + "rewards/code_reward/std": 0.23622475564479828, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 400.46875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 189.15625, + "completions/mean_terminated_length": 189.15625, + "completions/min_length": 118.75, + "completions/min_terminated_length": 118.75, "epoch": 0.05563173359451518, - "grad_norm": 0.018893589493420345, - "kl": 0.04736328125, - "learning_rate": 5.004248128385618e-07, - "loss": 0.0005, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 1.7113766777536588, + "kl": 0.2880859375, + "learning_rate": 5.007552043396547e-07, + "loss": -0.0331, + "num_tokens": 13705947.0, + "reward": 0.404338245280087, + "reward_std": 0.2840197389014065, + "rewards/code_reward/mean": 0.404338245280087, + "rewards/code_reward/std": 0.2840197426266968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 356.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.75, + "completions/max_terminated_length": 386.75, + "completions/mean_length": 227.40625, + "completions/mean_terminated_length": 227.40625, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, "epoch": 0.05574366867217014, - "grad_norm": 2.740738937008755, - "kl": 0.7550048828125, - "learning_rate": 5.001888090068784e-07, - "loss": -0.0421, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.3321241616316153, + "kl": 0.192626953125, + "learning_rate": 5.004248128385618e-07, + "loss": 0.1036, + "num_tokens": 13729656.0, + "reward": 0.19941096380352974, + "reward_std": 0.1264824215322733, + "rewards/code_reward/mean": 0.19941096380352974, + "rewards/code_reward/std": 0.1264824327081442, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 386.421875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.5, + "completions/max_terminated_length": 302.5, + "completions/mean_length": 159.75, + "completions/mean_terminated_length": 159.75, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, "epoch": 0.0558556037498251, - "grad_norm": 0.9492236255194394, - "kl": 0.24884033203125, - "learning_rate": 5.000472027468528e-07, - "loss": -0.0556, - "reward": 0.09843750111758709, - "reward_std": 0.0062500000931322575, - "rewards/code_reward": 0.0, - "rewards/format_reward": 0.984375, + "grad_norm": 1.4751264574193441, + "kl": 0.318359375, + "learning_rate": 5.001888090068784e-07, + "loss": -0.0364, + "num_tokens": 13749472.0, + "reward": 0.44989876449108124, + "reward_std": 0.047168461605906487, + "rewards/code_reward/mean": 0.44989876449108124, + "rewards/code_reward/std": 0.04716846067458391, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 344.65625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 119.34375, + "completions/mean_terminated_length": 119.34375, + "completions/min_length": 56.75, + "completions/min_terminated_length": 56.75, "epoch": 0.05596753882748006, - "grad_norm": 0.024972922874043218, - "kl": 0.033905029296875, - "learning_rate": 5.000000000000001e-07, - "loss": 0.0003, - "reward": 0.10000000149011612, - "reward_std": 0.0, - "rewards/code_reward": 0.0, - "rewards/format_reward": 1.0, + "grad_norm": 0.48660836341370794, + "kl": 0.421875, + "learning_rate": 5.000472027468528e-07, + "loss": 0.0205, + "num_tokens": 13768043.0, + "reward": 0.5847536753863096, + "reward_std": 0.04997873678803444, + "rewards/code_reward/mean": 0.5847536753863096, + "rewards/code_reward/std": 0.04997873678803444, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, "step": 500 }, { "epoch": 0.05596753882748006, "step": 500, "total_flos": 0.0, - "train_loss": 0.028706138839246706, - "train_runtime": 9937.8104, - "train_samples_per_second": 3.22, - "train_steps_per_second": 0.05 + "train_loss": 0.02433196935596061, + "train_runtime": 50427.8575, + "train_samples_per_second": 0.317, + "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 500, - "num_input_tokens_seen": 0, + "num_input_tokens_seen": 13768043, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { @@ -7036,7 +11537,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 16, + "train_batch_size": 8, "trial_name": null, "trial_params": null }