diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05596753882748006, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 624.28125, + "epoch": 0.00011193507765496012, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.90625, + "epoch": 0.00022387015530992023, + "grad_norm": 0.7040169045969362, + "kl": 0.0, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0601, + "reward": 0.007812500116415322, + "reward_std": 0.01743034040555358, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.078125, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.75, + "epoch": 0.00033580523296488035, + "grad_norm": 0.8629215889276903, + "kl": 0.00027060508728027344, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0038, + "reward": 0.004687500069849193, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.046875, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.078125, + "epoch": 0.00044774031061984047, + "grad_norm": 0.7016431850799195, + "kl": 0.00026345252990722656, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0435, + "reward": 0.006250000209547579, + "reward_std": 0.016327823046594858, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.0625, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.0, + "epoch": 0.0005596753882748006, + "grad_norm": 0.4574220548610273, + "kl": 0.0002818107604980469, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.007, + "reward": 0.0031250000465661287, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.03125, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.09375, + "epoch": 0.0006716104659297607, + "grad_norm": 0.5434000185158782, + "kl": 0.0002865791320800781, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0427, + "reward": 0.0031250000465661287, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.03125, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.765625, + "epoch": 0.0007835455435847208, + "grad_norm": 7.737527191229261, + "kl": 0.0010457038879394531, + "learning_rate": 2.3333333333333336e-06, + "loss": -0.0273, + "reward": 0.0031250000465661287, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.03125, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.4375, + "epoch": 0.0008954806212396809, + "grad_norm": 0.9652984425394716, + "kl": 0.000743865966796875, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0031, + "reward": 0.0062500000931322575, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.0625, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.203125, + "epoch": 0.0010074156988946412, + "grad_norm": 0.565508681156182, + "kl": 0.001064300537109375, + "learning_rate": 3e-06, + "loss": 0.0314, + "reward": 0.004687500069849193, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.046875, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.109375, + "epoch": 0.0011193507765496012, + "grad_norm": 29.074260549966585, + "kl": 0.7770309448242188, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0532, + "reward": 0.02031250041909516, + "reward_std": 0.04097762983292341, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.203125, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.96875, + "epoch": 0.0012312858542045614, + "grad_norm": 1.335155005624375, + "kl": 0.03216552734375, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.0838, + "reward": 0.025000000605359674, + "reward_std": 0.043084788136184216, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.25, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.265625, + "epoch": 0.0013432209318595214, + "grad_norm": 0.9728533904128814, + "kl": 0.11004638671875, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0547, + "reward": 0.02187500020954758, + "reward_std": 0.04057852132245898, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.21875, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.71875, + "epoch": 0.0014551560095144816, + "grad_norm": 3.1109091298399707, + "kl": 0.34326171875, + "learning_rate": 4.333333333333334e-06, + "loss": 0.1651, + "reward": 0.05625000037252903, + "reward_std": 0.04977653082460165, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.5625, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.4375, + "epoch": 0.0015670910871694416, + "grad_norm": 1.4333106697160516, + "kl": 0.2135009765625, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0872, + "reward": 0.06718750111758709, + "reward_std": 0.04840352013707161, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.671875, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.671875, + "epoch": 0.0016790261648244019, + "grad_norm": 1.1323714811109955, + "kl": 0.0590972900390625, + "learning_rate": 5e-06, + "loss": 0.0902, + "reward": 0.07343750260770321, + "reward_std": 0.04493850376456976, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.734375, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.921875, + "epoch": 0.0017909612424793619, + "grad_norm": 1.1229606735516884, + "kl": 0.0349578857421875, + "learning_rate": 4.999952797253148e-06, + "loss": 0.0231, + "reward": 0.07031250186264515, + "reward_std": 0.04625816363841295, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.703125, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.84375, + "epoch": 0.001902896320134322, + "grad_norm": 1.3958687154501264, + "kl": 0.040618896484375, + "learning_rate": 4.9998111909931225e-06, + "loss": 0.0841, + "reward": 0.07656250149011612, + "reward_std": 0.04255262762308121, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.765625, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.265625, + "epoch": 0.0020148313977892823, + "grad_norm": 1.2810159881634564, + "kl": 0.058685302734375, + "learning_rate": 4.999575187161439e-06, + "loss": -0.0583, + "reward": 0.08125000260770321, + "reward_std": 0.038373483810573816, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.8125, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.984375, + "epoch": 0.0021267664754442426, + "grad_norm": 0.6496120974985506, + "kl": 0.0307464599609375, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.0475, + "reward": 0.09218750149011612, + "reward_std": 0.01743034040555358, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.90625, + "epoch": 0.0022387015530992023, + "grad_norm": 1.1640288172191966, + "kl": 0.016815185546875, + "learning_rate": 4.998820030352409e-06, + "loss": 0.12, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.90625, + "epoch": 0.0023506366307541626, + "grad_norm": 2.092394327511984, + "kl": 0.4071197509765625, + "learning_rate": 4.998300909059929e-06, + "loss": 0.077, + "reward": 0.08906250260770321, + "reward_std": 0.025969465728849173, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.890625, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.609375, + "epoch": 0.002462571708409123, + "grad_norm": 1.7353937284994216, + "kl": 0.130462646484375, + "learning_rate": 4.997687453564198e-06, + "loss": 0.0013, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.75, + "epoch": 0.002574506786064083, + "grad_norm": 0.6464548918875196, + "kl": 0.0576171875, + "learning_rate": 4.9969796896045775e-06, + "loss": -0.0217, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.421875, + "epoch": 0.002686441863719043, + "grad_norm": 116.68805529495698, + "kl": 6.03173828125, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0415, + "reward": 0.09218750149011612, + "reward_std": 0.017430341336876154, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.5625, + "epoch": 0.002798376941374003, + "grad_norm": 3.181683689923044, + "kl": 0.4976806640625, + "learning_rate": 4.995281359034851e-06, + "loss": -0.0548, + "reward": 0.09218750335276127, + "reward_std": 0.023328250739723444, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.578125, + "epoch": 0.0029103120190289633, + "grad_norm": 0.9693325036669225, + "kl": 0.07562255859375, + "learning_rate": 4.994290863683296e-06, + "loss": 0.0118, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.640625, + "epoch": 0.0030222470966839235, + "grad_norm": 0.762738499506801, + "kl": 0.1248779296875, + "learning_rate": 4.99320620238196e-06, + "loss": 0.0572, + "reward": 0.09375000186264515, + "reward_std": 0.016327822115272284, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.65625, + "epoch": 0.0031341821743388833, + "grad_norm": 0.08544860970511584, + "kl": 0.04632568359375, + "learning_rate": 4.99202742064106e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.015625, + "epoch": 0.0032461172519938435, + "grad_norm": 1.1227892135326387, + "kl": 0.03948974609375, + "learning_rate": 4.990754567919917e-06, + "loss": -0.0086, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.34375, + "epoch": 0.0033580523296488037, + "grad_norm": 0.8427884883196753, + "kl": 0.03790283203125, + "learning_rate": 4.989387697624881e-06, + "loss": 0.0001, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.09375, + "epoch": 0.003469987407303764, + "grad_norm": 0.916292516898012, + "kl": 0.03955078125, + "learning_rate": 4.987926867107095e-06, + "loss": -0.0193, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.28125, + "epoch": 0.0035819224849587238, + "grad_norm": 0.5282928671535011, + "kl": 0.043212890625, + "learning_rate": 4.986372137660078e-06, + "loss": -0.0401, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.0, + "epoch": 0.003693857562613684, + "grad_norm": 0.8076561222873071, + "kl": 0.06463623046875, + "learning_rate": 4.984723574517165e-06, + "loss": 0.0103, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.015625, + "epoch": 0.003805792640268644, + "grad_norm": 0.5820201891858064, + "kl": 0.04254150390625, + "learning_rate": 4.9829812468487655e-06, + "loss": -0.0118, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.765625, + "epoch": 0.0039177277179236044, + "grad_norm": 1.30271756609257, + "kl": 0.07342529296875, + "learning_rate": 4.981145227759457e-06, + "loss": 0.0033, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.328125, + "epoch": 0.004029662795578565, + "grad_norm": 0.5253064587253158, + "kl": 0.0435791015625, + "learning_rate": 4.979215594284924e-06, + "loss": -0.004, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.046875, + "epoch": 0.004141597873233525, + "grad_norm": 0.03440107214885444, + "kl": 0.03900146484375, + "learning_rate": 4.977192427388722e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.390625, + "epoch": 0.004253532950888485, + "grad_norm": 0.03871583904571488, + "kl": 0.033447265625, + "learning_rate": 4.9750758119588824e-06, + "loss": 0.0003, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.453125, + "epoch": 0.0043654680285434445, + "grad_norm": 0.14679688464116405, + "kl": 0.05303955078125, + "learning_rate": 4.972865836804349e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.578125, + "epoch": 0.004477403106198405, + "grad_norm": 1.1845417806658105, + "kl": 0.06744384765625, + "learning_rate": 4.970562594651254e-06, + "loss": 0.0676, + "reward": 0.09531250037252903, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.359375, + "epoch": 0.004589338183853365, + "grad_norm": 0.6033536519344826, + "kl": 0.05682373046875, + "learning_rate": 4.968166182139026e-06, + "loss": 0.1634, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.03125, + "epoch": 0.004701273261508325, + "grad_norm": 0.03504335311347305, + "kl": 0.042327880859375, + "learning_rate": 4.9656766998163306e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.453125, + "epoch": 0.004813208339163285, + "grad_norm": 0.11226404372767065, + "kl": 0.0537109375, + "learning_rate": 4.963094252136865e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.296875, + "epoch": 0.004925143416818246, + "grad_norm": 1.682711207148907, + "kl": 0.2684326171875, + "learning_rate": 4.960418947454958e-06, + "loss": 0.0222, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.625, + "epoch": 0.005037078494473206, + "grad_norm": 0.04103769366987546, + "kl": 0.0435791015625, + "learning_rate": 4.957650898021038e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.765625, + "epoch": 0.005149013572128166, + "grad_norm": 0.524235834468468, + "kl": 0.065673828125, + "learning_rate": 4.954790219976915e-06, + "loss": -0.0335, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.984375, + "epoch": 0.005260948649783125, + "grad_norm": 105.38307149631456, + "kl": 0.32183837890625, + "learning_rate": 4.95183703335091e-06, + "loss": 0.0819, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.65625, + "epoch": 0.005372883727438086, + "grad_norm": 0.6435525752943102, + "kl": 0.1368408203125, + "learning_rate": 4.948791462052819e-06, + "loss": 0.0042, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0, + "epoch": 0.005484818805093046, + "grad_norm": 0.5344456298032659, + "kl": 0.060546875, + "learning_rate": 4.945653633868716e-06, + "loss": 0.0254, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.75, + "epoch": 0.005596753882748006, + "grad_norm": 0.7155667213796403, + "kl": 0.08978271484375, + "learning_rate": 4.942423680455584e-06, + "loss": 0.0132, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.296875, + "epoch": 0.005708688960402966, + "grad_norm": 0.6239904143882911, + "kl": 0.04815673828125, + "learning_rate": 4.939101737335802e-06, + "loss": 0.0135, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.71875, + "epoch": 0.0058206240380579265, + "grad_norm": 1.104725268564954, + "kl": 0.145263671875, + "learning_rate": 4.935687943891447e-06, + "loss": 0.0015, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.328125, + "epoch": 0.005932559115712887, + "grad_norm": 0.5835454079488682, + "kl": 0.075927734375, + "learning_rate": 4.932182443358458e-06, + "loss": 0.1512, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.578125, + "epoch": 0.006044494193367847, + "grad_norm": 0.49184303981459476, + "kl": 0.05926513671875, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0194, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.046875, + "epoch": 0.006156429271022806, + "grad_norm": 1.0291704424132635, + "kl": 0.1390380859375, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0102, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.0625, + "epoch": 0.006268364348677767, + "grad_norm": 0.5613069680179144, + "kl": 0.08880615234375, + "learning_rate": 4.921117189267535e-06, + "loss": 0.0121, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.359375, + "epoch": 0.006380299426332727, + "grad_norm": 0.7905247960482367, + "kl": 0.06884765625, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0134, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.28125, + "epoch": 0.006492234503987687, + "grad_norm": 18.99182061239259, + "kl": 1.30133056640625, + "learning_rate": 4.9132846166208355e-06, + "loss": -0.0248, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.59375, + "epoch": 0.006604169581642647, + "grad_norm": 1.0573356685875819, + "kl": 0.06280517578125, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.0153, + "reward": 0.09375, + "reward_std": 0.02500000037252903, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.171875, + "epoch": 0.0067161046592976075, + "grad_norm": 0.5498420295992223, + "kl": 0.070556640625, + "learning_rate": 4.905088979422971e-06, + "loss": -0.0324, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.109375, + "epoch": 0.006828039736952568, + "grad_norm": 0.4563712143777335, + "kl": 0.065673828125, + "learning_rate": 4.900855439079536e-06, + "loss": -0.0263, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.03125, + "epoch": 0.006939974814607528, + "grad_norm": 0.022676570314052416, + "kl": 0.0540771484375, + "learning_rate": 4.8965316531496055e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.109375, + "epoch": 0.007051909892262488, + "grad_norm": 0.37277836211463244, + "kl": 0.0537109375, + "learning_rate": 4.892117803050578e-06, + "loss": -0.0112, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.3125, + "epoch": 0.0071638449699174475, + "grad_norm": 0.6045549634649411, + "kl": 0.06939697265625, + "learning_rate": 4.887614073978761e-06, + "loss": 0.0378, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 490.53125, + "epoch": 0.007275780047572408, + "grad_norm": 0.4804494114616987, + "kl": 0.0440673828125, + "learning_rate": 4.883020654901609e-06, + "loss": 0.0326, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.625, + "epoch": 0.007387715125227368, + "grad_norm": 0.02779607160337418, + "kl": 0.04901123046875, + "learning_rate": 4.878337738549785e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.34375, + "epoch": 0.007499650202882328, + "grad_norm": 0.019327395350080524, + "kl": 0.05145263671875, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.140625, + "epoch": 0.007611585280537288, + "grad_norm": 0.7217823002699405, + "kl": 0.07281494140625, + "learning_rate": 4.868704203712173e-06, + "loss": 0.0201, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.171875, + "epoch": 0.007723520358192249, + "grad_norm": 0.01773957052813651, + "kl": 0.04571533203125, + "learning_rate": 4.86375398943021e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.109375, + "epoch": 0.007835455435847209, + "grad_norm": 0.5495053845625697, + "kl": 0.05712890625, + "learning_rate": 4.858715086264274e-06, + "loss": 0.0313, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.9375, + "epoch": 0.007947390513502168, + "grad_norm": 221.95709261316097, + "kl": 29.68072509765625, + "learning_rate": 4.853587705636646e-06, + "loss": 0.4784, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.046875, + "epoch": 0.00805932559115713, + "grad_norm": 0.6777938636001378, + "kl": 0.0565185546875, + "learning_rate": 4.84837206268195e-06, + "loss": -0.034, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.0, + "epoch": 0.008171260668812089, + "grad_norm": 0.03603965081169223, + "kl": 0.07452392578125, + "learning_rate": 4.8430683762381195e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.4375, + "epoch": 0.00828319574646705, + "grad_norm": 0.024934251266286088, + "kl": 0.06591796875, + "learning_rate": 4.837676868837213e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.796875, + "epoch": 0.008395130824122009, + "grad_norm": 0.025440481130695015, + "kl": 0.05645751953125, + "learning_rate": 4.832197766696085e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.171875, + "epoch": 0.00850706590177697, + "grad_norm": 0.053611239059879696, + "kl": 0.056640625, + "learning_rate": 4.826631299706887e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.859375, + "epoch": 0.00861900097943193, + "grad_norm": 0.6813369559573024, + "kl": 0.05133056640625, + "learning_rate": 4.820977701427424e-06, + "loss": 0.1191, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.421875, + "epoch": 0.008730936057086889, + "grad_norm": 0.7460088285353335, + "kl": 0.0655517578125, + "learning_rate": 4.81523720907136e-06, + "loss": 0.0598, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.640625, + "epoch": 0.00884287113474185, + "grad_norm": 0.6085630747158598, + "kl": 0.06109619140625, + "learning_rate": 4.809410063498254e-06, + "loss": 0.0091, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.125, + "epoch": 0.00895480621239681, + "grad_norm": 0.05060284374443874, + "kl": 0.06451416015625, + "learning_rate": 4.8034965092034656e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.15625, + "epoch": 0.00906674129005177, + "grad_norm": 0.020553083133118145, + "kl": 0.05169677734375, + "learning_rate": 4.797496794307889e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.921875, + "epoch": 0.00917867636770673, + "grad_norm": 0.02331309838539552, + "kl": 0.05242919921875, + "learning_rate": 4.791411170547545e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.296875, + "epoch": 0.009290611445361691, + "grad_norm": 0.8366878780204882, + "kl": 0.0616455078125, + "learning_rate": 4.785239893263017e-06, + "loss": 0.019, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.609375, + "epoch": 0.00940254652301665, + "grad_norm": 0.4621690087605406, + "kl": 0.07269287109375, + "learning_rate": 4.778983221388742e-06, + "loss": -0.0043, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.453125, + "epoch": 0.00951448160067161, + "grad_norm": 0.7196439160838366, + "kl": 0.07952880859375, + "learning_rate": 4.77264141744214e-06, + "loss": 0.0222, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.40625, + "epoch": 0.00962641667832657, + "grad_norm": 0.42630497015032043, + "kl": 0.10479736328125, + "learning_rate": 4.766214747512603e-06, + "loss": -0.0067, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.90625, + "epoch": 0.00973835175598153, + "grad_norm": 0.0717989075808753, + "kl": 0.06146240234375, + "learning_rate": 4.759703481250331e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.953125, + "epoch": 0.009850286833636491, + "grad_norm": 0.5199995504833264, + "kl": 0.0433349609375, + "learning_rate": 4.753107891855015e-06, + "loss": -0.0066, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.765625, + "epoch": 0.00996222191129145, + "grad_norm": 0.04715436027273259, + "kl": 0.07342529296875, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.0, + "epoch": 0.010074156988946412, + "grad_norm": 0.03668445511393964, + "kl": 0.05902099609375, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.015625, + "epoch": 0.010186092066601371, + "grad_norm": 0.025197578254458317, + "kl": 0.080322265625, + "learning_rate": 4.732817969868348e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.984375, + "epoch": 0.010298027144256332, + "grad_norm": 0.027850466760737828, + "kl": 0.057373046875, + "learning_rate": 4.7258878905233095e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.03125, + "epoch": 0.010409962221911291, + "grad_norm": 0.8302898332353863, + "kl": 0.06591796875, + "learning_rate": 4.718874906879688e-06, + "loss": 0.2943, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.859375, + "epoch": 0.01052189729956625, + "grad_norm": 0.030863660874464384, + "kl": 0.052978515625, + "learning_rate": 4.711779313188231e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.265625, + "epoch": 0.010633832377221212, + "grad_norm": 0.04564269693547498, + "kl": 0.06561279296875, + "learning_rate": 4.70460140716584e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.734375, + "epoch": 0.010745767454876171, + "grad_norm": 1.0393244090787366, + "kl": 0.06671142578125, + "learning_rate": 4.697341489983076e-06, + "loss": 0.0811, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.25, + "epoch": 0.010857702532531132, + "grad_norm": 0.023478872872016283, + "kl": 0.0535888671875, + "learning_rate": 4.6899998662515215e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.90625, + "epoch": 0.010969637610186092, + "grad_norm": 0.019758461356285777, + "kl": 0.0531005859375, + "learning_rate": 4.682576844011007e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.90625, + "epoch": 0.011081572687841053, + "grad_norm": 0.01976638212707949, + "kl": 0.0645751953125, + "learning_rate": 4.675072734716678e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.859375, + "epoch": 0.011193507765496012, + "grad_norm": 2.002878750503418, + "kl": 0.0760498046875, + "learning_rate": 4.667487853225931e-06, + "loss": 0.3922, + "reward": 0.09531250037252903, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.859375, + "epoch": 0.011305442843150973, + "grad_norm": 0.6672407190007655, + "kl": 0.05157470703125, + "learning_rate": 4.659822517785203e-06, + "loss": 0.1641, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.8125, + "epoch": 0.011417377920805933, + "grad_norm": 0.9439619070630538, + "kl": 0.061767578125, + "learning_rate": 4.6520770500166165e-06, + "loss": 0.3783, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.90625, + "epoch": 0.011529312998460892, + "grad_norm": 0.7519670307168462, + "kl": 0.05535888671875, + "learning_rate": 4.644251774904487e-06, + "loss": 0.0952, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.53125, + "epoch": 0.011641248076115853, + "grad_norm": 1.2637256595006654, + "kl": 0.160400390625, + "learning_rate": 4.636347020781684e-06, + "loss": -0.0378, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.40625, + "epoch": 0.011753183153770812, + "grad_norm": 0.9414284114713096, + "kl": 0.0699462890625, + "learning_rate": 4.6283631193158605e-06, + "loss": -0.0089, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.9375, + "epoch": 0.011865118231425774, + "grad_norm": 0.15424717287769374, + "kl": 0.0926513671875, + "learning_rate": 4.620300405495532e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.765625, + "epoch": 0.011977053309080733, + "grad_norm": 0.6366716085256641, + "kl": 0.0911865234375, + "learning_rate": 4.612159217616022e-06, + "loss": 0.0133, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.484375, + "epoch": 0.012088988386735694, + "grad_norm": 0.020907253695270293, + "kl": 0.06060791015625, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.453125, + "epoch": 0.012200923464390653, + "grad_norm": 0.024537257651566672, + "kl": 0.06396484375, + "learning_rate": 4.595642789309492e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.421875, + "epoch": 0.012312858542045613, + "grad_norm": 0.02438549022281726, + "kl": 0.06048583984375, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.796875, + "epoch": 0.012424793619700574, + "grad_norm": 0.5942167965684412, + "kl": 0.0833740234375, + "learning_rate": 4.578816606352205e-06, + "loss": 0.0065, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.953125, + "epoch": 0.012536728697355533, + "grad_norm": 0.023256524966985128, + "kl": 0.05865478515625, + "learning_rate": 4.570288237343632e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.203125, + "epoch": 0.012648663775010494, + "grad_norm": 0.4917785489739923, + "kl": 0.0616455078125, + "learning_rate": 4.561683492686289e-06, + "loss": 0.0131, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.609375, + "epoch": 0.012760598852665454, + "grad_norm": 0.4189078075428464, + "kl": 0.06109619140625, + "learning_rate": 4.5530027334180285e-06, + "loss": -0.0367, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.71875, + "epoch": 0.012872533930320415, + "grad_norm": 0.03244601571119012, + "kl": 0.0718994140625, + "learning_rate": 4.544246323766122e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.453125, + "epoch": 0.012984469007975374, + "grad_norm": 0.04077561657409853, + "kl": 0.0740966796875, + "learning_rate": 4.535414631131983e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.078125, + "epoch": 0.013096404085630335, + "grad_norm": 0.04561395129255693, + "kl": 0.0780029296875, + "learning_rate": 4.526508026075746e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.875, + "epoch": 0.013208339163285295, + "grad_norm": 0.032666442461546756, + "kl": 0.071533203125, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.484375, + "epoch": 0.013320274240940254, + "grad_norm": 0.030745081226969093, + "kl": 0.0496826171875, + "learning_rate": 4.508471576637713e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.46875, + "epoch": 0.013432209318595215, + "grad_norm": 0.025825744807750583, + "kl": 0.0751953125, + "learning_rate": 4.499342489029211e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.15625, + "epoch": 0.013544144396250174, + "grad_norm": 0.027055040661668105, + "kl": 0.071533203125, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.46875, + "epoch": 0.013656079473905135, + "grad_norm": 0.9837824917503732, + "kl": 0.05841064453125, + "learning_rate": 4.48086450320833e-06, + "loss": 0.156, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.9375, + "epoch": 0.013768014551560095, + "grad_norm": 0.36909382007632124, + "kl": 0.0616455078125, + "learning_rate": 4.4715163802952266e-06, + "loss": 0.034, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.875, + "epoch": 0.013879949629215056, + "grad_norm": 0.02289685144542213, + "kl": 0.0621337890625, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.609375, + "epoch": 0.013991884706870015, + "grad_norm": 0.5288400884276316, + "kl": 0.06207275390625, + "learning_rate": 4.4526038355898144e-06, + "loss": -0.0128, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.234375, + "epoch": 0.014103819784524976, + "grad_norm": 0.019666228568501372, + "kl": 0.056396484375, + "learning_rate": 4.4430402073300035e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.671875, + "epoch": 0.014215754862179936, + "grad_norm": 0.022162440604770836, + "kl": 0.0638427734375, + "learning_rate": 4.433405542493909e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.328125, + "epoch": 0.014327689939834895, + "grad_norm": 0.5613568768967913, + "kl": 0.06121826171875, + "learning_rate": 4.4237002453327734e-06, + "loss": -0.0001, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.328125, + "epoch": 0.014439625017489856, + "grad_norm": 0.025068232916270288, + "kl": 0.06744384765625, + "learning_rate": 4.4139247230614245e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.34375, + "epoch": 0.014551560095144815, + "grad_norm": 0.018723191179922716, + "kl": 0.05828857421875, + "learning_rate": 4.404079385841201e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.3125, + "epoch": 0.014663495172799777, + "grad_norm": 0.16972024878699354, + "kl": 0.09710693359375, + "learning_rate": 4.394164646762734e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.984375, + "epoch": 0.014775430250454736, + "grad_norm": 0.0237436227671808, + "kl": 0.06884765625, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.296875, + "epoch": 0.014887365328109697, + "grad_norm": 0.6392088951882646, + "kl": 0.05865478515625, + "learning_rate": 4.374128629935955e-06, + "loss": 0.1335, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.859375, + "epoch": 0.014999300405764656, + "grad_norm": 0.0258451541588282, + "kl": 0.0711669921875, + "learning_rate": 4.364008192858781e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.5625, + "epoch": 0.015111235483419617, + "grad_norm": 0.023614977303173818, + "kl": 0.0672607421875, + "learning_rate": 4.353820035230366e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.734375, + "epoch": 0.015223170561074577, + "grad_norm": 0.019070024346192215, + "kl": 0.066162109375, + "learning_rate": 4.3435645845254e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.890625, + "epoch": 0.015335105638729536, + "grad_norm": 0.37696582370006476, + "kl": 0.06463623046875, + "learning_rate": 4.333242271042054e-06, + "loss": -0.0004, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.234375, + "epoch": 0.015447040716384497, + "grad_norm": 3.0390479634412357, + "kl": 0.071533203125, + "learning_rate": 4.32285352788393e-06, + "loss": 0.2188, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.21875, + "epoch": 0.015558975794039457, + "grad_norm": 1.8150343506782198, + "kl": 0.106689453125, + "learning_rate": 4.312398790941882e-06, + "loss": 0.3003, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.28125, + "epoch": 0.015670910871694418, + "grad_norm": 0.03462843424405208, + "kl": 0.05792236328125, + "learning_rate": 4.301878498875735e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.421875, + "epoch": 0.015782845949349377, + "grad_norm": 1.7729469082974438, + "kl": 0.06402587890625, + "learning_rate": 4.291293093095873e-06, + "loss": 0.1156, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.90625, + "epoch": 0.015894781027004336, + "grad_norm": 0.678889245969432, + "kl": 0.0787353515625, + "learning_rate": 4.280643017744723e-06, + "loss": 0.0363, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.828125, + "epoch": 0.0160067161046593, + "grad_norm": 3.1600108504474265, + "kl": 0.103759765625, + "learning_rate": 4.269928719678117e-06, + "loss": 0.2578, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.375, + "epoch": 0.01611865118231426, + "grad_norm": 0.13214155840655448, + "kl": 0.101318359375, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.90625, + "epoch": 0.016230586259969218, + "grad_norm": 0.7194283098737337, + "kl": 0.08380126953125, + "learning_rate": 4.248309256276283e-06, + "loss": -0.0069, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.046875, + "epoch": 0.016342521337624177, + "grad_norm": 0.10784588867048772, + "kl": 0.0704345703125, + "learning_rate": 4.23740499805044e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.234375, + "epoch": 0.016454456415279137, + "grad_norm": 1.2992200606773725, + "kl": 0.1204833984375, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0012, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.171875, + "epoch": 0.0165663914929341, + "grad_norm": 16.491496062528398, + "kl": 0.4547119140625, + "learning_rate": 4.215409716133885e-06, + "loss": 0.131, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.171875, + "epoch": 0.01667832657058906, + "grad_norm": 4.096997318208475, + "kl": 0.2073974609375, + "learning_rate": 4.204319615321151e-06, + "loss": 0.0021, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.59375, + "epoch": 0.016790261648244018, + "grad_norm": 3.910798929312304, + "kl": 0.2662353515625, + "learning_rate": 4.193168494170065e-06, + "loss": 0.2077, + "reward": 0.09218750335276127, + "reward_std": 0.01861694734543562, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.78125, + "epoch": 0.016902196725898978, + "grad_norm": 13.951120691377671, + "kl": 0.37646484375, + "learning_rate": 4.181956820559339e-06, + "loss": 0.5985, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.0, + "epoch": 0.01701413180355394, + "grad_norm": 5.2519598128920455, + "kl": 0.1241455078125, + "learning_rate": 4.170685064908342e-06, + "loss": 0.2291, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.453125, + "epoch": 0.0171260668812089, + "grad_norm": 10.132405471868221, + "kl": 0.1990966796875, + "learning_rate": 4.159353700157365e-06, + "loss": 0.1752, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.171875, + "epoch": 0.01723800195886386, + "grad_norm": 0.784730424677537, + "kl": 0.0968017578125, + "learning_rate": 4.14796320174778e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.65625, + "epoch": 0.01734993703651882, + "grad_norm": 27.02725934627477, + "kl": 0.109619140625, + "learning_rate": 4.136514047602087e-06, + "loss": 0.1772, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.296875, + "epoch": 0.017461872114173778, + "grad_norm": 14.370055706061692, + "kl": 0.1593017578125, + "learning_rate": 4.1250067181038635e-06, + "loss": 0.2029, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.53125, + "epoch": 0.01757380719182874, + "grad_norm": 13.084686861766809, + "kl": 0.1204833984375, + "learning_rate": 4.113441696077608e-06, + "loss": 0.1918, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.359375, + "epoch": 0.0176857422694837, + "grad_norm": 0.6843422318132463, + "kl": 0.07861328125, + "learning_rate": 4.101819466768484e-06, + "loss": 0.017, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.34375, + "epoch": 0.01779767734713866, + "grad_norm": 6.203392924309637, + "kl": 0.2252197265625, + "learning_rate": 4.0901405178219535e-06, + "loss": -0.0466, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.71875, + "epoch": 0.01790961242479362, + "grad_norm": 0.7482548079571155, + "kl": 0.15234375, + "learning_rate": 4.078405339263326e-06, + "loss": 0.0015, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.734375, + "epoch": 0.018021547502448578, + "grad_norm": 0.821487756754832, + "kl": 0.095458984375, + "learning_rate": 4.06661442347719e-06, + "loss": 0.008, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.734375, + "epoch": 0.01813348258010354, + "grad_norm": 0.25693644980051783, + "kl": 0.1165771484375, + "learning_rate": 4.054768265186758e-06, + "loss": 0.0012, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.859375, + "epoch": 0.0182454176577585, + "grad_norm": 0.3151740457382109, + "kl": 0.0853271484375, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.203125, + "epoch": 0.01835735273541346, + "grad_norm": 0.2872706706321094, + "kl": 0.090087890625, + "learning_rate": 4.030912211554316e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.90625, + "epoch": 0.01846928781306842, + "grad_norm": 0.11020779139062825, + "kl": 0.0782470703125, + "learning_rate": 4.018903317164539e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.25, + "epoch": 0.018581222890723382, + "grad_norm": 0.045653419133126164, + "kl": 0.0740966796875, + "learning_rate": 4.006841182132932e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.359375, + "epoch": 0.01869315796837834, + "grad_norm": 0.021075436862415513, + "kl": 0.06011962890625, + "learning_rate": 3.9947263125625195e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.21875, + "epoch": 0.0188050930460333, + "grad_norm": 0.04494777486555804, + "kl": 0.07147216796875, + "learning_rate": 3.982559216768967e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.40625, + "epoch": 0.01891702812368826, + "grad_norm": 0.018822857218736996, + "kl": 0.0582275390625, + "learning_rate": 3.970340405259245e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.8125, + "epoch": 0.01902896320134322, + "grad_norm": 0.023628578386486077, + "kl": 0.07000732421875, + "learning_rate": 3.958070390710214e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.921875, + "epoch": 0.019140898278998182, + "grad_norm": 0.023811978335883294, + "kl": 0.0609130859375, + "learning_rate": 3.945749687947109e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.046875, + "epoch": 0.01925283335665314, + "grad_norm": 0.8750934577120364, + "kl": 0.07550048828125, + "learning_rate": 3.933378813921942e-06, + "loss": 0.013, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.546875, + "epoch": 0.0193647684343081, + "grad_norm": 0.6477727504447495, + "kl": 0.071044921875, + "learning_rate": 3.920958287691811e-06, + "loss": -0.0026, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.5, + "epoch": 0.01947670351196306, + "grad_norm": 27.871539574475218, + "kl": 0.44085693359375, + "learning_rate": 3.908488630397121e-06, + "loss": -0.0071, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.78125, + "epoch": 0.019588638589618023, + "grad_norm": 0.057425424775596354, + "kl": 0.06646728515625, + "learning_rate": 3.8959703652397175e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.21875, + "epoch": 0.019700573667272982, + "grad_norm": 0.02368246574314423, + "kl": 0.055419921875, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.234375, + "epoch": 0.019812508744927942, + "grad_norm": 0.03358527901376269, + "kl": 0.05792236328125, + "learning_rate": 3.870790114319559e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.78125, + "epoch": 0.0199244438225829, + "grad_norm": 0.5392466815020299, + "kl": 0.06982421875, + "learning_rate": 3.858129185069701e-06, + "loss": -0.0209, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.09375, + "epoch": 0.02003637890023786, + "grad_norm": 0.17046312215041995, + "kl": 0.08056640625, + "learning_rate": 3.845421760938597e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.234375, + "epoch": 0.020148313977892823, + "grad_norm": 0.05778936711697761, + "kl": 0.05181884765625, + "learning_rate": 3.832668375104312e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.75, + "epoch": 0.020260249055547783, + "grad_norm": 0.9368586862857481, + "kl": 0.0615234375, + "learning_rate": 3.8198695626733725e-06, + "loss": -0.0006, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.6875, + "epoch": 0.020372184133202742, + "grad_norm": 0.1463135701740714, + "kl": 0.05364990234375, + "learning_rate": 3.8070258606583156e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.671875, + "epoch": 0.0204841192108577, + "grad_norm": 24.196429667374314, + "kl": 0.20501708984375, + "learning_rate": 3.7941378079551544e-06, + "loss": 0.0021, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.625, + "epoch": 0.020596054288512664, + "grad_norm": 0.7575728537579188, + "kl": 0.067138671875, + "learning_rate": 3.7812059453207677e-06, + "loss": -0.0088, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.71875, + "epoch": 0.020707989366167624, + "grad_norm": 1.5781775679619348, + "kl": 0.129150390625, + "learning_rate": 3.768230815350213e-06, + "loss": -0.0091, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.6875, + "epoch": 0.020819924443822583, + "grad_norm": 3.7258182746280184, + "kl": 0.70550537109375, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.2283, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.1875, + "epoch": 0.020931859521477542, + "grad_norm": 23.600980161646945, + "kl": 2.8865966796875, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.2557, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.265625, + "epoch": 0.0210437945991325, + "grad_norm": 36.37207086101098, + "kl": 1.830078125, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.3201, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.90625, + "epoch": 0.021155729676787464, + "grad_norm": 20.154618516530974, + "kl": 0.8580322265625, + "learning_rate": 3.715908537066589e-06, + "loss": 0.236, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.265625, + "epoch": 0.021267664754442424, + "grad_norm": 1.0506630322756636, + "kl": 0.18212890625, + "learning_rate": 3.7027252720793538e-06, + "loss": -0.0025, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.59375, + "epoch": 0.021379599832097383, + "grad_norm": 0.10894834856423846, + "kl": 0.0850830078125, + "learning_rate": 3.689502032647817e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.421875, + "epoch": 0.021491534909752343, + "grad_norm": 0.070012885417872, + "kl": 0.0955810546875, + "learning_rate": 3.6762393735926245e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.953125, + "epoch": 0.021603469987407305, + "grad_norm": 0.055813531691409256, + "kl": 0.068603515625, + "learning_rate": 3.6629378513883852e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.171875, + "epoch": 0.021715405065062265, + "grad_norm": 52.57320558278293, + "kl": 0.1983642578125, + "learning_rate": 3.6495980241403307e-06, + "loss": 0.3357, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.78125, + "epoch": 0.021827340142717224, + "grad_norm": 0.04396489484628073, + "kl": 0.0579833984375, + "learning_rate": 3.636220451560896e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.78125, + "epoch": 0.021939275220372183, + "grad_norm": 1.0092735457153308, + "kl": 0.08642578125, + "learning_rate": 3.622805694946235e-06, + "loss": 0.0163, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.9375, + "epoch": 0.022051210298027143, + "grad_norm": 0.11501833373142471, + "kl": 0.07763671875, + "learning_rate": 3.609354317152667e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.546875, + "epoch": 0.022163145375682106, + "grad_norm": 0.24707486499522355, + "kl": 0.0819091796875, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.640625, + "epoch": 0.022275080453337065, + "grad_norm": 27.72743469845055, + "kl": 0.6578369140625, + "learning_rate": 3.5823439571131675e-06, + "loss": 0.0387, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.953125, + "epoch": 0.022387015530992024, + "grad_norm": 4.257281648957348, + "kl": 0.2021484375, + "learning_rate": 3.5687861081678477e-06, + "loss": 0.002, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.703125, + "epoch": 0.022498950608646984, + "grad_norm": 1.991901860771057, + "kl": 0.127197265625, + "learning_rate": 3.555193904597291e-06, + "loss": -0.0177, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.75, + "epoch": 0.022610885686301947, + "grad_norm": 22.923764220941024, + "kl": 0.1807861328125, + "learning_rate": 3.541567916703138e-06, + "loss": 0.1058, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.609375, + "epoch": 0.022722820763956906, + "grad_norm": 148.13942760303817, + "kl": 10.0704345703125, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.3985, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.625, + "epoch": 0.022834755841611865, + "grad_norm": 5.396743744887109, + "kl": 0.3883056640625, + "learning_rate": 3.5142168762142265e-06, + "loss": -0.0111, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.84375, + "epoch": 0.022946690919266825, + "grad_norm": 2770.9102363815387, + "kl": 340.0640869140625, + "learning_rate": 3.500492971214347e-06, + "loss": 3.6234, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.1875, + "epoch": 0.023058625996921784, + "grad_norm": 11.015389916640636, + "kl": 1.8922119140625, + "learning_rate": 3.48673757703248e-06, + "loss": -0.0028, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.78125, + "epoch": 0.023170561074576747, + "grad_norm": 1.8419760739328712, + "kl": 0.1317138671875, + "learning_rate": 3.472951270817418e-06, + "loss": -0.0237, + "reward": 0.09062500298023224, + "reward_std": 0.024866947438567877, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.90625, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.078125, + "epoch": 0.023282496152231706, + "grad_norm": 1.681815519531453, + "kl": 0.16943359375, + "learning_rate": 3.4591346310149578e-06, + "loss": -0.0704, + "reward": 0.08906250260770321, + "reward_std": 0.025969466660171747, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.890625, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.984375, + "epoch": 0.023394431229886666, + "grad_norm": 4.149140370325476, + "kl": 0.343994140625, + "learning_rate": 3.445288237343632e-06, + "loss": 0.0316, + "reward": 0.08906250260770321, + "reward_std": 0.031116947531700134, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.890625, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.3125, + "epoch": 0.023506366307541625, + "grad_norm": 7.97862155393176, + "kl": 0.59326171875, + "learning_rate": 3.4314126707703895e-06, + "loss": -0.0088, + "reward": 0.0937500037252903, + "reward_std": 0.017078250646591187, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.84375, + "epoch": 0.023618301385196588, + "grad_norm": 37.00032182098118, + "kl": 3.810546875, + "learning_rate": 3.4175085134862128e-06, + "loss": 0.1349, + "reward": 0.08281250484287739, + "reward_std": 0.03833641391247511, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.828125, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.96875, + "epoch": 0.023730236462851547, + "grad_norm": 78.79622548097791, + "kl": 3.765380859375, + "learning_rate": 3.4035763488816953e-06, + "loss": 0.2076, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.921875, + "epoch": 0.023842171540506506, + "grad_norm": 10.816304901178587, + "kl": 3.8818359375, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.1445, + "reward": 0.08906250260770321, + "reward_std": 0.025969465728849173, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.890625, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.6875, + "epoch": 0.023954106618161466, + "grad_norm": 13.522641035708572, + "kl": 4.187744140625, + "learning_rate": 3.375630337125133e-06, + "loss": 0.0886, + "reward": 0.09062500111758709, + "reward_std": 0.020155644044280052, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.90625, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.984375, + "epoch": 0.024066041695816425, + "grad_norm": 9.482318398692025, + "kl": 5.662109375, + "learning_rate": 3.361617662531772e-06, + "loss": 0.135, + "reward": 0.09062500298023224, + "reward_std": 0.0295782508328557, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.90625, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.234375, + "epoch": 0.024177976773471388, + "grad_norm": 9.298647150834725, + "kl": 1.287109375, + "learning_rate": 3.347579325686237e-06, + "loss": -0.0025, + "reward": 0.09218750335276127, + "reward_std": 0.01861694734543562, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.375, + "epoch": 0.024289911851126347, + "grad_norm": 9.33505130284713, + "kl": 2.80908203125, + "learning_rate": 3.333515915609027e-06, + "loss": 0.0542, + "reward": 0.09218750335276127, + "reward_std": 0.01861694734543562, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.453125, + "epoch": 0.024401846928781307, + "grad_norm": 46.939949673345936, + "kl": 7.6962890625, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.1244, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.140625, + "epoch": 0.024513782006436266, + "grad_norm": 8.965046605455813, + "kl": 1.474365234375, + "learning_rate": 3.305316237076927e-06, + "loss": 0.0928, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.921875, + "epoch": 0.024625717084091225, + "grad_norm": 30.26772960669773, + "kl": 3.83154296875, + "learning_rate": 3.291181151824071e-06, + "loss": 0.0081, + "reward": 0.09375000186264515, + "reward_std": 0.011180340312421322, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.640625, + "epoch": 0.024737652161746188, + "grad_norm": 9.50042126895199, + "kl": 0.977294921875, + "learning_rate": 3.27702335969396e-06, + "loss": -0.0179, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.90625, + "epoch": 0.024849587239401148, + "grad_norm": 8.200151360167519, + "kl": 0.697509765625, + "learning_rate": 3.2628434547191985e-06, + "loss": -0.037, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.859375, + "epoch": 0.024961522317056107, + "grad_norm": 18.01832349548423, + "kl": 0.634033203125, + "learning_rate": 3.2486420318601973e-06, + "loss": -0.0207, + "reward": 0.09218750335276127, + "reward_std": 0.01861694734543562, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.5, + "epoch": 0.025073457394711066, + "grad_norm": 99.84487944919411, + "kl": 4.774169921875, + "learning_rate": 3.2344196869802187e-06, + "loss": -0.0292, + "reward": 0.09218750149011612, + "reward_std": 0.02257782220840454, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.921875, + "epoch": 0.02518539247236603, + "grad_norm": 4.540807623552518, + "kl": 0.2640380859375, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.0409, + "reward": 0.09531250037252903, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.421875, + "epoch": 0.02529732755002099, + "grad_norm": 1.9006028901018341, + "kl": 0.2882080078125, + "learning_rate": 3.205914618974563e-06, + "loss": -0.0101, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.5, + "epoch": 0.025409262627675948, + "grad_norm": 8.497076561424455, + "kl": 1.46826171875, + "learning_rate": 3.1916330918644496e-06, + "loss": 0.0076, + "reward": 0.0937500037252903, + "reward_std": 0.017078250646591187, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.203125, + "epoch": 0.025521197705330907, + "grad_norm": 6.49286525909613, + "kl": 0.5, + "learning_rate": 3.177333034714303e-06, + "loss": 0.0436, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.09375, + "epoch": 0.025633132782985867, + "grad_norm": 99.57124601550895, + "kl": 13.4130859375, + "learning_rate": 3.1630150475258813e-06, + "loss": 0.1554, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.78125, + "epoch": 0.02574506786064083, + "grad_norm": 49.128639521298574, + "kl": 7.0673828125, + "learning_rate": 3.148679731053252e-06, + "loss": 0.0762, + "reward": 0.09218750335276127, + "reward_std": 0.023328250739723444, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.921875, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.09375, + "epoch": 0.02585700293829579, + "grad_norm": 367.82367126923293, + "kl": 26.0625, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.19, + "reward": 0.09531250037252903, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.828125, + "epoch": 0.025968938015950748, + "grad_norm": 2760.034913740409, + "kl": 32.2548828125, + "learning_rate": 3.1199595168819043e-06, + "loss": 0.4045, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.625, + "epoch": 0.026080873093605707, + "grad_norm": 24.759326550765813, + "kl": 2.56591796875, + "learning_rate": 3.105575824225852e-06, + "loss": 0.1236, + "reward": 0.09531250037252903, + "reward_std": 0.018750000279396772, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.453125, + "epoch": 0.02619280817126067, + "grad_norm": 11.271203444155871, + "kl": 0.311767578125, + "learning_rate": 3.091177212320363e-06, + "loss": 0.0506, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.984375, + "epoch": 0.02630474324891563, + "grad_norm": 60.98662683249761, + "kl": 6.624267578125, + "learning_rate": 3.0767642853023538e-06, + "loss": 0.0659, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.59375, + "epoch": 0.02641667832657059, + "grad_norm": 8.237085005507922, + "kl": 0.9478759765625, + "learning_rate": 3.062337647909376e-06, + "loss": -0.0391, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.09375, + "epoch": 0.02652861340422555, + "grad_norm": 18.57477238550014, + "kl": 2.2236328125, + "learning_rate": 3.04789790545424e-06, + "loss": 0.0015, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.015625, + "epoch": 0.026640548481880508, + "grad_norm": 16.627244952129402, + "kl": 0.564453125, + "learning_rate": 3.033445663799621e-06, + "loss": 0.0793, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.96875, + "epoch": 0.02675248355953547, + "grad_norm": 54.633519018974916, + "kl": 4.1083984375, + "learning_rate": 3.018981529332633e-06, + "loss": 0.1508, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.375, + "epoch": 0.02686441863719043, + "grad_norm": 8.038755997888481, + "kl": 3.0703125, + "learning_rate": 3.00450610893939e-06, + "loss": 0.0033, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.46875, + "epoch": 0.02697635371484539, + "grad_norm": 12.489323661336277, + "kl": 1.4534912109375, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.0417, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.390625, + "epoch": 0.02708828879250035, + "grad_norm": 5.250434377988738, + "kl": 1.822265625, + "learning_rate": 2.9755238402607826e-06, + "loss": -0.0469, + "reward": 0.0937500037252903, + "reward_std": 0.017078250646591187, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.125, + "epoch": 0.02720022387015531, + "grad_norm": 57.97881368696394, + "kl": 5.73095703125, + "learning_rate": 2.961018208013367e-06, + "loss": 0.2336, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.375, + "epoch": 0.02731215894781027, + "grad_norm": 49.75089750933376, + "kl": 4.38427734375, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.0314, + "reward": 0.09375000186264515, + "reward_std": 0.021039125509560108, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.0625, + "epoch": 0.02742409402546523, + "grad_norm": 1.707430633827994, + "kl": 0.204833984375, + "learning_rate": 2.9319809908131604e-06, + "loss": 0.002, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.109375, + "epoch": 0.02753602910312019, + "grad_norm": 13.47781256474575, + "kl": 0.590576171875, + "learning_rate": 2.917450624203847e-06, + "loss": 0.0719, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.0625, + "epoch": 0.02764796418077515, + "grad_norm": 1680.1301953567038, + "kl": 79.1229248046875, + "learning_rate": 2.9029132317017118e-06, + "loss": 0.7462, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.234375, + "epoch": 0.02775989925843011, + "grad_norm": 0.6798016263852162, + "kl": 0.1409912109375, + "learning_rate": 2.888369423266629e-06, + "loss": 0.0014, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.953125, + "epoch": 0.02787183433608507, + "grad_norm": 5.926518387479333, + "kl": 0.63525390625, + "learning_rate": 2.8738198091276712e-06, + "loss": -0.0057, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.359375, + "epoch": 0.02798376941374003, + "grad_norm": 73.88428710060259, + "kl": 6.923828125, + "learning_rate": 2.859264999757509e-06, + "loss": 0.2651, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.0625, + "epoch": 0.02809570449139499, + "grad_norm": 19.57610821275727, + "kl": 1.18603515625, + "learning_rate": 2.8447056058467928e-06, + "loss": -0.0419, + "reward": 0.09531250223517418, + "reward_std": 0.01478912541642785, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.953125, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.59375, + "epoch": 0.028207639569049953, + "grad_norm": 15.02657989678683, + "kl": 0.35986328125, + "learning_rate": 2.830142238278531e-06, + "loss": 0.0012, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.203125, + "epoch": 0.028319574646704912, + "grad_norm": 2.875721583070933, + "kl": 0.475341796875, + "learning_rate": 2.81557550810246e-06, + "loss": -0.0231, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.046875, + "epoch": 0.02843150972435987, + "grad_norm": 0.6092524986743804, + "kl": 0.1376953125, + "learning_rate": 2.8010060265094026e-06, + "loss": 0.0014, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.78125, + "epoch": 0.02854344480201483, + "grad_norm": 1.3831215185283958, + "kl": 0.130615234375, + "learning_rate": 2.786434404805629e-06, + "loss": -0.032, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.375, + "epoch": 0.02865537987966979, + "grad_norm": 0.29391488966565826, + "kl": 0.1136474609375, + "learning_rate": 2.771861254387199e-06, + "loss": 0.0011, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.015625, + "epoch": 0.028767314957324753, + "grad_norm": 8.597539761339826, + "kl": 0.3472900390625, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.0113, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.578125, + "epoch": 0.028879250034979712, + "grad_norm": 0.5239580359655661, + "kl": 0.1436767578125, + "learning_rate": 2.742712813285681e-06, + "loss": 0.0014, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.625, + "epoch": 0.02899118511263467, + "grad_norm": 1.9555271298208972, + "kl": 0.2032470703125, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.002, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.65625, + "epoch": 0.02910312019028963, + "grad_norm": 1.252836717471123, + "kl": 0.2208251953125, + "learning_rate": 2.7135655951943716e-06, + "loss": 0.0022, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.59375, + "epoch": 0.029215055267944594, + "grad_norm": 0.8719765743586952, + "kl": 0.1676025390625, + "learning_rate": 2.698993973490598e-06, + "loss": 0.0017, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.0625, + "epoch": 0.029326990345599553, + "grad_norm": 0.13713063096829928, + "kl": 0.10205078125, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.5, + "epoch": 0.029438925423254513, + "grad_norm": 1.2802535059679794, + "kl": 0.09423828125, + "learning_rate": 2.66985776172147e-06, + "loss": 0.0411, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.359375, + "epoch": 0.029550860500909472, + "grad_norm": 3.6751799791380084, + "kl": 0.1397705078125, + "learning_rate": 2.6552943941532088e-06, + "loss": -0.0156, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.90625, + "epoch": 0.02966279557856443, + "grad_norm": 0.07806269511080617, + "kl": 0.0855712890625, + "learning_rate": 2.6407350002424927e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.84375, + "epoch": 0.029774730656219394, + "grad_norm": 0.06731730411902957, + "kl": 0.087646484375, + "learning_rate": 2.626180190872329e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.734375, + "epoch": 0.029886665733874353, + "grad_norm": 0.13087790730798726, + "kl": 0.0902099609375, + "learning_rate": 2.611630576733372e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.078125, + "epoch": 0.029998600811529313, + "grad_norm": 5.554675473835038, + "kl": 0.14794921875, + "learning_rate": 2.5970867682982885e-06, + "loss": -0.051, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.1875, + "epoch": 0.030110535889184272, + "grad_norm": 0.14583524682041712, + "kl": 0.0927734375, + "learning_rate": 2.582549375796154e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.578125, + "epoch": 0.030222470966839235, + "grad_norm": 0.058225539376494065, + "kl": 0.077880859375, + "learning_rate": 2.568019009186841e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.6875, + "epoch": 0.030334406044494194, + "grad_norm": 0.04344087707852648, + "kl": 0.0789794921875, + "learning_rate": 2.5534962781354317e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.953125, + "epoch": 0.030446341122149154, + "grad_norm": 0.07717630877616868, + "kl": 0.07293701171875, + "learning_rate": 2.538981791986634e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.171875, + "epoch": 0.030558276199804113, + "grad_norm": 0.22582461693188216, + "kl": 0.0968017578125, + "learning_rate": 2.524476159739218e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.953125, + "epoch": 0.030670211277459072, + "grad_norm": 27.999475574336614, + "kl": 0.5250244140625, + "learning_rate": 2.5099799900204607e-06, + "loss": 0.0169, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.21875, + "epoch": 0.030782146355114035, + "grad_norm": 0.034837371039734215, + "kl": 0.07232666015625, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.265625, + "epoch": 0.030894081432768995, + "grad_norm": 18.866980558523725, + "kl": 1.3262939453125, + "learning_rate": 2.481018470667368e-06, + "loss": 0.0199, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.71875, + "epoch": 0.031006016510423954, + "grad_norm": 0.08715091318862284, + "kl": 0.0843505859375, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.90625, + "epoch": 0.031117951588078913, + "grad_norm": 18.97786056185319, + "kl": 0.388671875, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.0333, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.890625, + "epoch": 0.031229886665733873, + "grad_norm": 0.3777705304986896, + "kl": 0.09979248046875, + "learning_rate": 2.4376623520906255e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.59375, + "epoch": 0.031341821743388835, + "grad_norm": 23.98251858869545, + "kl": 5.38348388671875, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0275, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.40625, + "epoch": 0.031453756821043795, + "grad_norm": 0.09304733402356336, + "kl": 0.0772705078125, + "learning_rate": 2.408822787679637e-06, + "loss": 0.0008, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.25, + "epoch": 0.031565691898698754, + "grad_norm": 0.14440303776607324, + "kl": 0.0906982421875, + "learning_rate": 2.3944241757741475e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.765625, + "epoch": 0.031677626976353714, + "grad_norm": 2.359216668730623, + "kl": 0.90911865234375, + "learning_rate": 2.380040483118097e-06, + "loss": -0.0461, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.328125, + "epoch": 0.03178956205400867, + "grad_norm": 0.045061338534023075, + "kl": 0.05657958984375, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.828125, + "epoch": 0.03190149713166363, + "grad_norm": 0.21772012539025168, + "kl": 0.0853271484375, + "learning_rate": 2.351320268946749e-06, + "loss": 0.0009, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.703125, + "epoch": 0.0320134322093186, + "grad_norm": 0.6734360714201397, + "kl": 0.1312255859375, + "learning_rate": 2.336984952474119e-06, + "loss": 0.0013, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.15625, + "epoch": 0.03212536728697356, + "grad_norm": 4.987286255355766, + "kl": 0.12139892578125, + "learning_rate": 2.322666965285697e-06, + "loss": -0.0532, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.9375, + "epoch": 0.03223730236462852, + "grad_norm": 0.6096977245482182, + "kl": 0.09234619140625, + "learning_rate": 2.3083669081355507e-06, + "loss": -0.0601, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.640625, + "epoch": 0.03234923744228348, + "grad_norm": 0.04237782613357625, + "kl": 0.06573486328125, + "learning_rate": 2.2940853810254377e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.53125, + "epoch": 0.032461172519938436, + "grad_norm": 0.029403412259251353, + "kl": 0.06024169921875, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.109375, + "epoch": 0.032573107597593395, + "grad_norm": 0.08363337386352669, + "kl": 0.0560302734375, + "learning_rate": 2.2655803130197816e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.734375, + "epoch": 0.032685042675248355, + "grad_norm": 0.023053884997018263, + "kl": 0.06512451171875, + "learning_rate": 2.2513579681398034e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.546875, + "epoch": 0.032796977752903314, + "grad_norm": 1.460556497688483, + "kl": 0.15765380859375, + "learning_rate": 2.237156545280803e-06, + "loss": -0.026, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.609375, + "epoch": 0.03290891283055827, + "grad_norm": 7.935071492001548, + "kl": 1.8072509765625, + "learning_rate": 2.2229766403060403e-06, + "loss": -0.0278, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.125, + "epoch": 0.03302084790821324, + "grad_norm": 0.01681019430958899, + "kl": 0.0477294921875, + "learning_rate": 2.2088188481759305e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.484375, + "epoch": 0.0331327829858682, + "grad_norm": 0.01847832809783982, + "kl": 0.04632568359375, + "learning_rate": 2.194683762923073e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.78125, + "epoch": 0.03324471806352316, + "grad_norm": 0.02629542045427332, + "kl": 0.0531005859375, + "learning_rate": 2.1805719776273387e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.09375, + "epoch": 0.03335665314117812, + "grad_norm": 0.039099364449013255, + "kl": 0.0689697265625, + "learning_rate": 2.166484084390974e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.171875, + "epoch": 0.03346858821883308, + "grad_norm": 0.054692981575624876, + "kl": 0.067138671875, + "learning_rate": 2.1524206743137636e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.71875, + "epoch": 0.033580523296488037, + "grad_norm": 0.020024754177952568, + "kl": 0.04815673828125, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.609375, + "epoch": 0.033692458374142996, + "grad_norm": 0.7962215451972806, + "kl": 0.0548095703125, + "learning_rate": 2.124369662874868e-06, + "loss": 0.0015, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.640625, + "epoch": 0.033804393451797955, + "grad_norm": 0.03594032335785641, + "kl": 0.04754638671875, + "learning_rate": 2.110383238477441e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.828125, + "epoch": 0.033916328529452915, + "grad_norm": 0.02676423646625272, + "kl": 0.0552978515625, + "learning_rate": 2.096423651118305e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.1875, + "epoch": 0.03402826360710788, + "grad_norm": 0.021001193681821163, + "kl": 0.04876708984375, + "learning_rate": 2.082491486513788e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.9375, + "epoch": 0.03414019868476284, + "grad_norm": 0.015515949438366839, + "kl": 0.0455322265625, + "learning_rate": 2.0685873292296116e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.484375, + "epoch": 0.0342521337624178, + "grad_norm": 0.10070236191895302, + "kl": 0.042633056640625, + "learning_rate": 2.054711762656369e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.890625, + "epoch": 0.03436406884007276, + "grad_norm": 8.449530301304263, + "kl": 0.17718505859375, + "learning_rate": 2.040865368985044e-06, + "loss": 0.0072, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.953125, + "epoch": 0.03447600391772772, + "grad_norm": 4.554366640679164, + "kl": 0.10430908203125, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0096, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.78125, + "epoch": 0.03458793899538268, + "grad_norm": 1.4037888132598981, + "kl": 0.065673828125, + "learning_rate": 2.0132624229675205e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.390625, + "epoch": 0.03469987407303764, + "grad_norm": 0.08573893743799793, + "kl": 0.04962158203125, + "learning_rate": 1.9995070287856546e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.375, + "epoch": 0.034811809150692596, + "grad_norm": 14.764228813015494, + "kl": 0.43438720703125, + "learning_rate": 1.985783123785774e-06, + "loss": -0.03, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.03125, + "epoch": 0.034923744228347556, + "grad_norm": 0.2193000836614737, + "kl": 0.067138671875, + "learning_rate": 1.9720912837954486e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.421875, + "epoch": 0.035035679306002515, + "grad_norm": 0.1633478285227259, + "kl": 0.05523681640625, + "learning_rate": 1.958432083296862e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.4375, + "epoch": 0.03514761438365748, + "grad_norm": 0.145955357953582, + "kl": 0.051025390625, + "learning_rate": 1.9448060954027093e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.703125, + "epoch": 0.03525954946131244, + "grad_norm": 7.683536847345325, + "kl": 0.1790771484375, + "learning_rate": 1.931213891832153e-06, + "loss": -0.0279, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.09375, + "epoch": 0.0353714845389674, + "grad_norm": 0.19129137636650337, + "kl": 0.05731201171875, + "learning_rate": 1.9176560428868336e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.3125, + "epoch": 0.03548341961662236, + "grad_norm": 6.515657611937849, + "kl": 1.45611572265625, + "learning_rate": 1.9041331174269373e-06, + "loss": -0.0073, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.234375, + "epoch": 0.03559535469427732, + "grad_norm": 0.2025260783774205, + "kl": 0.05340576171875, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.5625, + "epoch": 0.03570728977193228, + "grad_norm": 4.576351957328261, + "kl": 1.09912109375, + "learning_rate": 1.8771943050537656e-06, + "loss": -0.0441, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.34375, + "epoch": 0.03581922484958724, + "grad_norm": 0.04886630482006485, + "kl": 0.0460205078125, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.3125, + "epoch": 0.0359311599272422, + "grad_norm": 0.21065449469813613, + "kl": 0.0640869140625, + "learning_rate": 1.8504019758596698e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.09375, + "epoch": 0.036043095004897156, + "grad_norm": 4.883286184793087, + "kl": 1.3126220703125, + "learning_rate": 1.8370621486116163e-06, + "loss": -0.0417, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.03125, + "epoch": 0.03615503008255212, + "grad_norm": 2.4312943229919903, + "kl": 0.7298583984375, + "learning_rate": 1.823760626407377e-06, + "loss": -0.0482, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.21875, + "epoch": 0.03626696516020708, + "grad_norm": 0.023794241531636316, + "kl": 0.0516357421875, + "learning_rate": 1.8104979673521838e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.34375, + "epoch": 0.03637890023786204, + "grad_norm": 0.06501484692075148, + "kl": 0.0458984375, + "learning_rate": 1.7972747279206482e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.140625, + "epoch": 0.036490835315517, + "grad_norm": 0.0554933350153533, + "kl": 0.045013427734375, + "learning_rate": 1.7840914629334122e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.5, + "epoch": 0.03660277039317196, + "grad_norm": 0.034907067888093626, + "kl": 0.05291748046875, + "learning_rate": 1.7709487255338731e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.453125, + "epoch": 0.03671470547082692, + "grad_norm": 0.04733189685853122, + "kl": 0.0457763671875, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.875, + "epoch": 0.03682664054848188, + "grad_norm": 8.191477588254072, + "kl": 0.31414794921875, + "learning_rate": 1.744787037546045e-06, + "loss": -0.0328, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.625, + "epoch": 0.03693857562613684, + "grad_norm": 4.485682653488296, + "kl": 0.06427001953125, + "learning_rate": 1.731769184649788e-06, + "loss": -0.0303, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.796875, + "epoch": 0.0370505107037918, + "grad_norm": 0.15001206134528802, + "kl": 0.054931640625, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.671875, + "epoch": 0.037162445781446764, + "grad_norm": 0.05596585517144101, + "kl": 0.04620361328125, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.171875, + "epoch": 0.03727438085910172, + "grad_norm": 0.21063855683262964, + "kl": 0.05145263671875, + "learning_rate": 1.6929741393416855e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.03125, + "epoch": 0.03738631593675668, + "grad_norm": 14.703670642314776, + "kl": 0.6478271484375, + "learning_rate": 1.6801304373266286e-06, + "loss": -0.0265, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.28125, + "epoch": 0.03749825101441164, + "grad_norm": 3.2693646713839533, + "kl": 0.2021484375, + "learning_rate": 1.667331624895689e-06, + "loss": -0.0538, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.15625, + "epoch": 0.0376101860920666, + "grad_norm": 0.0326789044313002, + "kl": 0.06781005859375, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.953125, + "epoch": 0.03772212116972156, + "grad_norm": 3.381374096168454, + "kl": 0.24603271484375, + "learning_rate": 1.6418708149302992e-06, + "loss": -0.0531, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.0625, + "epoch": 0.03783405624737652, + "grad_norm": 4.256284487475119, + "kl": 0.145263671875, + "learning_rate": 1.6292098856804423e-06, + "loss": -0.0518, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.484375, + "epoch": 0.03794599132503148, + "grad_norm": 4.758319146409537, + "kl": 0.52178955078125, + "learning_rate": 1.6165959825390661e-06, + "loss": -0.0436, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.828125, + "epoch": 0.03805792640268644, + "grad_norm": 0.06125517244847344, + "kl": 0.0491943359375, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.828125, + "epoch": 0.038169861480341405, + "grad_norm": 0.680605704729485, + "kl": 0.10308837890625, + "learning_rate": 1.59151136960288e-06, + "loss": 0.001, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.265625, + "epoch": 0.038281796557996364, + "grad_norm": 0.06326323223836276, + "kl": 0.04736328125, + "learning_rate": 1.5790417123081903e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.671875, + "epoch": 0.038393731635651324, + "grad_norm": 0.049854259040433335, + "kl": 0.048828125, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.125, + "epoch": 0.03850566671330628, + "grad_norm": 0.07874363283266796, + "kl": 0.05059814453125, + "learning_rate": 1.5542503120528918e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.3125, + "epoch": 0.03861760179096124, + "grad_norm": 0.12008769303035174, + "kl": 0.0606689453125, + "learning_rate": 1.5419296092897866e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.28125, + "epoch": 0.0387295368686162, + "grad_norm": 0.10834525189023946, + "kl": 0.0592041015625, + "learning_rate": 1.529659594740755e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.25, + "epoch": 0.03884147194627116, + "grad_norm": 0.05706678172063102, + "kl": 0.040863037109375, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.15625, + "epoch": 0.03895340702392612, + "grad_norm": 0.2260930410291458, + "kl": 0.0599365234375, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.671875, + "epoch": 0.03906534210158108, + "grad_norm": 0.03574155761474703, + "kl": 0.0439453125, + "learning_rate": 1.4931588178670695e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.40625, + "epoch": 0.039177277179236046, + "grad_norm": 0.06765325310501516, + "kl": 0.05303955078125, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.625, + "epoch": 0.039289212256891005, + "grad_norm": 0.45741242496960755, + "kl": 0.063720703125, + "learning_rate": 1.469087788445684e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.0625, + "epoch": 0.039401147334545965, + "grad_norm": 0.059362152677566817, + "kl": 0.044921875, + "learning_rate": 1.4571326385668965e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.265625, + "epoch": 0.039513082412200924, + "grad_norm": 0.03486583265069237, + "kl": 0.04302978515625, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.171875, + "epoch": 0.039625017489855884, + "grad_norm": 0.044710017702799705, + "kl": 0.04583740234375, + "learning_rate": 1.4333855765228104e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.359375, + "epoch": 0.03973695256751084, + "grad_norm": 0.0864511831454649, + "kl": 0.04925537109375, + "learning_rate": 1.421594660736675e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.8125, + "epoch": 0.0398488876451658, + "grad_norm": 0.03699683675596586, + "kl": 0.04742431640625, + "learning_rate": 1.4098594821780476e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.8125, + "epoch": 0.03996082272282076, + "grad_norm": 0.07941048515036817, + "kl": 0.05633544921875, + "learning_rate": 1.3981805332315174e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.390625, + "epoch": 0.04007275780047572, + "grad_norm": 12.642109265209918, + "kl": 1.81060791015625, + "learning_rate": 1.3865583039223929e-06, + "loss": -0.0185, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.390625, + "epoch": 0.04018469287813069, + "grad_norm": 0.08034212517091562, + "kl": 0.0445556640625, + "learning_rate": 1.374993281896137e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.171875, + "epoch": 0.04029662795578565, + "grad_norm": 0.09767901214575864, + "kl": 0.04425048828125, + "learning_rate": 1.3634859523979134e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.578125, + "epoch": 0.040408563033440606, + "grad_norm": 0.07629861701658103, + "kl": 0.048583984375, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.375, + "epoch": 0.040520498111095565, + "grad_norm": 2.436909257977141, + "kl": 0.3397216796875, + "learning_rate": 1.3406462998426358e-06, + "loss": -0.0575, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.640625, + "epoch": 0.040632433188750525, + "grad_norm": 1.7488785493981298, + "kl": 0.41937255859375, + "learning_rate": 1.3293149350916595e-06, + "loss": -0.0315, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.265625, + "epoch": 0.040744368266405484, + "grad_norm": 0.042176241580247326, + "kl": 0.03741455078125, + "learning_rate": 1.3180431794406623e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.40625, + "epoch": 0.04085630334406044, + "grad_norm": 0.06954183102024396, + "kl": 0.0513916015625, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.453125, + "epoch": 0.0409682384217154, + "grad_norm": 0.0585988602116313, + "kl": 0.03924560546875, + "learning_rate": 1.2956803846788503e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.34375, + "epoch": 0.04108017349937036, + "grad_norm": 0.029227802328528226, + "kl": 0.05963134765625, + "learning_rate": 1.284590283866116e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.015625, + "epoch": 0.04119210857702533, + "grad_norm": 0.024030066071591895, + "kl": 0.039306640625, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.578125, + "epoch": 0.04130404365468029, + "grad_norm": 0.022956778080715768, + "kl": 0.04571533203125, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.140625, + "epoch": 0.04141597873233525, + "grad_norm": 0.03031369908455225, + "kl": 0.04296875, + "learning_rate": 1.251690743723718e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.125, + "epoch": 0.041527913809990206, + "grad_norm": 1.4619752022004908, + "kl": 0.42498779296875, + "learning_rate": 1.2408493515534581e-06, + "loss": -0.0518, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.390625, + "epoch": 0.041639848887645166, + "grad_norm": 0.06787211806577707, + "kl": 0.04547119140625, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.984375, + "epoch": 0.041751783965300125, + "grad_norm": 0.844949833560224, + "kl": 0.04034423828125, + "learning_rate": 1.2193569822552772e-06, + "loss": 0.0324, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.625, + "epoch": 0.041863719042955085, + "grad_norm": 0.5718530387307899, + "kl": 0.034820556640625, + "learning_rate": 1.2087069069041268e-06, + "loss": 0.014, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.0625, + "epoch": 0.041975654120610044, + "grad_norm": 0.03780725899196792, + "kl": 0.0430908203125, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.140625, + "epoch": 0.042087589198265, + "grad_norm": 0.022995562401170407, + "kl": 0.05279541015625, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.75, + "epoch": 0.04219952427591997, + "grad_norm": 0.10984965670222785, + "kl": 0.04351806640625, + "learning_rate": 1.177146472116071e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.6875, + "epoch": 0.04231145935357493, + "grad_norm": 3.2788406827242613, + "kl": 0.182952880859375, + "learning_rate": 1.1667577289579462e-06, + "loss": -0.0157, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.484375, + "epoch": 0.04242339443122989, + "grad_norm": 0.04631375660525243, + "kl": 0.0482177734375, + "learning_rate": 1.1564354154746007e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.921875, + "epoch": 0.04253532950888485, + "grad_norm": 0.5029931057300824, + "kl": 0.04144287109375, + "learning_rate": 1.146179964769635e-06, + "loss": -0.0096, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.53125, + "epoch": 0.04264726458653981, + "grad_norm": 2.062066313854755, + "kl": 0.10888671875, + "learning_rate": 1.1359918071412195e-06, + "loss": 0.0137, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.390625, + "epoch": 0.042759199664194766, + "grad_norm": 5.689014120759552, + "kl": 0.19818115234375, + "learning_rate": 1.1258713700640456e-06, + "loss": 0.002, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.40625, + "epoch": 0.042871134741849726, + "grad_norm": 5.456471510055751, + "kl": 0.1583251953125, + "learning_rate": 1.115819078171383e-06, + "loss": -0.0152, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.59375, + "epoch": 0.042983069819504685, + "grad_norm": 0.01860238339511713, + "kl": 0.04095458984375, + "learning_rate": 1.1058353532372667e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.15625, + "epoch": 0.043095004897159644, + "grad_norm": 0.13949154395909993, + "kl": 0.0496826171875, + "learning_rate": 1.0959206141587998e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.453125, + "epoch": 0.04320693997481461, + "grad_norm": 1.107285367889642, + "kl": 0.3203125, + "learning_rate": 1.0860752769385766e-06, + "loss": -0.0542, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.203125, + "epoch": 0.04331887505246957, + "grad_norm": 14.993173199026563, + "kl": 1.521728515625, + "learning_rate": 1.0762997546672279e-06, + "loss": -0.0433, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.78125, + "epoch": 0.04343081013012453, + "grad_norm": 0.10799334625634623, + "kl": 0.041534423828125, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.546875, + "epoch": 0.04354274520777949, + "grad_norm": 0.37423665656457383, + "kl": 0.061767578125, + "learning_rate": 1.056959792669997e-06, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.703125, + "epoch": 0.04365468028543445, + "grad_norm": 0.19089367747123068, + "kl": 0.05108642578125, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.71875, + "epoch": 0.04376661536308941, + "grad_norm": 1.5778323232854403, + "kl": 0.1815185546875, + "learning_rate": 1.037903973997345e-06, + "loss": -0.0537, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.71875, + "epoch": 0.04387855044074437, + "grad_norm": 0.31904972301610846, + "kl": 0.052734375, + "learning_rate": 1.0284836197047737e-06, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.0625, + "epoch": 0.043990485518399326, + "grad_norm": 2.0851570445818868, + "kl": 0.2457275390625, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.0287, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.109375, + "epoch": 0.044102420596054286, + "grad_norm": 0.3993597476384507, + "kl": 0.07476806640625, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.15625, + "epoch": 0.04421435567370925, + "grad_norm": 0.3569068198951939, + "kl": 0.06988525390625, + "learning_rate": 1.0006575109707898e-06, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.234375, + "epoch": 0.04432629075136421, + "grad_norm": 0.13286047664045245, + "kl": 0.04833984375, + "learning_rate": 9.915284233622877e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.9375, + "epoch": 0.04443822582901917, + "grad_norm": 2.2497661432685447, + "kl": 0.05743408203125, + "learning_rate": 9.824731176992796e-07, + "loss": -0.002, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.140625, + "epoch": 0.04455016090667413, + "grad_norm": 3.30119576808313, + "kl": 1.076416015625, + "learning_rate": 9.734919739242543e-07, + "loss": -0.0157, + "reward": 0.09687500260770321, + "reward_std": 0.008539125323295593, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.28125, + "epoch": 0.04466209598432909, + "grad_norm": 0.08032954719670685, + "kl": 0.06231689453125, + "learning_rate": 9.645853688680177e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.8125, + "epoch": 0.04477403106198405, + "grad_norm": 0.30125779125635027, + "kl": 0.06890869140625, + "learning_rate": 9.557536762338786e-07, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.609375, + "epoch": 0.04488596613963901, + "grad_norm": 0.10410594418892839, + "kl": 0.04498291015625, + "learning_rate": 9.46997266581973e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.671875, + "epoch": 0.04499790121729397, + "grad_norm": 0.21768463357305143, + "kl": 0.04937744140625, + "learning_rate": 9.383165073137115e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.828125, + "epoch": 0.04510983629494893, + "grad_norm": 0.24943422959107125, + "kl": 0.054443359375, + "learning_rate": 9.297117626563687e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.4375, + "epoch": 0.04522177137260389, + "grad_norm": 0.15214769860694116, + "kl": 0.05352783203125, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.265625, + "epoch": 0.04533370645025885, + "grad_norm": 0.12737968723084875, + "kl": 0.04833984375, + "learning_rate": 9.127317581212753e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.078125, + "epoch": 0.04544564152791381, + "grad_norm": 0.12491707622355717, + "kl": 0.0419921875, + "learning_rate": 9.043572106905084e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.171875, + "epoch": 0.04555757660556877, + "grad_norm": 0.2782478324098304, + "kl": 0.04632568359375, + "learning_rate": 8.960601027347321e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.59375, + "epoch": 0.04566951168322373, + "grad_norm": 1.707651816399516, + "kl": 0.17364501953125, + "learning_rate": 8.878407823839788e-07, + "loss": 0.0017, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.625, + "epoch": 0.04578144676087869, + "grad_norm": 0.18198728965219338, + "kl": 0.05511474609375, + "learning_rate": 8.796995945044689e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.171875, + "epoch": 0.04589338183853365, + "grad_norm": 3.0116035887758312, + "kl": 0.07513427734375, + "learning_rate": 8.716368806841405e-07, + "loss": 0.0028, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.46875, + "epoch": 0.04600531691618861, + "grad_norm": 0.036581584760347785, + "kl": 0.04638671875, + "learning_rate": 8.636529792183171e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.40625, + "epoch": 0.04611725199384357, + "grad_norm": 1.7675486622831695, + "kl": 0.08441162109375, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0507, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.046875, + "epoch": 0.046229187071498534, + "grad_norm": 0.8339879341516419, + "kl": 0.06793212890625, + "learning_rate": 8.479229499833844e-07, + "loss": -0.0386, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.328125, + "epoch": 0.046341122149153494, + "grad_norm": 0.03758254954126514, + "kl": 0.046630859375, + "learning_rate": 8.401774822147976e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.90625, + "epoch": 0.04645305722680845, + "grad_norm": 0.03884055882244416, + "kl": 0.0457763671875, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.15625, + "epoch": 0.04656499230446341, + "grad_norm": 0.13058568836650014, + "kl": 0.06011962890625, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.59375, + "epoch": 0.04667692738211837, + "grad_norm": 0.05421556382240225, + "kl": 0.0401611328125, + "learning_rate": 8.174231559889931e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.25, + "epoch": 0.04678886245977333, + "grad_norm": 2.135742456611555, + "kl": 0.55340576171875, + "learning_rate": 8.100001337484787e-07, + "loss": -0.052, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.5625, + "epoch": 0.04690079753742829, + "grad_norm": 27.79993671743234, + "kl": 2.71258544921875, + "learning_rate": 8.026585100169251e-07, + "loss": 0.1087, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.640625, + "epoch": 0.04701273261508325, + "grad_norm": 0.3229458185249445, + "kl": 0.0728759765625, + "learning_rate": 7.953985928341601e-07, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.0625, + "epoch": 0.04712466769273821, + "grad_norm": 0.18899120451502113, + "kl": 0.04962158203125, + "learning_rate": 7.882206868117693e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.9375, + "epoch": 0.047236602770393175, + "grad_norm": 0.05378027086791007, + "kl": 0.0703125, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.609375, + "epoch": 0.047348537848048135, + "grad_norm": 0.275422779661784, + "kl": 0.05328369140625, + "learning_rate": 7.741121094766916e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.015625, + "epoch": 0.047460472925703094, + "grad_norm": 0.02946053453290786, + "kl": 0.04119873046875, + "learning_rate": 7.671820301316532e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.890625, + "epoch": 0.047572408003358053, + "grad_norm": 0.02258693420380951, + "kl": 0.0445556640625, + "learning_rate": 7.603351458574474e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.9375, + "epoch": 0.04768434308101301, + "grad_norm": 1.383085540888027, + "kl": 0.811279296875, + "learning_rate": 7.535717439356255e-07, + "loss": -0.0196, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.796875, + "epoch": 0.04779627815866797, + "grad_norm": 3.7777508498427164, + "kl": 0.77886962890625, + "learning_rate": 7.46892108144986e-07, + "loss": -0.0481, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.875, + "epoch": 0.04790821323632293, + "grad_norm": 0.10955470879553754, + "kl": 0.05078125, + "learning_rate": 7.402965187496697e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.671875, + "epoch": 0.04802014831397789, + "grad_norm": 0.04113851640884478, + "kl": 0.05523681640625, + "learning_rate": 7.337852524873974e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.03125, + "epoch": 0.04813208339163285, + "grad_norm": 0.018684245104689922, + "kl": 0.03826904296875, + "learning_rate": 7.273585825578608e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.5, + "epoch": 0.04824401846928781, + "grad_norm": 8.259759861418749, + "kl": 1.716339111328125, + "learning_rate": 7.21016778611259e-07, + "loss": -0.0347, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.265625, + "epoch": 0.048355953546942776, + "grad_norm": 0.020343976644658462, + "kl": 0.033447265625, + "learning_rate": 7.147601067369835e-07, + "loss": 0.0003, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.609375, + "epoch": 0.048467888624597735, + "grad_norm": 1.0066422924478642, + "kl": 0.75701904296875, + "learning_rate": 7.085888294524561e-07, + "loss": -0.0467, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.34375, + "epoch": 0.048579823702252695, + "grad_norm": 0.026562111650317053, + "kl": 0.04150390625, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.921875, + "epoch": 0.048691758779907654, + "grad_norm": 0.03150175672968521, + "kl": 0.03955078125, + "learning_rate": 6.965034907965349e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.296875, + "epoch": 0.04880369385756261, + "grad_norm": 0.030310737599258293, + "kl": 0.04107666015625, + "learning_rate": 6.905899365017462e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.328125, + "epoch": 0.04891562893521757, + "grad_norm": 1.8528337687242826, + "kl": 1.40911865234375, + "learning_rate": 6.847627909286409e-07, + "loss": -0.0344, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.515625, + "epoch": 0.04902756401287253, + "grad_norm": 0.028586652468959688, + "kl": 0.04345703125, + "learning_rate": 6.790222985725761e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.4375, + "epoch": 0.04913949909052749, + "grad_norm": 0.08766284200366595, + "kl": 0.0438232421875, + "learning_rate": 6.733687002931141e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.109375, + "epoch": 0.04925143416818245, + "grad_norm": 2.391400552951245, + "kl": 0.09228515625, + "learning_rate": 6.678022333039158e-07, + "loss": -0.0495, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.40625, + "epoch": 0.04936336924583742, + "grad_norm": 0.02480090522832702, + "kl": 0.0423583984375, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.546875, + "epoch": 0.049475304323492376, + "grad_norm": 0.019725004180409796, + "kl": 0.0413818359375, + "learning_rate": 6.569316237618811e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.765625, + "epoch": 0.049587239401147336, + "grad_norm": 0.04162342966221699, + "kl": 0.0498046875, + "learning_rate": 6.516279373180499e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.640625, + "epoch": 0.049699174478802295, + "grad_norm": 0.02391594067619232, + "kl": 0.0406494140625, + "learning_rate": 6.464122943633543e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.609375, + "epoch": 0.049811109556457255, + "grad_norm": 0.026401154596888455, + "kl": 0.03631591796875, + "learning_rate": 6.412849137357271e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.40625, + "epoch": 0.049923044634112214, + "grad_norm": 0.02165150509733936, + "kl": 0.04083251953125, + "learning_rate": 6.3624601056979e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.734375, + "epoch": 0.05003497971176717, + "grad_norm": 0.056253725649908846, + "kl": 0.04132080078125, + "learning_rate": 6.312957962878278e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.90625, + "epoch": 0.05014691478942213, + "grad_norm": 0.026148724794812197, + "kl": 0.05267333984375, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.484375, + "epoch": 0.05025884986707709, + "grad_norm": 0.1390653311408599, + "kl": 0.049560546875, + "learning_rate": 6.216622614502149e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.671875, + "epoch": 0.05037078494473206, + "grad_norm": 0.07042561536180422, + "kl": 0.048095703125, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.046875, + "epoch": 0.05048272002238702, + "grad_norm": 0.8824459143035142, + "kl": 0.163818359375, + "learning_rate": 6.123859260212393e-07, + "loss": 0.0016, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.421875, + "epoch": 0.05059465510004198, + "grad_norm": 0.9135028043813113, + "kl": 0.13751220703125, + "learning_rate": 6.07882196949423e-07, + "loss": -0.0596, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.40625, + "epoch": 0.050706590177696936, + "grad_norm": 0.02488219762931012, + "kl": 0.0374755859375, + "learning_rate": 6.034683468503948e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.765625, + "epoch": 0.050818525255351896, + "grad_norm": 0.07775771681049337, + "kl": 0.040863037109375, + "learning_rate": 5.991445609204641e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.953125, + "epoch": 0.050930460333006855, + "grad_norm": 0.23941770812751062, + "kl": 0.07257080078125, + "learning_rate": 5.949110205770292e-07, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.90625, + "epoch": 0.051042395410661814, + "grad_norm": 0.838372427480566, + "kl": 0.10174560546875, + "learning_rate": 5.90767903450964e-07, + "loss": -0.0595, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.3125, + "epoch": 0.051154330488316774, + "grad_norm": 0.02696775344751943, + "kl": 0.0364990234375, + "learning_rate": 5.867153833791652e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.890625, + "epoch": 0.05126626556597173, + "grad_norm": 2.593781001852504, + "kl": 0.27880859375, + "learning_rate": 5.827536303972587e-07, + "loss": 0.0028, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.28125, + "epoch": 0.0513782006436267, + "grad_norm": 0.09378689742071065, + "kl": 0.0506591796875, + "learning_rate": 5.78882810732465e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.09375, + "epoch": 0.05149013572128166, + "grad_norm": 3.1614265708752236, + "kl": 0.16015625, + "learning_rate": 5.75103086796625e-07, + "loss": 0.0016, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.296875, + "epoch": 0.05160207079893662, + "grad_norm": 1.7687368403392676, + "kl": 0.1033935546875, + "learning_rate": 5.714146171793846e-07, + "loss": -0.0578, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.265625, + "epoch": 0.05171400587659158, + "grad_norm": 0.049362194712489586, + "kl": 0.035614013671875, + "learning_rate": 5.678175566415422e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.171875, + "epoch": 0.05182594095424654, + "grad_norm": 0.02337468550122759, + "kl": 0.0418701171875, + "learning_rate": 5.643120561085528e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.125, + "epoch": 0.051937876031901496, + "grad_norm": 0.19211609005932148, + "kl": 0.06243896484375, + "learning_rate": 5.608982626641991e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.9375, + "epoch": 0.052049811109556456, + "grad_norm": 0.02527496415068125, + "kl": 0.0364990234375, + "learning_rate": 5.575763195444166e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.75, + "epoch": 0.052161746187211415, + "grad_norm": 0.02483601186922771, + "kl": 0.0379638671875, + "learning_rate": 5.543463661312847e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.203125, + "epoch": 0.052273681264866374, + "grad_norm": 0.030569824177275738, + "kl": 0.036956787109375, + "learning_rate": 5.512085379471808e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.96875, + "epoch": 0.05238561634252134, + "grad_norm": 0.870995271718928, + "kl": 0.053680419921875, + "learning_rate": 5.481629666490903e-07, + "loss": -0.0283, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.546875, + "epoch": 0.0524975514201763, + "grad_norm": 0.054911233572157374, + "kl": 0.04248046875, + "learning_rate": 5.452097800230853e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.671875, + "epoch": 0.05260948649783126, + "grad_norm": 0.9189343613920824, + "kl": 0.35272216796875, + "learning_rate": 5.423491019789623e-07, + "loss": -0.0551, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.390625, + "epoch": 0.05272142157548622, + "grad_norm": 0.0633802951576789, + "kl": 0.03961181640625, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.03125, + "epoch": 0.05283335665314118, + "grad_norm": 0.1011785822244599, + "kl": 0.053466796875, + "learning_rate": 5.369057478631359e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.609375, + "epoch": 0.05294529173079614, + "grad_norm": 0.13769887874885445, + "kl": 0.0439453125, + "learning_rate": 5.343233001836694e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.34375, + "epoch": 0.0530572268084511, + "grad_norm": 0.09302862947626754, + "kl": 0.03985595703125, + "learning_rate": 5.318338178609754e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.40625, + "epoch": 0.053169161886106056, + "grad_norm": 0.1263278332387542, + "kl": 0.062255859375, + "learning_rate": 5.294374053487459e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.015625, + "epoch": 0.053281096963761015, + "grad_norm": 0.020700901911474265, + "kl": 0.0355224609375, + "learning_rate": 5.271341631956511e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.046875, + "epoch": 0.05339303204141598, + "grad_norm": 0.027139069654941325, + "kl": 0.0396728515625, + "learning_rate": 5.249241880411181e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.703125, + "epoch": 0.05350496711907094, + "grad_norm": 0.04265200650936088, + "kl": 0.060302734375, + "learning_rate": 5.228075726112785e-07, + "loss": 0.0006, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.640625, + "epoch": 0.0536169021967259, + "grad_norm": 0.025504104198776033, + "kl": 0.03570556640625, + "learning_rate": 5.207844057150768e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.78125, + "epoch": 0.05372883727438086, + "grad_norm": 18.185082196534506, + "kl": 1.0419921875, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0419, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.96875, + "epoch": 0.05384077235203582, + "grad_norm": 0.03327137868690443, + "kl": 0.0408935546875, + "learning_rate": 5.170187531512351e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.703125, + "epoch": 0.05395270742969078, + "grad_norm": 1.7420279438246644, + "kl": 1.31329345703125, + "learning_rate": 5.152764254828348e-07, + "loss": -0.0354, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.5, + "epoch": 0.05406464250734574, + "grad_norm": 0.33787614688903167, + "kl": 0.06884765625, + "learning_rate": 5.136278623399225e-07, + "loss": 0.0007, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.515625, + "epoch": 0.0541765775850007, + "grad_norm": 0.023809496275349114, + "kl": 0.05426025390625, + "learning_rate": 5.120731328929058e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.3125, + "epoch": 0.05428851266265566, + "grad_norm": 0.015058815890629395, + "kl": 0.0362548828125, + "learning_rate": 5.106123023751187e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.515625, + "epoch": 0.05440044774031062, + "grad_norm": 0.019845886834378008, + "kl": 0.03704833984375, + "learning_rate": 5.092454320800833e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0, + "epoch": 0.05451238281796558, + "grad_norm": 0.05683684336283696, + "kl": 0.0426025390625, + "learning_rate": 5.079725793589405e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.71875, + "epoch": 0.05462431789562054, + "grad_norm": 0.03418640906474549, + "kl": 0.04443359375, + "learning_rate": 5.067937976180407e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.0625, + "epoch": 0.0547362529732755, + "grad_norm": 0.036114039198282856, + "kl": 0.05450439453125, + "learning_rate": 5.057091363167046e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.90625, + "epoch": 0.05484818805093046, + "grad_norm": 0.023332365324867536, + "kl": 0.03631591796875, + "learning_rate": 5.047186409651489e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.671875, + "epoch": 0.05496012312858542, + "grad_norm": 13.172841203218344, + "kl": 2.97564697265625, + "learning_rate": 5.038223531225742e-07, + "loss": 0.0339, + "reward": 0.09687500074505806, + "reward_std": 0.012500000186264515, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.796875, + "epoch": 0.05507205820624038, + "grad_norm": 9.652533322003624, + "kl": 0.4891357421875, + "learning_rate": 5.030203103954232e-07, + "loss": -0.0404, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.296875, + "epoch": 0.05518399328389534, + "grad_norm": 0.030140393405088423, + "kl": 0.0369873046875, + "learning_rate": 5.023125464358026e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.0, + "epoch": 0.0552959283615503, + "grad_norm": 0.02422588912502369, + "kl": 0.037200927734375, + "learning_rate": 5.016990909400709e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.34375, + "epoch": 0.055407863439205264, + "grad_norm": 0.0372886708561457, + "kl": 0.05389404296875, + "learning_rate": 5.011799696475915e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.703125, + "epoch": 0.05551979851686022, + "grad_norm": 0.01359500582951308, + "kl": 0.035980224609375, + "learning_rate": 5.007552043396547e-07, + "loss": 0.0004, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.46875, + "epoch": 0.05563173359451518, + "grad_norm": 0.018893589493420345, + "kl": 0.04736328125, + "learning_rate": 5.004248128385618e-07, + "loss": 0.0005, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.9375, + "epoch": 0.05574366867217014, + "grad_norm": 2.740738937008755, + "kl": 0.7550048828125, + "learning_rate": 5.001888090068784e-07, + "loss": -0.0421, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.421875, + "epoch": 0.0558556037498251, + "grad_norm": 0.9492236255194394, + "kl": 0.24884033203125, + "learning_rate": 5.000472027468528e-07, + "loss": -0.0556, + "reward": 0.09843750111758709, + "reward_std": 0.0062500000931322575, + "rewards/code_reward": 0.0, + "rewards/format_reward": 0.984375, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.65625, + "epoch": 0.05596753882748006, + "grad_norm": 0.024972922874043218, + "kl": 0.033905029296875, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0003, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 500 + }, + { + "epoch": 0.05596753882748006, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.028706138839246706, + "train_runtime": 9937.8104, + "train_samples_per_second": 3.22, + "train_steps_per_second": 0.05 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}