Qwen2.5-1.5B-Open-R1-Code-GRPO / trainer_state.json
zhimeng's picture
Model save
0584706 verified
raw
history blame
206 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05596753882748006,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 624.28125,
"epoch": 0.00011193507765496012,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 458.90625,
"epoch": 0.00022387015530992023,
"grad_norm": 0.7040169045969362,
"kl": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0601,
"reward": 0.007812500116415322,
"reward_std": 0.01743034040555358,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.078125,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 444.75,
"epoch": 0.00033580523296488035,
"grad_norm": 0.8629215889276903,
"kl": 0.00027060508728027344,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0038,
"reward": 0.004687500069849193,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.046875,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 443.078125,
"epoch": 0.00044774031061984047,
"grad_norm": 0.7016431850799195,
"kl": 0.00026345252990722656,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0435,
"reward": 0.006250000209547579,
"reward_std": 0.016327823046594858,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 466.0,
"epoch": 0.0005596753882748006,
"grad_norm": 0.4574220548610273,
"kl": 0.0002818107604980469,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.007,
"reward": 0.0031250000465661287,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.03125,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 495.09375,
"epoch": 0.0006716104659297607,
"grad_norm": 0.5434000185158782,
"kl": 0.0002865791320800781,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0427,
"reward": 0.0031250000465661287,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.03125,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 509.765625,
"epoch": 0.0007835455435847208,
"grad_norm": 7.737527191229261,
"kl": 0.0010457038879394531,
"learning_rate": 2.3333333333333336e-06,
"loss": -0.0273,
"reward": 0.0031250000465661287,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.03125,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 401.4375,
"epoch": 0.0008954806212396809,
"grad_norm": 0.9652984425394716,
"kl": 0.000743865966796875,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0031,
"reward": 0.0062500000931322575,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 631.203125,
"epoch": 0.0010074156988946412,
"grad_norm": 0.565508681156182,
"kl": 0.001064300537109375,
"learning_rate": 3e-06,
"loss": 0.0314,
"reward": 0.004687500069849193,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.046875,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 509.109375,
"epoch": 0.0011193507765496012,
"grad_norm": 29.074260549966585,
"kl": 0.7770309448242188,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0532,
"reward": 0.02031250041909516,
"reward_std": 0.04097762983292341,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.203125,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 306.96875,
"epoch": 0.0012312858542045614,
"grad_norm": 1.335155005624375,
"kl": 0.03216552734375,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0838,
"reward": 0.025000000605359674,
"reward_std": 0.043084788136184216,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 469.265625,
"epoch": 0.0013432209318595214,
"grad_norm": 0.9728533904128814,
"kl": 0.11004638671875,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0547,
"reward": 0.02187500020954758,
"reward_std": 0.04057852132245898,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.21875,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 411.71875,
"epoch": 0.0014551560095144816,
"grad_norm": 3.1109091298399707,
"kl": 0.34326171875,
"learning_rate": 4.333333333333334e-06,
"loss": 0.1651,
"reward": 0.05625000037252903,
"reward_std": 0.04977653082460165,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5625,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 437.4375,
"epoch": 0.0015670910871694416,
"grad_norm": 1.4333106697160516,
"kl": 0.2135009765625,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0872,
"reward": 0.06718750111758709,
"reward_std": 0.04840352013707161,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.671875,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 459.671875,
"epoch": 0.0016790261648244019,
"grad_norm": 1.1323714811109955,
"kl": 0.0590972900390625,
"learning_rate": 5e-06,
"loss": 0.0902,
"reward": 0.07343750260770321,
"reward_std": 0.04493850376456976,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.734375,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 430.921875,
"epoch": 0.0017909612424793619,
"grad_norm": 1.1229606735516884,
"kl": 0.0349578857421875,
"learning_rate": 4.999952797253148e-06,
"loss": 0.0231,
"reward": 0.07031250186264515,
"reward_std": 0.04625816363841295,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.703125,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 361.84375,
"epoch": 0.001902896320134322,
"grad_norm": 1.3958687154501264,
"kl": 0.040618896484375,
"learning_rate": 4.9998111909931225e-06,
"loss": 0.0841,
"reward": 0.07656250149011612,
"reward_std": 0.04255262762308121,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.765625,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 366.265625,
"epoch": 0.0020148313977892823,
"grad_norm": 1.2810159881634564,
"kl": 0.058685302734375,
"learning_rate": 4.999575187161439e-06,
"loss": -0.0583,
"reward": 0.08125000260770321,
"reward_std": 0.038373483810573816,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 544.984375,
"epoch": 0.0021267664754442426,
"grad_norm": 0.6496120974985506,
"kl": 0.0307464599609375,
"learning_rate": 4.9992447956603455e-06,
"loss": 0.0475,
"reward": 0.09218750149011612,
"reward_std": 0.01743034040555358,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 470.90625,
"epoch": 0.0022387015530992023,
"grad_norm": 1.1640288172191966,
"kl": 0.016815185546875,
"learning_rate": 4.998820030352409e-06,
"loss": 0.12,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 295.90625,
"epoch": 0.0023506366307541626,
"grad_norm": 2.092394327511984,
"kl": 0.4071197509765625,
"learning_rate": 4.998300909059929e-06,
"loss": 0.077,
"reward": 0.08906250260770321,
"reward_std": 0.025969465728849173,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.890625,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 466.609375,
"epoch": 0.002462571708409123,
"grad_norm": 1.7353937284994216,
"kl": 0.130462646484375,
"learning_rate": 4.997687453564198e-06,
"loss": 0.0013,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 364.75,
"epoch": 0.002574506786064083,
"grad_norm": 0.6464548918875196,
"kl": 0.0576171875,
"learning_rate": 4.9969796896045775e-06,
"loss": -0.0217,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 342.421875,
"epoch": 0.002686441863719043,
"grad_norm": 116.68805529495698,
"kl": 6.03173828125,
"learning_rate": 4.996177646877426e-06,
"loss": 0.0415,
"reward": 0.09218750149011612,
"reward_std": 0.017430341336876154,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 380.5625,
"epoch": 0.002798376941374003,
"grad_norm": 3.181683689923044,
"kl": 0.4976806640625,
"learning_rate": 4.995281359034851e-06,
"loss": -0.0548,
"reward": 0.09218750335276127,
"reward_std": 0.023328250739723444,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 395.578125,
"epoch": 0.0029103120190289633,
"grad_norm": 0.9693325036669225,
"kl": 0.07562255859375,
"learning_rate": 4.994290863683296e-06,
"loss": 0.0118,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 505.640625,
"epoch": 0.0030222470966839235,
"grad_norm": 0.762738499506801,
"kl": 0.1248779296875,
"learning_rate": 4.99320620238196e-06,
"loss": 0.0572,
"reward": 0.09375000186264515,
"reward_std": 0.016327822115272284,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 356.65625,
"epoch": 0.0031341821743388833,
"grad_norm": 0.08544860970511584,
"kl": 0.04632568359375,
"learning_rate": 4.99202742064106e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 345.015625,
"epoch": 0.0032461172519938435,
"grad_norm": 1.1227892135326387,
"kl": 0.03948974609375,
"learning_rate": 4.990754567919917e-06,
"loss": -0.0086,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 413.34375,
"epoch": 0.0033580523296488037,
"grad_norm": 0.8427884883196753,
"kl": 0.03790283203125,
"learning_rate": 4.989387697624881e-06,
"loss": 0.0001,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 290.09375,
"epoch": 0.003469987407303764,
"grad_norm": 0.916292516898012,
"kl": 0.03955078125,
"learning_rate": 4.987926867107095e-06,
"loss": -0.0193,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 403.28125,
"epoch": 0.0035819224849587238,
"grad_norm": 0.5282928671535011,
"kl": 0.043212890625,
"learning_rate": 4.986372137660078e-06,
"loss": -0.0401,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 376.0,
"epoch": 0.003693857562613684,
"grad_norm": 0.8076561222873071,
"kl": 0.06463623046875,
"learning_rate": 4.984723574517165e-06,
"loss": 0.0103,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 352.015625,
"epoch": 0.003805792640268644,
"grad_norm": 0.5820201891858064,
"kl": 0.04254150390625,
"learning_rate": 4.9829812468487655e-06,
"loss": -0.0118,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 321.765625,
"epoch": 0.0039177277179236044,
"grad_norm": 1.30271756609257,
"kl": 0.07342529296875,
"learning_rate": 4.981145227759457e-06,
"loss": 0.0033,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 325.328125,
"epoch": 0.004029662795578565,
"grad_norm": 0.5253064587253158,
"kl": 0.0435791015625,
"learning_rate": 4.979215594284924e-06,
"loss": -0.004,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 285.046875,
"epoch": 0.004141597873233525,
"grad_norm": 0.03440107214885444,
"kl": 0.03900146484375,
"learning_rate": 4.977192427388722e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 446.390625,
"epoch": 0.004253532950888485,
"grad_norm": 0.03871583904571488,
"kl": 0.033447265625,
"learning_rate": 4.9750758119588824e-06,
"loss": 0.0003,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 407.453125,
"epoch": 0.0043654680285434445,
"grad_norm": 0.14679688464116405,
"kl": 0.05303955078125,
"learning_rate": 4.972865836804349e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 386.578125,
"epoch": 0.004477403106198405,
"grad_norm": 1.1845417806658105,
"kl": 0.06744384765625,
"learning_rate": 4.970562594651254e-06,
"loss": 0.0676,
"reward": 0.09531250037252903,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 411.359375,
"epoch": 0.004589338183853365,
"grad_norm": 0.6033536519344826,
"kl": 0.05682373046875,
"learning_rate": 4.968166182139026e-06,
"loss": 0.1634,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 419.03125,
"epoch": 0.004701273261508325,
"grad_norm": 0.03504335311347305,
"kl": 0.042327880859375,
"learning_rate": 4.9656766998163306e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 414.453125,
"epoch": 0.004813208339163285,
"grad_norm": 0.11226404372767065,
"kl": 0.0537109375,
"learning_rate": 4.963094252136865e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 470.296875,
"epoch": 0.004925143416818246,
"grad_norm": 1.682711207148907,
"kl": 0.2684326171875,
"learning_rate": 4.960418947454958e-06,
"loss": 0.0222,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 405.625,
"epoch": 0.005037078494473206,
"grad_norm": 0.04103769366987546,
"kl": 0.0435791015625,
"learning_rate": 4.957650898021038e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 422.765625,
"epoch": 0.005149013572128166,
"grad_norm": 0.524235834468468,
"kl": 0.065673828125,
"learning_rate": 4.954790219976915e-06,
"loss": -0.0335,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 443.984375,
"epoch": 0.005260948649783125,
"grad_norm": 105.38307149631456,
"kl": 0.32183837890625,
"learning_rate": 4.95183703335091e-06,
"loss": 0.0819,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 385.65625,
"epoch": 0.005372883727438086,
"grad_norm": 0.6435525752943102,
"kl": 0.1368408203125,
"learning_rate": 4.948791462052819e-06,
"loss": 0.0042,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 630.0,
"epoch": 0.005484818805093046,
"grad_norm": 0.5344456298032659,
"kl": 0.060546875,
"learning_rate": 4.945653633868716e-06,
"loss": 0.0254,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 422.75,
"epoch": 0.005596753882748006,
"grad_norm": 0.7155667213796403,
"kl": 0.08978271484375,
"learning_rate": 4.942423680455584e-06,
"loss": 0.0132,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 596.296875,
"epoch": 0.005708688960402966,
"grad_norm": 0.6239904143882911,
"kl": 0.04815673828125,
"learning_rate": 4.939101737335802e-06,
"loss": 0.0135,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 402.71875,
"epoch": 0.0058206240380579265,
"grad_norm": 1.104725268564954,
"kl": 0.145263671875,
"learning_rate": 4.935687943891447e-06,
"loss": 0.0015,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 477.328125,
"epoch": 0.005932559115712887,
"grad_norm": 0.5835454079488682,
"kl": 0.075927734375,
"learning_rate": 4.932182443358458e-06,
"loss": 0.1512,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 580.578125,
"epoch": 0.006044494193367847,
"grad_norm": 0.49184303981459476,
"kl": 0.05926513671875,
"learning_rate": 4.928585382820616e-06,
"loss": 0.0194,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 392.046875,
"epoch": 0.006156429271022806,
"grad_norm": 1.0291704424132635,
"kl": 0.1390380859375,
"learning_rate": 4.924896913203376e-06,
"loss": 0.0102,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 465.0625,
"epoch": 0.006268364348677767,
"grad_norm": 0.5613069680179144,
"kl": 0.08880615234375,
"learning_rate": 4.921117189267535e-06,
"loss": 0.0121,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 480.359375,
"epoch": 0.006380299426332727,
"grad_norm": 0.7905247960482367,
"kl": 0.06884765625,
"learning_rate": 4.917246369602742e-06,
"loss": 0.0134,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 393.28125,
"epoch": 0.006492234503987687,
"grad_norm": 18.99182061239259,
"kl": 1.30133056640625,
"learning_rate": 4.9132846166208355e-06,
"loss": -0.0248,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 509.59375,
"epoch": 0.006604169581642647,
"grad_norm": 1.0573356685875819,
"kl": 0.06280517578125,
"learning_rate": 4.9092320965490365e-06,
"loss": 0.0153,
"reward": 0.09375,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 415.171875,
"epoch": 0.0067161046592976075,
"grad_norm": 0.5498420295992223,
"kl": 0.070556640625,
"learning_rate": 4.905088979422971e-06,
"loss": -0.0324,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 374.109375,
"epoch": 0.006828039736952568,
"grad_norm": 0.4563712143777335,
"kl": 0.065673828125,
"learning_rate": 4.900855439079536e-06,
"loss": -0.0263,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 470.03125,
"epoch": 0.006939974814607528,
"grad_norm": 0.022676570314052416,
"kl": 0.0540771484375,
"learning_rate": 4.8965316531496055e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 442.109375,
"epoch": 0.007051909892262488,
"grad_norm": 0.37277836211463244,
"kl": 0.0537109375,
"learning_rate": 4.892117803050578e-06,
"loss": -0.0112,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 407.3125,
"epoch": 0.0071638449699174475,
"grad_norm": 0.6045549634649411,
"kl": 0.06939697265625,
"learning_rate": 4.887614073978761e-06,
"loss": 0.0378,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 490.53125,
"epoch": 0.007275780047572408,
"grad_norm": 0.4804494114616987,
"kl": 0.0440673828125,
"learning_rate": 4.883020654901609e-06,
"loss": 0.0326,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 414.625,
"epoch": 0.007387715125227368,
"grad_norm": 0.02779607160337418,
"kl": 0.04901123046875,
"learning_rate": 4.878337738549785e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 391.34375,
"epoch": 0.007499650202882328,
"grad_norm": 0.019327395350080524,
"kl": 0.05145263671875,
"learning_rate": 4.873565521409082e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 408.140625,
"epoch": 0.007611585280537288,
"grad_norm": 0.7217823002699405,
"kl": 0.07281494140625,
"learning_rate": 4.868704203712173e-06,
"loss": 0.0201,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 488.171875,
"epoch": 0.007723520358192249,
"grad_norm": 0.01773957052813651,
"kl": 0.04571533203125,
"learning_rate": 4.86375398943021e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 405.109375,
"epoch": 0.007835455435847209,
"grad_norm": 0.5495053845625697,
"kl": 0.05712890625,
"learning_rate": 4.858715086264274e-06,
"loss": 0.0313,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 340.9375,
"epoch": 0.007947390513502168,
"grad_norm": 221.95709261316097,
"kl": 29.68072509765625,
"learning_rate": 4.853587705636646e-06,
"loss": 0.4784,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 328.046875,
"epoch": 0.00805932559115713,
"grad_norm": 0.6777938636001378,
"kl": 0.0565185546875,
"learning_rate": 4.84837206268195e-06,
"loss": -0.034,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 283.0,
"epoch": 0.008171260668812089,
"grad_norm": 0.03603965081169223,
"kl": 0.07452392578125,
"learning_rate": 4.8430683762381195e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 357.4375,
"epoch": 0.00828319574646705,
"grad_norm": 0.024934251266286088,
"kl": 0.06591796875,
"learning_rate": 4.837676868837213e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 395.796875,
"epoch": 0.008395130824122009,
"grad_norm": 0.025440481130695015,
"kl": 0.05645751953125,
"learning_rate": 4.832197766696085e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 448.171875,
"epoch": 0.00850706590177697,
"grad_norm": 0.053611239059879696,
"kl": 0.056640625,
"learning_rate": 4.826631299706887e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 498.859375,
"epoch": 0.00861900097943193,
"grad_norm": 0.6813369559573024,
"kl": 0.05133056640625,
"learning_rate": 4.820977701427424e-06,
"loss": 0.1191,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 345.421875,
"epoch": 0.008730936057086889,
"grad_norm": 0.7460088285353335,
"kl": 0.0655517578125,
"learning_rate": 4.81523720907136e-06,
"loss": 0.0598,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 374.640625,
"epoch": 0.00884287113474185,
"grad_norm": 0.6085630747158598,
"kl": 0.06109619140625,
"learning_rate": 4.809410063498254e-06,
"loss": 0.0091,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 373.125,
"epoch": 0.00895480621239681,
"grad_norm": 0.05060284374443874,
"kl": 0.06451416015625,
"learning_rate": 4.8034965092034656e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 479.15625,
"epoch": 0.00906674129005177,
"grad_norm": 0.020553083133118145,
"kl": 0.05169677734375,
"learning_rate": 4.797496794307889e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 383.921875,
"epoch": 0.00917867636770673,
"grad_norm": 0.02331309838539552,
"kl": 0.05242919921875,
"learning_rate": 4.791411170547545e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 470.296875,
"epoch": 0.009290611445361691,
"grad_norm": 0.8366878780204882,
"kl": 0.0616455078125,
"learning_rate": 4.785239893263017e-06,
"loss": 0.019,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 379.609375,
"epoch": 0.00940254652301665,
"grad_norm": 0.4621690087605406,
"kl": 0.07269287109375,
"learning_rate": 4.778983221388742e-06,
"loss": -0.0043,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 323.453125,
"epoch": 0.00951448160067161,
"grad_norm": 0.7196439160838366,
"kl": 0.07952880859375,
"learning_rate": 4.77264141744214e-06,
"loss": 0.0222,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 363.40625,
"epoch": 0.00962641667832657,
"grad_norm": 0.42630497015032043,
"kl": 0.10479736328125,
"learning_rate": 4.766214747512603e-06,
"loss": -0.0067,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 301.90625,
"epoch": 0.00973835175598153,
"grad_norm": 0.0717989075808753,
"kl": 0.06146240234375,
"learning_rate": 4.759703481250331e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 514.953125,
"epoch": 0.009850286833636491,
"grad_norm": 0.5199995504833264,
"kl": 0.0433349609375,
"learning_rate": 4.753107891855015e-06,
"loss": -0.0066,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 270.765625,
"epoch": 0.00996222191129145,
"grad_norm": 0.04715436027273259,
"kl": 0.07342529296875,
"learning_rate": 4.746428256064375e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 313.0,
"epoch": 0.010074156988946412,
"grad_norm": 0.03668445511393964,
"kl": 0.05902099609375,
"learning_rate": 4.7396648541425534e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 343.015625,
"epoch": 0.010186092066601371,
"grad_norm": 0.025197578254458317,
"kl": 0.080322265625,
"learning_rate": 4.732817969868348e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 337.984375,
"epoch": 0.010298027144256332,
"grad_norm": 0.027850466760737828,
"kl": 0.057373046875,
"learning_rate": 4.7258878905233095e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 340.03125,
"epoch": 0.010409962221911291,
"grad_norm": 0.8302898332353863,
"kl": 0.06591796875,
"learning_rate": 4.718874906879688e-06,
"loss": 0.2943,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 412.859375,
"epoch": 0.01052189729956625,
"grad_norm": 0.030863660874464384,
"kl": 0.052978515625,
"learning_rate": 4.711779313188231e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 377.265625,
"epoch": 0.010633832377221212,
"grad_norm": 0.04564269693547498,
"kl": 0.06561279296875,
"learning_rate": 4.70460140716584e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 372.734375,
"epoch": 0.010745767454876171,
"grad_norm": 1.0393244090787366,
"kl": 0.06671142578125,
"learning_rate": 4.697341489983076e-06,
"loss": 0.0811,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 362.25,
"epoch": 0.010857702532531132,
"grad_norm": 0.023478872872016283,
"kl": 0.0535888671875,
"learning_rate": 4.6899998662515215e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 373.90625,
"epoch": 0.010969637610186092,
"grad_norm": 0.019758461356285777,
"kl": 0.0531005859375,
"learning_rate": 4.682576844011007e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 395.90625,
"epoch": 0.011081572687841053,
"grad_norm": 0.01976638212707949,
"kl": 0.0645751953125,
"learning_rate": 4.675072734716678e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 366.859375,
"epoch": 0.011193507765496012,
"grad_norm": 2.002878750503418,
"kl": 0.0760498046875,
"learning_rate": 4.667487853225931e-06,
"loss": 0.3922,
"reward": 0.09531250037252903,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 477.859375,
"epoch": 0.011305442843150973,
"grad_norm": 0.6672407190007655,
"kl": 0.05157470703125,
"learning_rate": 4.659822517785203e-06,
"loss": 0.1641,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 345.8125,
"epoch": 0.011417377920805933,
"grad_norm": 0.9439619070630538,
"kl": 0.061767578125,
"learning_rate": 4.6520770500166165e-06,
"loss": 0.3783,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 539.90625,
"epoch": 0.011529312998460892,
"grad_norm": 0.7519670307168462,
"kl": 0.05535888671875,
"learning_rate": 4.644251774904487e-06,
"loss": 0.0952,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 396.53125,
"epoch": 0.011641248076115853,
"grad_norm": 1.2637256595006654,
"kl": 0.160400390625,
"learning_rate": 4.636347020781684e-06,
"loss": -0.0378,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 403.40625,
"epoch": 0.011753183153770812,
"grad_norm": 0.9414284114713096,
"kl": 0.0699462890625,
"learning_rate": 4.6283631193158605e-06,
"loss": -0.0089,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 404.9375,
"epoch": 0.011865118231425774,
"grad_norm": 0.15424717287769374,
"kl": 0.0926513671875,
"learning_rate": 4.620300405495532e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 376.765625,
"epoch": 0.011977053309080733,
"grad_norm": 0.6366716085256641,
"kl": 0.0911865234375,
"learning_rate": 4.612159217616022e-06,
"loss": 0.0133,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 385.484375,
"epoch": 0.012088988386735694,
"grad_norm": 0.020907253695270293,
"kl": 0.06060791015625,
"learning_rate": 4.603939897265268e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 381.453125,
"epoch": 0.012200923464390653,
"grad_norm": 0.024537257651566672,
"kl": 0.06396484375,
"learning_rate": 4.595642789309492e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 344.421875,
"epoch": 0.012312858542045613,
"grad_norm": 0.02438549022281726,
"kl": 0.06048583984375,
"learning_rate": 4.587268241878724e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 338.796875,
"epoch": 0.012424793619700574,
"grad_norm": 0.5942167965684412,
"kl": 0.0833740234375,
"learning_rate": 4.578816606352205e-06,
"loss": 0.0065,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 379.953125,
"epoch": 0.012536728697355533,
"grad_norm": 0.023256524966985128,
"kl": 0.05865478515625,
"learning_rate": 4.570288237343632e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 344.203125,
"epoch": 0.012648663775010494,
"grad_norm": 0.4917785489739923,
"kl": 0.0616455078125,
"learning_rate": 4.561683492686289e-06,
"loss": 0.0131,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 323.609375,
"epoch": 0.012760598852665454,
"grad_norm": 0.4189078075428464,
"kl": 0.06109619140625,
"learning_rate": 4.5530027334180285e-06,
"loss": -0.0367,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 415.71875,
"epoch": 0.012872533930320415,
"grad_norm": 0.03244601571119012,
"kl": 0.0718994140625,
"learning_rate": 4.544246323766122e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 294.453125,
"epoch": 0.012984469007975374,
"grad_norm": 0.04077561657409853,
"kl": 0.0740966796875,
"learning_rate": 4.535414631131983e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 259.078125,
"epoch": 0.013096404085630335,
"grad_norm": 0.04561395129255693,
"kl": 0.0780029296875,
"learning_rate": 4.526508026075746e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 353.875,
"epoch": 0.013208339163285295,
"grad_norm": 0.032666442461546756,
"kl": 0.071533203125,
"learning_rate": 4.517526882300721e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 445.484375,
"epoch": 0.013320274240940254,
"grad_norm": 0.030745081226969093,
"kl": 0.0496826171875,
"learning_rate": 4.508471576637713e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 410.46875,
"epoch": 0.013432209318595215,
"grad_norm": 0.025825744807750583,
"kl": 0.0751953125,
"learning_rate": 4.499342489029211e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 261.15625,
"epoch": 0.013544144396250174,
"grad_norm": 0.027055040661668105,
"kl": 0.071533203125,
"learning_rate": 4.490140002513449e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 503.46875,
"epoch": 0.013656079473905135,
"grad_norm": 0.9837824917503732,
"kl": 0.05841064453125,
"learning_rate": 4.48086450320833e-06,
"loss": 0.156,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 551.9375,
"epoch": 0.013768014551560095,
"grad_norm": 0.36909382007632124,
"kl": 0.0616455078125,
"learning_rate": 4.4715163802952266e-06,
"loss": 0.034,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 311.875,
"epoch": 0.013879949629215056,
"grad_norm": 0.02289685144542213,
"kl": 0.0621337890625,
"learning_rate": 4.462096026002655e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 349.609375,
"epoch": 0.013991884706870015,
"grad_norm": 0.5288400884276316,
"kl": 0.06207275390625,
"learning_rate": 4.4526038355898144e-06,
"loss": -0.0128,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 376.234375,
"epoch": 0.014103819784524976,
"grad_norm": 0.019666228568501372,
"kl": 0.056396484375,
"learning_rate": 4.4430402073300035e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 338.671875,
"epoch": 0.014215754862179936,
"grad_norm": 0.022162440604770836,
"kl": 0.0638427734375,
"learning_rate": 4.433405542493909e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 356.328125,
"epoch": 0.014327689939834895,
"grad_norm": 0.5613568768967913,
"kl": 0.06121826171875,
"learning_rate": 4.4237002453327734e-06,
"loss": -0.0001,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 375.328125,
"epoch": 0.014439625017489856,
"grad_norm": 0.025068232916270288,
"kl": 0.06744384765625,
"learning_rate": 4.4139247230614245e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 407.34375,
"epoch": 0.014551560095144815,
"grad_norm": 0.018723191179922716,
"kl": 0.05828857421875,
"learning_rate": 4.404079385841201e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 265.3125,
"epoch": 0.014663495172799777,
"grad_norm": 0.16972024878699354,
"kl": 0.09710693359375,
"learning_rate": 4.394164646762734e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 321.984375,
"epoch": 0.014775430250454736,
"grad_norm": 0.0237436227671808,
"kl": 0.06884765625,
"learning_rate": 4.384180921828618e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 419.296875,
"epoch": 0.014887365328109697,
"grad_norm": 0.6392088951882646,
"kl": 0.05865478515625,
"learning_rate": 4.374128629935955e-06,
"loss": 0.1335,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 371.859375,
"epoch": 0.014999300405764656,
"grad_norm": 0.0258451541588282,
"kl": 0.0711669921875,
"learning_rate": 4.364008192858781e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 386.5625,
"epoch": 0.015111235483419617,
"grad_norm": 0.023614977303173818,
"kl": 0.0672607421875,
"learning_rate": 4.353820035230366e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 466.734375,
"epoch": 0.015223170561074577,
"grad_norm": 0.019070024346192215,
"kl": 0.066162109375,
"learning_rate": 4.3435645845254e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 504.890625,
"epoch": 0.015335105638729536,
"grad_norm": 0.37696582370006476,
"kl": 0.06463623046875,
"learning_rate": 4.333242271042054e-06,
"loss": -0.0004,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 567.234375,
"epoch": 0.015447040716384497,
"grad_norm": 3.0390479634412357,
"kl": 0.071533203125,
"learning_rate": 4.32285352788393e-06,
"loss": 0.2188,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 457.21875,
"epoch": 0.015558975794039457,
"grad_norm": 1.8150343506782198,
"kl": 0.106689453125,
"learning_rate": 4.312398790941882e-06,
"loss": 0.3003,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 609.28125,
"epoch": 0.015670910871694418,
"grad_norm": 0.03462843424405208,
"kl": 0.05792236328125,
"learning_rate": 4.301878498875735e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 473.421875,
"epoch": 0.015782845949349377,
"grad_norm": 1.7729469082974438,
"kl": 0.06402587890625,
"learning_rate": 4.291293093095873e-06,
"loss": 0.1156,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 440.90625,
"epoch": 0.015894781027004336,
"grad_norm": 0.678889245969432,
"kl": 0.0787353515625,
"learning_rate": 4.280643017744723e-06,
"loss": 0.0363,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 615.828125,
"epoch": 0.0160067161046593,
"grad_norm": 3.1600108504474265,
"kl": 0.103759765625,
"learning_rate": 4.269928719678117e-06,
"loss": 0.2578,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 383.375,
"epoch": 0.01611865118231426,
"grad_norm": 0.13214155840655448,
"kl": 0.101318359375,
"learning_rate": 4.2591506484465426e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 414.90625,
"epoch": 0.016230586259969218,
"grad_norm": 0.7194283098737337,
"kl": 0.08380126953125,
"learning_rate": 4.248309256276283e-06,
"loss": -0.0069,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 480.046875,
"epoch": 0.016342521337624177,
"grad_norm": 0.10784588867048772,
"kl": 0.0704345703125,
"learning_rate": 4.23740499805044e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 386.234375,
"epoch": 0.016454456415279137,
"grad_norm": 1.2992200606773725,
"kl": 0.1204833984375,
"learning_rate": 4.22643833128985e-06,
"loss": 0.0012,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 430.171875,
"epoch": 0.0165663914929341,
"grad_norm": 16.491496062528398,
"kl": 0.4547119140625,
"learning_rate": 4.215409716133885e-06,
"loss": 0.131,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 471.171875,
"epoch": 0.01667832657058906,
"grad_norm": 4.096997318208475,
"kl": 0.2073974609375,
"learning_rate": 4.204319615321151e-06,
"loss": 0.0021,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 423.59375,
"epoch": 0.016790261648244018,
"grad_norm": 3.910798929312304,
"kl": 0.2662353515625,
"learning_rate": 4.193168494170065e-06,
"loss": 0.2077,
"reward": 0.09218750335276127,
"reward_std": 0.01861694734543562,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 388.78125,
"epoch": 0.016902196725898978,
"grad_norm": 13.951120691377671,
"kl": 0.37646484375,
"learning_rate": 4.181956820559339e-06,
"loss": 0.5985,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 348.0,
"epoch": 0.01701413180355394,
"grad_norm": 5.2519598128920455,
"kl": 0.1241455078125,
"learning_rate": 4.170685064908342e-06,
"loss": 0.2291,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 459.453125,
"epoch": 0.0171260668812089,
"grad_norm": 10.132405471868221,
"kl": 0.1990966796875,
"learning_rate": 4.159353700157365e-06,
"loss": 0.1752,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 386.171875,
"epoch": 0.01723800195886386,
"grad_norm": 0.784730424677537,
"kl": 0.0968017578125,
"learning_rate": 4.14796320174778e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 368.65625,
"epoch": 0.01734993703651882,
"grad_norm": 27.02725934627477,
"kl": 0.109619140625,
"learning_rate": 4.136514047602087e-06,
"loss": 0.1772,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 477.296875,
"epoch": 0.017461872114173778,
"grad_norm": 14.370055706061692,
"kl": 0.1593017578125,
"learning_rate": 4.1250067181038635e-06,
"loss": 0.2029,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 480.53125,
"epoch": 0.01757380719182874,
"grad_norm": 13.084686861766809,
"kl": 0.1204833984375,
"learning_rate": 4.113441696077608e-06,
"loss": 0.1918,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 379.359375,
"epoch": 0.0176857422694837,
"grad_norm": 0.6843422318132463,
"kl": 0.07861328125,
"learning_rate": 4.101819466768484e-06,
"loss": 0.017,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 366.34375,
"epoch": 0.01779767734713866,
"grad_norm": 6.203392924309637,
"kl": 0.2252197265625,
"learning_rate": 4.0901405178219535e-06,
"loss": -0.0466,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 404.71875,
"epoch": 0.01790961242479362,
"grad_norm": 0.7482548079571155,
"kl": 0.15234375,
"learning_rate": 4.078405339263326e-06,
"loss": 0.0015,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 392.734375,
"epoch": 0.018021547502448578,
"grad_norm": 0.821487756754832,
"kl": 0.095458984375,
"learning_rate": 4.06661442347719e-06,
"loss": 0.008,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 370.734375,
"epoch": 0.01813348258010354,
"grad_norm": 0.25693644980051783,
"kl": 0.1165771484375,
"learning_rate": 4.054768265186758e-06,
"loss": 0.0012,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 336.859375,
"epoch": 0.0182454176577585,
"grad_norm": 0.3151740457382109,
"kl": 0.0853271484375,
"learning_rate": 4.0428673614331036e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 354.203125,
"epoch": 0.01835735273541346,
"grad_norm": 0.2872706706321094,
"kl": 0.090087890625,
"learning_rate": 4.030912211554316e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 337.90625,
"epoch": 0.01846928781306842,
"grad_norm": 0.11020779139062825,
"kl": 0.0782470703125,
"learning_rate": 4.018903317164539e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 346.25,
"epoch": 0.018581222890723382,
"grad_norm": 0.045653419133126164,
"kl": 0.0740966796875,
"learning_rate": 4.006841182132932e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 343.359375,
"epoch": 0.01869315796837834,
"grad_norm": 0.021075436862415513,
"kl": 0.06011962890625,
"learning_rate": 3.9947263125625195e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 263.21875,
"epoch": 0.0188050930460333,
"grad_norm": 0.04494777486555804,
"kl": 0.07147216796875,
"learning_rate": 3.982559216768967e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 339.40625,
"epoch": 0.01891702812368826,
"grad_norm": 0.018822857218736996,
"kl": 0.0582275390625,
"learning_rate": 3.970340405259245e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 350.8125,
"epoch": 0.01902896320134322,
"grad_norm": 0.023628578386486077,
"kl": 0.07000732421875,
"learning_rate": 3.958070390710214e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 367.921875,
"epoch": 0.019140898278998182,
"grad_norm": 0.023811978335883294,
"kl": 0.0609130859375,
"learning_rate": 3.945749687947109e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 300.046875,
"epoch": 0.01925283335665314,
"grad_norm": 0.8750934577120364,
"kl": 0.07550048828125,
"learning_rate": 3.933378813921942e-06,
"loss": 0.013,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 335.546875,
"epoch": 0.0193647684343081,
"grad_norm": 0.6477727504447495,
"kl": 0.071044921875,
"learning_rate": 3.920958287691811e-06,
"loss": -0.0026,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 268.5,
"epoch": 0.01947670351196306,
"grad_norm": 27.871539574475218,
"kl": 0.44085693359375,
"learning_rate": 3.908488630397121e-06,
"loss": -0.0071,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 328.78125,
"epoch": 0.019588638589618023,
"grad_norm": 0.057425424775596354,
"kl": 0.06646728515625,
"learning_rate": 3.8959703652397175e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 359.21875,
"epoch": 0.019700573667272982,
"grad_norm": 0.02368246574314423,
"kl": 0.055419921875,
"learning_rate": 3.883404017460935e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 320.234375,
"epoch": 0.019812508744927942,
"grad_norm": 0.03358527901376269,
"kl": 0.05792236328125,
"learning_rate": 3.870790114319559e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 249.78125,
"epoch": 0.0199244438225829,
"grad_norm": 0.5392466815020299,
"kl": 0.06982421875,
"learning_rate": 3.858129185069701e-06,
"loss": -0.0209,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 322.09375,
"epoch": 0.02003637890023786,
"grad_norm": 0.17046312215041995,
"kl": 0.08056640625,
"learning_rate": 3.845421760938597e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 343.234375,
"epoch": 0.020148313977892823,
"grad_norm": 0.05778936711697761,
"kl": 0.05181884765625,
"learning_rate": 3.832668375104312e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 328.75,
"epoch": 0.020260249055547783,
"grad_norm": 0.9368586862857481,
"kl": 0.0615234375,
"learning_rate": 3.8198695626733725e-06,
"loss": -0.0006,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 303.6875,
"epoch": 0.020372184133202742,
"grad_norm": 0.1463135701740714,
"kl": 0.05364990234375,
"learning_rate": 3.8070258606583156e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 268.671875,
"epoch": 0.0204841192108577,
"grad_norm": 24.196429667374314,
"kl": 0.20501708984375,
"learning_rate": 3.7941378079551544e-06,
"loss": 0.0021,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 350.625,
"epoch": 0.020596054288512664,
"grad_norm": 0.7575728537579188,
"kl": 0.067138671875,
"learning_rate": 3.7812059453207677e-06,
"loss": -0.0088,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 381.71875,
"epoch": 0.020707989366167624,
"grad_norm": 1.5781775679619348,
"kl": 0.129150390625,
"learning_rate": 3.768230815350213e-06,
"loss": -0.0091,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 384.6875,
"epoch": 0.020819924443822583,
"grad_norm": 3.7258182746280184,
"kl": 0.70550537109375,
"learning_rate": 3.7552129624539557e-06,
"loss": 0.2283,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 449.1875,
"epoch": 0.020931859521477542,
"grad_norm": 23.600980161646945,
"kl": 2.8865966796875,
"learning_rate": 3.7421529328350316e-06,
"loss": 0.2557,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 352.265625,
"epoch": 0.0210437945991325,
"grad_norm": 36.37207086101098,
"kl": 1.830078125,
"learning_rate": 3.7290512744661274e-06,
"loss": 0.3201,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 396.90625,
"epoch": 0.021155729676787464,
"grad_norm": 20.154618516530974,
"kl": 0.8580322265625,
"learning_rate": 3.715908537066589e-06,
"loss": 0.236,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 371.265625,
"epoch": 0.021267664754442424,
"grad_norm": 1.0506630322756636,
"kl": 0.18212890625,
"learning_rate": 3.7027252720793538e-06,
"loss": -0.0025,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 290.59375,
"epoch": 0.021379599832097383,
"grad_norm": 0.10894834856423846,
"kl": 0.0850830078125,
"learning_rate": 3.689502032647817e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 406.421875,
"epoch": 0.021491534909752343,
"grad_norm": 0.070012885417872,
"kl": 0.0955810546875,
"learning_rate": 3.6762393735926245e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 288.953125,
"epoch": 0.021603469987407305,
"grad_norm": 0.055813531691409256,
"kl": 0.068603515625,
"learning_rate": 3.6629378513883852e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 279.171875,
"epoch": 0.021715405065062265,
"grad_norm": 52.57320558278293,
"kl": 0.1983642578125,
"learning_rate": 3.6495980241403307e-06,
"loss": 0.3357,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 291.78125,
"epoch": 0.021827340142717224,
"grad_norm": 0.04396489484628073,
"kl": 0.0579833984375,
"learning_rate": 3.636220451560896e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 390.78125,
"epoch": 0.021939275220372183,
"grad_norm": 1.0092735457153308,
"kl": 0.08642578125,
"learning_rate": 3.622805694946235e-06,
"loss": 0.0163,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 307.9375,
"epoch": 0.022051210298027143,
"grad_norm": 0.11501833373142471,
"kl": 0.07763671875,
"learning_rate": 3.609354317152667e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 354.546875,
"epoch": 0.022163145375682106,
"grad_norm": 0.24707486499522355,
"kl": 0.0819091796875,
"learning_rate": 3.595866882573063e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 307.640625,
"epoch": 0.022275080453337065,
"grad_norm": 27.72743469845055,
"kl": 0.6578369140625,
"learning_rate": 3.5823439571131675e-06,
"loss": 0.0387,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 297.953125,
"epoch": 0.022387015530992024,
"grad_norm": 4.257281648957348,
"kl": 0.2021484375,
"learning_rate": 3.5687861081678477e-06,
"loss": 0.002,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 406.703125,
"epoch": 0.022498950608646984,
"grad_norm": 1.991901860771057,
"kl": 0.127197265625,
"learning_rate": 3.555193904597291e-06,
"loss": -0.0177,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 360.75,
"epoch": 0.022610885686301947,
"grad_norm": 22.923764220941024,
"kl": 0.1807861328125,
"learning_rate": 3.541567916703138e-06,
"loss": 0.1058,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 361.609375,
"epoch": 0.022722820763956906,
"grad_norm": 148.13942760303817,
"kl": 10.0704345703125,
"learning_rate": 3.5279087162045517e-06,
"loss": 0.3985,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 312.625,
"epoch": 0.022834755841611865,
"grad_norm": 5.396743744887109,
"kl": 0.3883056640625,
"learning_rate": 3.5142168762142265e-06,
"loss": -0.0111,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 409.84375,
"epoch": 0.022946690919266825,
"grad_norm": 2770.9102363815387,
"kl": 340.0640869140625,
"learning_rate": 3.500492971214347e-06,
"loss": 3.6234,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 254.1875,
"epoch": 0.023058625996921784,
"grad_norm": 11.015389916640636,
"kl": 1.8922119140625,
"learning_rate": 3.48673757703248e-06,
"loss": -0.0028,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 350.78125,
"epoch": 0.023170561074576747,
"grad_norm": 1.8419760739328712,
"kl": 0.1317138671875,
"learning_rate": 3.472951270817418e-06,
"loss": -0.0237,
"reward": 0.09062500298023224,
"reward_std": 0.024866947438567877,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.90625,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 384.078125,
"epoch": 0.023282496152231706,
"grad_norm": 1.681815519531453,
"kl": 0.16943359375,
"learning_rate": 3.4591346310149578e-06,
"loss": -0.0704,
"reward": 0.08906250260770321,
"reward_std": 0.025969466660171747,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.890625,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 265.984375,
"epoch": 0.023394431229886666,
"grad_norm": 4.149140370325476,
"kl": 0.343994140625,
"learning_rate": 3.445288237343632e-06,
"loss": 0.0316,
"reward": 0.08906250260770321,
"reward_std": 0.031116947531700134,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.890625,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 323.3125,
"epoch": 0.023506366307541625,
"grad_norm": 7.97862155393176,
"kl": 0.59326171875,
"learning_rate": 3.4314126707703895e-06,
"loss": -0.0088,
"reward": 0.0937500037252903,
"reward_std": 0.017078250646591187,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 410.84375,
"epoch": 0.023618301385196588,
"grad_norm": 37.00032182098118,
"kl": 3.810546875,
"learning_rate": 3.4175085134862128e-06,
"loss": 0.1349,
"reward": 0.08281250484287739,
"reward_std": 0.03833641391247511,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.828125,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 348.96875,
"epoch": 0.023730236462851547,
"grad_norm": 78.79622548097791,
"kl": 3.765380859375,
"learning_rate": 3.4035763488816953e-06,
"loss": 0.2076,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 321.921875,
"epoch": 0.023842171540506506,
"grad_norm": 10.816304901178587,
"kl": 3.8818359375,
"learning_rate": 3.3896167615225594e-06,
"loss": 0.1445,
"reward": 0.08906250260770321,
"reward_std": 0.025969465728849173,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.890625,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 375.6875,
"epoch": 0.023954106618161466,
"grad_norm": 13.522641035708572,
"kl": 4.187744140625,
"learning_rate": 3.375630337125133e-06,
"loss": 0.0886,
"reward": 0.09062500111758709,
"reward_std": 0.020155644044280052,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.90625,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 292.984375,
"epoch": 0.024066041695816425,
"grad_norm": 9.482318398692025,
"kl": 5.662109375,
"learning_rate": 3.361617662531772e-06,
"loss": 0.135,
"reward": 0.09062500298023224,
"reward_std": 0.0295782508328557,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.90625,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 330.234375,
"epoch": 0.024177976773471388,
"grad_norm": 9.298647150834725,
"kl": 1.287109375,
"learning_rate": 3.347579325686237e-06,
"loss": -0.0025,
"reward": 0.09218750335276127,
"reward_std": 0.01861694734543562,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 321.375,
"epoch": 0.024289911851126347,
"grad_norm": 9.33505130284713,
"kl": 2.80908203125,
"learning_rate": 3.333515915609027e-06,
"loss": 0.0542,
"reward": 0.09218750335276127,
"reward_std": 0.01861694734543562,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 364.453125,
"epoch": 0.024401846928781307,
"grad_norm": 46.939949673345936,
"kl": 7.6962890625,
"learning_rate": 3.3194280223726616e-06,
"loss": 0.1244,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 389.140625,
"epoch": 0.024513782006436266,
"grad_norm": 8.965046605455813,
"kl": 1.474365234375,
"learning_rate": 3.305316237076927e-06,
"loss": 0.0928,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 380.921875,
"epoch": 0.024625717084091225,
"grad_norm": 30.26772960669773,
"kl": 3.83154296875,
"learning_rate": 3.291181151824071e-06,
"loss": 0.0081,
"reward": 0.09375000186264515,
"reward_std": 0.011180340312421322,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 431.640625,
"epoch": 0.024737652161746188,
"grad_norm": 9.50042126895199,
"kl": 0.977294921875,
"learning_rate": 3.27702335969396e-06,
"loss": -0.0179,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 363.90625,
"epoch": 0.024849587239401148,
"grad_norm": 8.200151360167519,
"kl": 0.697509765625,
"learning_rate": 3.2628434547191985e-06,
"loss": -0.037,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 305.859375,
"epoch": 0.024961522317056107,
"grad_norm": 18.01832349548423,
"kl": 0.634033203125,
"learning_rate": 3.2486420318601973e-06,
"loss": -0.0207,
"reward": 0.09218750335276127,
"reward_std": 0.01861694734543562,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 368.5,
"epoch": 0.025073457394711066,
"grad_norm": 99.84487944919411,
"kl": 4.774169921875,
"learning_rate": 3.2344196869802187e-06,
"loss": -0.0292,
"reward": 0.09218750149011612,
"reward_std": 0.02257782220840454,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 553.921875,
"epoch": 0.02518539247236603,
"grad_norm": 4.540807623552518,
"kl": 0.2640380859375,
"learning_rate": 3.2201770168203694e-06,
"loss": 0.0409,
"reward": 0.09531250037252903,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 470.421875,
"epoch": 0.02529732755002099,
"grad_norm": 1.9006028901018341,
"kl": 0.2882080078125,
"learning_rate": 3.205914618974563e-06,
"loss": -0.0101,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 359.5,
"epoch": 0.025409262627675948,
"grad_norm": 8.497076561424455,
"kl": 1.46826171875,
"learning_rate": 3.1916330918644496e-06,
"loss": 0.0076,
"reward": 0.0937500037252903,
"reward_std": 0.017078250646591187,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 398.203125,
"epoch": 0.025521197705330907,
"grad_norm": 6.49286525909613,
"kl": 0.5,
"learning_rate": 3.177333034714303e-06,
"loss": 0.0436,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 340.09375,
"epoch": 0.025633132782985867,
"grad_norm": 99.57124601550895,
"kl": 13.4130859375,
"learning_rate": 3.1630150475258813e-06,
"loss": 0.1554,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 352.78125,
"epoch": 0.02574506786064083,
"grad_norm": 49.128639521298574,
"kl": 7.0673828125,
"learning_rate": 3.148679731053252e-06,
"loss": 0.0762,
"reward": 0.09218750335276127,
"reward_std": 0.023328250739723444,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.921875,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 351.09375,
"epoch": 0.02585700293829579,
"grad_norm": 367.82367126923293,
"kl": 26.0625,
"learning_rate": 3.1343276867775805e-06,
"loss": 0.19,
"reward": 0.09531250037252903,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 366.828125,
"epoch": 0.025968938015950748,
"grad_norm": 2760.034913740409,
"kl": 32.2548828125,
"learning_rate": 3.1199595168819043e-06,
"loss": 0.4045,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 421.625,
"epoch": 0.026080873093605707,
"grad_norm": 24.759326550765813,
"kl": 2.56591796875,
"learning_rate": 3.105575824225852e-06,
"loss": 0.1236,
"reward": 0.09531250037252903,
"reward_std": 0.018750000279396772,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 352.453125,
"epoch": 0.02619280817126067,
"grad_norm": 11.271203444155871,
"kl": 0.311767578125,
"learning_rate": 3.091177212320363e-06,
"loss": 0.0506,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 294.984375,
"epoch": 0.02630474324891563,
"grad_norm": 60.98662683249761,
"kl": 6.624267578125,
"learning_rate": 3.0767642853023538e-06,
"loss": 0.0659,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 387.59375,
"epoch": 0.02641667832657059,
"grad_norm": 8.237085005507922,
"kl": 0.9478759765625,
"learning_rate": 3.062337647909376e-06,
"loss": -0.0391,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 260.09375,
"epoch": 0.02652861340422555,
"grad_norm": 18.57477238550014,
"kl": 2.2236328125,
"learning_rate": 3.04789790545424e-06,
"loss": 0.0015,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 322.015625,
"epoch": 0.026640548481880508,
"grad_norm": 16.627244952129402,
"kl": 0.564453125,
"learning_rate": 3.033445663799621e-06,
"loss": 0.0793,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 315.96875,
"epoch": 0.02675248355953547,
"grad_norm": 54.633519018974916,
"kl": 4.1083984375,
"learning_rate": 3.018981529332633e-06,
"loss": 0.1508,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 380.375,
"epoch": 0.02686441863719043,
"grad_norm": 8.038755997888481,
"kl": 3.0703125,
"learning_rate": 3.00450610893939e-06,
"loss": 0.0033,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 386.46875,
"epoch": 0.02697635371484539,
"grad_norm": 12.489323661336277,
"kl": 1.4534912109375,
"learning_rate": 2.9900200099795396e-06,
"loss": 0.0417,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 345.390625,
"epoch": 0.02708828879250035,
"grad_norm": 5.250434377988738,
"kl": 1.822265625,
"learning_rate": 2.9755238402607826e-06,
"loss": -0.0469,
"reward": 0.0937500037252903,
"reward_std": 0.017078250646591187,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 369.125,
"epoch": 0.02720022387015531,
"grad_norm": 57.97881368696394,
"kl": 5.73095703125,
"learning_rate": 2.961018208013367e-06,
"loss": 0.2336,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 264.375,
"epoch": 0.02731215894781027,
"grad_norm": 49.75089750933376,
"kl": 4.38427734375,
"learning_rate": 2.9465037218645694e-06,
"loss": 0.0314,
"reward": 0.09375000186264515,
"reward_std": 0.021039125509560108,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 322.0625,
"epoch": 0.02742409402546523,
"grad_norm": 1.707430633827994,
"kl": 0.204833984375,
"learning_rate": 2.9319809908131604e-06,
"loss": 0.002,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 376.109375,
"epoch": 0.02753602910312019,
"grad_norm": 13.47781256474575,
"kl": 0.590576171875,
"learning_rate": 2.917450624203847e-06,
"loss": 0.0719,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 376.0625,
"epoch": 0.02764796418077515,
"grad_norm": 1680.1301953567038,
"kl": 79.1229248046875,
"learning_rate": 2.9029132317017118e-06,
"loss": 0.7462,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 379.234375,
"epoch": 0.02775989925843011,
"grad_norm": 0.6798016263852162,
"kl": 0.1409912109375,
"learning_rate": 2.888369423266629e-06,
"loss": 0.0014,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 352.953125,
"epoch": 0.02787183433608507,
"grad_norm": 5.926518387479333,
"kl": 0.63525390625,
"learning_rate": 2.8738198091276712e-06,
"loss": -0.0057,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 400.359375,
"epoch": 0.02798376941374003,
"grad_norm": 73.88428710060259,
"kl": 6.923828125,
"learning_rate": 2.859264999757509e-06,
"loss": 0.2651,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 413.0625,
"epoch": 0.02809570449139499,
"grad_norm": 19.57610821275727,
"kl": 1.18603515625,
"learning_rate": 2.8447056058467928e-06,
"loss": -0.0419,
"reward": 0.09531250223517418,
"reward_std": 0.01478912541642785,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.953125,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 357.59375,
"epoch": 0.028207639569049953,
"grad_norm": 15.02657989678683,
"kl": 0.35986328125,
"learning_rate": 2.830142238278531e-06,
"loss": 0.0012,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 381.203125,
"epoch": 0.028319574646704912,
"grad_norm": 2.875721583070933,
"kl": 0.475341796875,
"learning_rate": 2.81557550810246e-06,
"loss": -0.0231,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 371.046875,
"epoch": 0.02843150972435987,
"grad_norm": 0.6092524986743804,
"kl": 0.1376953125,
"learning_rate": 2.8010060265094026e-06,
"loss": 0.0014,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 375.78125,
"epoch": 0.02854344480201483,
"grad_norm": 1.3831215185283958,
"kl": 0.130615234375,
"learning_rate": 2.786434404805629e-06,
"loss": -0.032,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 394.375,
"epoch": 0.02865537987966979,
"grad_norm": 0.29391488966565826,
"kl": 0.1136474609375,
"learning_rate": 2.771861254387199e-06,
"loss": 0.0011,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 418.015625,
"epoch": 0.028767314957324753,
"grad_norm": 8.597539761339826,
"kl": 0.3472900390625,
"learning_rate": 2.7572871867143204e-06,
"loss": 0.0113,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 425.578125,
"epoch": 0.028879250034979712,
"grad_norm": 0.5239580359655661,
"kl": 0.1436767578125,
"learning_rate": 2.742712813285681e-06,
"loss": 0.0014,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 341.625,
"epoch": 0.02899118511263467,
"grad_norm": 1.9555271298208972,
"kl": 0.2032470703125,
"learning_rate": 2.7281387456128017e-06,
"loss": 0.002,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 368.65625,
"epoch": 0.02910312019028963,
"grad_norm": 1.252836717471123,
"kl": 0.2208251953125,
"learning_rate": 2.7135655951943716e-06,
"loss": 0.0022,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 375.59375,
"epoch": 0.029215055267944594,
"grad_norm": 0.8719765743586952,
"kl": 0.1676025390625,
"learning_rate": 2.698993973490598e-06,
"loss": 0.0017,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 333.0625,
"epoch": 0.029326990345599553,
"grad_norm": 0.13713063096829928,
"kl": 0.10205078125,
"learning_rate": 2.6844244918975416e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 320.5,
"epoch": 0.029438925423254513,
"grad_norm": 1.2802535059679794,
"kl": 0.09423828125,
"learning_rate": 2.66985776172147e-06,
"loss": 0.0411,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 434.359375,
"epoch": 0.029550860500909472,
"grad_norm": 3.6751799791380084,
"kl": 0.1397705078125,
"learning_rate": 2.6552943941532088e-06,
"loss": -0.0156,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 352.90625,
"epoch": 0.02966279557856443,
"grad_norm": 0.07806269511080617,
"kl": 0.0855712890625,
"learning_rate": 2.6407350002424927e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 298.84375,
"epoch": 0.029774730656219394,
"grad_norm": 0.06731730411902957,
"kl": 0.087646484375,
"learning_rate": 2.626180190872329e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 295.734375,
"epoch": 0.029886665733874353,
"grad_norm": 0.13087790730798726,
"kl": 0.0902099609375,
"learning_rate": 2.611630576733372e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 273.078125,
"epoch": 0.029998600811529313,
"grad_norm": 5.554675473835038,
"kl": 0.14794921875,
"learning_rate": 2.5970867682982885e-06,
"loss": -0.051,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 344.1875,
"epoch": 0.030110535889184272,
"grad_norm": 0.14583524682041712,
"kl": 0.0927734375,
"learning_rate": 2.582549375796154e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 290.578125,
"epoch": 0.030222470966839235,
"grad_norm": 0.058225539376494065,
"kl": 0.077880859375,
"learning_rate": 2.568019009186841e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 361.6875,
"epoch": 0.030334406044494194,
"grad_norm": 0.04344087707852648,
"kl": 0.0789794921875,
"learning_rate": 2.5534962781354317e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 305.953125,
"epoch": 0.030446341122149154,
"grad_norm": 0.07717630877616868,
"kl": 0.07293701171875,
"learning_rate": 2.538981791986634e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 419.171875,
"epoch": 0.030558276199804113,
"grad_norm": 0.22582461693188216,
"kl": 0.0968017578125,
"learning_rate": 2.524476159739218e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 299.953125,
"epoch": 0.030670211277459072,
"grad_norm": 27.999475574336614,
"kl": 0.5250244140625,
"learning_rate": 2.5099799900204607e-06,
"loss": 0.0169,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 357.21875,
"epoch": 0.030782146355114035,
"grad_norm": 0.034837371039734215,
"kl": 0.07232666015625,
"learning_rate": 2.4954938910606108e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 325.265625,
"epoch": 0.030894081432768995,
"grad_norm": 18.866980558523725,
"kl": 1.3262939453125,
"learning_rate": 2.481018470667368e-06,
"loss": 0.0199,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 375.71875,
"epoch": 0.031006016510423954,
"grad_norm": 0.08715091318862284,
"kl": 0.0843505859375,
"learning_rate": 2.4665543362003802e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 383.90625,
"epoch": 0.031117951588078913,
"grad_norm": 18.97786056185319,
"kl": 0.388671875,
"learning_rate": 2.4521020945457615e-06,
"loss": 0.0333,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 292.890625,
"epoch": 0.031229886665733873,
"grad_norm": 0.3777705304986896,
"kl": 0.09979248046875,
"learning_rate": 2.4376623520906255e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 306.59375,
"epoch": 0.031341821743388835,
"grad_norm": 23.98251858869545,
"kl": 5.38348388671875,
"learning_rate": 2.4232357146976478e-06,
"loss": 0.0275,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 403.40625,
"epoch": 0.031453756821043795,
"grad_norm": 0.09304733402356336,
"kl": 0.0772705078125,
"learning_rate": 2.408822787679637e-06,
"loss": 0.0008,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 354.25,
"epoch": 0.031565691898698754,
"grad_norm": 0.14440303776607324,
"kl": 0.0906982421875,
"learning_rate": 2.3944241757741475e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 323.765625,
"epoch": 0.031677626976353714,
"grad_norm": 2.359216668730623,
"kl": 0.90911865234375,
"learning_rate": 2.380040483118097e-06,
"loss": -0.0461,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 269.328125,
"epoch": 0.03178956205400867,
"grad_norm": 0.045061338534023075,
"kl": 0.05657958984375,
"learning_rate": 2.365672313222419e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 281.828125,
"epoch": 0.03190149713166363,
"grad_norm": 0.21772012539025168,
"kl": 0.0853271484375,
"learning_rate": 2.351320268946749e-06,
"loss": 0.0009,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 314.703125,
"epoch": 0.0320134322093186,
"grad_norm": 0.6734360714201397,
"kl": 0.1312255859375,
"learning_rate": 2.336984952474119e-06,
"loss": 0.0013,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 415.15625,
"epoch": 0.03212536728697356,
"grad_norm": 4.987286255355766,
"kl": 0.12139892578125,
"learning_rate": 2.322666965285697e-06,
"loss": -0.0532,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 400.9375,
"epoch": 0.03223730236462852,
"grad_norm": 0.6096977245482182,
"kl": 0.09234619140625,
"learning_rate": 2.3083669081355507e-06,
"loss": -0.0601,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 502.640625,
"epoch": 0.03234923744228348,
"grad_norm": 0.04237782613357625,
"kl": 0.06573486328125,
"learning_rate": 2.2940853810254377e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 324.53125,
"epoch": 0.032461172519938436,
"grad_norm": 0.029403412259251353,
"kl": 0.06024169921875,
"learning_rate": 2.2798229831796313e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 413.109375,
"epoch": 0.032573107597593395,
"grad_norm": 0.08363337386352669,
"kl": 0.0560302734375,
"learning_rate": 2.2655803130197816e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 318.734375,
"epoch": 0.032685042675248355,
"grad_norm": 0.023053884997018263,
"kl": 0.06512451171875,
"learning_rate": 2.2513579681398034e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 460.546875,
"epoch": 0.032796977752903314,
"grad_norm": 1.460556497688483,
"kl": 0.15765380859375,
"learning_rate": 2.237156545280803e-06,
"loss": -0.026,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 282.609375,
"epoch": 0.03290891283055827,
"grad_norm": 7.935071492001548,
"kl": 1.8072509765625,
"learning_rate": 2.2229766403060403e-06,
"loss": -0.0278,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 385.125,
"epoch": 0.03302084790821324,
"grad_norm": 0.01681019430958899,
"kl": 0.0477294921875,
"learning_rate": 2.2088188481759305e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 414.484375,
"epoch": 0.0331327829858682,
"grad_norm": 0.01847832809783982,
"kl": 0.04632568359375,
"learning_rate": 2.194683762923073e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 443.78125,
"epoch": 0.03324471806352316,
"grad_norm": 0.02629542045427332,
"kl": 0.0531005859375,
"learning_rate": 2.1805719776273387e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 371.09375,
"epoch": 0.03335665314117812,
"grad_norm": 0.039099364449013255,
"kl": 0.0689697265625,
"learning_rate": 2.166484084390974e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 348.171875,
"epoch": 0.03346858821883308,
"grad_norm": 0.054692981575624876,
"kl": 0.067138671875,
"learning_rate": 2.1524206743137636e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 453.71875,
"epoch": 0.033580523296488037,
"grad_norm": 0.020024754177952568,
"kl": 0.04815673828125,
"learning_rate": 2.1383823374682287e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 397.609375,
"epoch": 0.033692458374142996,
"grad_norm": 0.7962215451972806,
"kl": 0.0548095703125,
"learning_rate": 2.124369662874868e-06,
"loss": 0.0015,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 439.640625,
"epoch": 0.033804393451797955,
"grad_norm": 0.03594032335785641,
"kl": 0.04754638671875,
"learning_rate": 2.110383238477441e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 382.828125,
"epoch": 0.033916328529452915,
"grad_norm": 0.02676423646625272,
"kl": 0.0552978515625,
"learning_rate": 2.096423651118305e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 364.1875,
"epoch": 0.03402826360710788,
"grad_norm": 0.021001193681821163,
"kl": 0.04876708984375,
"learning_rate": 2.082491486513788e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 353.9375,
"epoch": 0.03414019868476284,
"grad_norm": 0.015515949438366839,
"kl": 0.0455322265625,
"learning_rate": 2.0685873292296116e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 502.484375,
"epoch": 0.0342521337624178,
"grad_norm": 0.10070236191895302,
"kl": 0.042633056640625,
"learning_rate": 2.054711762656369e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 423.890625,
"epoch": 0.03436406884007276,
"grad_norm": 8.449530301304263,
"kl": 0.17718505859375,
"learning_rate": 2.040865368985044e-06,
"loss": 0.0072,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 365.953125,
"epoch": 0.03447600391772772,
"grad_norm": 4.554366640679164,
"kl": 0.10430908203125,
"learning_rate": 2.027048729182583e-06,
"loss": 0.0096,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 400.78125,
"epoch": 0.03458793899538268,
"grad_norm": 1.4037888132598981,
"kl": 0.065673828125,
"learning_rate": 2.0132624229675205e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 356.390625,
"epoch": 0.03469987407303764,
"grad_norm": 0.08573893743799793,
"kl": 0.04962158203125,
"learning_rate": 1.9995070287856546e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 412.375,
"epoch": 0.034811809150692596,
"grad_norm": 14.764228813015494,
"kl": 0.43438720703125,
"learning_rate": 1.985783123785774e-06,
"loss": -0.03,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 279.03125,
"epoch": 0.034923744228347556,
"grad_norm": 0.2193000836614737,
"kl": 0.067138671875,
"learning_rate": 1.9720912837954486e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 385.421875,
"epoch": 0.035035679306002515,
"grad_norm": 0.1633478285227259,
"kl": 0.05523681640625,
"learning_rate": 1.958432083296862e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 388.4375,
"epoch": 0.03514761438365748,
"grad_norm": 0.145955357953582,
"kl": 0.051025390625,
"learning_rate": 1.9448060954027093e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 378.703125,
"epoch": 0.03525954946131244,
"grad_norm": 7.683536847345325,
"kl": 0.1790771484375,
"learning_rate": 1.931213891832153e-06,
"loss": -0.0279,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 410.09375,
"epoch": 0.0353714845389674,
"grad_norm": 0.19129137636650337,
"kl": 0.05731201171875,
"learning_rate": 1.9176560428868336e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 390.3125,
"epoch": 0.03548341961662236,
"grad_norm": 6.515657611937849,
"kl": 1.45611572265625,
"learning_rate": 1.9041331174269373e-06,
"loss": -0.0073,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 454.234375,
"epoch": 0.03559535469427732,
"grad_norm": 0.2025260783774205,
"kl": 0.05340576171875,
"learning_rate": 1.8906456828473341e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 407.5625,
"epoch": 0.03570728977193228,
"grad_norm": 4.576351957328261,
"kl": 1.09912109375,
"learning_rate": 1.8771943050537656e-06,
"loss": -0.0441,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 414.34375,
"epoch": 0.03581922484958724,
"grad_norm": 0.04886630482006485,
"kl": 0.0460205078125,
"learning_rate": 1.8637795484391046e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 456.3125,
"epoch": 0.0359311599272422,
"grad_norm": 0.21065449469813613,
"kl": 0.0640869140625,
"learning_rate": 1.8504019758596698e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 353.09375,
"epoch": 0.036043095004897156,
"grad_norm": 4.883286184793087,
"kl": 1.3126220703125,
"learning_rate": 1.8370621486116163e-06,
"loss": -0.0417,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 396.03125,
"epoch": 0.03615503008255212,
"grad_norm": 2.4312943229919903,
"kl": 0.7298583984375,
"learning_rate": 1.823760626407377e-06,
"loss": -0.0482,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 366.21875,
"epoch": 0.03626696516020708,
"grad_norm": 0.023794241531636316,
"kl": 0.0516357421875,
"learning_rate": 1.8104979673521838e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 338.34375,
"epoch": 0.03637890023786204,
"grad_norm": 0.06501484692075148,
"kl": 0.0458984375,
"learning_rate": 1.7972747279206482e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 392.140625,
"epoch": 0.036490835315517,
"grad_norm": 0.0554933350153533,
"kl": 0.045013427734375,
"learning_rate": 1.7840914629334122e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 361.5,
"epoch": 0.03660277039317196,
"grad_norm": 0.034907067888093626,
"kl": 0.05291748046875,
"learning_rate": 1.7709487255338731e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 375.453125,
"epoch": 0.03671470547082692,
"grad_norm": 0.04733189685853122,
"kl": 0.0457763671875,
"learning_rate": 1.7578470671649684e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 337.875,
"epoch": 0.03682664054848188,
"grad_norm": 8.191477588254072,
"kl": 0.31414794921875,
"learning_rate": 1.744787037546045e-06,
"loss": -0.0328,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 380.625,
"epoch": 0.03693857562613684,
"grad_norm": 4.485682653488296,
"kl": 0.06427001953125,
"learning_rate": 1.731769184649788e-06,
"loss": -0.0303,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 421.796875,
"epoch": 0.0370505107037918,
"grad_norm": 0.15001206134528802,
"kl": 0.054931640625,
"learning_rate": 1.7187940546792325e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 431.671875,
"epoch": 0.037162445781446764,
"grad_norm": 0.05596585517144101,
"kl": 0.04620361328125,
"learning_rate": 1.7058621920448465e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 374.171875,
"epoch": 0.03727438085910172,
"grad_norm": 0.21063855683262964,
"kl": 0.05145263671875,
"learning_rate": 1.6929741393416855e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 401.03125,
"epoch": 0.03738631593675668,
"grad_norm": 14.703670642314776,
"kl": 0.6478271484375,
"learning_rate": 1.6801304373266286e-06,
"loss": -0.0265,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 405.28125,
"epoch": 0.03749825101441164,
"grad_norm": 3.2693646713839533,
"kl": 0.2021484375,
"learning_rate": 1.667331624895689e-06,
"loss": -0.0538,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 464.15625,
"epoch": 0.0376101860920666,
"grad_norm": 0.0326789044313002,
"kl": 0.06781005859375,
"learning_rate": 1.6545782390614037e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 356.953125,
"epoch": 0.03772212116972156,
"grad_norm": 3.381374096168454,
"kl": 0.24603271484375,
"learning_rate": 1.6418708149302992e-06,
"loss": -0.0531,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 350.0625,
"epoch": 0.03783405624737652,
"grad_norm": 4.256284487475119,
"kl": 0.145263671875,
"learning_rate": 1.6292098856804423e-06,
"loss": -0.0518,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 244.484375,
"epoch": 0.03794599132503148,
"grad_norm": 4.758319146409537,
"kl": 0.52178955078125,
"learning_rate": 1.6165959825390661e-06,
"loss": -0.0436,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 352.828125,
"epoch": 0.03805792640268644,
"grad_norm": 0.06125517244847344,
"kl": 0.0491943359375,
"learning_rate": 1.604029634760284e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 363.828125,
"epoch": 0.038169861480341405,
"grad_norm": 0.680605704729485,
"kl": 0.10308837890625,
"learning_rate": 1.59151136960288e-06,
"loss": 0.001,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 393.265625,
"epoch": 0.038281796557996364,
"grad_norm": 0.06326323223836276,
"kl": 0.04736328125,
"learning_rate": 1.5790417123081903e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 381.671875,
"epoch": 0.038393731635651324,
"grad_norm": 0.049854259040433335,
"kl": 0.048828125,
"learning_rate": 1.5666211860780583e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 403.125,
"epoch": 0.03850566671330628,
"grad_norm": 0.07874363283266796,
"kl": 0.05059814453125,
"learning_rate": 1.5542503120528918e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 413.3125,
"epoch": 0.03861760179096124,
"grad_norm": 0.12008769303035174,
"kl": 0.0606689453125,
"learning_rate": 1.5419296092897866e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 445.28125,
"epoch": 0.0387295368686162,
"grad_norm": 0.10834525189023946,
"kl": 0.0592041015625,
"learning_rate": 1.529659594740755e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 378.25,
"epoch": 0.03884147194627116,
"grad_norm": 0.05706678172063102,
"kl": 0.040863037109375,
"learning_rate": 1.5174407832310338e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 316.15625,
"epoch": 0.03895340702392612,
"grad_norm": 0.2260930410291458,
"kl": 0.0599365234375,
"learning_rate": 1.5052736874374815e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 412.671875,
"epoch": 0.03906534210158108,
"grad_norm": 0.03574155761474703,
"kl": 0.0439453125,
"learning_rate": 1.4931588178670695e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 325.40625,
"epoch": 0.039177277179236046,
"grad_norm": 0.06765325310501516,
"kl": 0.05303955078125,
"learning_rate": 1.4810966828354605e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 382.625,
"epoch": 0.039289212256891005,
"grad_norm": 0.45741242496960755,
"kl": 0.063720703125,
"learning_rate": 1.469087788445684e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 415.0625,
"epoch": 0.039401147334545965,
"grad_norm": 0.059362152677566817,
"kl": 0.044921875,
"learning_rate": 1.4571326385668965e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 409.265625,
"epoch": 0.039513082412200924,
"grad_norm": 0.03486583265069237,
"kl": 0.04302978515625,
"learning_rate": 1.4452317348132434e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 549.171875,
"epoch": 0.039625017489855884,
"grad_norm": 0.044710017702799705,
"kl": 0.04583740234375,
"learning_rate": 1.4333855765228104e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 374.359375,
"epoch": 0.03973695256751084,
"grad_norm": 0.0864511831454649,
"kl": 0.04925537109375,
"learning_rate": 1.421594660736675e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 361.8125,
"epoch": 0.0398488876451658,
"grad_norm": 0.03699683675596586,
"kl": 0.04742431640625,
"learning_rate": 1.4098594821780476e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 362.8125,
"epoch": 0.03996082272282076,
"grad_norm": 0.07941048515036817,
"kl": 0.05633544921875,
"learning_rate": 1.3981805332315174e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 293.390625,
"epoch": 0.04007275780047572,
"grad_norm": 12.642109265209918,
"kl": 1.81060791015625,
"learning_rate": 1.3865583039223929e-06,
"loss": -0.0185,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 357.390625,
"epoch": 0.04018469287813069,
"grad_norm": 0.08034212517091562,
"kl": 0.0445556640625,
"learning_rate": 1.374993281896137e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 367.171875,
"epoch": 0.04029662795578565,
"grad_norm": 0.09767901214575864,
"kl": 0.04425048828125,
"learning_rate": 1.3634859523979134e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 392.578125,
"epoch": 0.040408563033440606,
"grad_norm": 0.07629861701658103,
"kl": 0.048583984375,
"learning_rate": 1.3520367982522208e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 359.375,
"epoch": 0.040520498111095565,
"grad_norm": 2.436909257977141,
"kl": 0.3397216796875,
"learning_rate": 1.3406462998426358e-06,
"loss": -0.0575,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 345.640625,
"epoch": 0.040632433188750525,
"grad_norm": 1.7488785493981298,
"kl": 0.41937255859375,
"learning_rate": 1.3293149350916595e-06,
"loss": -0.0315,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 340.265625,
"epoch": 0.040744368266405484,
"grad_norm": 0.042176241580247326,
"kl": 0.03741455078125,
"learning_rate": 1.3180431794406623e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 438.40625,
"epoch": 0.04085630334406044,
"grad_norm": 0.06954183102024396,
"kl": 0.0513916015625,
"learning_rate": 1.3068315058299358e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 384.453125,
"epoch": 0.0409682384217154,
"grad_norm": 0.0585988602116313,
"kl": 0.03924560546875,
"learning_rate": 1.2956803846788503e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 369.34375,
"epoch": 0.04108017349937036,
"grad_norm": 0.029227802328528226,
"kl": 0.05963134765625,
"learning_rate": 1.284590283866116e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 468.015625,
"epoch": 0.04119210857702533,
"grad_norm": 0.024030066071591895,
"kl": 0.039306640625,
"learning_rate": 1.2735616687101518e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 345.578125,
"epoch": 0.04130404365468029,
"grad_norm": 0.022956778080715768,
"kl": 0.04571533203125,
"learning_rate": 1.2625950019495614e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 419.140625,
"epoch": 0.04141597873233525,
"grad_norm": 0.03031369908455225,
"kl": 0.04296875,
"learning_rate": 1.251690743723718e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 327.125,
"epoch": 0.041527913809990206,
"grad_norm": 1.4619752022004908,
"kl": 0.42498779296875,
"learning_rate": 1.2408493515534581e-06,
"loss": -0.0518,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 403.390625,
"epoch": 0.041639848887645166,
"grad_norm": 0.06787211806577707,
"kl": 0.04547119140625,
"learning_rate": 1.2300712803218834e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 384.984375,
"epoch": 0.041751783965300125,
"grad_norm": 0.844949833560224,
"kl": 0.04034423828125,
"learning_rate": 1.2193569822552772e-06,
"loss": 0.0324,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 469.625,
"epoch": 0.041863719042955085,
"grad_norm": 0.5718530387307899,
"kl": 0.034820556640625,
"learning_rate": 1.2087069069041268e-06,
"loss": 0.014,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 408.0625,
"epoch": 0.041975654120610044,
"grad_norm": 0.03780725899196792,
"kl": 0.0430908203125,
"learning_rate": 1.1981215011242654e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 394.140625,
"epoch": 0.042087589198265,
"grad_norm": 0.022995562401170407,
"kl": 0.05279541015625,
"learning_rate": 1.1876012090581184e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 411.75,
"epoch": 0.04219952427591997,
"grad_norm": 0.10984965670222785,
"kl": 0.04351806640625,
"learning_rate": 1.177146472116071e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 400.6875,
"epoch": 0.04231145935357493,
"grad_norm": 3.2788406827242613,
"kl": 0.182952880859375,
"learning_rate": 1.1667577289579462e-06,
"loss": -0.0157,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 379.484375,
"epoch": 0.04242339443122989,
"grad_norm": 0.04631375660525243,
"kl": 0.0482177734375,
"learning_rate": 1.1564354154746007e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 361.921875,
"epoch": 0.04253532950888485,
"grad_norm": 0.5029931057300824,
"kl": 0.04144287109375,
"learning_rate": 1.146179964769635e-06,
"loss": -0.0096,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 408.53125,
"epoch": 0.04264726458653981,
"grad_norm": 2.062066313854755,
"kl": 0.10888671875,
"learning_rate": 1.1359918071412195e-06,
"loss": 0.0137,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 473.390625,
"epoch": 0.042759199664194766,
"grad_norm": 5.689014120759552,
"kl": 0.19818115234375,
"learning_rate": 1.1258713700640456e-06,
"loss": 0.002,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 378.40625,
"epoch": 0.042871134741849726,
"grad_norm": 5.456471510055751,
"kl": 0.1583251953125,
"learning_rate": 1.115819078171383e-06,
"loss": -0.0152,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 405.59375,
"epoch": 0.042983069819504685,
"grad_norm": 0.01860238339511713,
"kl": 0.04095458984375,
"learning_rate": 1.1058353532372667e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 382.15625,
"epoch": 0.043095004897159644,
"grad_norm": 0.13949154395909993,
"kl": 0.0496826171875,
"learning_rate": 1.0959206141587998e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 315.453125,
"epoch": 0.04320693997481461,
"grad_norm": 1.107285367889642,
"kl": 0.3203125,
"learning_rate": 1.0860752769385766e-06,
"loss": -0.0542,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 304.203125,
"epoch": 0.04331887505246957,
"grad_norm": 14.993173199026563,
"kl": 1.521728515625,
"learning_rate": 1.0762997546672279e-06,
"loss": -0.0433,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 456.78125,
"epoch": 0.04343081013012453,
"grad_norm": 0.10799334625634623,
"kl": 0.041534423828125,
"learning_rate": 1.0665944575060914e-06,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 410.546875,
"epoch": 0.04354274520777949,
"grad_norm": 0.37423665656457383,
"kl": 0.061767578125,
"learning_rate": 1.056959792669997e-06,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 373.703125,
"epoch": 0.04365468028543445,
"grad_norm": 0.19089367747123068,
"kl": 0.05108642578125,
"learning_rate": 1.0473961644101856e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 358.71875,
"epoch": 0.04376661536308941,
"grad_norm": 1.5778323232854403,
"kl": 0.1815185546875,
"learning_rate": 1.037903973997345e-06,
"loss": -0.0537,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 326.71875,
"epoch": 0.04387855044074437,
"grad_norm": 0.31904972301610846,
"kl": 0.052734375,
"learning_rate": 1.0284836197047737e-06,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 266.0625,
"epoch": 0.043990485518399326,
"grad_norm": 2.0851570445818868,
"kl": 0.2457275390625,
"learning_rate": 1.0191354967916712e-06,
"loss": 0.0287,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 415.109375,
"epoch": 0.044102420596054286,
"grad_norm": 0.3993597476384507,
"kl": 0.07476806640625,
"learning_rate": 1.0098599974865515e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 378.15625,
"epoch": 0.04421435567370925,
"grad_norm": 0.3569068198951939,
"kl": 0.06988525390625,
"learning_rate": 1.0006575109707898e-06,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 314.234375,
"epoch": 0.04432629075136421,
"grad_norm": 0.13286047664045245,
"kl": 0.04833984375,
"learning_rate": 9.915284233622877e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 416.9375,
"epoch": 0.04443822582901917,
"grad_norm": 2.2497661432685447,
"kl": 0.05743408203125,
"learning_rate": 9.824731176992796e-07,
"loss": -0.002,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 365.140625,
"epoch": 0.04455016090667413,
"grad_norm": 3.30119576808313,
"kl": 1.076416015625,
"learning_rate": 9.734919739242543e-07,
"loss": -0.0157,
"reward": 0.09687500260770321,
"reward_std": 0.008539125323295593,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 277.28125,
"epoch": 0.04466209598432909,
"grad_norm": 0.08032954719670685,
"kl": 0.06231689453125,
"learning_rate": 9.645853688680177e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 379.8125,
"epoch": 0.04477403106198405,
"grad_norm": 0.30125779125635027,
"kl": 0.06890869140625,
"learning_rate": 9.557536762338786e-07,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 408.609375,
"epoch": 0.04488596613963901,
"grad_norm": 0.10410594418892839,
"kl": 0.04498291015625,
"learning_rate": 9.46997266581973e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 388.671875,
"epoch": 0.04499790121729397,
"grad_norm": 0.21768463357305143,
"kl": 0.04937744140625,
"learning_rate": 9.383165073137115e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 340.828125,
"epoch": 0.04510983629494893,
"grad_norm": 0.24943422959107125,
"kl": 0.054443359375,
"learning_rate": 9.297117626563687e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 398.4375,
"epoch": 0.04522177137260389,
"grad_norm": 0.15214769860694116,
"kl": 0.05352783203125,
"learning_rate": 9.211833936477957e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 309.265625,
"epoch": 0.04533370645025885,
"grad_norm": 0.12737968723084875,
"kl": 0.04833984375,
"learning_rate": 9.127317581212753e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 367.078125,
"epoch": 0.04544564152791381,
"grad_norm": 0.12491707622355717,
"kl": 0.0419921875,
"learning_rate": 9.043572106905084e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 347.171875,
"epoch": 0.04555757660556877,
"grad_norm": 0.2782478324098304,
"kl": 0.04632568359375,
"learning_rate": 8.960601027347321e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 344.59375,
"epoch": 0.04566951168322373,
"grad_norm": 1.707651816399516,
"kl": 0.17364501953125,
"learning_rate": 8.878407823839788e-07,
"loss": 0.0017,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 258.625,
"epoch": 0.04578144676087869,
"grad_norm": 0.18198728965219338,
"kl": 0.05511474609375,
"learning_rate": 8.796995945044689e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 329.171875,
"epoch": 0.04589338183853365,
"grad_norm": 3.0116035887758312,
"kl": 0.07513427734375,
"learning_rate": 8.716368806841405e-07,
"loss": 0.0028,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 370.46875,
"epoch": 0.04600531691618861,
"grad_norm": 0.036581584760347785,
"kl": 0.04638671875,
"learning_rate": 8.636529792183171e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 412.40625,
"epoch": 0.04611725199384357,
"grad_norm": 1.7675486622831695,
"kl": 0.08441162109375,
"learning_rate": 8.557482250955144e-07,
"loss": 0.0507,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 376.046875,
"epoch": 0.046229187071498534,
"grad_norm": 0.8339879341516419,
"kl": 0.06793212890625,
"learning_rate": 8.479229499833844e-07,
"loss": -0.0386,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 269.328125,
"epoch": 0.046341122149153494,
"grad_norm": 0.03758254954126514,
"kl": 0.046630859375,
"learning_rate": 8.401774822147976e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 426.90625,
"epoch": 0.04645305722680845,
"grad_norm": 0.03884055882244416,
"kl": 0.0457763671875,
"learning_rate": 8.325121467740695e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 327.15625,
"epoch": 0.04656499230446341,
"grad_norm": 0.13058568836650014,
"kl": 0.06011962890625,
"learning_rate": 8.249272652833226e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 389.59375,
"epoch": 0.04667692738211837,
"grad_norm": 0.05421556382240225,
"kl": 0.0401611328125,
"learning_rate": 8.174231559889931e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 266.25,
"epoch": 0.04678886245977333,
"grad_norm": 2.135742456611555,
"kl": 0.55340576171875,
"learning_rate": 8.100001337484787e-07,
"loss": -0.052,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 301.5625,
"epoch": 0.04690079753742829,
"grad_norm": 27.79993671743234,
"kl": 2.71258544921875,
"learning_rate": 8.026585100169251e-07,
"loss": 0.1087,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 381.640625,
"epoch": 0.04701273261508325,
"grad_norm": 0.3229458185249445,
"kl": 0.0728759765625,
"learning_rate": 7.953985928341601e-07,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 352.0625,
"epoch": 0.04712466769273821,
"grad_norm": 0.18899120451502113,
"kl": 0.04962158203125,
"learning_rate": 7.882206868117693e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 285.9375,
"epoch": 0.047236602770393175,
"grad_norm": 0.05378027086791007,
"kl": 0.0703125,
"learning_rate": 7.81125093120313e-07,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 366.609375,
"epoch": 0.047348537848048135,
"grad_norm": 0.275422779661784,
"kl": 0.05328369140625,
"learning_rate": 7.741121094766916e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 386.015625,
"epoch": 0.047460472925703094,
"grad_norm": 0.02946053453290786,
"kl": 0.04119873046875,
"learning_rate": 7.671820301316532e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 381.890625,
"epoch": 0.047572408003358053,
"grad_norm": 0.02258693420380951,
"kl": 0.0445556640625,
"learning_rate": 7.603351458574474e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 416.9375,
"epoch": 0.04768434308101301,
"grad_norm": 1.383085540888027,
"kl": 0.811279296875,
"learning_rate": 7.535717439356255e-07,
"loss": -0.0196,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 368.796875,
"epoch": 0.04779627815866797,
"grad_norm": 3.7777508498427164,
"kl": 0.77886962890625,
"learning_rate": 7.46892108144986e-07,
"loss": -0.0481,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 276.875,
"epoch": 0.04790821323632293,
"grad_norm": 0.10955470879553754,
"kl": 0.05078125,
"learning_rate": 7.402965187496697e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 380.671875,
"epoch": 0.04802014831397789,
"grad_norm": 0.04113851640884478,
"kl": 0.05523681640625,
"learning_rate": 7.337852524873974e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 420.03125,
"epoch": 0.04813208339163285,
"grad_norm": 0.018684245104689922,
"kl": 0.03826904296875,
"learning_rate": 7.273585825578608e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 345.5,
"epoch": 0.04824401846928781,
"grad_norm": 8.259759861418749,
"kl": 1.716339111328125,
"learning_rate": 7.21016778611259e-07,
"loss": -0.0347,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 402.265625,
"epoch": 0.048355953546942776,
"grad_norm": 0.020343976644658462,
"kl": 0.033447265625,
"learning_rate": 7.147601067369835e-07,
"loss": 0.0003,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 351.609375,
"epoch": 0.048467888624597735,
"grad_norm": 1.0066422924478642,
"kl": 0.75701904296875,
"learning_rate": 7.085888294524561e-07,
"loss": -0.0467,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 415.34375,
"epoch": 0.048579823702252695,
"grad_norm": 0.026562111650317053,
"kl": 0.04150390625,
"learning_rate": 7.025032056921117e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 308.921875,
"epoch": 0.048691758779907654,
"grad_norm": 0.03150175672968521,
"kl": 0.03955078125,
"learning_rate": 6.965034907965349e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 343.296875,
"epoch": 0.04880369385756261,
"grad_norm": 0.030310737599258293,
"kl": 0.04107666015625,
"learning_rate": 6.905899365017462e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 319.328125,
"epoch": 0.04891562893521757,
"grad_norm": 1.8528337687242826,
"kl": 1.40911865234375,
"learning_rate": 6.847627909286409e-07,
"loss": -0.0344,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 328.515625,
"epoch": 0.04902756401287253,
"grad_norm": 0.028586652468959688,
"kl": 0.04345703125,
"learning_rate": 6.790222985725761e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 335.4375,
"epoch": 0.04913949909052749,
"grad_norm": 0.08766284200366595,
"kl": 0.0438232421875,
"learning_rate": 6.733687002931141e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 305.109375,
"epoch": 0.04925143416818245,
"grad_norm": 2.391400552951245,
"kl": 0.09228515625,
"learning_rate": 6.678022333039158e-07,
"loss": -0.0495,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 322.40625,
"epoch": 0.04936336924583742,
"grad_norm": 0.02480090522832702,
"kl": 0.0423583984375,
"learning_rate": 6.623231311627876e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 395.546875,
"epoch": 0.049475304323492376,
"grad_norm": 0.019725004180409796,
"kl": 0.0413818359375,
"learning_rate": 6.569316237618811e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 359.765625,
"epoch": 0.049587239401147336,
"grad_norm": 0.04162342966221699,
"kl": 0.0498046875,
"learning_rate": 6.516279373180499e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 344.640625,
"epoch": 0.049699174478802295,
"grad_norm": 0.02391594067619232,
"kl": 0.0406494140625,
"learning_rate": 6.464122943633543e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 369.609375,
"epoch": 0.049811109556457255,
"grad_norm": 0.026401154596888455,
"kl": 0.03631591796875,
"learning_rate": 6.412849137357271e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 378.40625,
"epoch": 0.049923044634112214,
"grad_norm": 0.02165150509733936,
"kl": 0.04083251953125,
"learning_rate": 6.3624601056979e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 424.734375,
"epoch": 0.05003497971176717,
"grad_norm": 0.056253725649908846,
"kl": 0.04132080078125,
"learning_rate": 6.312957962878278e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 332.90625,
"epoch": 0.05014691478942213,
"grad_norm": 0.026148724794812197,
"kl": 0.05267333984375,
"learning_rate": 6.264344785909181e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 310.484375,
"epoch": 0.05025884986707709,
"grad_norm": 0.1390653311408599,
"kl": 0.049560546875,
"learning_rate": 6.216622614502149e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 388.671875,
"epoch": 0.05037078494473206,
"grad_norm": 0.07042561536180422,
"kl": 0.048095703125,
"learning_rate": 6.169793450983916e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 368.046875,
"epoch": 0.05048272002238702,
"grad_norm": 0.8824459143035142,
"kl": 0.163818359375,
"learning_rate": 6.123859260212393e-07,
"loss": 0.0016,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 328.421875,
"epoch": 0.05059465510004198,
"grad_norm": 0.9135028043813113,
"kl": 0.13751220703125,
"learning_rate": 6.07882196949423e-07,
"loss": -0.0596,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 396.40625,
"epoch": 0.050706590177696936,
"grad_norm": 0.02488219762931012,
"kl": 0.0374755859375,
"learning_rate": 6.034683468503948e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 339.765625,
"epoch": 0.050818525255351896,
"grad_norm": 0.07775771681049337,
"kl": 0.040863037109375,
"learning_rate": 5.991445609204641e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 323.953125,
"epoch": 0.050930460333006855,
"grad_norm": 0.23941770812751062,
"kl": 0.07257080078125,
"learning_rate": 5.949110205770292e-07,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 377.90625,
"epoch": 0.051042395410661814,
"grad_norm": 0.838372427480566,
"kl": 0.10174560546875,
"learning_rate": 5.90767903450964e-07,
"loss": -0.0595,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 354.3125,
"epoch": 0.051154330488316774,
"grad_norm": 0.02696775344751943,
"kl": 0.0364990234375,
"learning_rate": 5.867153833791652e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 293.890625,
"epoch": 0.05126626556597173,
"grad_norm": 2.593781001852504,
"kl": 0.27880859375,
"learning_rate": 5.827536303972587e-07,
"loss": 0.0028,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 440.28125,
"epoch": 0.0513782006436267,
"grad_norm": 0.09378689742071065,
"kl": 0.0506591796875,
"learning_rate": 5.78882810732465e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 286.09375,
"epoch": 0.05149013572128166,
"grad_norm": 3.1614265708752236,
"kl": 0.16015625,
"learning_rate": 5.75103086796625e-07,
"loss": 0.0016,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 311.296875,
"epoch": 0.05160207079893662,
"grad_norm": 1.7687368403392676,
"kl": 0.1033935546875,
"learning_rate": 5.714146171793846e-07,
"loss": -0.0578,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 377.265625,
"epoch": 0.05171400587659158,
"grad_norm": 0.049362194712489586,
"kl": 0.035614013671875,
"learning_rate": 5.678175566415422e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 364.171875,
"epoch": 0.05182594095424654,
"grad_norm": 0.02337468550122759,
"kl": 0.0418701171875,
"learning_rate": 5.643120561085528e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 286.125,
"epoch": 0.051937876031901496,
"grad_norm": 0.19211609005932148,
"kl": 0.06243896484375,
"learning_rate": 5.608982626641991e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 460.9375,
"epoch": 0.052049811109556456,
"grad_norm": 0.02527496415068125,
"kl": 0.0364990234375,
"learning_rate": 5.575763195444166e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 337.75,
"epoch": 0.052161746187211415,
"grad_norm": 0.02483601186922771,
"kl": 0.0379638671875,
"learning_rate": 5.543463661312847e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 382.203125,
"epoch": 0.052273681264866374,
"grad_norm": 0.030569824177275738,
"kl": 0.036956787109375,
"learning_rate": 5.512085379471808e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 372.96875,
"epoch": 0.05238561634252134,
"grad_norm": 0.870995271718928,
"kl": 0.053680419921875,
"learning_rate": 5.481629666490903e-07,
"loss": -0.0283,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 255.546875,
"epoch": 0.0524975514201763,
"grad_norm": 0.054911233572157374,
"kl": 0.04248046875,
"learning_rate": 5.452097800230853e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 365.671875,
"epoch": 0.05260948649783126,
"grad_norm": 0.9189343613920824,
"kl": 0.35272216796875,
"learning_rate": 5.423491019789623e-07,
"loss": -0.0551,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 364.390625,
"epoch": 0.05272142157548622,
"grad_norm": 0.0633802951576789,
"kl": 0.03961181640625,
"learning_rate": 5.395810525450425e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 332.03125,
"epoch": 0.05283335665314118,
"grad_norm": 0.1011785822244599,
"kl": 0.053466796875,
"learning_rate": 5.369057478631359e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 287.609375,
"epoch": 0.05294529173079614,
"grad_norm": 0.13769887874885445,
"kl": 0.0439453125,
"learning_rate": 5.343233001836694e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 387.34375,
"epoch": 0.0530572268084511,
"grad_norm": 0.09302862947626754,
"kl": 0.03985595703125,
"learning_rate": 5.318338178609754e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 374.40625,
"epoch": 0.053169161886106056,
"grad_norm": 0.1263278332387542,
"kl": 0.062255859375,
"learning_rate": 5.294374053487459e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 358.015625,
"epoch": 0.053281096963761015,
"grad_norm": 0.020700901911474265,
"kl": 0.0355224609375,
"learning_rate": 5.271341631956511e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 401.046875,
"epoch": 0.05339303204141598,
"grad_norm": 0.027139069654941325,
"kl": 0.0396728515625,
"learning_rate": 5.249241880411181e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 515.703125,
"epoch": 0.05350496711907094,
"grad_norm": 0.04265200650936088,
"kl": 0.060302734375,
"learning_rate": 5.228075726112785e-07,
"loss": 0.0006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 386.640625,
"epoch": 0.0536169021967259,
"grad_norm": 0.025504104198776033,
"kl": 0.03570556640625,
"learning_rate": 5.207844057150768e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 213.78125,
"epoch": 0.05372883727438086,
"grad_norm": 18.185082196534506,
"kl": 1.0419921875,
"learning_rate": 5.188547722405437e-07,
"loss": 0.0419,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 390.96875,
"epoch": 0.05384077235203582,
"grad_norm": 0.03327137868690443,
"kl": 0.0408935546875,
"learning_rate": 5.170187531512351e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 426.703125,
"epoch": 0.05395270742969078,
"grad_norm": 1.7420279438246644,
"kl": 1.31329345703125,
"learning_rate": 5.152764254828348e-07,
"loss": -0.0354,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 432.5,
"epoch": 0.05406464250734574,
"grad_norm": 0.33787614688903167,
"kl": 0.06884765625,
"learning_rate": 5.136278623399225e-07,
"loss": 0.0007,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 291.515625,
"epoch": 0.0541765775850007,
"grad_norm": 0.023809496275349114,
"kl": 0.05426025390625,
"learning_rate": 5.120731328929058e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 383.3125,
"epoch": 0.05428851266265566,
"grad_norm": 0.015058815890629395,
"kl": 0.0362548828125,
"learning_rate": 5.106123023751187e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 373.515625,
"epoch": 0.05440044774031062,
"grad_norm": 0.019845886834378008,
"kl": 0.03704833984375,
"learning_rate": 5.092454320800833e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 336.0,
"epoch": 0.05451238281796558,
"grad_norm": 0.05683684336283696,
"kl": 0.0426025390625,
"learning_rate": 5.079725793589405e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 286.71875,
"epoch": 0.05462431789562054,
"grad_norm": 0.03418640906474549,
"kl": 0.04443359375,
"learning_rate": 5.067937976180407e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 318.0625,
"epoch": 0.0547362529732755,
"grad_norm": 0.036114039198282856,
"kl": 0.05450439453125,
"learning_rate": 5.057091363167046e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 376.90625,
"epoch": 0.05484818805093046,
"grad_norm": 0.023332365324867536,
"kl": 0.03631591796875,
"learning_rate": 5.047186409651489e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 388.671875,
"epoch": 0.05496012312858542,
"grad_norm": 13.172841203218344,
"kl": 2.97564697265625,
"learning_rate": 5.038223531225742e-07,
"loss": 0.0339,
"reward": 0.09687500074505806,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.96875,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 310.796875,
"epoch": 0.05507205820624038,
"grad_norm": 9.652533322003624,
"kl": 0.4891357421875,
"learning_rate": 5.030203103954232e-07,
"loss": -0.0404,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 302.296875,
"epoch": 0.05518399328389534,
"grad_norm": 0.030140393405088423,
"kl": 0.0369873046875,
"learning_rate": 5.023125464358026e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 353.0,
"epoch": 0.0552959283615503,
"grad_norm": 0.02422588912502369,
"kl": 0.037200927734375,
"learning_rate": 5.016990909400709e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 370.34375,
"epoch": 0.055407863439205264,
"grad_norm": 0.0372886708561457,
"kl": 0.05389404296875,
"learning_rate": 5.011799696475915e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 384.703125,
"epoch": 0.05551979851686022,
"grad_norm": 0.01359500582951308,
"kl": 0.035980224609375,
"learning_rate": 5.007552043396547e-07,
"loss": 0.0004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 400.46875,
"epoch": 0.05563173359451518,
"grad_norm": 0.018893589493420345,
"kl": 0.04736328125,
"learning_rate": 5.004248128385618e-07,
"loss": 0.0005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 356.9375,
"epoch": 0.05574366867217014,
"grad_norm": 2.740738937008755,
"kl": 0.7550048828125,
"learning_rate": 5.001888090068784e-07,
"loss": -0.0421,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 386.421875,
"epoch": 0.0558556037498251,
"grad_norm": 0.9492236255194394,
"kl": 0.24884033203125,
"learning_rate": 5.000472027468528e-07,
"loss": -0.0556,
"reward": 0.09843750111758709,
"reward_std": 0.0062500000931322575,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.984375,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 344.65625,
"epoch": 0.05596753882748006,
"grad_norm": 0.024972922874043218,
"kl": 0.033905029296875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0003,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 500
},
{
"epoch": 0.05596753882748006,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.028706138839246706,
"train_runtime": 9937.8104,
"train_samples_per_second": 3.22,
"train_steps_per_second": 0.05
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}