VISTA-4B / trainer_state.json
m1ngcheng's picture
Add files using upload-large-folder tool
c43f667 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3372681281618887,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 10.032812595367432,
"epoch": 0.0005621135469364812,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.07857463508844376,
"ratio/all_0": 0.125,
"ratio/all_2": 0.46875,
"reward": 1.776562511920929,
"reward_std": 0.7588308304548264,
"rewards/avg_0": 1.71875,
"rewards/avg_1": 1.71875,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.703125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875298023224,
"rewards/point_reward": 0.784375011920929,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 9.984375238418579,
"epoch": 0.0011242270938729624,
"kl": 0.2562527507543564,
"learning_rate": 9.999943788645306e-07,
"loss": -0.03776644542813301,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8640625476837158,
"reward_std": 0.7996459901332855,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625178813934,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 10.035937547683716,
"epoch": 0.0016863406408094434,
"kl": 0.7022024244070053,
"learning_rate": 9.999887577290613e-07,
"loss": -0.0651589035987854,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.375,
"reward": 1.7562500536441803,
"reward_std": 0.702281191945076,
"rewards/avg_0": 1.6875,
"rewards/avg_1": 1.6875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.65625,
"rewards/avg_5": 1.6875,
"rewards/avg_6": 1.703125,
"rewards/avg_7": 1.640625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.7578125149011612,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 10.115624904632568,
"epoch": 0.0022484541877459247,
"kl": 1.118683785200119,
"learning_rate": 9.99983136593592e-07,
"loss": -0.04716287553310394,
"ratio/all_0": 0.125,
"ratio/all_2": 0.5,
"reward": 1.748437523841858,
"reward_std": 0.7817670404911041,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.65625,
"rewards/avg_2": 1.6875,
"rewards/avg_3": 1.6875,
"rewards/avg_4": 1.671875,
"rewards/avg_5": 1.65625,
"rewards/avg_6": 1.671875,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7484375238418579,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 9.982812404632568,
"epoch": 0.002810567734682406,
"kl": 1.4118792414665222,
"learning_rate": 9.999775154581225e-07,
"loss": -0.0010041920468211174,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.8250000178813934,
"reward_std": 0.8004997074604034,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8250000029802322,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 10.034375429153442,
"epoch": 0.003372681281618887,
"kl": 1.565059244632721,
"learning_rate": 9.999718943226531e-07,
"loss": -0.005607172846794128,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.453125,
"reward": 1.784375011920929,
"reward_std": 0.7476067095994949,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.703125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.71875,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.7859375178813934,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 10.014062881469727,
"epoch": 0.0039347948285553686,
"kl": 1.4795698821544647,
"learning_rate": 9.999662731871838e-07,
"loss": 0.002856176346540451,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.546875,
"reward": 1.8359375298023224,
"reward_std": 0.7821812778711319,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8359375,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 10.068750381469727,
"epoch": 0.004496908375491849,
"kl": 1.4396315813064575,
"learning_rate": 9.999606520517145e-07,
"loss": -0.008815726265311241,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.854687511920929,
"reward_std": 0.803190678358078,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750029802322,
"rewards/point_reward": 0.8578125089406967,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 9.831250190734863,
"epoch": 0.00505902192242833,
"kl": 1.3827219307422638,
"learning_rate": 9.99955030916245e-07,
"loss": -0.043409671634435654,
"ratio/all_0": 0.125,
"ratio/all_2": 0.390625,
"reward": 1.7234375178813934,
"reward_std": 0.7338877320289612,
"rewards/avg_0": 1.6875,
"rewards/avg_1": 1.671875,
"rewards/avg_2": 1.671875,
"rewards/avg_3": 1.640625,
"rewards/avg_4": 1.671875,
"rewards/avg_5": 1.578125,
"rewards/avg_6": 1.640625,
"rewards/avg_7": 1.671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.7265625,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 9.918750286102295,
"epoch": 0.005621135469364812,
"kl": 1.4110659658908844,
"learning_rate": 9.999494097807756e-07,
"loss": -0.04152492806315422,
"ratio/all_0": 0.140625,
"ratio/all_2": 0.46875,
"reward": 1.737500011920929,
"reward_std": 0.7673228681087494,
"rewards/avg_0": 1.6875,
"rewards/avg_1": 1.640625,
"rewards/avg_2": 1.6875,
"rewards/avg_3": 1.671875,
"rewards/avg_4": 1.671875,
"rewards/avg_5": 1.6875,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.640625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500059604645,
"rewards/point_reward": 0.7437500208616257,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 10.062500238418579,
"epoch": 0.006183249016301293,
"kl": 1.4412920773029327,
"learning_rate": 9.999437886453063e-07,
"loss": -0.0002207462675869465,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.53125,
"reward": 1.8234375417232513,
"reward_std": 0.7895885556936264,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8250000178813934,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 9.984375238418579,
"epoch": 0.006745362563237774,
"kl": 1.4790368676185608,
"learning_rate": 9.99938167509837e-07,
"loss": -0.04130768030881882,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.421875,
"reward": 1.7734375298023224,
"reward_std": 0.7166178226470947,
"rewards/avg_0": 1.65625,
"rewards/avg_1": 1.71875,
"rewards/avg_2": 1.71875,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.703125,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.7750000059604645,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 9.83750033378601,
"epoch": 0.007307476110174255,
"kl": 1.534344106912613,
"learning_rate": 9.999325463743674e-07,
"loss": -0.026084139943122864,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.421875,
"reward": 1.7765625417232513,
"reward_std": 0.719652533531189,
"rewards/avg_0": 1.703125,
"rewards/avg_1": 1.671875,
"rewards/avg_2": 1.6875,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.703125,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.7781250029802322,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 10.014062643051147,
"epoch": 0.007869589657110737,
"kl": 1.600354939699173,
"learning_rate": 9.999269252388981e-07,
"loss": -0.010605765506625175,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.453125,
"reward": 1.8046875298023224,
"reward_std": 0.7357720136642456,
"rewards/avg_0": 1.703125,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8046875,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 9.998437404632568,
"epoch": 0.008431703204047217,
"kl": 1.5717801451683044,
"learning_rate": 9.999213041034288e-07,
"loss": 0.0074955616146326065,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.609375,
"reward": 1.834375023841858,
"reward_std": 0.8261165618896484,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8343750089406967,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 10.057812690734863,
"epoch": 0.008993816750983699,
"kl": 1.4447836577892303,
"learning_rate": 9.999156829679595e-07,
"loss": -0.013929950073361397,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.515625,
"reward": 1.7890625298023224,
"reward_std": 0.786567360162735,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.703125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.6875,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7890625,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 9.904687643051147,
"epoch": 0.00955593029792018,
"kl": 1.501718521118164,
"learning_rate": 9.999100618324902e-07,
"loss": -0.019112706184387207,
"ratio/all_0": 0.140625,
"ratio/all_2": 0.375,
"reward": 1.7453125417232513,
"reward_std": 0.7173045426607132,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.6875,
"rewards/avg_2": 1.65625,
"rewards/avg_3": 1.609375,
"rewards/avg_4": 1.71875,
"rewards/avg_5": 1.65625,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.6875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7453124970197678,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 9.840625047683716,
"epoch": 0.01011804384485666,
"kl": 1.4566405415534973,
"learning_rate": 9.999044406970208e-07,
"loss": 0.005196334794163704,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.846875011920929,
"reward_std": 0.8059732168912888,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.846875011920929,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 9.921875476837158,
"epoch": 0.010680157391793142,
"kl": 1.509264975786209,
"learning_rate": 9.998988195615515e-07,
"loss": 0.006893848069012165,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.625,
"reward": 1.84375,
"reward_std": 0.8279983550310135,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8453125208616257,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 10.048437595367432,
"epoch": 0.011242270938729624,
"kl": 1.4839653670787811,
"learning_rate": 9.99893198426082e-07,
"loss": 0.005196704529225826,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.53125,
"reward": 1.8062500357627869,
"reward_std": 0.7997609674930573,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8062500208616257,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 9.96875,
"epoch": 0.011804384485666104,
"kl": 1.5253983438014984,
"learning_rate": 9.998875772906127e-07,
"loss": -0.01355843897908926,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.40625,
"reward": 1.8078125417232513,
"reward_std": 0.7039558738470078,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.71875,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8093750029802322,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 9.98593783378601,
"epoch": 0.012366498032602586,
"kl": 1.5911829769611359,
"learning_rate": 9.998819561551433e-07,
"loss": -0.026118595153093338,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.796875,
"reward_std": 0.7733737677335739,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.6875,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.7984375208616257,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 10.207812547683716,
"epoch": 0.012928611579539067,
"kl": 1.4360250234603882,
"learning_rate": 9.99876335019674e-07,
"loss": -0.007857441902160645,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.4375,
"reward": 1.7984375059604645,
"reward_std": 0.736803725361824,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.71875,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7984375208616257,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 10.175000190734863,
"epoch": 0.013490725126475547,
"kl": 1.6245340406894684,
"learning_rate": 9.998707138842045e-07,
"loss": -0.008536879904568195,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8359375298023224,
"reward_std": 0.7617449015378952,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8359375149011612,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 10.134375095367432,
"epoch": 0.014052838673412029,
"kl": 1.8844734132289886,
"learning_rate": 9.998650927487351e-07,
"loss": -0.01100741233676672,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5,
"reward": 1.8343750536441803,
"reward_std": 0.723815992474556,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8343750089406967,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 10.245312690734863,
"epoch": 0.01461495222034851,
"kl": 1.6238836646080017,
"learning_rate": 9.998594716132658e-07,
"loss": -0.013836846686899662,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.578125,
"reward": 1.8046875298023224,
"reward_std": 0.802058219909668,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.6875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8046875149011612,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 10.129687786102295,
"epoch": 0.01517706576728499,
"kl": 1.5611946880817413,
"learning_rate": 9.998538504777965e-07,
"loss": -0.03556900471448898,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.359375,
"reward": 1.7734375298023224,
"reward_std": 0.6553197205066681,
"rewards/avg_0": 1.65625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.65625,
"rewards/avg_4": 1.65625,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.6875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7734375149011612,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 10.168750047683716,
"epoch": 0.015739179314221474,
"kl": 1.512777954339981,
"learning_rate": 9.998482293423272e-07,
"loss": -0.014125403016805649,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.834375023841858,
"reward_std": 0.7443395406007767,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.703125,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8343750238418579,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 10.273437738418579,
"epoch": 0.016301292861157952,
"kl": 1.5330235958099365,
"learning_rate": 9.998426082068576e-07,
"loss": -0.003860312746837735,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.484375,
"reward": 1.8406250178813934,
"reward_std": 0.729529395699501,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8406250029802322,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 10.125000238418579,
"epoch": 0.016863406408094434,
"kl": 1.6126181781291962,
"learning_rate": 9.998369870713883e-07,
"loss": -0.00027049880009144545,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.46875,
"reward": 1.8328125178813934,
"reward_std": 0.7283511310815811,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8328125178813934,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 10.178124904632568,
"epoch": 0.017425519955030916,
"kl": 1.4546434581279755,
"learning_rate": 9.99831365935919e-07,
"loss": 0.0018729576840996742,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.53125,
"reward": 1.8062500357627869,
"reward_std": 0.8023815006017685,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.71875,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8062500059604645,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 10.1640625,
"epoch": 0.017987633501967398,
"kl": 1.4049744606018066,
"learning_rate": 9.998257448004497e-07,
"loss": -0.0031263651326298714,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8437500596046448,
"reward_std": 0.8310911953449249,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.84375,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 10.075000286102295,
"epoch": 0.01854974704890388,
"kl": 1.3538335859775543,
"learning_rate": 9.998201236649804e-07,
"loss": -0.032431505620479584,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.421875,
"reward": 1.7656250298023224,
"reward_std": 0.7300598323345184,
"rewards/avg_0": 1.625,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.703125,
"rewards/avg_4": 1.640625,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7656250149011612,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 10.176562547683716,
"epoch": 0.01911186059584036,
"kl": 1.4589167535305023,
"learning_rate": 9.99814502529511e-07,
"loss": 0.011930462904274464,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.885937511920929,
"reward_std": 0.8517080992460251,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8859375268220901,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 10.060937881469727,
"epoch": 0.01967397414277684,
"kl": 1.5603070259094238,
"learning_rate": 9.998088813940415e-07,
"loss": -0.02111126109957695,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8140625357627869,
"reward_std": 0.7738387882709503,
"rewards/avg_0": 1.65625,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8156249970197678,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 10.029687643051147,
"epoch": 0.02023608768971332,
"kl": 1.587103694677353,
"learning_rate": 9.998032602585722e-07,
"loss": 0.02595140039920807,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.609375,
"reward": 1.8718750476837158,
"reward_std": 0.8129217773675919,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.871874988079071,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 10.100000143051147,
"epoch": 0.020798201236649803,
"kl": 1.5513676702976227,
"learning_rate": 9.997976391231028e-07,
"loss": -0.009216181933879852,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.857812523841858,
"reward_std": 0.7749081701040268,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125238418579,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 10.182812452316284,
"epoch": 0.021360314783586284,
"kl": 1.4709611535072327,
"learning_rate": 9.997920179876335e-07,
"loss": -0.029708366841077805,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8078124821186066,
"reward_std": 0.8228506743907928,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.71875,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8078125268220901,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 10.150000095367432,
"epoch": 0.021922428330522766,
"kl": 1.5169185996055603,
"learning_rate": 9.99786396852164e-07,
"loss": 0.017741110175848007,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8906250298023224,
"reward_std": 0.7767912149429321,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 10.046875238418579,
"epoch": 0.022484541877459248,
"kl": 1.4632358253002167,
"learning_rate": 9.997807757166947e-07,
"loss": -0.03963226079940796,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.803125023841858,
"reward_std": 0.7558850347995758,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.71875,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8046875149011612,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 10.187500238418579,
"epoch": 0.02304665542439573,
"kl": 1.3824447095394135,
"learning_rate": 9.997751545812253e-07,
"loss": 0.010992420837283134,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.671875,
"reward": 1.8593750298023224,
"reward_std": 0.8612623810768127,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8609375059604645,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 10.057812690734863,
"epoch": 0.023608768971332208,
"kl": 1.5909472107887268,
"learning_rate": 9.99769533445756e-07,
"loss": -0.013673016801476479,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.823437511920929,
"reward_std": 0.7688121795654297,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.823437511920929,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 10.285937547683716,
"epoch": 0.02417088251826869,
"kl": 1.4693297147750854,
"learning_rate": 9.997639123102867e-07,
"loss": -0.0068480633199214935,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8453125357627869,
"reward_std": 0.7629244476556778,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8453125059604645,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 10.30625033378601,
"epoch": 0.02473299606520517,
"kl": 1.4473371505737305,
"learning_rate": 9.997582911748172e-07,
"loss": -0.00031005823984742165,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.53125,
"reward": 1.8234375417232513,
"reward_std": 0.7852741330862045,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.823437511920929,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 10.312500476837158,
"epoch": 0.025295109612141653,
"kl": 1.4984396696090698,
"learning_rate": 9.997526700393478e-07,
"loss": -0.007865441963076591,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.453125,
"reward": 1.8234375417232513,
"reward_std": 0.7250258326530457,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.823437511920929,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 10.32968783378601,
"epoch": 0.025857223159078135,
"kl": 1.4959508776664734,
"learning_rate": 9.997470489038785e-07,
"loss": 0.0012173890136182308,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.609375,
"reward": 1.8203125298023224,
"reward_std": 0.8282635509967804,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8203125,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 10.228125095367432,
"epoch": 0.026419336706014616,
"kl": 1.4259069859981537,
"learning_rate": 9.997414277684092e-07,
"loss": -0.006716692354530096,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.484375,
"reward": 1.8125,
"reward_std": 0.7544415444135666,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8125000298023224,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 10.353125095367432,
"epoch": 0.026981450252951095,
"kl": 1.383703500032425,
"learning_rate": 9.997358066329399e-07,
"loss": -0.014200778678059578,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.515625,
"reward": 1.8359375,
"reward_std": 0.7453095018863678,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8375000059604645,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 10.187500476837158,
"epoch": 0.027543563799887576,
"kl": 1.6698559820652008,
"learning_rate": 9.997301854974706e-07,
"loss": 0.007298845797777176,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.8218750059604645,
"reward_std": 0.7926801592111588,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.671875,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8234375268220901,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 10.160937547683716,
"epoch": 0.028105677346824058,
"kl": 1.2673736810684204,
"learning_rate": 9.99724564362001e-07,
"loss": -0.008520995266735554,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.515625,
"reward": 1.8296875059604645,
"reward_std": 0.7676666676998138,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8312499970197678,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 10.223437547683716,
"epoch": 0.02866779089376054,
"kl": 1.329988956451416,
"learning_rate": 9.997189432265317e-07,
"loss": -0.023494983091950417,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.815625011920929,
"reward_std": 0.7471684217453003,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.815625011920929,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 10.117187976837158,
"epoch": 0.02922990444069702,
"kl": 1.569214403629303,
"learning_rate": 9.997133220910624e-07,
"loss": -0.0041077700443565845,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.8109375536441803,
"reward_std": 0.7974056601524353,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.71875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8109375089406967,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 10.22031283378601,
"epoch": 0.029792017987633503,
"kl": 1.572588711977005,
"learning_rate": 9.99707700955593e-07,
"loss": -0.012450341135263443,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.8390625417232513,
"reward_std": 0.7325338274240494,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.839062511920929,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 10.168750524520874,
"epoch": 0.03035413153456998,
"kl": 1.601712167263031,
"learning_rate": 9.997020798201237e-07,
"loss": 0.008456533774733543,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.8796875178813934,
"reward_std": 0.821140706539154,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875029802322,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 10.239062547683716,
"epoch": 0.030916245081506463,
"kl": 1.5150468349456787,
"learning_rate": 9.996964586846542e-07,
"loss": -0.010614115744829178,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.5,
"reward": 1.7984375059604645,
"reward_std": 0.773614913225174,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.71875,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.703125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7984375059604645,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 10.107812643051147,
"epoch": 0.03147835862844295,
"kl": 1.594663679599762,
"learning_rate": 9.996908375491849e-07,
"loss": 0.005751887336373329,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.484375,
"reward": 1.8390624821186066,
"reward_std": 0.73665352165699,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8390625268220901,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 10.292187690734863,
"epoch": 0.03204047217537943,
"kl": 1.5536901950836182,
"learning_rate": 9.996852164137155e-07,
"loss": 0.016127917915582657,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9140625,
"reward_std": 0.8086505830287933,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.96875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625149011612,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062547683716,
"epoch": 0.032602585722315905,
"kl": 1.610331416130066,
"learning_rate": 9.996795952782462e-07,
"loss": -0.0014691497199237347,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.8609375357627869,
"reward_std": 0.7582378685474396,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375059604645,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 10.270312786102295,
"epoch": 0.03316469926925239,
"kl": 1.541633427143097,
"learning_rate": 9.996739741427767e-07,
"loss": 0.01057470217347145,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8750000298023224,
"reward_std": 0.7591521888971329,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.875,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 10.290625095367432,
"epoch": 0.03372681281618887,
"kl": 1.5340014100074768,
"learning_rate": 9.996683530073074e-07,
"loss": 0.009979034774005413,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.8750000298023224,
"reward_std": 0.7346808016300201,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 10.28125,
"epoch": 0.034288926363125354,
"kl": 1.5578001737594604,
"learning_rate": 9.99662731871838e-07,
"loss": -0.021913019940257072,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.546875,
"reward": 1.8281250298023224,
"reward_std": 0.7614506334066391,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8296875059604645,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 10.281250238418579,
"epoch": 0.03485103991006183,
"kl": 1.4720918536186218,
"learning_rate": 9.996571107363687e-07,
"loss": -0.0258667953312397,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.46875,
"reward": 1.8062500357627869,
"reward_std": 0.7150072902441025,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.703125,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.703125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8062500059604645,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 10.309375286102295,
"epoch": 0.03541315345699832,
"kl": 1.5031334161758423,
"learning_rate": 9.996514896008992e-07,
"loss": 0.0031455708667635918,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8453125357627869,
"reward_std": 0.8244078755378723,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8453125059604645,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 10.251562595367432,
"epoch": 0.035975267003934795,
"kl": 1.509196251630783,
"learning_rate": 9.9964586846543e-07,
"loss": -0.022226005792617798,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.842187523841858,
"reward_std": 0.7700472772121429,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875089406967,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 10.271875143051147,
"epoch": 0.03653738055087127,
"kl": 1.5132167041301727,
"learning_rate": 9.996402473299608e-07,
"loss": 0.0028878115117549896,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.873437523841858,
"reward_std": 0.7714907974004745,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375238418579,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500429153442,
"epoch": 0.03709949409780776,
"kl": 1.5417117476463318,
"learning_rate": 9.996346261944912e-07,
"loss": 0.0048205191269516945,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.8921875059604645,
"reward_std": 0.7501703798770905,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875208616257,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 10.425000190734863,
"epoch": 0.03766160764474424,
"kl": 1.5405822396278381,
"learning_rate": 9.996290050590219e-07,
"loss": -7.154536433517933e-06,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.484375,
"reward": 1.8203125298023224,
"reward_std": 0.7464889883995056,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8203125149011612,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000286102295,
"epoch": 0.03822372119168072,
"kl": 1.4785286784172058,
"learning_rate": 9.996233839235526e-07,
"loss": -0.049676112830638885,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.4375,
"reward": 1.792187511920929,
"reward_std": 0.6961659491062164,
"rewards/avg_0": 1.71875,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7921874970197678,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 10.375,
"epoch": 0.0387858347386172,
"kl": 1.439880520105362,
"learning_rate": 9.996177627880832e-07,
"loss": -0.0004722205922007561,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.515625,
"reward": 1.8671875298023224,
"reward_std": 0.7345052808523178,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8687500059604645,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 10.292187690734863,
"epoch": 0.03934794828555368,
"kl": 1.4667574167251587,
"learning_rate": 9.996121416526137e-07,
"loss": -0.004125084728002548,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.484375,
"reward": 1.826562523841858,
"reward_std": 0.7579498142004013,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.71875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8265625089406967,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 10.384375095367432,
"epoch": 0.039910061832490164,
"kl": 1.4546357691287994,
"learning_rate": 9.996065205171444e-07,
"loss": -0.0041849445551633835,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.8359375,
"reward_std": 0.7519094198942184,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8359375149011612,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 10.379687547683716,
"epoch": 0.04047217537942664,
"kl": 1.4776123464107513,
"learning_rate": 9.99600899381675e-07,
"loss": 0.012721182778477669,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.5625,
"reward": 1.8375000357627869,
"reward_std": 0.7974309027194977,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000047683716,
"epoch": 0.04103428892636313,
"kl": 1.4323228895664215,
"learning_rate": 9.995952782462057e-07,
"loss": -0.0011236064601689577,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8796875178813934,
"reward_std": 0.7972553670406342,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 10.217187643051147,
"epoch": 0.041596402473299605,
"kl": 1.5072983801364899,
"learning_rate": 9.995896571107362e-07,
"loss": -0.02287622168660164,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.8250000476837158,
"reward_std": 0.7327965945005417,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8265625089406967,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 10.421875238418579,
"epoch": 0.04215851602023609,
"kl": 1.5226522088050842,
"learning_rate": 9.995840359752669e-07,
"loss": 0.014585795812308788,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.8640625476837158,
"reward_std": 0.8066390603780746,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8656250089406967,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625,
"epoch": 0.04272062956717257,
"kl": 1.5350313782691956,
"learning_rate": 9.995784148397976e-07,
"loss": 0.0005742218345403671,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.831250011920929,
"reward_std": 0.7712255567312241,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.6875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.831250011920929,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 10.404687643051147,
"epoch": 0.04328274311410905,
"kl": 1.487417221069336,
"learning_rate": 9.995727937043282e-07,
"loss": -0.0033020656555891037,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.865625023841858,
"reward_std": 0.7682248502969742,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 10.354687452316284,
"epoch": 0.04384485666104553,
"kl": 1.5165870487689972,
"learning_rate": 9.99567172568859e-07,
"loss": -0.024435177445411682,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.453125,
"reward": 1.7906250357627869,
"reward_std": 0.7295899838209152,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.671875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.703125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7906250059604645,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 10.387500286102295,
"epoch": 0.04440697020798201,
"kl": 1.452039659023285,
"learning_rate": 9.995615514333896e-07,
"loss": -0.03055991232395172,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.46875,
"reward": 1.803125023841858,
"reward_std": 0.725995808839798,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.703125,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8031250089406967,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 10.582812786102295,
"epoch": 0.044969083754918496,
"kl": 1.5275309681892395,
"learning_rate": 9.995559302979203e-07,
"loss": 0.016731752082705498,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9000000357627869,
"reward_std": 0.848938599228859,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 10.4609375,
"epoch": 0.045531197301854974,
"kl": 1.53338223695755,
"learning_rate": 9.995503091624507e-07,
"loss": -0.010927405208349228,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8546875417232513,
"reward_std": 0.7863930761814117,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8546874970197678,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 10.557812690734863,
"epoch": 0.04609331084879146,
"kl": 1.5185647904872894,
"learning_rate": 9.995446880269814e-07,
"loss": 0.02417963184416294,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.910937488079071,
"reward_std": 0.8183117806911469,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375327825546,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 10.412499904632568,
"epoch": 0.04665542439572794,
"kl": 1.5981625616550446,
"learning_rate": 9.99539066891512e-07,
"loss": 0.014372358098626137,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8687500357627869,
"reward_std": 0.7999352663755417,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500059604645,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 10.357812643051147,
"epoch": 0.047217537942664416,
"kl": 1.584965080022812,
"learning_rate": 9.995334457560428e-07,
"loss": 0.0024953281972557306,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8640625178813934,
"reward_std": 0.7857124209403992,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625029802322,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 10.412500143051147,
"epoch": 0.0477796514896009,
"kl": 1.5639266967773438,
"learning_rate": 9.995278246205732e-07,
"loss": 0.014907699078321457,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8984375298023224,
"reward_std": 0.787330225110054,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 10.464062690734863,
"epoch": 0.04834176503653738,
"kl": 1.5700583755970001,
"learning_rate": 9.99522203485104e-07,
"loss": 0.02378089912235737,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8828125298023224,
"reward_std": 0.8185782730579376,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 10.39218783378601,
"epoch": 0.048903878583473864,
"kl": 1.5734200477600098,
"learning_rate": 9.995165823496346e-07,
"loss": -0.0061425757594406605,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8328125178813934,
"reward_std": 0.7643389850854874,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8343750089406967,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687547683716,
"epoch": 0.04946599213041034,
"kl": 1.4525836110115051,
"learning_rate": 9.995109612141653e-07,
"loss": 0.030096255242824554,
"ratio/all_0": 0.0,
"ratio/all_2": 0.640625,
"reward": 1.9406250417232513,
"reward_std": 0.7833255678415298,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.96875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9406249970197678,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125,
"epoch": 0.05002810567734682,
"kl": 1.550961047410965,
"learning_rate": 9.995053400786957e-07,
"loss": 0.01746845617890358,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.8875000476837158,
"reward_std": 0.8394832015037537,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.887499988079071,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687881469727,
"epoch": 0.050590219224283306,
"kl": 1.5564002692699432,
"learning_rate": 9.994997189432264e-07,
"loss": 0.009959584102034569,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8703125417232513,
"reward_std": 0.7675454616546631,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 10.375000238418579,
"epoch": 0.051152332771219784,
"kl": 1.5659159719944,
"learning_rate": 9.99494097807757e-07,
"loss": 0.02350633218884468,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.8843750357627869,
"reward_std": 0.8328011333942413,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 10.443750381469727,
"epoch": 0.05171444631815627,
"kl": 1.5553985238075256,
"learning_rate": 9.994884766722878e-07,
"loss": 0.011613225564360619,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8828125298023224,
"reward_std": 0.8189027905464172,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 10.3828125,
"epoch": 0.05227655986509275,
"kl": 1.5122833847999573,
"learning_rate": 9.994828555368184e-07,
"loss": 0.03290847688913345,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9328125417232513,
"reward_std": 0.8211647868156433,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.953125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.932812511920929,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000238418579,
"epoch": 0.05283867341202923,
"kl": 1.4713640809059143,
"learning_rate": 9.994772344013491e-07,
"loss": 0.025454115122556686,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9218750298023224,
"reward_std": 0.8077968657016754,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.953125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.921875,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875238418579,
"epoch": 0.05340078695896571,
"kl": 1.5354516804218292,
"learning_rate": 9.994716132658798e-07,
"loss": -0.03127738833427429,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5,
"reward": 1.8312500417232513,
"reward_std": 0.7202664166688919,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8328125029802322,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 10.439062595367432,
"epoch": 0.05396290050590219,
"kl": 1.5282418131828308,
"learning_rate": 9.994659921304103e-07,
"loss": 0.012253593653440475,
"ratio/all_0": 0.0,
"ratio/all_2": 0.65625,
"reward": 1.904687523841858,
"reward_std": 0.801643967628479,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875089406967,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 10.551562547683716,
"epoch": 0.054525014052838675,
"kl": 1.4728964567184448,
"learning_rate": 9.99460370994941e-07,
"loss": -0.01994449459016323,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.578125,
"reward": 1.807812511920929,
"reward_std": 0.804884597659111,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.807812511920929,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250286102295,
"epoch": 0.05508712759977515,
"kl": 1.5632469356060028,
"learning_rate": 9.994547498594716e-07,
"loss": 0.020106907933950424,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.873437523841858,
"reward_std": 0.8139888793230057,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8750000149011612,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 10.428125143051147,
"epoch": 0.05564924114671164,
"kl": 1.521868348121643,
"learning_rate": 9.994491287240023e-07,
"loss": 0.0054258364252746105,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.846875011920929,
"reward_std": 0.7462553530931473,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.846875011920929,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 10.45468783378601,
"epoch": 0.056211354693648116,
"kl": 1.505905568599701,
"learning_rate": 9.994435075885328e-07,
"loss": 0.008497299626469612,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8718750178813934,
"reward_std": 0.7737879902124405,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750178813934,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 10.315625429153442,
"epoch": 0.0567734682405846,
"kl": 1.5346179604530334,
"learning_rate": 9.994378864530634e-07,
"loss": -0.004003023728728294,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8593750298023224,
"reward_std": 0.7765247821807861,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750149011612,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 10.559375047683716,
"epoch": 0.05733558178752108,
"kl": 1.5174490213394165,
"learning_rate": 9.994322653175941e-07,
"loss": -0.007914504036307335,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.8375000059604645,
"reward_std": 0.7441652566194534,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 10.350000143051147,
"epoch": 0.05789769533445756,
"kl": 1.5438537895679474,
"learning_rate": 9.994266441821248e-07,
"loss": -0.005718717817217112,
"ratio/all_0": 0.0,
"ratio/all_2": 0.59375,
"reward": 1.8765625357627869,
"reward_std": 0.7701685428619385,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8781249970197678,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 10.517187595367432,
"epoch": 0.05845980888139404,
"kl": 1.5379104316234589,
"learning_rate": 9.994210230466555e-07,
"loss": -0.0068434132263064384,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.8796875178813934,
"reward_std": 0.7376827597618103,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 10.384375095367432,
"epoch": 0.05902192242833052,
"kl": 1.5371789634227753,
"learning_rate": 9.99415401911186e-07,
"loss": 0.011498227715492249,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8906250298023224,
"reward_std": 0.7910128086805344,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.890625,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 10.320312738418579,
"epoch": 0.059584035975267007,
"kl": 1.4497211277484894,
"learning_rate": 9.994097807757166e-07,
"loss": 0.023686472326517105,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.609375,
"reward": 1.854687511920929,
"reward_std": 0.8297070264816284,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 10.340625286102295,
"epoch": 0.060146149522203485,
"kl": 1.56570765376091,
"learning_rate": 9.994041596402473e-07,
"loss": -0.023295482620596886,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.4375,
"reward": 1.8187500536441803,
"reward_std": 0.6901557743549347,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.703125,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8187500089406967,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 10.503125190734863,
"epoch": 0.06070826306913996,
"kl": 1.4991244673728943,
"learning_rate": 9.99398538504778e-07,
"loss": -0.0025712046772241592,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8531250059604645,
"reward_std": 0.7726690620183945,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250208616257,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 10.43906283378601,
"epoch": 0.06127037661607645,
"kl": 1.52072274684906,
"learning_rate": 9.993929173693084e-07,
"loss": -0.0034492649137973785,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.859375,
"reward_std": 0.7871571332216263,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750149011612,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 10.625000476837158,
"epoch": 0.061832490163012926,
"kl": 1.4775553047657013,
"learning_rate": 9.993872962338393e-07,
"loss": -0.004010038450360298,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5,
"reward": 1.8609375655651093,
"reward_std": 0.7292641997337341,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375059604645,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 10.553124904632568,
"epoch": 0.06239460370994941,
"kl": 1.537141591310501,
"learning_rate": 9.993816750983698e-07,
"loss": 0.012600191868841648,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.515625,
"reward": 1.8531250357627869,
"reward_std": 0.7582390755414963,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250059604645,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 10.557812452316284,
"epoch": 0.0629567172568859,
"kl": 1.4744674563407898,
"learning_rate": 9.993760539629005e-07,
"loss": 0.017602285370230675,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.609375,
"reward": 1.8593750298023224,
"reward_std": 0.8213162273168564,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750149011612,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 10.635937452316284,
"epoch": 0.06351883080382237,
"kl": 1.5100781619548798,
"learning_rate": 9.993704328274311e-07,
"loss": -0.005892171524465084,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.8359375,
"reward_std": 0.7448712140321732,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8359375298023224,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.06408094435075885,
"kl": 1.4870060682296753,
"learning_rate": 9.993648116919618e-07,
"loss": -0.0005512246862053871,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.850000023841858,
"reward_std": 0.7565291672945023,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8500000238418579,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 10.523437738418579,
"epoch": 0.06464305789769534,
"kl": 1.5237656235694885,
"learning_rate": 9.993591905564923e-07,
"loss": 0.0005614343099296093,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8578125536441803,
"reward_std": 0.745839923620224,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578124940395355,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937547683716,
"epoch": 0.06520517144463181,
"kl": 1.5184479653835297,
"learning_rate": 9.99353569421023e-07,
"loss": 0.018243275582790375,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8921875357627869,
"reward_std": 0.8082122951745987,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 10.553125381469727,
"epoch": 0.0657672849915683,
"kl": 1.5549254715442657,
"learning_rate": 9.993479482855536e-07,
"loss": 0.029085306450724602,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.9109375178813934,
"reward_std": 0.8369801193475723,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 10.506250143051147,
"epoch": 0.06632939853850478,
"kl": 1.5414323210716248,
"learning_rate": 9.993423271500843e-07,
"loss": 0.011565025895833969,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8671875298023224,
"reward_std": 0.7845959216356277,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8671875,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187690734863,
"epoch": 0.06689151208544127,
"kl": 1.533017873764038,
"learning_rate": 9.99336706014615e-07,
"loss": 0.0029391534626483917,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.865625023841858,
"reward_std": 0.8084775656461716,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 10.514062643051147,
"epoch": 0.06745362563237774,
"kl": 1.5064394176006317,
"learning_rate": 9.993310848791455e-07,
"loss": -0.0017734202556312084,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.865625023841858,
"reward_std": 0.7416293770074844,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 10.545312643051147,
"epoch": 0.06801573917931422,
"kl": 1.5602343082427979,
"learning_rate": 9.993254637436761e-07,
"loss": -0.006767038721591234,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5,
"reward": 1.8250000178813934,
"reward_std": 0.7424818426370621,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.734375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8250000029802322,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 10.515625,
"epoch": 0.06857785272625071,
"kl": 1.5196544229984283,
"learning_rate": 9.993198426082068e-07,
"loss": -0.00423224875703454,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.8250000178813934,
"reward_std": 0.7697820663452148,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8250000178813934,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 10.515625238418579,
"epoch": 0.06913996627318718,
"kl": 1.453963041305542,
"learning_rate": 9.993142214727375e-07,
"loss": -0.01648056134581566,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.8375000357627869,
"reward_std": 0.7505858242511749,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 10.528125047683716,
"epoch": 0.06970207982012366,
"kl": 1.4716617166996002,
"learning_rate": 9.99308600337268e-07,
"loss": 0.02104857563972473,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.53125,
"reward": 1.857812523841858,
"reward_std": 0.7756710201501846,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125089406967,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 10.478125095367432,
"epoch": 0.07026419336706015,
"kl": 1.5574918985366821,
"learning_rate": 9.993029792017988e-07,
"loss": -0.0042773825116455555,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8406250178813934,
"reward_std": 0.7628335356712341,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8406250178813934,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 10.543750286102295,
"epoch": 0.07082630691399663,
"kl": 1.4953092634677887,
"learning_rate": 9.992973580663293e-07,
"loss": 0.021830586716532707,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.53125,
"reward": 1.8656249940395355,
"reward_std": 0.7679849863052368,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8671875149011612,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 10.440625190734863,
"epoch": 0.0713884204609331,
"kl": 1.5018656253814697,
"learning_rate": 9.9929173693086e-07,
"loss": -0.0003001997247338295,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8640625178813934,
"reward_std": 0.7929854989051819,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625178813934,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312690734863,
"epoch": 0.07195053400786959,
"kl": 1.4831855297088623,
"learning_rate": 9.992861157953907e-07,
"loss": 0.0010036500170826912,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8750000298023224,
"reward_std": 0.79169100522995,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500190734863,
"epoch": 0.07251264755480608,
"kl": 1.5014575123786926,
"learning_rate": 9.992804946599213e-07,
"loss": -0.005141148809343576,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.515625,
"reward": 1.8125,
"reward_std": 0.7820057570934296,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.65625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8125000298023224,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 10.350000143051147,
"epoch": 0.07307476110174255,
"kl": 1.5373378098011017,
"learning_rate": 9.99274873524452e-07,
"loss": 0.018027551472187042,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9062500298023224,
"reward_std": 0.7975836396217346,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9062500149011612,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 10.303125381469727,
"epoch": 0.07363687464867903,
"kl": 1.5327981412410736,
"learning_rate": 9.992692523889825e-07,
"loss": -0.019817432388663292,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8281250298023224,
"reward_std": 0.7495806366205215,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8281250149011612,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000047683716,
"epoch": 0.07419898819561552,
"kl": 1.5207752287387848,
"learning_rate": 9.992636312535132e-07,
"loss": 0.024002227932214737,
"ratio/all_0": 0.0,
"ratio/all_2": 0.6875,
"reward": 1.9218750298023224,
"reward_std": 0.8269690722227097,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.96875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9218750149011612,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 10.307812452316284,
"epoch": 0.07476110174255199,
"kl": 1.5719003975391388,
"learning_rate": 9.992580101180438e-07,
"loss": 0.026019521057605743,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8765625059604645,
"reward_std": 0.830683246254921,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.878125011920929,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312929153442,
"epoch": 0.07532321528948847,
"kl": 1.516474723815918,
"learning_rate": 9.992523889825745e-07,
"loss": -0.010382582433521748,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8406250476837158,
"reward_std": 0.7487256675958633,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8406250029802322,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 10.26718783378601,
"epoch": 0.07588532883642496,
"kl": 1.586272120475769,
"learning_rate": 9.99246767847105e-07,
"loss": 0.007272321730852127,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.846875011920929,
"reward_std": 0.8077567517757416,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8484375178813934,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 10.437500238418579,
"epoch": 0.07644744238336144,
"kl": 1.5889216363430023,
"learning_rate": 9.992411467116357e-07,
"loss": 0.00854005478322506,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8828125298023224,
"reward_std": 0.7845934629440308,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125238418579,
"epoch": 0.07700955593029792,
"kl": 1.5352471768856049,
"learning_rate": 9.992355255761663e-07,
"loss": 0.01773301512002945,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.8843750059604645,
"reward_std": 0.7357997745275497,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937595367432,
"epoch": 0.0775716694772344,
"kl": 1.5447623431682587,
"learning_rate": 9.99229904440697e-07,
"loss": 0.012218187563121319,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.893750011920929,
"reward_std": 0.79325070977211,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 10.45468783378601,
"epoch": 0.07813378302417089,
"kl": 1.6122995018959045,
"learning_rate": 9.992242833052275e-07,
"loss": -0.013333135284483433,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.826562523841858,
"reward_std": 0.7633627504110336,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8265625089406967,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 10.484375,
"epoch": 0.07869589657110736,
"kl": 1.593853771686554,
"learning_rate": 9.992186621697581e-07,
"loss": 0.020144429057836533,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.854687511920929,
"reward_std": 0.7729583382606506,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8546874970197678,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 10.33750033378601,
"epoch": 0.07925801011804384,
"kl": 1.5863473117351532,
"learning_rate": 9.992130410342888e-07,
"loss": 0.015076573938131332,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.5625,
"reward": 1.8484375178813934,
"reward_std": 0.7912780493497849,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375178813934,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812547683716,
"epoch": 0.07982012366498033,
"kl": 1.5253994762897491,
"learning_rate": 9.992074198988195e-07,
"loss": 0.02953893318772316,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.671875,
"reward": 1.8859375417232513,
"reward_std": 0.8484649807214737,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 10.367187976837158,
"epoch": 0.08038223721191681,
"kl": 1.569258987903595,
"learning_rate": 9.992017987633502e-07,
"loss": 0.033501721918582916,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.9156250357627869,
"reward_std": 0.8672823309898376,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250059604645,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 10.559375286102295,
"epoch": 0.08094435075885328,
"kl": 1.5631862878799438,
"learning_rate": 9.991961776278809e-07,
"loss": -0.0034087556414306164,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.546875,
"reward": 1.8421874940395355,
"reward_std": 0.7658355087041855,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.703125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875238418579,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 10.410937786102295,
"epoch": 0.08150646430578977,
"kl": 1.4907237887382507,
"learning_rate": 9.991905564924115e-07,
"loss": 0.00786613579839468,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8671875298023224,
"reward_std": 0.7537936717271805,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8671875,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 10.439062595367432,
"epoch": 0.08206857785272625,
"kl": 1.4968520998954773,
"learning_rate": 9.99184935356942e-07,
"loss": 0.01350158266723156,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.9000000357627869,
"reward_std": 0.7897035330533981,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.9031250029802322,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 10.484375238418579,
"epoch": 0.08263069139966273,
"kl": 1.5990287363529205,
"learning_rate": 9.991793142214727e-07,
"loss": 0.031508635729551315,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.893750011920929,
"reward_std": 0.8107470273971558,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8953125029802322,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562929153442,
"epoch": 0.08319280494659921,
"kl": 1.5559196174144745,
"learning_rate": 9.991736930860034e-07,
"loss": 0.011043448001146317,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8718750178813934,
"reward_std": 0.8150699585676193,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750029802322,
"rewards/point_reward": 0.8750000149011612,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 10.415625095367432,
"epoch": 0.0837549184935357,
"kl": 1.5151457488536835,
"learning_rate": 9.99168071950534e-07,
"loss": 0.013916858471930027,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.546875,
"reward": 1.8671875,
"reward_std": 0.7652469575405121,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8671875149011612,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 10.537499904632568,
"epoch": 0.08431703204047218,
"kl": 1.5213910639286041,
"learning_rate": 9.991624508150645e-07,
"loss": 0.02209000289440155,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8812500536441803,
"reward_std": 0.806503638625145,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 10.379687786102295,
"epoch": 0.08487914558740865,
"kl": 1.5504529476165771,
"learning_rate": 9.991568296795952e-07,
"loss": 0.02314211055636406,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.854687511920929,
"reward_std": 0.7863009423017502,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562547683716,
"epoch": 0.08544125913434514,
"kl": 1.493497610092163,
"learning_rate": 9.991512085441259e-07,
"loss": 0.016514841467142105,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9015625417232513,
"reward_std": 0.7979805618524551,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9031250029802322,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 10.464062690734863,
"epoch": 0.08600337268128162,
"kl": 1.5301005244255066,
"learning_rate": 9.991455874086565e-07,
"loss": 0.019722461700439453,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8796875178813934,
"reward_std": 0.8272948712110519,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062547683716,
"epoch": 0.0865654862282181,
"kl": 1.5509155094623566,
"learning_rate": 9.99139966273187e-07,
"loss": 0.010571800172328949,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8718750178813934,
"reward_std": 0.7905745208263397,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750029802322,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312786102295,
"epoch": 0.08712759977515458,
"kl": 1.5481449365615845,
"learning_rate": 9.991343451377177e-07,
"loss": -0.004659176804125309,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.850000023841858,
"reward_std": 0.7373934984207153,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8500000089406967,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 10.460937738418579,
"epoch": 0.08768971332209106,
"kl": 1.5249327719211578,
"learning_rate": 9.991287240022486e-07,
"loss": 0.0059776403941214085,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8531250357627869,
"reward_std": 0.7579726427793503,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250208616257,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 10.409374952316284,
"epoch": 0.08825182686902755,
"kl": 1.5869796872138977,
"learning_rate": 9.99123102866779e-07,
"loss": 0.008219568058848381,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.515625,
"reward": 1.8406250476837158,
"reward_std": 0.7582390904426575,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8406250178813934,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 10.471874952316284,
"epoch": 0.08881394041596402,
"kl": 1.4864901006221771,
"learning_rate": 9.991174817313097e-07,
"loss": 0.019063614308834076,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.734375,
"reward": 1.8781250417232513,
"reward_std": 0.880060464143753,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8796875029802322,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 10.4296875,
"epoch": 0.0893760539629005,
"kl": 1.5587861239910126,
"learning_rate": 9.991118605958404e-07,
"loss": 0.005818033590912819,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8562500178813934,
"reward_std": 0.8070340156555176,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8562500029802322,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125238418579,
"epoch": 0.08993816750983699,
"kl": 1.5292396545410156,
"learning_rate": 9.99106239460371e-07,
"loss": 0.03895500302314758,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.734375,
"reward": 1.904687523841858,
"reward_std": 0.886305496096611,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875089406967,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 10.471874952316284,
"epoch": 0.09050028105677346,
"kl": 1.5067280530929565,
"learning_rate": 9.991006183249015e-07,
"loss": 0.012101344764232635,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8718750476837158,
"reward_std": 0.7817658483982086,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750029802322,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 10.331250190734863,
"epoch": 0.09106239460370995,
"kl": 1.6083339750766754,
"learning_rate": 9.990949971894322e-07,
"loss": 0.014146502129733562,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.5625,
"reward": 1.8437500298023224,
"reward_std": 0.7921317666769028,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8437500149011612,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812786102295,
"epoch": 0.09162450815064643,
"kl": 1.5441269874572754,
"learning_rate": 9.990893760539629e-07,
"loss": 0.0075755673460662365,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8453125357627869,
"reward_std": 0.8307097852230072,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8453125059604645,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 10.354687690734863,
"epoch": 0.09218662169758292,
"kl": 1.6081570386886597,
"learning_rate": 9.990837549184936e-07,
"loss": 0.028595969080924988,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9156250059604645,
"reward_std": 0.8074723035097122,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250059604645,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 10.518750190734863,
"epoch": 0.09274873524451939,
"kl": 1.5291050374507904,
"learning_rate": 9.99078133783024e-07,
"loss": 0.008415000513195992,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.889062523841858,
"reward_std": 0.7609771192073822,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 10.365625143051147,
"epoch": 0.09331084879145587,
"kl": 1.6111627221107483,
"learning_rate": 9.990725126475547e-07,
"loss": 0.00948229618370533,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.870312511920929,
"reward_std": 0.8168683350086212,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8703124970197678,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 10.459375143051147,
"epoch": 0.09387296233839236,
"kl": 1.5411999225616455,
"learning_rate": 9.990668915120854e-07,
"loss": 0.018182864412665367,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.640625,
"reward": 1.854687511920929,
"reward_std": 0.8434903770685196,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 10.379687786102295,
"epoch": 0.09443507588532883,
"kl": 1.5454858243465424,
"learning_rate": 9.99061270376616e-07,
"loss": 0.010270864702761173,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.8828125,
"reward_std": 0.8237272053956985,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 10.451562643051147,
"epoch": 0.09499718943226532,
"kl": 1.509293019771576,
"learning_rate": 9.990556492411467e-07,
"loss": 0.029691193252801895,
"ratio/all_0": 0.0,
"ratio/all_2": 0.71875,
"reward": 1.9359374940395355,
"reward_std": 0.8380990624427795,
"rewards/avg_0": 1.96875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9359375238418579,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312690734863,
"epoch": 0.0955593029792018,
"kl": 1.6071103811264038,
"learning_rate": 9.990500281056772e-07,
"loss": 0.004686681553721428,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8687500059604645,
"reward_std": 0.8097708225250244,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.870312511920929,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 10.392187595367432,
"epoch": 0.09612141652613827,
"kl": 1.6090731918811798,
"learning_rate": 9.99044406970208e-07,
"loss": 0.011083896271884441,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.885937511920929,
"reward_std": 0.772784024477005,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8859375268220901,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 10.264062643051147,
"epoch": 0.09668353007307476,
"kl": 1.5098300874233246,
"learning_rate": 9.990387858347385e-07,
"loss": 0.013532341457903385,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.8750000298023224,
"reward_std": 0.8310911804437637,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562786102295,
"epoch": 0.09724564362001124,
"kl": 1.5602307319641113,
"learning_rate": 9.990331646992692e-07,
"loss": 0.01641331985592842,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.881250023841858,
"reward_std": 0.7340317219495773,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 10.332812786102295,
"epoch": 0.09780775716694773,
"kl": 1.5638643205165863,
"learning_rate": 9.990275435638e-07,
"loss": 0.005013991147279739,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8578124940395355,
"reward_std": 0.8247323632240295,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125238418579,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 10.34531283378601,
"epoch": 0.0983698707138842,
"kl": 1.5222603976726532,
"learning_rate": 9.990219224283306e-07,
"loss": 0.0225371066480875,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.8875000476837158,
"reward_std": 0.8316797167062759,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000178813934,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 10.446875095367432,
"epoch": 0.09893198426082069,
"kl": 1.522368609905243,
"learning_rate": 9.99016301292861e-07,
"loss": 0.026052303612232208,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.9000000059604645,
"reward_std": 0.8042051494121552,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000208616257,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 10.289062738418579,
"epoch": 0.09949409780775717,
"kl": 1.5553119480609894,
"learning_rate": 9.990106801573917e-07,
"loss": 0.03195918723940849,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.9109375178813934,
"reward_std": 0.8066186159849167,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 10.404687881469727,
"epoch": 0.10005621135469364,
"kl": 1.5360974669456482,
"learning_rate": 9.990050590219224e-07,
"loss": 0.014459017664194107,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.546875,
"reward": 1.873437523841858,
"reward_std": 0.7592671066522598,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8750000149011612,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 10.451562643051147,
"epoch": 0.10061832490163013,
"kl": 1.5627336502075195,
"learning_rate": 9.98999437886453e-07,
"loss": 0.0008577615953981876,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.53125,
"reward": 1.870312511920929,
"reward_std": 0.7360649853944778,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 10.4296875,
"epoch": 0.10118043844856661,
"kl": 1.5364422798156738,
"learning_rate": 9.989938167509838e-07,
"loss": 0.013105844147503376,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.546875,
"reward": 1.885937511920929,
"reward_std": 0.7499039620161057,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8859374970197678,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 10.578125,
"epoch": 0.1017425519955031,
"kl": 1.5224189162254333,
"learning_rate": 9.989881956155142e-07,
"loss": 0.006834621541202068,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.5625,
"reward": 1.8328125178813934,
"reward_std": 0.7984373569488525,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8328125178813934,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 10.600000381469727,
"epoch": 0.10230466554243957,
"kl": 1.5363523662090302,
"learning_rate": 9.989825744800449e-07,
"loss": 0.002911627758294344,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.546875,
"reward": 1.8765625059604645,
"reward_std": 0.7439265847206116,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625208616257,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 10.567187547683716,
"epoch": 0.10286677908937605,
"kl": 1.5349900424480438,
"learning_rate": 9.989769533445756e-07,
"loss": 0.020761406049132347,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8718750476837158,
"reward_std": 0.8221687376499176,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750029802322,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.10342889263631254,
"kl": 1.5807002484798431,
"learning_rate": 9.989713322091062e-07,
"loss": 0.001111285062506795,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.4375,
"reward": 1.8453125357627869,
"reward_std": 0.6977837383747101,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8453125059604645,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 10.664062976837158,
"epoch": 0.10399100618324901,
"kl": 1.558085858821869,
"learning_rate": 9.989657110736367e-07,
"loss": 0.015460247173905373,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.8562500178813934,
"reward_std": 0.762683317065239,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8562500178813934,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687881469727,
"epoch": 0.1045531197301855,
"kl": 1.5969407558441162,
"learning_rate": 9.989600899381674e-07,
"loss": 0.012307515367865562,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8578124940395355,
"reward_std": 0.7931357175111771,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125238418579,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 10.448437929153442,
"epoch": 0.10511523327712198,
"kl": 1.5586650669574738,
"learning_rate": 9.98954468802698e-07,
"loss": 0.020341139286756516,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8765625059604645,
"reward_std": 0.8082123398780823,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625059604645,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 10.440625190734863,
"epoch": 0.10567734682405847,
"kl": 1.5379924774169922,
"learning_rate": 9.989488476672287e-07,
"loss": 0.011121533811092377,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.857812523841858,
"reward_std": 0.8036759942770004,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.71875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125089406967,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 10.321875095367432,
"epoch": 0.10623946037099494,
"kl": 1.6033551394939423,
"learning_rate": 9.989432265317594e-07,
"loss": 0.009739244356751442,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.546875,
"reward": 1.878125011920929,
"reward_std": 0.7489920854568481,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437404632568,
"epoch": 0.10680157391793142,
"kl": 1.5498007535934448,
"learning_rate": 9.9893760539629e-07,
"loss": 0.03089289367198944,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.65625,
"reward": 1.8843750655651093,
"reward_std": 0.8420456349849701,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 10.40000033378601,
"epoch": 0.10736368746486791,
"kl": 1.5609517395496368,
"learning_rate": 9.989319842608206e-07,
"loss": 0.026409577578306198,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.896875023841858,
"reward_std": 0.8327089995145798,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968749940395355,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 10.593750238418579,
"epoch": 0.10792580101180438,
"kl": 1.5071128010749817,
"learning_rate": 9.989263631253512e-07,
"loss": 0.007572017144411802,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8796875178813934,
"reward_std": 0.770546168088913,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250190734863,
"epoch": 0.10848791455874086,
"kl": 1.519468516111374,
"learning_rate": 9.98920741989882e-07,
"loss": 0.020838668569922447,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.9031250476837158,
"reward_std": 0.81875379383564,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250029802322,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 10.401562690734863,
"epoch": 0.10905002810567735,
"kl": 1.6071525514125824,
"learning_rate": 9.989151208544126e-07,
"loss": 0.01021914929151535,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5,
"reward": 1.8750000298023224,
"reward_std": 0.7244045436382294,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250429153442,
"epoch": 0.10961214165261383,
"kl": 1.5413286685943604,
"learning_rate": 9.989094997189433e-07,
"loss": 0.025953728705644608,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.901562511920929,
"reward_std": 0.7901591062545776,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 10.44687533378601,
"epoch": 0.1101742551995503,
"kl": 1.528151273727417,
"learning_rate": 9.989038785834737e-07,
"loss": 0.03157404437661171,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9249999821186066,
"reward_std": 0.8380978256464005,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9250000268220901,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 10.5,
"epoch": 0.11073636874648679,
"kl": 1.5467852056026459,
"learning_rate": 9.988982574480044e-07,
"loss": 0.010904987342655659,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8843750357627869,
"reward_std": 0.8058620393276215,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.885937511920929,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 10.365625143051147,
"epoch": 0.11129848229342328,
"kl": 1.5382420718669891,
"learning_rate": 9.98892636312535e-07,
"loss": 0.01803477108478546,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8890624940395355,
"reward_std": 0.8067688345909119,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625238418579,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 10.318750143051147,
"epoch": 0.11186059584035975,
"kl": 1.6021558046340942,
"learning_rate": 9.988870151770658e-07,
"loss": 0.004164498299360275,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.515625,
"reward": 1.810937523841858,
"reward_std": 0.7804713249206543,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.6875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8109375089406967,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 10.292187452316284,
"epoch": 0.11242270938729623,
"kl": 1.6093721091747284,
"learning_rate": 9.988813940415962e-07,
"loss": 0.024522338062524796,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.889062523841858,
"reward_std": 0.8257010728120804,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 10.400000095367432,
"epoch": 0.11298482293423272,
"kl": 1.527627944946289,
"learning_rate": 9.98875772906127e-07,
"loss": 0.015955956652760506,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.8953125178813934,
"reward_std": 0.8191908448934555,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 10.30625057220459,
"epoch": 0.1135469364811692,
"kl": 1.6383016109466553,
"learning_rate": 9.988701517706576e-07,
"loss": 0.035872433334589005,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9234375357627869,
"reward_std": 0.7858867347240448,
"rewards/avg_0": 1.96875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9234375059604645,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 10.281250238418579,
"epoch": 0.11410905002810567,
"kl": 1.553186446428299,
"learning_rate": 9.988645306351883e-07,
"loss": 0.007708997465670109,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5625,
"reward": 1.8515625298023224,
"reward_std": 0.7847436964511871,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8515625,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 10.273437738418579,
"epoch": 0.11467116357504216,
"kl": 1.5798857808113098,
"learning_rate": 9.98858909499719e-07,
"loss": 0.03088054247200489,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6875,
"reward": 1.8953125178813934,
"reward_std": 0.8464910984039307,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 10.371875286102295,
"epoch": 0.11523327712197864,
"kl": 1.6130147278308868,
"learning_rate": 9.988532883642496e-07,
"loss": 0.012614361010491848,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8843750357627869,
"reward_std": 0.7893950045108795,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 10.323437690734863,
"epoch": 0.11579539066891512,
"kl": 1.5593442618846893,
"learning_rate": 9.988476672287803e-07,
"loss": 0.016722215339541435,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8843750059604645,
"reward_std": 0.8005313277244568,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8859375268220901,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 10.379688024520874,
"epoch": 0.1163575042158516,
"kl": 1.5896699726581573,
"learning_rate": 9.988420460933108e-07,
"loss": 0.010505123063921928,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.834375023841858,
"reward_std": 0.7868907004594803,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8343750089406967,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562547683716,
"epoch": 0.11691961776278809,
"kl": 1.5763377845287323,
"learning_rate": 9.988364249578414e-07,
"loss": 0.008979510515928268,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8640625178813934,
"reward_std": 0.7982265949249268,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625178813934,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 10.460937738418579,
"epoch": 0.11748173130972456,
"kl": 1.5620132386684418,
"learning_rate": 9.988308038223721e-07,
"loss": 0.015451719984412193,
"ratio/all_0": 0.0,
"ratio/all_2": 0.59375,
"reward": 1.9062500298023224,
"reward_std": 0.769840195775032,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9062500149011612,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 10.368750095367432,
"epoch": 0.11804384485666104,
"kl": 1.602480798959732,
"learning_rate": 9.988251826869028e-07,
"loss": 0.005209244322031736,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.484375,
"reward": 1.8562500178813934,
"reward_std": 0.7441752403974533,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.703125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8578125089406967,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 10.512500286102295,
"epoch": 0.11860595840359753,
"kl": 1.5443048775196075,
"learning_rate": 9.988195615514333e-07,
"loss": 0.023446325212717056,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.9015625417232513,
"reward_std": 0.8415164798498154,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9015624970197678,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.11916807195053401,
"kl": 1.4916993379592896,
"learning_rate": 9.98813940415964e-07,
"loss": 0.007108633406460285,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.453125,
"reward": 1.8437500298023224,
"reward_std": 0.7147433012723923,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8453125208616257,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000238418579,
"epoch": 0.11973018549747048,
"kl": 1.5132508873939514,
"learning_rate": 9.988083192804946e-07,
"loss": 0.009618530049920082,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8921875059604645,
"reward_std": 0.7719315439462662,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 10.487500190734863,
"epoch": 0.12029229904440697,
"kl": 1.5387333631515503,
"learning_rate": 9.988026981450253e-07,
"loss": 0.016562655568122864,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.9156250059604645,
"reward_std": 0.7766409516334534,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250208616257,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437786102295,
"epoch": 0.12085441259134345,
"kl": 1.6194742918014526,
"learning_rate": 9.987970770095558e-07,
"loss": 0.023598913103342056,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.9140625298023224,
"reward_std": 0.7459004819393158,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 10.490625143051147,
"epoch": 0.12141652613827993,
"kl": 1.5474883019924164,
"learning_rate": 9.987914558740864e-07,
"loss": -0.01752345636487007,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8328125178813934,
"reward_std": 0.7599477916955948,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8328125178813934,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687786102295,
"epoch": 0.12197863968521641,
"kl": 1.5022177398204803,
"learning_rate": 9.987858347386171e-07,
"loss": 0.027818644419312477,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.9078125357627869,
"reward_std": 0.805649921298027,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125208616257,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 10.465625286102295,
"epoch": 0.1225407532321529,
"kl": 1.5558375120162964,
"learning_rate": 9.987802136031478e-07,
"loss": 0.010633164085447788,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5625,
"reward": 1.8484375476837158,
"reward_std": 0.7891297936439514,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375178813934,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.12310286677908938,
"kl": 1.5038470327854156,
"learning_rate": 9.987745924676785e-07,
"loss": -0.008047796785831451,
"ratio/all_0": 0.0,
"ratio/all_2": 0.578125,
"reward": 1.865625023841858,
"reward_std": 0.7684888392686844,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656249940395355,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250190734863,
"epoch": 0.12366498032602585,
"kl": 1.557765245437622,
"learning_rate": 9.987689713322091e-07,
"loss": 0.0018783528357744217,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.862500011920929,
"reward_std": 0.7834757715463638,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.862500011920929,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500190734863,
"epoch": 0.12422709387296234,
"kl": 1.5848875045776367,
"learning_rate": 9.987633501967398e-07,
"loss": -0.004130351357161999,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.515625,
"reward": 1.854687511920929,
"reward_std": 0.7420046925544739,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.8578125238418579,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 10.387500047683716,
"epoch": 0.12478920741989882,
"kl": 1.5096738934516907,
"learning_rate": 9.987577290612703e-07,
"loss": 0.017767585813999176,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.609375,
"reward": 1.870312511920929,
"reward_std": 0.8075316995382309,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 10.434375047683716,
"epoch": 0.1253513209668353,
"kl": 1.586805284023285,
"learning_rate": 9.98752107925801e-07,
"loss": 0.028075214475393295,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.9109375178813934,
"reward_std": 0.8107988387346268,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.953125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187690734863,
"epoch": 0.1259134345137718,
"kl": 1.5914437770843506,
"learning_rate": 9.987464867903316e-07,
"loss": 0.022167479619383812,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8843750357627869,
"reward_std": 0.8190177977085114,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 10.428125143051147,
"epoch": 0.12647554806070826,
"kl": 1.5555630326271057,
"learning_rate": 9.987408656548623e-07,
"loss": 0.027459319680929184,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.9140625298023224,
"reward_std": 0.7828859984874725,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625149011612,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 10.518750190734863,
"epoch": 0.12703766160764474,
"kl": 1.540842980146408,
"learning_rate": 9.987352445193928e-07,
"loss": -0.01165848784148693,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.484375,
"reward": 1.8125000596046448,
"reward_std": 0.7399534285068512,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8125,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 10.38437533378601,
"epoch": 0.12759977515458124,
"kl": 1.5715409219264984,
"learning_rate": 9.987296233839235e-07,
"loss": 0.0172878485172987,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8828125,
"reward_std": 0.8174593150615692,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 10.49375033378601,
"epoch": 0.1281618887015177,
"kl": 1.44660022854805,
"learning_rate": 9.987240022484541e-07,
"loss": 0.004869728349149227,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.8906250298023224,
"reward_std": 0.7453107237815857,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250429153442,
"epoch": 0.12872400224845418,
"kl": 1.543150782585144,
"learning_rate": 9.987183811129848e-07,
"loss": 0.021657878533005714,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9218750298023224,
"reward_std": 0.8066779375076294,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.96875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9218750149011612,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125238418579,
"epoch": 0.12928611579539068,
"kl": 1.5754946768283844,
"learning_rate": 9.987127599775153e-07,
"loss": 0.007582786493003368,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8843750357627869,
"reward_std": 0.7794698178768158,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625238418579,
"epoch": 0.12984822934232715,
"kl": 1.5105478763580322,
"learning_rate": 9.98707138842046e-07,
"loss": 0.012311361730098724,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.854687511920929,
"reward_std": 0.833659902215004,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8562500178813934,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 10.39218783378601,
"epoch": 0.13041034288926362,
"kl": 1.5355972051620483,
"learning_rate": 9.987015177065766e-07,
"loss": 0.012546722777187824,
"ratio/all_0": 0.0,
"ratio/all_2": 0.71875,
"reward": 1.9078125357627869,
"reward_std": 0.8449338972568512,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125208616257,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 10.446875095367432,
"epoch": 0.13097245643620012,
"kl": 1.5727481544017792,
"learning_rate": 9.986958965711073e-07,
"loss": 0.004496240522712469,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.8906250298023224,
"reward_std": 0.8004075884819031,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8921875208616257,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 10.396875381469727,
"epoch": 0.1315345699831366,
"kl": 1.5587262213230133,
"learning_rate": 9.98690275435638e-07,
"loss": -0.0068831006065011024,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.875,
"reward_std": 0.7379455119371414,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 10.4609375,
"epoch": 0.13209668353007306,
"kl": 1.5907757580280304,
"learning_rate": 9.986846543001687e-07,
"loss": 0.03402874246239662,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.578125,
"reward": 1.850000023841858,
"reward_std": 0.8150699734687805,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8499999940395355,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 10.531250238418579,
"epoch": 0.13265879707700956,
"kl": 1.5824647843837738,
"learning_rate": 9.986790331646993e-07,
"loss": 0.021441660821437836,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.901562511920929,
"reward_std": 0.8082363605499268,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937547683716,
"epoch": 0.13322091062394603,
"kl": 1.5818893015384674,
"learning_rate": 9.986734120292298e-07,
"loss": 0.0017864266410470009,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.873437523841858,
"reward_std": 0.7641280144453049,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375089406967,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 10.562500238418579,
"epoch": 0.13378302417088253,
"kl": 1.5562168657779694,
"learning_rate": 9.986677908937605e-07,
"loss": 0.015783432871103287,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8796875178813934,
"reward_std": 0.7744939774274826,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 10.482812643051147,
"epoch": 0.134345137717819,
"kl": 1.5716504752635956,
"learning_rate": 9.986621697582912e-07,
"loss": 0.026386769488453865,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.9031250178813934,
"reward_std": 0.8459218442440033,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9046875238418579,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687643051147,
"epoch": 0.13490725126475547,
"kl": 1.5155498683452606,
"learning_rate": 9.986565486228218e-07,
"loss": 0.028578244149684906,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8953125476837158,
"reward_std": 0.8043807148933411,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8968750238418579,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 10.468750238418579,
"epoch": 0.13546936481169197,
"kl": 1.5957022309303284,
"learning_rate": 9.986509274873523e-07,
"loss": 0.030983472242951393,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9171875417232513,
"reward_std": 0.7883910536766052,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 10.460937738418579,
"epoch": 0.13603147835862844,
"kl": 1.5496326088905334,
"learning_rate": 9.98645306351883e-07,
"loss": 0.019616173580288887,
"ratio/all_0": 0.0,
"ratio/all_2": 0.578125,
"reward": 1.9156250059604645,
"reward_std": 0.7500213980674744,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250208616257,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 10.590625286102295,
"epoch": 0.13659359190556492,
"kl": 1.5903447270393372,
"learning_rate": 9.986396852164137e-07,
"loss": 0.005906625185161829,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.8656249940395355,
"reward_std": 0.7259642332792282,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250238418579,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 10.357812643051147,
"epoch": 0.13715570545250141,
"kl": 1.5682716965675354,
"learning_rate": 9.986340640809443e-07,
"loss": 0.027652684599161148,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.578125,
"reward": 1.8531250357627869,
"reward_std": 0.8177826255559921,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250059604645,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 10.401562690734863,
"epoch": 0.13771781899943789,
"kl": 1.5091069638729095,
"learning_rate": 9.98628442945475e-07,
"loss": 0.00808423850685358,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.870312511920929,
"reward_std": 0.83954256772995,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8703125268220901,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 10.432812690734863,
"epoch": 0.13827993254637436,
"kl": 1.5255979001522064,
"learning_rate": 9.986228218100055e-07,
"loss": 0.013790331780910492,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8796875178813934,
"reward_std": 0.7863009124994278,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 10.234375238418579,
"epoch": 0.13884204609331086,
"kl": 1.5829472541809082,
"learning_rate": 9.986172006745362e-07,
"loss": 0.02624700777232647,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.904687523841858,
"reward_std": 0.8320054858922958,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875089406967,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 10.339062452316284,
"epoch": 0.13940415964024733,
"kl": 1.5977038741111755,
"learning_rate": 9.986115795390668e-07,
"loss": 0.006217719055712223,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5625,
"reward": 1.8515625,
"reward_std": 0.7779113799333572,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8515625298023224,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 10.41562533378601,
"epoch": 0.1399662731871838,
"kl": 1.5816867351531982,
"learning_rate": 9.986059584035975e-07,
"loss": 0.014478843659162521,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8656249940395355,
"reward_std": 0.7894555926322937,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250238418579,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 10.537500143051147,
"epoch": 0.1405283867341203,
"kl": 1.564496248960495,
"learning_rate": 9.986003372681282e-07,
"loss": 0.0180707685649395,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.578125,
"reward": 1.8421875536441803,
"reward_std": 0.8131468445062637,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8437500149011612,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 10.4375,
"epoch": 0.14109050028105677,
"kl": 1.5500454604625702,
"learning_rate": 9.985947161326589e-07,
"loss": 0.02213384583592415,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8765625357627869,
"reward_std": 0.8081517666578293,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625208616257,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 10.276562690734863,
"epoch": 0.14165261382799327,
"kl": 1.6070986092090607,
"learning_rate": 9.985890949971893e-07,
"loss": 0.02211572788655758,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.881250023841858,
"reward_std": 0.8086518347263336,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 10.381250143051147,
"epoch": 0.14221472737492974,
"kl": 1.4865486919879913,
"learning_rate": 9.9858347386172e-07,
"loss": 0.013803028501570225,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.546875,
"reward": 1.8609375357627869,
"reward_std": 0.7731085568666458,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375208616257,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 10.46250033378601,
"epoch": 0.1427768409218662,
"kl": 1.562472254037857,
"learning_rate": 9.985778527262507e-07,
"loss": 0.0062060109339654446,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8687500059604645,
"reward_std": 0.763007864356041,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500208616257,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 10.40781283378601,
"epoch": 0.1433389544688027,
"kl": 1.6123566925525665,
"learning_rate": 9.985722315907814e-07,
"loss": 0.01719619520008564,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.8640625476837158,
"reward_std": 0.797432154417038,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625029802322,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 10.40781283378601,
"epoch": 0.14390106801573918,
"kl": 1.5859326422214508,
"learning_rate": 9.985666104553118e-07,
"loss": 0.02508179470896721,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.912500023841858,
"reward_std": 0.7811773121356964,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 10.42343783378601,
"epoch": 0.14446318156267565,
"kl": 1.5853928923606873,
"learning_rate": 9.985609893198425e-07,
"loss": 0.029612088575959206,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.671875,
"reward": 1.921875,
"reward_std": 0.8175742924213409,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9234375208616257,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 10.448437929153442,
"epoch": 0.14502529510961215,
"kl": 1.5741258263587952,
"learning_rate": 9.985553681843732e-07,
"loss": 0.0034783557057380676,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8625000417232513,
"reward_std": 0.7571201622486115,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.862500011920929,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.14558740865654862,
"kl": 1.4908913373947144,
"learning_rate": 9.985497470489039e-07,
"loss": 0.0013852245174348354,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8765625357627869,
"reward_std": 0.7487268894910812,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.878125011920929,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062786102295,
"epoch": 0.1461495222034851,
"kl": 1.5390149652957916,
"learning_rate": 9.985441259134345e-07,
"loss": 0.011133184656500816,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8687500357627869,
"reward_std": 0.7598303556442261,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500059604645,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187643051147,
"epoch": 0.1467116357504216,
"kl": 1.6012259721755981,
"learning_rate": 9.98538504777965e-07,
"loss": 0.006842806003987789,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.546875,
"reward": 1.8734375536441803,
"reward_std": 0.745634064078331,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734374940395355,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937595367432,
"epoch": 0.14727374929735806,
"kl": 1.5561028718948364,
"learning_rate": 9.985328836424957e-07,
"loss": 0.0026737055741250515,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8609375655651093,
"reward_std": 0.7467529475688934,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.71875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375059604645,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 10.328125238418579,
"epoch": 0.14783586284429454,
"kl": 1.61723193526268,
"learning_rate": 9.985272625070264e-07,
"loss": 0.03143702447414398,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.8937500417232513,
"reward_std": 0.8296476751565933,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8937499970197678,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 10.489062786102295,
"epoch": 0.14839797639123103,
"kl": 1.558815211057663,
"learning_rate": 9.98521641371557e-07,
"loss": 0.014333357103168964,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8765625357627869,
"reward_std": 0.8043806999921799,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625059604645,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 10.493750095367432,
"epoch": 0.1489600899381675,
"kl": 1.5746314227581024,
"learning_rate": 9.985160202360877e-07,
"loss": 0.04017938673496246,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.6875,
"reward": 1.8921875357627869,
"reward_std": 0.8675475120544434,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 10.496875286102295,
"epoch": 0.14952220348510398,
"kl": 1.532246470451355,
"learning_rate": 9.985103991006184e-07,
"loss": -0.004921544808894396,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8421874940395355,
"reward_std": 0.7891298532485962,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875238418579,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 10.459375143051147,
"epoch": 0.15008431703204048,
"kl": 1.5063121914863586,
"learning_rate": 9.985047779651489e-07,
"loss": 0.02832210063934326,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.9000000357627869,
"reward_std": 0.8565918058156967,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 10.609375238418579,
"epoch": 0.15064643057897695,
"kl": 1.541852205991745,
"learning_rate": 9.984991568296795e-07,
"loss": 0.00951352994889021,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8765625357627869,
"reward_std": 0.8181691318750381,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.878125011920929,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 10.385937452316284,
"epoch": 0.15120854412591345,
"kl": 1.6570470035076141,
"learning_rate": 9.984935356942102e-07,
"loss": 0.027027206495404243,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.904687523841858,
"reward_std": 0.7799990177154541,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875238418579,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 10.460937738418579,
"epoch": 0.15177065767284992,
"kl": 1.6034429669380188,
"learning_rate": 9.984879145587409e-07,
"loss": 0.015010214410722256,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.609375,
"reward": 1.842187523841858,
"reward_std": 0.8224339187145233,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875089406967,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 10.409375190734863,
"epoch": 0.1523327712197864,
"kl": 1.6018818020820618,
"learning_rate": 9.984822934232716e-07,
"loss": 0.033464521169662476,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.901562511920929,
"reward_std": 0.7854724824428558,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 10.456249952316284,
"epoch": 0.1528948847667229,
"kl": 1.558174967765808,
"learning_rate": 9.98476672287802e-07,
"loss": 0.02480011060833931,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.640625,
"reward": 1.850000023841858,
"reward_std": 0.8561535179615021,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8500000238418579,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 10.542187452316284,
"epoch": 0.15345699831365936,
"kl": 1.4964491426944733,
"learning_rate": 9.984710511523327e-07,
"loss": 0.037152573466300964,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.75,
"reward": 1.926562488079071,
"reward_std": 0.8781774640083313,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9265625327825546,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 10.478125095367432,
"epoch": 0.15401911186059583,
"kl": 1.6227607131004333,
"learning_rate": 9.984654300168634e-07,
"loss": 0.04696275666356087,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.9234375357627869,
"reward_std": 0.8446333706378937,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9234375208616257,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437786102295,
"epoch": 0.15458122540753233,
"kl": 1.5785993933677673,
"learning_rate": 9.98459808881394e-07,
"loss": 0.032174110412597656,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.671875,
"reward": 1.889062523841858,
"reward_std": 0.8476100713014603,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 10.475000381469727,
"epoch": 0.1551433389544688,
"kl": 1.6346567571163177,
"learning_rate": 9.984541877459245e-07,
"loss": 0.02995876967906952,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8984375,
"reward_std": 0.7702215760946274,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375149011612,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 10.375000238418579,
"epoch": 0.15570545250140527,
"kl": 1.5467197000980377,
"learning_rate": 9.984485666104552e-07,
"loss": 0.007414577528834343,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.871874988079071,
"reward_std": 0.7875954359769821,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750327825546,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 10.318750381469727,
"epoch": 0.15626756604834177,
"kl": 1.605959415435791,
"learning_rate": 9.984429454749859e-07,
"loss": 0.016059426590800285,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8718750476837158,
"reward_std": 0.7766409367322922,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750178813934,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 10.562500238418579,
"epoch": 0.15682967959527824,
"kl": 1.5902847647666931,
"learning_rate": 9.984373243395166e-07,
"loss": 0.023069556802511215,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8875000476837158,
"reward_std": 0.8167774081230164,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000029802322,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 10.340625047683716,
"epoch": 0.15739179314221471,
"kl": 1.5816633105278015,
"learning_rate": 9.984317032040472e-07,
"loss": 0.0046473303809762,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.881250023841858,
"reward_std": 0.8019672930240631,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 10.440625190734863,
"epoch": 0.1579539066891512,
"kl": 1.6009697616100311,
"learning_rate": 9.98426082068578e-07,
"loss": 0.02072141319513321,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.9000000357627869,
"reward_std": 0.8073585629463196,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000208616257,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 10.429687738418579,
"epoch": 0.15851602023608768,
"kl": 1.6120758056640625,
"learning_rate": 9.984204609331084e-07,
"loss": 0.004281133413314819,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.857812523841858,
"reward_std": 0.7453094720840454,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125238418579,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 10.454687595367432,
"epoch": 0.15907813378302418,
"kl": 1.5999435186386108,
"learning_rate": 9.98414839797639e-07,
"loss": 0.017133023589849472,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8953125178813934,
"reward_std": 0.7781753242015839,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 10.448437690734863,
"epoch": 0.15964024732996066,
"kl": 1.5763249397277832,
"learning_rate": 9.984092186621697e-07,
"loss": 0.012010748498141766,
"ratio/all_0": 0.0,
"ratio/all_2": 0.609375,
"reward": 1.9031250476837158,
"reward_std": 0.7713418155908585,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250178813934,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 10.237499952316284,
"epoch": 0.16020236087689713,
"kl": 1.6282682716846466,
"learning_rate": 9.984035975267004e-07,
"loss": 0.025732139125466347,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.909375011920929,
"reward_std": 0.8114782422780991,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9093750268220901,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312786102295,
"epoch": 0.16076447442383363,
"kl": 1.6078749597072601,
"learning_rate": 9.98397976391231e-07,
"loss": 0.002353701274842024,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.453125,
"reward": 1.8125000298023224,
"reward_std": 0.7375993728637695,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8125,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 10.382812738418579,
"epoch": 0.1613265879707701,
"kl": 1.615327775478363,
"learning_rate": 9.983923552557615e-07,
"loss": 0.000707289669662714,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.453125,
"reward": 1.8218750059604645,
"reward_std": 0.729204848408699,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8218750208616257,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.16188870151770657,
"kl": 1.5665834844112396,
"learning_rate": 9.983867341202922e-07,
"loss": 0.018719196319580078,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.901562511920929,
"reward_std": 0.7854484617710114,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9015625268220901,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 10.479687452316284,
"epoch": 0.16245081506464307,
"kl": 1.5476877391338348,
"learning_rate": 9.98381112984823e-07,
"loss": 0.029271548613905907,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9218750298023224,
"reward_std": 0.805764839053154,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9218750149011612,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 10.376562595367432,
"epoch": 0.16301292861157954,
"kl": 1.5733246803283691,
"learning_rate": 9.983754918493536e-07,
"loss": 0.032239705324172974,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.934374988079071,
"reward_std": 0.8063534200191498,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9343750178813934,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 10.378124952316284,
"epoch": 0.163575042158516,
"kl": 1.6404075622558594,
"learning_rate": 9.98369870713884e-07,
"loss": 0.029129423201084137,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6875,
"reward": 1.8843750357627869,
"reward_std": 0.8548843264579773,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 10.44687533378601,
"epoch": 0.1641371557054525,
"kl": 1.5501161813735962,
"learning_rate": 9.983642495784147e-07,
"loss": 0.017258845269680023,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8921875357627869,
"reward_std": 0.7841527462005615,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 10.470312595367432,
"epoch": 0.16469926925238898,
"kl": 1.6193291246891022,
"learning_rate": 9.983586284429454e-07,
"loss": 0.004365533124655485,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.671875,
"reward": 1.8843750059604645,
"reward_std": 0.8215802311897278,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 10.409375190734863,
"epoch": 0.16526138279932545,
"kl": 1.625065267086029,
"learning_rate": 9.98353007307476e-07,
"loss": 0.031236734241247177,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9171875417232513,
"reward_std": 0.7906895130872726,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 10.48593783378601,
"epoch": 0.16582349634626195,
"kl": 1.6580379605293274,
"learning_rate": 9.983473861720065e-07,
"loss": -0.010036187246441841,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.8171875178813934,
"reward_std": 0.7649829834699631,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.734375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8171875029802322,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000047683716,
"epoch": 0.16638560989319842,
"kl": 1.6484183073043823,
"learning_rate": 9.983417650365374e-07,
"loss": 0.011245986446738243,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5625,
"reward": 1.857812523841858,
"reward_std": 0.7721955329179764,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125089406967,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 10.548437595367432,
"epoch": 0.16694772344013492,
"kl": 1.6036081910133362,
"learning_rate": 9.983361439010681e-07,
"loss": 0.02160033956170082,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.885937511920929,
"reward_std": 0.7658355087041855,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 10.30625033378601,
"epoch": 0.1675098369870714,
"kl": 1.5906179249286652,
"learning_rate": 9.983305227655986e-07,
"loss": 0.023189380764961243,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8828125298023224,
"reward_std": 0.8100359886884689,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500429153442,
"epoch": 0.16807195053400786,
"kl": 1.57911616563797,
"learning_rate": 9.983249016301292e-07,
"loss": 0.024665620177984238,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8953125476837158,
"reward_std": 0.813511535525322,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125029802322,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 10.420312643051147,
"epoch": 0.16863406408094436,
"kl": 1.5858178734779358,
"learning_rate": 9.9831928049466e-07,
"loss": 0.019302021712064743,
"ratio/all_0": 0.0,
"ratio/all_2": 0.5625,
"reward": 1.9109375178813934,
"reward_std": 0.7481989115476608,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 10.367187738418579,
"epoch": 0.16919617762788083,
"kl": 1.5977429747581482,
"learning_rate": 9.983136593591906e-07,
"loss": 0.03787188231945038,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8937499821186066,
"reward_std": 0.8181628286838531,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8937500268220901,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 10.29062533378601,
"epoch": 0.1697582911748173,
"kl": 1.5648789703845978,
"learning_rate": 9.98308038223721e-07,
"loss": 0.011258106678724289,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.59375,
"reward": 1.865625023841858,
"reward_std": 0.7881258726119995,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 10.531250238418579,
"epoch": 0.1703204047217538,
"kl": 1.592081367969513,
"learning_rate": 9.983024170882517e-07,
"loss": 0.009078697301447392,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.625,
"reward": 1.8406250178813934,
"reward_std": 0.8274097889661789,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8406250178813934,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 10.617187738418579,
"epoch": 0.17088251826869028,
"kl": 1.5056523084640503,
"learning_rate": 9.982967959527824e-07,
"loss": 0.014645126648247242,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9046874940395355,
"reward_std": 0.8363915532827377,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875238418579,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.17144463181562675,
"kl": 1.6216383576393127,
"learning_rate": 9.98291174817313e-07,
"loss": 0.018544694408774376,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8843750059604645,
"reward_std": 0.822757288813591,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 10.337500095367432,
"epoch": 0.17200674536256325,
"kl": 1.5746294260025024,
"learning_rate": 9.982855536818436e-07,
"loss": 0.029133716598153114,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9234375357627869,
"reward_std": 0.8675475269556046,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9234375059604645,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 10.406250238418579,
"epoch": 0.17256885890949972,
"kl": 1.6034705936908722,
"learning_rate": 9.982799325463742e-07,
"loss": 0.022716794162988663,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8796875178813934,
"reward_std": 0.8130151182413101,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875029802322,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687547683716,
"epoch": 0.1731309724564362,
"kl": 1.6118988692760468,
"learning_rate": 9.98274311410905e-07,
"loss": 0.025835514068603516,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9109375476837158,
"reward_std": 0.8112130612134933,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375029802322,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 10.340625286102295,
"epoch": 0.1736930860033727,
"kl": 1.5647907555103302,
"learning_rate": 9.982686902754356e-07,
"loss": -0.005121363326907158,
"ratio/all_0": 0.0,
"ratio/all_2": 0.515625,
"reward": 1.8703125417232513,
"reward_std": 0.7209883630275726,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8703124970197678,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 10.40625,
"epoch": 0.17425519955030916,
"kl": 1.6206236481666565,
"learning_rate": 9.982630691399663e-07,
"loss": 0.014173053205013275,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.59375,
"reward": 1.854687511920929,
"reward_std": 0.8075632303953171,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 10.479687690734863,
"epoch": 0.17481731309724563,
"kl": 1.4940095245838165,
"learning_rate": 9.98257448004497e-07,
"loss": -0.020530520007014275,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.8250000476837158,
"reward_std": 0.7503218501806259,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.71875,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8250000029802322,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 10.342187643051147,
"epoch": 0.17537942664418213,
"kl": 1.6300634145736694,
"learning_rate": 9.982518268690276e-07,
"loss": 0.021781207993626595,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.889062523841858,
"reward_std": 0.7929854989051819,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625238418579,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 10.31406283378601,
"epoch": 0.1759415401911186,
"kl": 1.5964065194129944,
"learning_rate": 9.98246205733558e-07,
"loss": 0.019931331276893616,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8781250417232513,
"reward_std": 0.8184268027544022,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437786102295,
"epoch": 0.1765036537380551,
"kl": 1.5781501233577728,
"learning_rate": 9.982405845980888e-07,
"loss": 0.014711914584040642,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8828125298023224,
"reward_std": 0.7929855138063431,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 10.487500190734863,
"epoch": 0.17706576728499157,
"kl": 1.5839548408985138,
"learning_rate": 9.982349634626194e-07,
"loss": 0.007520720828324556,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.885937511920929,
"reward_std": 0.7497561424970627,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 10.245312690734863,
"epoch": 0.17762788083192804,
"kl": 1.586982786655426,
"learning_rate": 9.982293423271501e-07,
"loss": 0.00036374107003211975,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.854687511920929,
"reward_std": 0.7707860469818115,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 10.434375047683716,
"epoch": 0.17818999437886454,
"kl": 1.5729843974113464,
"learning_rate": 9.982237211916806e-07,
"loss": 0.007898079231381416,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.8953125476837158,
"reward_std": 0.7891903966665268,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125029802322,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 10.385937690734863,
"epoch": 0.178752107925801,
"kl": 1.5544759631156921,
"learning_rate": 9.982181000562113e-07,
"loss": 0.006270549260079861,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.515625,
"reward": 1.857812523841858,
"reward_std": 0.7436625808477402,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125089406967,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 10.350000381469727,
"epoch": 0.17931422147273748,
"kl": 1.5743531286716461,
"learning_rate": 9.98212478920742e-07,
"loss": 0.023270972073078156,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.890625,
"reward_std": 0.7828847467899323,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 10.340625286102295,
"epoch": 0.17987633501967398,
"kl": 1.5790450274944305,
"learning_rate": 9.982068577852726e-07,
"loss": 0.02013951539993286,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.515625,
"reward": 1.865625023841858,
"reward_std": 0.7560012191534042,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 10.237500190734863,
"epoch": 0.18043844856661045,
"kl": 1.616362452507019,
"learning_rate": 9.982012366498033e-07,
"loss": 0.0052131060510873795,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.46875,
"reward": 1.8421875536441803,
"reward_std": 0.7244033068418503,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875089406967,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 10.382812976837158,
"epoch": 0.18100056211354693,
"kl": 1.5818106830120087,
"learning_rate": 9.981956155143338e-07,
"loss": 0.014903640374541283,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.5625,
"reward": 1.842187523841858,
"reward_std": 0.7995538413524628,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875238418579,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 10.398437738418579,
"epoch": 0.18156267566048342,
"kl": 1.5807396471500397,
"learning_rate": 9.981899943788644e-07,
"loss": 0.0053629628382623196,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8484375476837158,
"reward_std": 0.7667220234870911,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375029802322,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 10.39218783378601,
"epoch": 0.1821247892074199,
"kl": 1.5583379864692688,
"learning_rate": 9.981843732433951e-07,
"loss": -0.00255510862916708,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.484375,
"reward": 1.8250000178813934,
"reward_std": 0.7457249462604523,
"rewards/avg_0": 1.71875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8250000029802322,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 10.398437738418579,
"epoch": 0.18268690275435637,
"kl": 1.5831120014190674,
"learning_rate": 9.981787521079258e-07,
"loss": -0.011474616825580597,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5,
"reward": 1.8171875178813934,
"reward_std": 0.7433380484580994,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.75,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8171875178813934,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 10.418750286102295,
"epoch": 0.18324901630129287,
"kl": 1.5725690126419067,
"learning_rate": 9.981731309724565e-07,
"loss": 0.016651010140776634,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8953125178813934,
"reward_std": 0.7747579514980316,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125029802322,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 10.435937643051147,
"epoch": 0.18381112984822934,
"kl": 1.591273546218872,
"learning_rate": 9.981675098369872e-07,
"loss": 0.013820771127939224,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.8906250298023224,
"reward_std": 0.7669556438922882,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250047683716,
"epoch": 0.18437324339516584,
"kl": 1.6049018502235413,
"learning_rate": 9.981618887015176e-07,
"loss": -0.015562339685857296,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.46875,
"reward": 1.8421875536441803,
"reward_std": 0.6979580372571945,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.71875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8421875089406967,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 10.570312738418579,
"epoch": 0.1849353569421023,
"kl": 1.596179574728012,
"learning_rate": 9.981562675660483e-07,
"loss": 0.015992822125554085,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8562500476837158,
"reward_std": 0.8342421501874924,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8562500029802322,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 10.570312738418579,
"epoch": 0.18549747048903878,
"kl": 1.5825116634368896,
"learning_rate": 9.98150646430579e-07,
"loss": -0.016910620033740997,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.484375,
"reward": 1.8078125417232513,
"reward_std": 0.7369780391454697,
"rewards/avg_0": 1.71875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.6875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.807812511920929,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 10.49375033378601,
"epoch": 0.18605958403597528,
"kl": 1.592354953289032,
"learning_rate": 9.981450252951096e-07,
"loss": 0.02186637371778488,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8921875357627869,
"reward_std": 0.7789381444454193,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 10.5546875,
"epoch": 0.18662169758291175,
"kl": 1.5267528891563416,
"learning_rate": 9.981394041596401e-07,
"loss": 0.032644666731357574,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.889062523841858,
"reward_std": 0.8288520723581314,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625238418579,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562452316284,
"epoch": 0.18718381112984822,
"kl": 1.5329179763793945,
"learning_rate": 9.981337830241708e-07,
"loss": 0.0030208320822566748,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.862500011920929,
"reward_std": 0.7397526204586029,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8640625029802322,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 10.589062690734863,
"epoch": 0.18774592467678472,
"kl": 1.6342924535274506,
"learning_rate": 9.981281618887015e-07,
"loss": 0.0077019971795380116,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8656249940395355,
"reward_std": 0.7672802358865738,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250238418579,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 10.593750238418579,
"epoch": 0.1883080382237212,
"kl": 1.6173183023929596,
"learning_rate": 9.981225407532321e-07,
"loss": 0.03500716760754585,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.901562511920929,
"reward_std": 0.7957247048616409,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250190734863,
"epoch": 0.18887015177065766,
"kl": 1.5762000679969788,
"learning_rate": 9.981169196177628e-07,
"loss": 0.02564232610166073,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.9000000059604645,
"reward_std": 0.8302361965179443,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000208616257,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 10.412500381469727,
"epoch": 0.18943226531759416,
"kl": 1.6118149757385254,
"learning_rate": 9.981112984822933e-07,
"loss": -0.015139142982661724,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.515625,
"reward": 1.792187511920929,
"reward_std": 0.780357614159584,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.71875,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.6875,
"rewards/avg_7": 1.703125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.792187511920929,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 10.465625286102295,
"epoch": 0.18999437886453063,
"kl": 1.5685740411281586,
"learning_rate": 9.98105677346824e-07,
"loss": 0.013548189774155617,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.546875,
"reward": 1.8656249940395355,
"reward_std": 0.7624193429946899,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250089406967,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 10.489062547683716,
"epoch": 0.1905564924114671,
"kl": 1.6768341958522797,
"learning_rate": 9.981000562113546e-07,
"loss": 0.027506785467267036,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.9078125357627869,
"reward_std": 0.7817645967006683,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 10.407812595367432,
"epoch": 0.1911186059584036,
"kl": 1.5220493972301483,
"learning_rate": 9.980944350758853e-07,
"loss": 0.014555826783180237,
"ratio/all_0": 0.0,
"ratio/all_2": 0.59375,
"reward": 1.90625,
"reward_std": 0.763274297118187,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.90625,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 10.437500238418579,
"epoch": 0.19168071950534007,
"kl": 1.5265831649303436,
"learning_rate": 9.980888139404158e-07,
"loss": 0.010628137737512589,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.8937500417232513,
"reward_std": 0.8350971043109894,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 10.521875143051147,
"epoch": 0.19224283305227655,
"kl": 1.488110363483429,
"learning_rate": 9.980831928049467e-07,
"loss": 0.006948402151465416,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.889062523841858,
"reward_std": 0.774493932723999,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250286102295,
"epoch": 0.19280494659921305,
"kl": 1.5184958577156067,
"learning_rate": 9.980775716694771e-07,
"loss": 0.01944681815803051,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9156250357627869,
"reward_std": 0.8254118114709854,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250059604645,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 10.668750286102295,
"epoch": 0.19336706014614952,
"kl": 1.4792628586292267,
"learning_rate": 9.980719505340078e-07,
"loss": 0.010150870308279991,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8953125476837158,
"reward_std": 0.7756129056215286,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.895312488079071,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 10.492187738418579,
"epoch": 0.19392917369308602,
"kl": 1.5462858974933624,
"learning_rate": 9.980663293985385e-07,
"loss": 0.006519682239741087,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.8796875476837158,
"reward_std": 0.7582378536462784,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875029802322,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000238418579,
"epoch": 0.1944912872400225,
"kl": 1.6059279441833496,
"learning_rate": 9.980607082630692e-07,
"loss": -0.009531073272228241,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.8375000357627869,
"reward_std": 0.7587139010429382,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.75,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000238418579,
"epoch": 0.19505340078695896,
"kl": 1.581993728876114,
"learning_rate": 9.980550871275998e-07,
"loss": 0.027156025171279907,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8984375,
"reward_std": 0.7981103658676147,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375149011612,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.19561551433389546,
"kl": 1.5765255689620972,
"learning_rate": 9.980494659921303e-07,
"loss": 0.008464457467198372,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.8718750476837158,
"reward_std": 0.7553205490112305,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750178813934,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000286102295,
"epoch": 0.19617762788083193,
"kl": 1.5645933151245117,
"learning_rate": 9.98043844856661e-07,
"loss": 0.024853838607668877,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8718750178813934,
"reward_std": 0.8353610932826996,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750029802322,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 10.345312595367432,
"epoch": 0.1967397414277684,
"kl": 1.6021571457386017,
"learning_rate": 9.980382237211917e-07,
"loss": 0.014802216552197933,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8609375059604645,
"reward_std": 0.8342433571815491,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375208616257,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 10.448437929153442,
"epoch": 0.1973018549747049,
"kl": 1.5465485453605652,
"learning_rate": 9.980326025857223e-07,
"loss": 0.015489017590880394,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.8843750059604645,
"reward_std": 0.8451384902000427,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 10.354687690734863,
"epoch": 0.19786396852164137,
"kl": 1.5961918532848358,
"learning_rate": 9.980269814502528e-07,
"loss": 0.0025184075348079205,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.8828125596046448,
"reward_std": 0.8035257160663605,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 10.414062738418579,
"epoch": 0.19842608206857784,
"kl": 1.6324600279331207,
"learning_rate": 9.980213603147835e-07,
"loss": 0.03646010905504227,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.671875,
"reward": 1.896875023841858,
"reward_std": 0.8411010056734085,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 10.350000143051147,
"epoch": 0.19898819561551434,
"kl": 1.555162936449051,
"learning_rate": 9.980157391793142e-07,
"loss": 0.03622834011912346,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.932812511920929,
"reward_std": 0.8318552523851395,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.96875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.932812511920929,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 10.462500095367432,
"epoch": 0.1995503091624508,
"kl": 1.6028335988521576,
"learning_rate": 9.980101180438448e-07,
"loss": 0.017967145889997482,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.873437523841858,
"reward_std": 0.8339188247919083,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375089406967,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 10.381250381469727,
"epoch": 0.20011242270938728,
"kl": 1.5512114763259888,
"learning_rate": 9.980044969083753e-07,
"loss": 0.028043627738952637,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.671875,
"reward": 1.920312523841858,
"reward_std": 0.8178975880146027,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9218750149011612,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 10.398437738418579,
"epoch": 0.20067453625632378,
"kl": 1.64331915974617,
"learning_rate": 9.979988757729062e-07,
"loss": 0.01075400784611702,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.8484375476837158,
"reward_std": 0.7601196467876434,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375178813934,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812547683716,
"epoch": 0.20123664980326025,
"kl": 1.6044352352619171,
"learning_rate": 9.979932546374367e-07,
"loss": 0.01619352400302887,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8796875476837158,
"reward_std": 0.7670706510543823,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875029802322,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 10.553125143051147,
"epoch": 0.20179876335019675,
"kl": 1.579399436712265,
"learning_rate": 9.979876335019673e-07,
"loss": 0.017630768939852715,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5625,
"reward": 1.8687500059604645,
"reward_std": 0.7773216217756271,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500208616257,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 10.58750033378601,
"epoch": 0.20236087689713322,
"kl": 1.5743523836135864,
"learning_rate": 9.97982012366498e-07,
"loss": 0.028611905872821808,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6875,
"reward": 1.8953125178813934,
"reward_std": 0.8501724600791931,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 10.462500095367432,
"epoch": 0.2029229904440697,
"kl": 1.5823079943656921,
"learning_rate": 9.979763912310287e-07,
"loss": 0.018070163205266,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.515625,
"reward": 1.889062523841858,
"reward_std": 0.7269922941923141,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 10.409375190734863,
"epoch": 0.2034851039910062,
"kl": 1.5337883830070496,
"learning_rate": 9.979707700955594e-07,
"loss": 0.02222316712141037,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9124999940395355,
"reward_std": 0.7893053442239761,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.20404721753794267,
"kl": 1.4978066980838776,
"learning_rate": 9.979651489600898e-07,
"loss": 0.028549140319228172,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.9078125357627869,
"reward_std": 0.7738133370876312,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 10.428125143051147,
"epoch": 0.20460933108487914,
"kl": 1.5625019669532776,
"learning_rate": 9.979595278246205e-07,
"loss": 0.022949017584323883,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.893750011920929,
"reward_std": 0.7820322662591934,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 10.537500143051147,
"epoch": 0.20517144463181564,
"kl": 1.6440396308898926,
"learning_rate": 9.979539066891512e-07,
"loss": 0.012113522738218307,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8531250059604645,
"reward_std": 0.7996447384357452,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250059604645,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 10.534375190734863,
"epoch": 0.2057335581787521,
"kl": 1.5494819581508636,
"learning_rate": 9.979482855536819e-07,
"loss": 0.038545746356248856,
"ratio/all_0": 0.0,
"ratio/all_2": 0.75,
"reward": 1.9484375417232513,
"reward_std": 0.8534396290779114,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.948437511920929,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 10.525000095367432,
"epoch": 0.20629567172568858,
"kl": 1.5757357478141785,
"learning_rate": 9.979426644182123e-07,
"loss": 0.015849340707063675,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8718750178813934,
"reward_std": 0.7840642780065536,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750178813934,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 10.434375047683716,
"epoch": 0.20685778527262508,
"kl": 1.6400718986988068,
"learning_rate": 9.97937043282743e-07,
"loss": 0.0333583801984787,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.909375011920929,
"reward_std": 0.8277343511581421,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 10.59375,
"epoch": 0.20741989881956155,
"kl": 1.5465018153190613,
"learning_rate": 9.979314221472737e-07,
"loss": 0.015456119552254677,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8828125596046448,
"reward_std": 0.7825614660978317,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 10.57968783378601,
"epoch": 0.20798201236649802,
"kl": 1.5840701758861542,
"learning_rate": 9.979258010118044e-07,
"loss": 0.013504572212696075,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8812500536441803,
"reward_std": 0.8036747425794601,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062547683716,
"epoch": 0.20854412591343452,
"kl": 1.5801215767860413,
"learning_rate": 9.979201798763348e-07,
"loss": 0.007882913574576378,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8703125417232513,
"reward_std": 0.8143058866262436,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 10.595312595367432,
"epoch": 0.209106239460371,
"kl": 1.5764483511447906,
"learning_rate": 9.979145587408655e-07,
"loss": 0.02095562405884266,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.609375,
"reward": 1.8687500357627869,
"reward_std": 0.8148956596851349,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500059604645,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 10.579687595367432,
"epoch": 0.2096683530073075,
"kl": 1.5262295305728912,
"learning_rate": 9.979089376053964e-07,
"loss": -0.005769479088485241,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.53125,
"reward": 1.8609375059604645,
"reward_std": 0.7386855036020279,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8609375208616257,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 10.607812404632568,
"epoch": 0.21023046655424396,
"kl": 1.5864124596118927,
"learning_rate": 9.979033164699269e-07,
"loss": 0.017187146469950676,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.5,
"reward": 1.8375000357627869,
"reward_std": 0.7572944611310959,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 10.489062786102295,
"epoch": 0.21079258010118043,
"kl": 1.592396318912506,
"learning_rate": 9.978976953344575e-07,
"loss": 0.012976432219147682,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.8859375417232513,
"reward_std": 0.7594173699617386,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 10.604687929153442,
"epoch": 0.21135469364811693,
"kl": 1.5425201952457428,
"learning_rate": 9.978920741989882e-07,
"loss": 0.02891225554049015,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.9078125059604645,
"reward_std": 0.8302794843912125,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.909375011920929,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 10.482812881469727,
"epoch": 0.2119168071950534,
"kl": 1.5657863020896912,
"learning_rate": 9.978864530635189e-07,
"loss": 0.025767218321561813,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.9015625417232513,
"reward_std": 0.8274085968732834,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 10.575000286102295,
"epoch": 0.21247892074198987,
"kl": 1.5857770144939423,
"learning_rate": 9.978808319280494e-07,
"loss": 0.01667657122015953,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8765625357627869,
"reward_std": 0.8095056265592575,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625208616257,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 10.503125190734863,
"epoch": 0.21304103428892637,
"kl": 1.5959461033344269,
"learning_rate": 9.9787521079258e-07,
"loss": 0.02422156184911728,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8828125,
"reward_std": 0.79684117436409,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 10.410937547683716,
"epoch": 0.21360314783586284,
"kl": 1.606498122215271,
"learning_rate": 9.978695896571107e-07,
"loss": 0.031145939603447914,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.625,
"reward": 1.873437523841858,
"reward_std": 0.8260256797075272,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375238418579,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 10.540625095367432,
"epoch": 0.21416526138279932,
"kl": 1.6109911799430847,
"learning_rate": 9.978639685216414e-07,
"loss": 0.027983110398054123,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.893750011920929,
"reward_std": 0.8392192274332047,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 10.517187595367432,
"epoch": 0.21472737492973581,
"kl": 1.5722034871578217,
"learning_rate": 9.978583473861719e-07,
"loss": 0.008846685290336609,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8687500357627869,
"reward_std": 0.8315659910440445,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500059604645,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 10.415625095367432,
"epoch": 0.21528948847667229,
"kl": 1.5834924280643463,
"learning_rate": 9.978527262507025e-07,
"loss": 0.017309188842773438,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.895312488079071,
"reward_std": 0.8090067058801651,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125327825546,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 10.556250095367432,
"epoch": 0.21585160202360876,
"kl": 1.5703161358833313,
"learning_rate": 9.978471051152332e-07,
"loss": 0.019084349274635315,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.893750011920929,
"reward_std": 0.8291173279285431,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500190734863,
"epoch": 0.21641371557054526,
"kl": 1.5886655449867249,
"learning_rate": 9.978414839797639e-07,
"loss": 0.028999771922826767,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9187500178813934,
"reward_std": 0.7858879417181015,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9187500029802322,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 10.295312643051147,
"epoch": 0.21697582911748173,
"kl": 1.6783213019371033,
"learning_rate": 9.978358628442946e-07,
"loss": 0.04459211230278015,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6875,
"reward": 1.9031250178813934,
"reward_std": 0.8525859266519547,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250178813934,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 10.375,
"epoch": 0.2175379426644182,
"kl": 1.597625732421875,
"learning_rate": 9.97830241708825e-07,
"loss": 0.012688029557466507,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8765625357627869,
"reward_std": 0.7545565068721771,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625059604645,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.2181000562113547,
"kl": 1.5851397812366486,
"learning_rate": 9.97824620573356e-07,
"loss": 0.03079412877559662,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8984375298023224,
"reward_std": 0.8054997026920319,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375149011612,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 10.317187547683716,
"epoch": 0.21866216975829117,
"kl": 1.564811259508133,
"learning_rate": 9.978189994378864e-07,
"loss": 0.013478105887770653,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.609375,
"reward": 1.857812523841858,
"reward_std": 0.814572349190712,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125089406967,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 10.317187786102295,
"epoch": 0.21922428330522767,
"kl": 1.582626074552536,
"learning_rate": 9.97813378302417e-07,
"loss": 0.030684249475598335,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.59375,
"reward": 1.8734374940395355,
"reward_std": 0.7998443990945816,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375238418579,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 10.339062690734863,
"epoch": 0.21978639685216414,
"kl": 1.6444810032844543,
"learning_rate": 9.978077571669477e-07,
"loss": 0.017499804496765137,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.8984375596046448,
"reward_std": 0.8140418827533722,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 10.404687881469727,
"epoch": 0.2203485103991006,
"kl": 1.6334162950515747,
"learning_rate": 9.978021360314784e-07,
"loss": 0.020157240331172943,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8687500059604645,
"reward_std": 0.7964091897010803,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8703125268220901,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 10.318750381469727,
"epoch": 0.2209106239460371,
"kl": 1.6211105585098267,
"learning_rate": 9.977965148960089e-07,
"loss": 0.017307277768850327,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.578125,
"reward": 1.850000023841858,
"reward_std": 0.7988163083791733,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8500000238418579,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 10.357812881469727,
"epoch": 0.22147273749297358,
"kl": 1.5798417031764984,
"learning_rate": 9.977908937605396e-07,
"loss": 0.019197724759578705,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8921875357627869,
"reward_std": 0.8184860795736313,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 10.396874904632568,
"epoch": 0.22203485103991005,
"kl": 1.5937089622020721,
"learning_rate": 9.977852726250702e-07,
"loss": -0.003021673299372196,
"ratio/all_0": 0.0,
"ratio/all_2": 0.578125,
"reward": 1.8718750476837158,
"reward_std": 0.766514927148819,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750178813934,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 10.267187595367432,
"epoch": 0.22259696458684655,
"kl": 1.5950027704238892,
"learning_rate": 9.97779651489601e-07,
"loss": 0.03185337409377098,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.9000000059604645,
"reward_std": 0.8525859266519547,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 10.409375190734863,
"epoch": 0.22315907813378302,
"kl": 1.5446783006191254,
"learning_rate": 9.977740303541314e-07,
"loss": 0.020665908232331276,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9015625417232513,
"reward_std": 0.7969914227724075,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312786102295,
"epoch": 0.2237211916807195,
"kl": 1.5845154523849487,
"learning_rate": 9.97768409218662e-07,
"loss": 0.0206165611743927,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.609375,
"reward": 1.8484375476837158,
"reward_std": 0.8286777585744858,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375029802322,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187690734863,
"epoch": 0.224283305227656,
"kl": 1.6193600594997406,
"learning_rate": 9.977627880831927e-07,
"loss": 0.030626408755779266,
"ratio/all_0": 0.0,
"ratio/all_2": 0.71875,
"reward": 1.932812511920929,
"reward_std": 0.8446698635816574,
"rewards/avg_0": 1.96875,
"rewards/avg_1": 1.96875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9343750178813934,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 10.428125143051147,
"epoch": 0.22484541877459246,
"kl": 1.623527467250824,
"learning_rate": 9.977571669477234e-07,
"loss": 0.024629278108477592,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.71875,
"reward": 1.8828125298023224,
"reward_std": 0.8720838725566864,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 10.539062738418579,
"epoch": 0.22540753232152894,
"kl": 1.56095489859581,
"learning_rate": 9.97751545812254e-07,
"loss": -0.004245146177709103,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.46875,
"reward": 1.8296875357627869,
"reward_std": 0.7254326045513153,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8296875208616257,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 10.514062881469727,
"epoch": 0.22596964586846544,
"kl": 1.557588368654251,
"learning_rate": 9.977459246767845e-07,
"loss": 0.0035413671284914017,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.671875,
"reward": 1.885937511920929,
"reward_std": 0.8230249434709549,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8859375268220901,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 10.495312690734863,
"epoch": 0.2265317594154019,
"kl": 1.5131984651088715,
"learning_rate": 9.977403035413154e-07,
"loss": 0.02268044278025627,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9093749821186066,
"reward_std": 0.793575257062912,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9093750268220901,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000238418579,
"epoch": 0.2270938729623384,
"kl": 1.5539019405841827,
"learning_rate": 9.97734682405846e-07,
"loss": 0.006285407580435276,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.53125,
"reward": 1.8171875178813934,
"reward_std": 0.7920168191194534,
"rewards/avg_0": 1.734375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8171875178813934,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 10.506250143051147,
"epoch": 0.22765598650927488,
"kl": 1.6138789355754852,
"learning_rate": 9.977290612703766e-07,
"loss": 0.011725908145308495,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.671875,
"reward": 1.8921875357627869,
"reward_std": 0.8255873620510101,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875208616257,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.22821810005621135,
"kl": 1.526683360338211,
"learning_rate": 9.977234401349073e-07,
"loss": 0.027411654591560364,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.9093750417232513,
"reward_std": 0.8125075399875641,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 10.47812533378601,
"epoch": 0.22878021360314785,
"kl": 1.6264634728431702,
"learning_rate": 9.97717818999438e-07,
"loss": 0.01731857657432556,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.889062523841858,
"reward_std": 0.8020582050085068,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 10.4765625,
"epoch": 0.22934232715008432,
"kl": 1.6295714378356934,
"learning_rate": 9.977121978639684e-07,
"loss": 0.017600316554307938,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.5625,
"reward": 1.8375000059604645,
"reward_std": 0.8005238026380539,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8375000059604645,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 10.534375190734863,
"epoch": 0.2299044406970208,
"kl": 1.6157631576061249,
"learning_rate": 9.97706576728499e-07,
"loss": 0.015536559745669365,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8968750536441803,
"reward_std": 0.7858879566192627,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968749940395355,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 10.476562738418579,
"epoch": 0.2304665542439573,
"kl": 1.5186310112476349,
"learning_rate": 9.977009555930298e-07,
"loss": 0.020370997488498688,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.546875,
"reward": 1.8531250059604645,
"reward_std": 0.7863021939992905,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8531250208616257,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 10.484375238418579,
"epoch": 0.23102866779089376,
"kl": 1.554486483335495,
"learning_rate": 9.976953344575604e-07,
"loss": 0.016492923721671104,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.59375,
"reward": 1.8765625059604645,
"reward_std": 0.7910115867853165,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625059604645,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 10.554687738418579,
"epoch": 0.23159078133783023,
"kl": 1.5660745203495026,
"learning_rate": 9.976897133220911e-07,
"loss": 0.015180633403360844,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.53125,
"reward": 1.878125011920929,
"reward_std": 0.7512881010770798,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8781250268220901,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 10.562500238418579,
"epoch": 0.23215289488476673,
"kl": 1.5972300469875336,
"learning_rate": 9.976840921866216e-07,
"loss": -0.0018402566201984882,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.859375,
"reward_std": 0.819339856505394,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750298023224,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562547683716,
"epoch": 0.2327150084317032,
"kl": 1.564846783876419,
"learning_rate": 9.976784710511522e-07,
"loss": 0.011204143986105919,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.484375,
"reward": 1.8796875476837158,
"reward_std": 0.7146098762750626,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8812500089406967,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 10.589062690734863,
"epoch": 0.23327712197863967,
"kl": 1.5199431777000427,
"learning_rate": 9.97672849915683e-07,
"loss": 0.01307937502861023,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.8921875357627869,
"reward_std": 0.7638034969568253,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 10.439062595367432,
"epoch": 0.23383923552557617,
"kl": 1.6180415451526642,
"learning_rate": 9.976672287802136e-07,
"loss": 0.022075064480304718,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9078125357627869,
"reward_std": 0.7796428948640823,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125208616257,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 10.46875,
"epoch": 0.23440134907251264,
"kl": 1.6099268794059753,
"learning_rate": 9.97661607644744e-07,
"loss": 0.03135301545262337,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.8953125178813934,
"reward_std": 0.8648688793182373,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125029802322,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 10.514062643051147,
"epoch": 0.23496346261944911,
"kl": 1.589457243680954,
"learning_rate": 9.976559865092747e-07,
"loss": 0.007210549898445606,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.8828125298023224,
"reward_std": 0.7537620812654495,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 10.415624856948853,
"epoch": 0.23552557616638561,
"kl": 1.6951603591442108,
"learning_rate": 9.976503653738054e-07,
"loss": 0.03695042431354523,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.9000000357627869,
"reward_std": 0.7999933958053589,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937595367432,
"epoch": 0.23608768971332209,
"kl": 1.585318922996521,
"learning_rate": 9.97644744238336e-07,
"loss": 0.026742789894342422,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.9031250178813934,
"reward_std": 0.8052344620227814,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250178813934,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 10.468750238418579,
"epoch": 0.23664980326025858,
"kl": 1.5903295278549194,
"learning_rate": 9.976391231028668e-07,
"loss": 0.015053427778184414,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8875000476837158,
"reward_std": 0.7833255380392075,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000029802322,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875238418579,
"epoch": 0.23721191680719506,
"kl": 1.6383749842643738,
"learning_rate": 9.976335019673975e-07,
"loss": -0.012720446102321148,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.515625,
"reward": 1.810937523841858,
"reward_std": 0.7563245743513107,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.703125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8109375089406967,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 10.335937738418579,
"epoch": 0.23777403035413153,
"kl": 1.6610612869262695,
"learning_rate": 9.976278808319281e-07,
"loss": 0.023902423679828644,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8828125,
"reward_std": 0.8249382674694061,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 10.459375143051147,
"epoch": 0.23833614390106803,
"kl": 1.5190238654613495,
"learning_rate": 9.976222596964586e-07,
"loss": 0.028971953317523003,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.640625,
"reward": 1.873437523841858,
"reward_std": 0.8383654952049255,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375089406967,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 10.475000381469727,
"epoch": 0.2388982574480045,
"kl": 1.6028160750865936,
"learning_rate": 9.976166385609893e-07,
"loss": 0.009334921836853027,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5625,
"reward": 1.846875011920929,
"reward_std": 0.7874211072921753,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.846875011920929,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 10.36093783378601,
"epoch": 0.23946037099494097,
"kl": 1.523706704378128,
"learning_rate": 9.9761101742552e-07,
"loss": -0.0016508856788277626,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.8828125298023224,
"reward_std": 0.8340931683778763,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 10.381250381469727,
"epoch": 0.24002248454187747,
"kl": 1.5795050263404846,
"learning_rate": 9.976053962900506e-07,
"loss": 0.010042035952210426,
"ratio/all_0": 0.0,
"ratio/all_2": 0.609375,
"reward": 1.9000000357627869,
"reward_std": 0.7759362161159515,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312786102295,
"epoch": 0.24058459808881394,
"kl": 1.561740130186081,
"learning_rate": 9.97599775154581e-07,
"loss": 0.010532597079873085,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.878125011920929,
"reward_std": 0.8440195620059967,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 10.446875095367432,
"epoch": 0.2411467116357504,
"kl": 1.5893949568271637,
"learning_rate": 9.975941540191118e-07,
"loss": 0.018870336934924126,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.873437523841858,
"reward_std": 0.8117434531450272,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8734375089406967,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 10.371875286102295,
"epoch": 0.2417088251826869,
"kl": 1.6396957039833069,
"learning_rate": 9.975885328836424e-07,
"loss": 0.015891946852207184,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.546875,
"reward": 1.8750000298023224,
"reward_std": 0.7589618265628815,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8765625059604645,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 10.46250033378601,
"epoch": 0.24227093872962338,
"kl": 1.5293635129928589,
"learning_rate": 9.975829117481731e-07,
"loss": 0.02730741538107395,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8796875476837158,
"reward_std": 0.8261153250932693,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562786102295,
"epoch": 0.24283305227655985,
"kl": 1.5679107904434204,
"learning_rate": 9.975772906127036e-07,
"loss": 0.027399998158216476,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9109375178813934,
"reward_std": 0.8114770352840424,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 10.462500095367432,
"epoch": 0.24339516582349635,
"kl": 1.5263629853725433,
"learning_rate": 9.975716694772343e-07,
"loss": 0.028345463797450066,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.9078125059604645,
"reward_std": 0.781880795955658,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062547683716,
"epoch": 0.24395727937043282,
"kl": 1.6106272339820862,
"learning_rate": 9.97566048341765e-07,
"loss": 0.027817612513899803,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8875000178813934,
"reward_std": 0.8168696016073227,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000178813934,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500429153442,
"epoch": 0.24451939291736932,
"kl": 1.698000580072403,
"learning_rate": 9.975604272062956e-07,
"loss": 0.025879831984639168,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.885937511920929,
"reward_std": 0.7702215611934662,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 10.412500143051147,
"epoch": 0.2450815064643058,
"kl": 1.6197902262210846,
"learning_rate": 9.975548060708263e-07,
"loss": 0.02497243694961071,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.870312511920929,
"reward_std": 0.8267063200473785,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 10.443750381469727,
"epoch": 0.24564362001124226,
"kl": 1.4857002794742584,
"learning_rate": 9.97549184935357e-07,
"loss": 0.011337099596858025,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9015625417232513,
"reward_std": 0.8217532634735107,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 10.474999904632568,
"epoch": 0.24620573355817876,
"kl": 1.6131562292575836,
"learning_rate": 9.975435637998877e-07,
"loss": 0.023611659184098244,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.9078125059604645,
"reward_std": 0.7654212862253189,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125208616257,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 10.414062738418579,
"epoch": 0.24676784710511523,
"kl": 1.5450926423072815,
"learning_rate": 9.975379426644181e-07,
"loss": 0.006920941174030304,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.881250023841858,
"reward_std": 0.7529083490371704,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 10.464062690734863,
"epoch": 0.2473299606520517,
"kl": 1.5010447204113007,
"learning_rate": 9.975323215289488e-07,
"loss": 0.03678865730762482,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.925000011920929,
"reward_std": 0.8583438247442245,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9265625178813934,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 10.3125,
"epoch": 0.2478920741989882,
"kl": 1.600469321012497,
"learning_rate": 9.975267003934795e-07,
"loss": 0.02798604406416416,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.889062523841858,
"reward_std": 0.84578637778759,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625089406967,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000047683716,
"epoch": 0.24845418774592468,
"kl": 1.568362832069397,
"learning_rate": 9.975210792580102e-07,
"loss": 0.021247677505016327,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8921875357627869,
"reward_std": 0.7840049564838409,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 10.414062738418579,
"epoch": 0.24901630129286115,
"kl": 1.5653727054595947,
"learning_rate": 9.975154581225406e-07,
"loss": 0.00297177373431623,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.546875,
"reward": 1.8359375298023224,
"reward_std": 0.7760512083768845,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.734375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8375000208616257,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 10.399999856948853,
"epoch": 0.24957841483979765,
"kl": 1.6329529285430908,
"learning_rate": 9.975098369870713e-07,
"loss": 0.023191925138235092,
"ratio/all_0": 0.0,
"ratio/all_2": 0.703125,
"reward": 1.9124999940395355,
"reward_std": 0.8387785255908966,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.2501405283867341,
"kl": 1.5339020490646362,
"learning_rate": 9.97504215851602e-07,
"loss": 0.04132520407438278,
"ratio/all_0": 0.0,
"ratio/all_2": 0.75,
"reward": 1.9562500417232513,
"reward_std": 0.8514669239521027,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.96875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.953125,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.956250011920929,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 10.375000238418579,
"epoch": 0.2507026419336706,
"kl": 1.5444872379302979,
"learning_rate": 9.974985947161326e-07,
"loss": 0.032432883977890015,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.9140625596046448,
"reward_std": 0.861393392086029,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625149011612,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875238418579,
"epoch": 0.25126475548060706,
"kl": 1.5955162942409515,
"learning_rate": 9.974929735806631e-07,
"loss": 0.028477732092142105,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.8906250298023224,
"reward_std": 0.8443440943956375,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 10.315625190734863,
"epoch": 0.2518268690275436,
"kl": 1.578765332698822,
"learning_rate": 9.974873524451938e-07,
"loss": 0.019804082810878754,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9078125357627869,
"reward_std": 0.8340931385755539,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937356948853,
"epoch": 0.25238898257448006,
"kl": 1.5700731873512268,
"learning_rate": 9.974817313097245e-07,
"loss": 0.030755460262298584,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.9171875417232513,
"reward_std": 0.8595938086509705,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 10.506249904632568,
"epoch": 0.25295109612141653,
"kl": 1.6172282099723816,
"learning_rate": 9.974761101742551e-07,
"loss": 0.025174066424369812,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8968749940395355,
"reward_std": 0.8019672930240631,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 10.470312595367432,
"epoch": 0.253513209668353,
"kl": 1.5827001631259918,
"learning_rate": 9.974704890387858e-07,
"loss": 0.027356263250112534,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6875,
"reward": 1.8984375298023224,
"reward_std": 0.842809721827507,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 10.407812595367432,
"epoch": 0.2540753232152895,
"kl": 1.5556872487068176,
"learning_rate": 9.974648679033165e-07,
"loss": 0.028104005381464958,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.9062500298023224,
"reward_std": 0.8070946335792542,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.90625,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.25463743676222594,
"kl": 1.5716063380241394,
"learning_rate": 9.974592467678472e-07,
"loss": 0.02172784134745598,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8796875178813934,
"reward_std": 0.8272948265075684,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 10.418750286102295,
"epoch": 0.25519955030916247,
"kl": 1.5538042485713959,
"learning_rate": 9.974536256323776e-07,
"loss": 0.0027528153732419014,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8515625298023224,
"reward_std": 0.7925472110509872,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8515625149011612,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 10.464062929153442,
"epoch": 0.25576166385609894,
"kl": 1.5341682434082031,
"learning_rate": 9.974480044969083e-07,
"loss": 0.027387108653783798,
"ratio/all_0": 0.0,
"ratio/all_2": 0.734375,
"reward": 1.9328125417232513,
"reward_std": 0.8470215350389481,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.932812511920929,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 10.554687738418579,
"epoch": 0.2563237774030354,
"kl": 1.5187903046607971,
"learning_rate": 9.97442383361439e-07,
"loss": 0.00495085958391428,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.878125011920929,
"reward_std": 0.7772054225206375,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8781250268220901,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 10.518750190734863,
"epoch": 0.2568858909499719,
"kl": 1.5951554477214813,
"learning_rate": 9.974367622259697e-07,
"loss": 0.007676597684621811,
"ratio/all_0": 0.0,
"ratio/all_2": 0.640625,
"reward": 1.8953125178813934,
"reward_std": 0.7952839285135269,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 10.503125190734863,
"epoch": 0.25744800449690836,
"kl": 1.6238690912723541,
"learning_rate": 9.974311410905001e-07,
"loss": 0.031223241239786148,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.90625,
"reward_std": 0.8409267216920853,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9062500298023224,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 10.607812881469727,
"epoch": 0.2580101180438449,
"kl": 1.5360244810581207,
"learning_rate": 9.974255199550308e-07,
"loss": 0.009246742352843285,
"ratio/all_0": 0.0,
"ratio/all_2": 0.640625,
"reward": 1.896875023841858,
"reward_std": 0.8043554276227951,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 10.387500047683716,
"epoch": 0.25857223159078135,
"kl": 1.5769808888435364,
"learning_rate": 9.974198988195615e-07,
"loss": 0.008776286616921425,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.862500011920929,
"reward_std": 0.8146316558122635,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8625000268220901,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 10.512500047683716,
"epoch": 0.2591343451377178,
"kl": 1.588375449180603,
"learning_rate": 9.974142776840922e-07,
"loss": 0.023817773908376694,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8843750357627869,
"reward_std": 0.8069419264793396,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 10.506250143051147,
"epoch": 0.2596964586846543,
"kl": 1.561067909002304,
"learning_rate": 9.974086565486228e-07,
"loss": 0.014013110660016537,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8671875298023224,
"reward_std": 0.7865673452615738,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8671875,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 10.548437595367432,
"epoch": 0.26025857223159077,
"kl": 1.557085394859314,
"learning_rate": 9.974030354131533e-07,
"loss": 0.031534142792224884,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.78125,
"reward": 1.9296875298023224,
"reward_std": 0.8877490013837814,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9296875149011612,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 10.304687738418579,
"epoch": 0.26082068577852724,
"kl": 1.5971083343029022,
"learning_rate": 9.97397414277684e-07,
"loss": 0.015254084020853043,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.8546875417232513,
"reward_std": 0.8286777883768082,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 10.368750095367432,
"epoch": 0.26138279932546377,
"kl": 1.6045811474323273,
"learning_rate": 9.973917931422147e-07,
"loss": 0.022352047264575958,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9046875536441803,
"reward_std": 0.7893041223287582,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046874940395355,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 10.520312547683716,
"epoch": 0.26194491287240024,
"kl": 1.5685151815414429,
"learning_rate": 9.973861720067453e-07,
"loss": 0.006927984766662121,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.875,
"reward_std": 0.7674835920333862,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 10.440625190734863,
"epoch": 0.2625070264193367,
"kl": 1.6096564829349518,
"learning_rate": 9.97380550871276e-07,
"loss": 0.0011199398431926966,
"ratio/all_0": 0.0,
"ratio/all_2": 0.640625,
"reward": 1.8843750357627869,
"reward_std": 0.7997609376907349,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 10.407812595367432,
"epoch": 0.2630691399662732,
"kl": 1.6002353727817535,
"learning_rate": 9.973749297358067e-07,
"loss": 0.0033882353454828262,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8484375178813934,
"reward_std": 0.7542925029993057,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375178813934,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187690734863,
"epoch": 0.26363125351320965,
"kl": 1.6339455544948578,
"learning_rate": 9.973693086003372e-07,
"loss": 0.02932353876531124,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.9000000059604645,
"reward_std": 0.8019672483205795,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000208616257,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 10.435937881469727,
"epoch": 0.2641933670601461,
"kl": 1.6218569576740265,
"learning_rate": 9.973636874648678e-07,
"loss": 0.03466630354523659,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9265625178813934,
"reward_std": 0.8086530715227127,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9265625178813934,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.26475548060708265,
"kl": 1.5886335372924805,
"learning_rate": 9.973580663293985e-07,
"loss": 0.030788417905569077,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.9343750178813934,
"reward_std": 0.7689536660909653,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9343750029802322,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062786102295,
"epoch": 0.2653175941540191,
"kl": 1.619934231042862,
"learning_rate": 9.973524451939292e-07,
"loss": 0.04304520785808563,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.703125,
"reward": 1.9062500298023224,
"reward_std": 0.8607139140367508,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.90625,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437881469727,
"epoch": 0.2658797077009556,
"kl": 1.579685002565384,
"learning_rate": 9.973468240584597e-07,
"loss": 0.02157140150666237,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8906250298023224,
"reward_std": 0.8131882101297379,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 10.5859375,
"epoch": 0.26644182124789206,
"kl": 1.4751878082752228,
"learning_rate": 9.973412029229903e-07,
"loss": 0.02198803797364235,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8859375417232513,
"reward_std": 0.8028210699558258,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 10.456249952316284,
"epoch": 0.26700393479482853,
"kl": 1.5577751994132996,
"learning_rate": 9.97335581787521e-07,
"loss": 0.018937459215521812,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.8968750536441803,
"reward_std": 0.8412512391805649,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 10.490625143051147,
"epoch": 0.26756604834176506,
"kl": 1.5770387947559357,
"learning_rate": 9.973299606520517e-07,
"loss": 0.02208469621837139,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.578125,
"reward": 1.856249988079071,
"reward_std": 0.8088020831346512,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8562500327825546,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 10.36875033378601,
"epoch": 0.26812816188870153,
"kl": 1.6067821681499481,
"learning_rate": 9.973243395165824e-07,
"loss": 0.0007646908052265644,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.578125,
"reward": 1.870312511920929,
"reward_std": 0.7674848735332489,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8718750178813934,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 10.517187595367432,
"epoch": 0.268690275435638,
"kl": 1.5556055009365082,
"learning_rate": 9.973187183811128e-07,
"loss": 0.019349299371242523,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.890625,
"reward_std": 0.794694185256958,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 10.41562533378601,
"epoch": 0.2692523889825745,
"kl": 1.5935364365577698,
"learning_rate": 9.973130972456435e-07,
"loss": 0.023987704887986183,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9109375178813934,
"reward_std": 0.7916025668382645,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375029802322,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 10.3984375,
"epoch": 0.26981450252951095,
"kl": 1.6000608801841736,
"learning_rate": 9.973074761101742e-07,
"loss": 0.0358889140188694,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9296875298023224,
"reward_std": 0.7762255221605301,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9296875149011612,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 10.443750381469727,
"epoch": 0.2703766160764474,
"kl": 1.5882815420627594,
"learning_rate": 9.973018549747049e-07,
"loss": 0.03253752738237381,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9328125417232513,
"reward_std": 0.8097695410251617,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.932812511920929,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 10.576562643051147,
"epoch": 0.27093872962338394,
"kl": 1.5390391051769257,
"learning_rate": 9.972962338392355e-07,
"loss": 0.030903009697794914,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.9031250178813934,
"reward_std": 0.824316993355751,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250178813934,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625476837158,
"epoch": 0.2715008431703204,
"kl": 1.5586816370487213,
"learning_rate": 9.972906127037662e-07,
"loss": 0.012824834324419498,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5625,
"reward": 1.8546875417232513,
"reward_std": 0.785448431968689,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.854687511920929,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.2720629567172569,
"kl": 1.5961687564849854,
"learning_rate": 9.972849915682967e-07,
"loss": 0.021679479628801346,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8984375,
"reward_std": 0.78426893055439,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375298023224,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.27262507026419336,
"kl": 1.5359916687011719,
"learning_rate": 9.972793704328274e-07,
"loss": 0.02804931253194809,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.640625,
"reward": 1.8828125298023224,
"reward_std": 0.829443022608757,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125149011612,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 10.435937881469727,
"epoch": 0.27318718381112983,
"kl": 1.5890219509601593,
"learning_rate": 9.97273749297358e-07,
"loss": 0.016086315736174583,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.881250023841858,
"reward_std": 0.7686631232500076,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812500089406967,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 10.521875143051147,
"epoch": 0.27374929735806636,
"kl": 1.567686676979065,
"learning_rate": 9.972681281618887e-07,
"loss": 0.02141323685646057,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.921875,
"reward_std": 0.8018170297145844,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9234375208616257,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 10.406250238418579,
"epoch": 0.27431141090500283,
"kl": 1.607559710741043,
"learning_rate": 9.972625070264194e-07,
"loss": 0.026632366701960564,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9093750417232513,
"reward_std": 0.8379234969615936,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 10.351562738418579,
"epoch": 0.2748735244519393,
"kl": 1.5950010418891907,
"learning_rate": 9.972568858909499e-07,
"loss": 0.03235364705324173,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.9078125059604645,
"reward_std": 0.8532653301954269,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 10.367187738418579,
"epoch": 0.27543563799887577,
"kl": 1.6210860908031464,
"learning_rate": 9.972512647554805e-07,
"loss": 0.016208132728934288,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8765625357627869,
"reward_std": 0.7815902531147003,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765624910593033,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 10.451562643051147,
"epoch": 0.27599775154581224,
"kl": 1.5678941011428833,
"learning_rate": 9.972456436200112e-07,
"loss": 0.009668991900980473,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.8875000178813934,
"reward_std": 0.8114782571792603,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000029802322,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625238418579,
"epoch": 0.2765598650927487,
"kl": 1.5274541079998016,
"learning_rate": 9.972400224845419e-07,
"loss": 0.030864091590046883,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.912500023841858,
"reward_std": 0.8409267067909241,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.27712197863968524,
"kl": 1.6061663329601288,
"learning_rate": 9.972344013490724e-07,
"loss": 0.017122022807598114,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.640625,
"reward": 1.8718750178813934,
"reward_std": 0.8296681493520737,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8734375238418579,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 10.246875047683716,
"epoch": 0.2776840921866217,
"kl": 1.5769842863082886,
"learning_rate": 9.97228780213603e-07,
"loss": 0.00853213481605053,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.8718750476837158,
"reward_std": 0.7921317666769028,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8718750029802322,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312929153442,
"epoch": 0.2782462057335582,
"kl": 1.6459482312202454,
"learning_rate": 9.972231590781337e-07,
"loss": 0.034749262034893036,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.928125023841858,
"reward_std": 0.8268794566392899,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9281250089406967,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 10.273437738418579,
"epoch": 0.27880831928049465,
"kl": 1.5733148455619812,
"learning_rate": 9.972175379426644e-07,
"loss": 0.009545298293232918,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.8875000178813934,
"reward_std": 0.8128636181354523,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000178813934,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 10.364062786102295,
"epoch": 0.2793704328274311,
"kl": 1.6750578880310059,
"learning_rate": 9.97211916807195e-07,
"loss": 0.035452038049697876,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.9234375357627869,
"reward_std": 0.8499085307121277,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9234375059604645,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 10.317187547683716,
"epoch": 0.2799325463743676,
"kl": 1.5531387031078339,
"learning_rate": 9.972062956717257e-07,
"loss": 0.03181811794638634,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9265625178813934,
"reward_std": 0.86952143907547,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9265625178813934,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 10.400000095367432,
"epoch": 0.2804946599213041,
"kl": 1.5031760036945343,
"learning_rate": 9.972006745362562e-07,
"loss": 0.0034638706129044294,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.8937500417232513,
"reward_std": 0.7857717573642731,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 10.295312643051147,
"epoch": 0.2810567734682406,
"kl": 1.576939582824707,
"learning_rate": 9.971950534007869e-07,
"loss": 0.034424349665641785,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.9125000536441803,
"reward_std": 0.7881839871406555,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000286102295,
"epoch": 0.28161888701517707,
"kl": 1.6134376525878906,
"learning_rate": 9.971894322653176e-07,
"loss": 0.01385501679033041,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.8984375298023224,
"reward_std": 0.78426893055439,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375,
"step": 501
},
{
"clip_ratio": 0.0,
"completion_length": 10.582812309265137,
"epoch": 0.28218100056211354,
"kl": 1.587764173746109,
"learning_rate": 9.971838111298482e-07,
"loss": 0.01922702230513096,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.9000000357627869,
"reward_std": 0.783739760518074,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 502
},
{
"clip_ratio": 0.0,
"completion_length": 10.487500190734863,
"epoch": 0.28274311410905,
"kl": 1.5082274377346039,
"learning_rate": 9.97178189994379e-07,
"loss": 0.009739737957715988,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.890625,
"reward_std": 0.753702774643898,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 503
},
{
"clip_ratio": 0.0,
"completion_length": 10.554687738418579,
"epoch": 0.28330522765598654,
"kl": 1.5581303536891937,
"learning_rate": 9.971725688589094e-07,
"loss": 0.028458768501877785,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9218750298023224,
"reward_std": 0.835271418094635,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.921875,
"step": 504
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500429153442,
"epoch": 0.283867341202923,
"kl": 1.585391253232956,
"learning_rate": 9.9716694772344e-07,
"loss": 0.02503221668303013,
"ratio/all_0": 0.0,
"ratio/all_2": 0.734375,
"reward": 1.9296875298023224,
"reward_std": 0.8459025174379349,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.96875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9296875149011612,
"step": 505
},
{
"clip_ratio": 0.0,
"completion_length": 10.503124952316284,
"epoch": 0.2844294547498595,
"kl": 1.5686596930027008,
"learning_rate": 9.971613265879707e-07,
"loss": 0.027785908430814743,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.925000011920929,
"reward_std": 0.8607139587402344,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9250000268220901,
"step": 506
},
{
"clip_ratio": 0.0,
"completion_length": 10.531250238418579,
"epoch": 0.28499156829679595,
"kl": 1.5021850764751434,
"learning_rate": 9.971557054525014e-07,
"loss": 0.0024004708975553513,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8781250417232513,
"reward_std": 0.7744927406311035,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8781249970197678,
"step": 507
},
{
"clip_ratio": 0.0,
"completion_length": 10.490625143051147,
"epoch": 0.2855536818437324,
"kl": 1.5643205046653748,
"learning_rate": 9.971500843170319e-07,
"loss": 0.03464096784591675,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.75,
"reward": 1.921875,
"reward_std": 0.8716684281826019,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.921875,
"step": 508
},
{
"clip_ratio": 0.0,
"completion_length": 10.617187738418579,
"epoch": 0.2861157953906689,
"kl": 1.5092883110046387,
"learning_rate": 9.971444631815626e-07,
"loss": 0.021037127822637558,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.893750011920929,
"reward_std": 0.7613003849983215,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 509
},
{
"clip_ratio": 0.0,
"completion_length": 10.392187595367432,
"epoch": 0.2866779089376054,
"kl": 1.5385313928127289,
"learning_rate": 9.971388420460932e-07,
"loss": 0.023627443239092827,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.9218750298023224,
"reward_std": 0.837095096707344,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9234375059604645,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 10.495312690734863,
"epoch": 0.2872400224845419,
"kl": 1.5302962362766266,
"learning_rate": 9.97133220910624e-07,
"loss": 0.0014047394506633282,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.53125,
"reward": 1.8359375298023224,
"reward_std": 0.7675429880619049,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8359375,
"step": 511
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125238418579,
"epoch": 0.28780213603147836,
"kl": 1.5516058504581451,
"learning_rate": 9.971275997751546e-07,
"loss": 0.023453330621123314,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.625,
"reward": 1.8562500476837158,
"reward_std": 0.8335639387369156,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8562500029802322,
"step": 512
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187881469727,
"epoch": 0.28836424957841483,
"kl": 1.5807332396507263,
"learning_rate": 9.971219786396853e-07,
"loss": 0.002808466088026762,
"ratio/all_0": 0.0,
"ratio/all_2": 0.515625,
"reward": 1.885937511920929,
"reward_std": 0.7218433618545532,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8859375268220901,
"step": 513
},
{
"clip_ratio": 0.0,
"completion_length": 10.543750286102295,
"epoch": 0.2889263631253513,
"kl": 1.5733287930488586,
"learning_rate": 9.97116357504216e-07,
"loss": 0.02668760158121586,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.873437523841858,
"reward_std": 0.8334064036607742,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8750000298023224,
"step": 514
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187881469727,
"epoch": 0.2894884766722878,
"kl": 1.6143159568309784,
"learning_rate": 9.971107363687464e-07,
"loss": 0.025657769292593002,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.8953125178813934,
"reward_std": 0.8407426327466965,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.8984375298023224,
"step": 515
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187929153442,
"epoch": 0.2900505902192243,
"kl": 1.5857263207435608,
"learning_rate": 9.97105115233277e-07,
"loss": 0.01410939171910286,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.878125011920929,
"reward_std": 0.8141909092664719,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 516
},
{
"clip_ratio": 0.0,
"completion_length": 10.425000429153442,
"epoch": 0.2906127037661608,
"kl": 1.6421043276786804,
"learning_rate": 9.970994940978078e-07,
"loss": 0.019229963421821594,
"ratio/all_0": 0.0,
"ratio/all_2": 0.65625,
"reward": 1.907812476158142,
"reward_std": 0.8077450841665268,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9093750268220901,
"step": 517
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312547683716,
"epoch": 0.29117481731309725,
"kl": 1.5684965550899506,
"learning_rate": 9.970938729623384e-07,
"loss": 0.025036733597517014,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.59375,
"reward": 1.8843750059604645,
"reward_std": 0.7915432304143906,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750208616257,
"step": 518
},
{
"clip_ratio": 0.0,
"completion_length": 10.353125095367432,
"epoch": 0.2917369308600337,
"kl": 1.5684770345687866,
"learning_rate": 9.97088251826869e-07,
"loss": 0.02313578687608242,
"ratio/all_0": 0.0,
"ratio/all_2": 0.703125,
"reward": 1.9265625178813934,
"reward_std": 0.8191908597946167,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9265625178813934,
"step": 519
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125,
"epoch": 0.2922990444069702,
"kl": 1.5355479717254639,
"learning_rate": 9.970826306913996e-07,
"loss": 0.014254674315452576,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.878125011920929,
"reward_std": 0.7888064831495285,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 10.4453125,
"epoch": 0.2928611579539067,
"kl": 1.6413032114505768,
"learning_rate": 9.970770095559303e-07,
"loss": 0.0015225689858198166,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.609375,
"reward": 1.8484375178813934,
"reward_std": 0.8030874729156494,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8500000089406967,
"step": 521
},
{
"clip_ratio": 0.0,
"completion_length": 10.562500238418579,
"epoch": 0.2934232715008432,
"kl": 1.550758719444275,
"learning_rate": 9.97071388420461e-07,
"loss": 0.002582995221018791,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.8828125298023224,
"reward_std": 0.7766397297382355,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125,
"step": 522
},
{
"clip_ratio": 0.0,
"completion_length": 10.451562643051147,
"epoch": 0.29398538504777966,
"kl": 1.594634473323822,
"learning_rate": 9.970657672849914e-07,
"loss": 0.016852285712957382,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.879687488079071,
"reward_std": 0.77764493227005,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875327825546,
"step": 523
},
{
"clip_ratio": 0.0,
"completion_length": 10.415625095367432,
"epoch": 0.29454749859471613,
"kl": 1.5855125486850739,
"learning_rate": 9.97060146149522e-07,
"loss": 0.011170834302902222,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.53125,
"reward": 1.8578124940395355,
"reward_std": 0.7613887935876846,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8578125238418579,
"step": 524
},
{
"clip_ratio": 0.0,
"completion_length": 10.465625286102295,
"epoch": 0.2951096121416526,
"kl": 1.5837196707725525,
"learning_rate": 9.970545250140528e-07,
"loss": 0.02334693819284439,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.578125,
"reward": 1.8593750298023224,
"reward_std": 0.8002597838640213,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.734375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750149011612,
"step": 525
},
{
"clip_ratio": 0.0,
"completion_length": 10.393750429153442,
"epoch": 0.29567172568858907,
"kl": 1.5401901602745056,
"learning_rate": 9.970489038785834e-07,
"loss": 0.016585461795330048,
"ratio/all_0": 0.0,
"ratio/all_2": 0.65625,
"reward": 1.909375011920929,
"reward_std": 0.8099210113286972,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 526
},
{
"clip_ratio": 0.0,
"completion_length": 10.382812738418579,
"epoch": 0.2962338392355256,
"kl": 1.5949873328208923,
"learning_rate": 9.970432827431141e-07,
"loss": 0.025830864906311035,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.896875023841858,
"reward_std": 0.8379149883985519,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.9000000059604645,
"step": 527
},
{
"clip_ratio": 0.0,
"completion_length": 10.221875190734863,
"epoch": 0.29679595278246207,
"kl": 1.5679085850715637,
"learning_rate": 9.970376616076448e-07,
"loss": 0.04403117299079895,
"ratio/all_0": 0.0,
"ratio/all_2": 0.75,
"reward": 1.959375023841858,
"reward_std": 0.850878432393074,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.984375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.984375,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9593750089406967,
"step": 528
},
{
"clip_ratio": 0.0,
"completion_length": 10.404687643051147,
"epoch": 0.29735806632939854,
"kl": 1.5711309611797333,
"learning_rate": 9.970320404721755e-07,
"loss": 0.03724472224712372,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9390625059604645,
"reward_std": 0.8469318300485611,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.96875,
"rewards/avg_6": 1.953125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9390625208616257,
"step": 529
},
{
"clip_ratio": 0.0,
"completion_length": 10.362500429153442,
"epoch": 0.297920179876335,
"kl": 1.5913471281528473,
"learning_rate": 9.97026419336706e-07,
"loss": 0.014495437033474445,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.8828125298023224,
"reward_std": 0.8245996832847595,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.8875000029802322,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 10.245312929153442,
"epoch": 0.2984822934232715,
"kl": 1.5916757583618164,
"learning_rate": 9.970207982012366e-07,
"loss": 0.04658622294664383,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.75,
"reward": 1.9250000417232513,
"reward_std": 0.8800604343414307,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.925000011920929,
"step": 531
},
{
"clip_ratio": 0.0,
"completion_length": 10.359375476837158,
"epoch": 0.29904440697020795,
"kl": 1.4983412623405457,
"learning_rate": 9.970151770657673e-07,
"loss": 0.023762550204992294,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.890625,
"reward_std": 0.8271434009075165,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 532
},
{
"clip_ratio": 0.0,
"completion_length": 10.406250238418579,
"epoch": 0.2996065205171445,
"kl": 1.6456592977046967,
"learning_rate": 9.97009555930298e-07,
"loss": 0.02334306575357914,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5,
"reward": 1.8828125298023224,
"reward_std": 0.7357584089040756,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8843750059604645,
"step": 533
},
{
"clip_ratio": 0.0,
"completion_length": 10.381250381469727,
"epoch": 0.30016863406408095,
"kl": 1.5607287883758545,
"learning_rate": 9.970039347948284e-07,
"loss": 0.0018885387107729912,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.873437523841858,
"reward_std": 0.8119989484548569,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.878125011920929,
"step": 534
},
{
"clip_ratio": 0.0,
"completion_length": 10.375,
"epoch": 0.3007307476110174,
"kl": 1.6651538908481598,
"learning_rate": 9.96998313659359e-07,
"loss": 0.02762552909553051,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8953125476837158,
"reward_std": 0.8060480654239655,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8968750089406967,
"step": 535
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187643051147,
"epoch": 0.3012928611579539,
"kl": 1.5861826539039612,
"learning_rate": 9.969926925238898e-07,
"loss": 0.011971995234489441,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.878125011920929,
"reward_std": 0.7851677536964417,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.8812500089406967,
"step": 536
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250190734863,
"epoch": 0.30185497470489037,
"kl": 1.6371749639511108,
"learning_rate": 9.969870713884205e-07,
"loss": 0.023145480081439018,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.885937511920929,
"reward_std": 0.8331510424613953,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500059604645,
"rewards/point_reward": 0.8921875208616257,
"step": 537
},
{
"clip_ratio": 0.0,
"completion_length": 10.387500286102295,
"epoch": 0.3024170882518269,
"kl": 1.6015640199184418,
"learning_rate": 9.969814502529511e-07,
"loss": 0.009054386988282204,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6875,
"reward": 1.8765625357627869,
"reward_std": 0.8440788686275482,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.878125011920929,
"step": 538
},
{
"clip_ratio": 0.0,
"completion_length": 10.440625190734863,
"epoch": 0.30297920179876336,
"kl": 1.582722783088684,
"learning_rate": 9.969758291174816e-07,
"loss": 0.015762681141495705,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.578125,
"reward": 1.8796875178813934,
"reward_std": 0.7894563376903534,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750029802322,
"rewards/point_reward": 0.8828125149011612,
"step": 539
},
{
"clip_ratio": 0.0,
"completion_length": 10.434375286102295,
"epoch": 0.30354131534569984,
"kl": 1.558344304561615,
"learning_rate": 9.969702079820123e-07,
"loss": 0.011870039626955986,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.484375,
"reward": 1.8765625357627869,
"reward_std": 0.7154543548822403,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8781250268220901,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 10.453125,
"epoch": 0.3041034288926363,
"kl": 1.5748683512210846,
"learning_rate": 9.96964586846543e-07,
"loss": 0.03740207105875015,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9312500357627869,
"reward_std": 0.8683542609214783,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9328124970197678,
"step": 541
},
{
"clip_ratio": 0.0,
"completion_length": 10.41562533378601,
"epoch": 0.3046655424395728,
"kl": 1.5355166792869568,
"learning_rate": 9.969589657110736e-07,
"loss": 0.04280543699860573,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.765625,
"reward": 1.9453125298023224,
"reward_std": 0.8763377517461777,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.96875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9468750059604645,
"step": 542
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000476837158,
"epoch": 0.30522765598650925,
"kl": 1.543639749288559,
"learning_rate": 9.969533445756043e-07,
"loss": 0.015493150800466537,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5625,
"reward": 1.8750000298023224,
"reward_std": 0.7713418304920197,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000149011612,
"step": 543
},
{
"clip_ratio": 0.0,
"completion_length": 10.548437595367432,
"epoch": 0.3057897695334458,
"kl": 1.5717388689517975,
"learning_rate": 9.96947723440135e-07,
"loss": 0.01888006553053856,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9000000357627869,
"reward_std": 0.8184342980384827,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.901562511920929,
"step": 544
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687547683716,
"epoch": 0.30635188308038225,
"kl": 1.5470468699932098,
"learning_rate": 9.969421023046654e-07,
"loss": 0.020793110132217407,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.878125011920929,
"reward_std": 0.7924162298440933,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8796875178813934,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 10.40000033378601,
"epoch": 0.3069139966273187,
"kl": 1.5460689961910248,
"learning_rate": 9.969364811691961e-07,
"loss": 0.03830792009830475,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.9375000298023224,
"reward_std": 0.8405124992132187,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9375,
"step": 546
},
{
"clip_ratio": 0.0,
"completion_length": 10.550000429153442,
"epoch": 0.3074761101742552,
"kl": 1.5856299698352814,
"learning_rate": 9.969308600337268e-07,
"loss": 0.01524635311216116,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.885937511920929,
"reward_std": 0.8167786598205566,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 547
},
{
"clip_ratio": 0.0,
"completion_length": 10.48593783378601,
"epoch": 0.30803822372119166,
"kl": 1.5868599712848663,
"learning_rate": 9.969252388982575e-07,
"loss": 0.029491964727640152,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9156250059604645,
"reward_std": 0.839767649769783,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.953125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9171875268220901,
"step": 548
},
{
"clip_ratio": 0.0,
"completion_length": 10.570312738418579,
"epoch": 0.3086003372681282,
"kl": 1.582980364561081,
"learning_rate": 9.96919617762788e-07,
"loss": 0.017688706517219543,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.896875023841858,
"reward_std": 0.8135938942432404,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.9000000208616257,
"step": 549
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812786102295,
"epoch": 0.30916245081506466,
"kl": 1.5583482682704926,
"learning_rate": 9.969139966273186e-07,
"loss": 0.009958229959011078,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.896875023841858,
"reward_std": 0.8217545002698898,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8984375149011612,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 10.673437595367432,
"epoch": 0.30972456436200113,
"kl": 1.573817253112793,
"learning_rate": 9.969083754918493e-07,
"loss": 0.027634086087346077,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9203125536441803,
"reward_std": 0.8161921203136444,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.925000011920929,
"step": 551
},
{
"clip_ratio": 0.0,
"completion_length": 10.523437738418579,
"epoch": 0.3102866779089376,
"kl": 1.537708729505539,
"learning_rate": 9.9690275435638e-07,
"loss": 0.017526382580399513,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8875000178813934,
"reward_std": 0.8181226849555969,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8890625238418579,
"step": 552
},
{
"clip_ratio": 0.0,
"completion_length": 10.4609375,
"epoch": 0.3108487914558741,
"kl": 1.6289916336536407,
"learning_rate": 9.968971332209107e-07,
"loss": 0.01097562350332737,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.8828125298023224,
"reward_std": 0.7881049513816833,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500208616257,
"rewards/point_reward": 0.8890624940395355,
"step": 553
},
{
"clip_ratio": 0.0,
"completion_length": 10.576562643051147,
"epoch": 0.31141090500281055,
"kl": 1.5423737466335297,
"learning_rate": 9.968915120854411e-07,
"loss": 0.021829504519701004,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.59375,
"reward": 1.8750000298023224,
"reward_std": 0.7977400124073029,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750029802322,
"rewards/point_reward": 0.878125011920929,
"step": 554
},
{
"clip_ratio": 0.0,
"completion_length": 10.65781283378601,
"epoch": 0.31197301854974707,
"kl": 1.5545784831047058,
"learning_rate": 9.968858909499718e-07,
"loss": 0.030509337782859802,
"ratio/all_0": 0.0,
"ratio/all_2": 0.75,
"reward": 1.9375,
"reward_std": 0.8586998879909515,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9390625059604645,
"step": 555
},
{
"clip_ratio": 0.0,
"completion_length": 10.456249952316284,
"epoch": 0.31253513209668354,
"kl": 1.5957116782665253,
"learning_rate": 9.968802698145025e-07,
"loss": 0.023539943620562553,
"ratio/all_0": 0.0,
"ratio/all_2": 0.65625,
"reward": 1.9171875417232513,
"reward_std": 0.8016439527273178,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 556
},
{
"clip_ratio": 0.0,
"completion_length": 10.578125238418579,
"epoch": 0.31309724564362,
"kl": 1.5839785635471344,
"learning_rate": 9.968746486790332e-07,
"loss": 0.020525125786662102,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8859375417232513,
"reward_std": 0.7854725420475006,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 557
},
{
"clip_ratio": 0.0,
"completion_length": 10.565625190734863,
"epoch": 0.3136593591905565,
"kl": 1.6725057363510132,
"learning_rate": 9.968690275435636e-07,
"loss": 0.01900116726756096,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.625,
"reward": 1.862500011920929,
"reward_std": 0.8175742924213409,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.862500011920929,
"step": 558
},
{
"clip_ratio": 0.0,
"completion_length": 10.484375238418579,
"epoch": 0.31422147273749296,
"kl": 1.5269731283187866,
"learning_rate": 9.968634064080945e-07,
"loss": 0.0200035460293293,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5625,
"reward": 1.8640625178813934,
"reward_std": 0.7826195657253265,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8640625029802322,
"step": 559
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312690734863,
"epoch": 0.31478358628442943,
"kl": 1.5648876428604126,
"learning_rate": 9.96857785272625e-07,
"loss": 0.024630874395370483,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.75,
"reward": 1.9093750417232513,
"reward_std": 0.8682510405778885,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 10.45468783378601,
"epoch": 0.31534569983136596,
"kl": 1.5348423719406128,
"learning_rate": 9.968521641371556e-07,
"loss": 0.01418773178011179,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.896875023841858,
"reward_std": 0.8042657375335693,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 561
},
{
"clip_ratio": 0.0,
"completion_length": 10.44687533378601,
"epoch": 0.3159078133783024,
"kl": 1.5903120338916779,
"learning_rate": 9.968465430016863e-07,
"loss": 0.01568039134144783,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.8859374821186066,
"reward_std": 0.7972578704357147,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8875000178813934,
"step": 562
},
{
"clip_ratio": 0.0,
"completion_length": 10.389062881469727,
"epoch": 0.3164699269252389,
"kl": 1.5598368048667908,
"learning_rate": 9.96840921866217e-07,
"loss": 0.015399058349430561,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.765625,
"reward": 1.8984375,
"reward_std": 0.8830222487449646,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9953125268220901,
"rewards/point_reward": 0.9031250029802322,
"step": 563
},
{
"clip_ratio": 0.0,
"completion_length": 10.385937452316284,
"epoch": 0.31703204047217537,
"kl": 1.5837746560573578,
"learning_rate": 9.968353007307477e-07,
"loss": 0.0344664491713047,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.71875,
"reward": 1.9031250178813934,
"reward_std": 0.8720826357603073,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250029802322,
"step": 564
},
{
"clip_ratio": 0.0,
"completion_length": 10.389062643051147,
"epoch": 0.31759415401911184,
"kl": 1.5606088936328888,
"learning_rate": 9.968296795952781e-07,
"loss": 0.01733417436480522,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5625,
"reward": 1.8921875357627869,
"reward_std": 0.7611248642206192,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 565
},
{
"clip_ratio": 0.0,
"completion_length": 10.4375,
"epoch": 0.31815626756604837,
"kl": 1.586754322052002,
"learning_rate": 9.968240584598088e-07,
"loss": 0.019815683364868164,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8765625059604645,
"reward_std": 0.7880108803510666,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625208616257,
"step": 566
},
{
"clip_ratio": 0.0,
"completion_length": 10.346875190734863,
"epoch": 0.31871838111298484,
"kl": 1.628895491361618,
"learning_rate": 9.968184373243395e-07,
"loss": 0.0360635444521904,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.9140625298023224,
"reward_std": 0.8494412302970886,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9156250208616257,
"step": 567
},
{
"clip_ratio": 0.0,
"completion_length": 10.350000143051147,
"epoch": 0.3192804946599213,
"kl": 1.6824475228786469,
"learning_rate": 9.968128161888702e-07,
"loss": 0.028761250898241997,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.640625,
"reward": 1.8953125476837158,
"reward_std": 0.8177232891321182,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 568
},
{
"clip_ratio": 0.0,
"completion_length": 10.35312533378601,
"epoch": 0.3198426082068578,
"kl": 1.5600067675113678,
"learning_rate": 9.968071950534006e-07,
"loss": 0.027938317507505417,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.765625,
"reward": 1.9187500178813934,
"reward_std": 0.8827731013298035,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9187500178813934,
"step": 569
},
{
"clip_ratio": 0.0,
"completion_length": 10.323437690734863,
"epoch": 0.32040472175379425,
"kl": 1.5906377136707306,
"learning_rate": 9.968015739179313e-07,
"loss": 0.0034601669758558273,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.640625,
"reward": 1.8765625059604645,
"reward_std": 0.8050589561462402,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625208616257,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 10.2578125,
"epoch": 0.3209668353007307,
"kl": 1.598751276731491,
"learning_rate": 9.96795952782462e-07,
"loss": 0.0312412828207016,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9187500476837158,
"reward_std": 0.8491444438695908,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9187500178813934,
"step": 571
},
{
"clip_ratio": 0.0,
"completion_length": 10.268750190734863,
"epoch": 0.32152894884766725,
"kl": 1.614193320274353,
"learning_rate": 9.967903316469927e-07,
"loss": 0.03290371596813202,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.75,
"reward": 1.9046874940395355,
"reward_std": 0.881769135594368,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9062500149011612,
"step": 572
},
{
"clip_ratio": 0.0,
"completion_length": 10.412500143051147,
"epoch": 0.3220910623946037,
"kl": 1.5136121213436127,
"learning_rate": 9.967847105115231e-07,
"loss": 0.021257780492305756,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9078125059604645,
"reward_std": 0.816282257437706,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125208616257,
"step": 573
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000286102295,
"epoch": 0.3226531759415402,
"kl": 1.5708222687244415,
"learning_rate": 9.96779089376054e-07,
"loss": 0.03907536342740059,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.9250000417232513,
"reward_std": 0.84419384598732,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9265625178813934,
"step": 574
},
{
"clip_ratio": 0.0,
"completion_length": 10.3359375,
"epoch": 0.32321528948847666,
"kl": 1.5484612584114075,
"learning_rate": 9.967734682405845e-07,
"loss": 0.011735038831830025,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.609375,
"reward": 1.870312511920929,
"reward_std": 0.8025570511817932,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.870312511920929,
"step": 575
},
{
"clip_ratio": 0.0,
"completion_length": 10.39218783378601,
"epoch": 0.32377740303541314,
"kl": 1.5881067514419556,
"learning_rate": 9.967678471051152e-07,
"loss": 0.01938185654580593,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.890625,
"reward_std": 0.7996688187122345,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 576
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187881469727,
"epoch": 0.3243395165823496,
"kl": 1.5919342041015625,
"learning_rate": 9.967622259696458e-07,
"loss": 0.03571542724967003,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.65625,
"reward": 1.8921875059604645,
"reward_std": 0.8346816897392273,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875208616257,
"step": 577
},
{
"clip_ratio": 0.0,
"completion_length": 10.492187738418579,
"epoch": 0.32490163012928613,
"kl": 1.517667442560196,
"learning_rate": 9.967566048341765e-07,
"loss": 0.012189960107207298,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.896875023841858,
"reward_std": 0.8305607587099075,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8968750089406967,
"step": 578
},
{
"clip_ratio": 0.0,
"completion_length": 10.50156283378601,
"epoch": 0.3254637436762226,
"kl": 1.6046421825885773,
"learning_rate": 9.967509836987072e-07,
"loss": 0.01817990466952324,
"ratio/all_0": 0.0,
"ratio/all_2": 0.625,
"reward": 1.9124999940395355,
"reward_std": 0.7751974761486053,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000238418579,
"step": 579
},
{
"clip_ratio": 0.0,
"completion_length": 10.565624952316284,
"epoch": 0.3260258572231591,
"kl": 1.532376229763031,
"learning_rate": 9.967453625632377e-07,
"loss": 0.009712206199765205,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.878125011920929,
"reward_std": 0.825791984796524,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000286102295,
"epoch": 0.32658797077009555,
"kl": 1.5661653578281403,
"learning_rate": 9.967397414277683e-07,
"loss": 0.021667201071977615,
"ratio/all_0": 0.0,
"ratio/all_2": 0.578125,
"reward": 1.9140625298023224,
"reward_std": 0.7548810392618179,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625149011612,
"step": 581
},
{
"clip_ratio": 0.0,
"completion_length": 10.564062595367432,
"epoch": 0.327150084317032,
"kl": 1.586315929889679,
"learning_rate": 9.96734120292299e-07,
"loss": 0.01577918976545334,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.8953125178813934,
"reward_std": 0.8373362272977829,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125029802322,
"step": 582
},
{
"clip_ratio": 0.0,
"completion_length": 10.425000190734863,
"epoch": 0.32771219786396855,
"kl": 1.5910922586917877,
"learning_rate": 9.967284991568297e-07,
"loss": 0.022063709795475006,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.9109375178813934,
"reward_std": 0.8032617419958115,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 583
},
{
"clip_ratio": 0.0,
"completion_length": 10.426562547683716,
"epoch": 0.328274311410905,
"kl": 1.5398958623409271,
"learning_rate": 9.967228780213602e-07,
"loss": 0.010586312040686607,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.884374976158142,
"reward_std": 0.8187513500452042,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750357627869,
"step": 584
},
{
"clip_ratio": 0.0,
"completion_length": 10.479687690734863,
"epoch": 0.3288364249578415,
"kl": 1.6160912215709686,
"learning_rate": 9.967172568858908e-07,
"loss": 0.02243340015411377,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8937500417232513,
"reward_std": 0.778084471821785,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 585
},
{
"clip_ratio": 0.0,
"completion_length": 10.428125143051147,
"epoch": 0.32939853850477796,
"kl": 1.5488245785236359,
"learning_rate": 9.967116357504215e-07,
"loss": 0.026065057143568993,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.9281249940395355,
"reward_std": 0.8089158087968826,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9281250238418579,
"step": 586
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250047683716,
"epoch": 0.32996065205171443,
"kl": 1.566488653421402,
"learning_rate": 9.967060146149522e-07,
"loss": 0.02423604018986225,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.8921875059604645,
"reward_std": 0.8361251354217529,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875208616257,
"step": 587
},
{
"clip_ratio": 0.0,
"completion_length": 10.476562738418579,
"epoch": 0.3305227655986509,
"kl": 1.5282692313194275,
"learning_rate": 9.967003934794827e-07,
"loss": 0.024469148367643356,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9125000536441803,
"reward_std": 0.7933112531900406,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000089406967,
"step": 588
},
{
"clip_ratio": 0.0,
"completion_length": 10.512500047683716,
"epoch": 0.33108487914558743,
"kl": 1.6268229484558105,
"learning_rate": 9.966947723440135e-07,
"loss": 0.028700783848762512,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.904687523841858,
"reward_std": 0.8226082473993301,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046875089406967,
"step": 589
},
{
"clip_ratio": 0.0,
"completion_length": 10.434375286102295,
"epoch": 0.3316469926925239,
"kl": 1.5528870820999146,
"learning_rate": 9.966891512085442e-07,
"loss": 0.031011313199996948,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.910937488079071,
"reward_std": 0.8530013412237167,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375327825546,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.33220910623946037,
"kl": 1.521255999803543,
"learning_rate": 9.966835300730747e-07,
"loss": 0.013384701684117317,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.8906250298023224,
"reward_std": 0.7729595750570297,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250149011612,
"step": 591
},
{
"clip_ratio": 0.0,
"completion_length": 10.656250238418579,
"epoch": 0.33277121978639684,
"kl": 1.5627491772174835,
"learning_rate": 9.966779089376054e-07,
"loss": 0.01156577654182911,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5625,
"reward": 1.878125011920929,
"reward_std": 0.7626833021640778,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8781250268220901,
"step": 592
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437547683716,
"epoch": 0.3333333333333333,
"kl": 1.6590400338172913,
"learning_rate": 9.96672287802136e-07,
"loss": 0.03726842254400253,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.9000000059604645,
"reward_std": 0.8307802230119705,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.901562511920929,
"step": 593
},
{
"clip_ratio": 0.0,
"completion_length": 10.389062404632568,
"epoch": 0.33389544688026984,
"kl": 1.6278806328773499,
"learning_rate": 9.966666666666667e-07,
"loss": 0.010789453983306885,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.875,
"reward_std": 0.7950026988983154,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.8765625208616257,
"step": 594
},
{
"clip_ratio": 0.0,
"completion_length": 10.534374952316284,
"epoch": 0.3344575604272063,
"kl": 1.603445678949356,
"learning_rate": 9.966610455311972e-07,
"loss": 0.02332155592739582,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.671875,
"reward": 1.8765625059604645,
"reward_std": 0.84195476770401,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8765625059604645,
"step": 595
},
{
"clip_ratio": 0.0,
"completion_length": 10.4765625,
"epoch": 0.3350196739741428,
"kl": 1.5592808425426483,
"learning_rate": 9.966554243957279e-07,
"loss": 0.02063816972076893,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.625,
"reward": 1.8781249821186066,
"reward_std": 0.8197565823793411,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8781250268220901,
"step": 596
},
{
"clip_ratio": 0.0,
"completion_length": 10.306249856948853,
"epoch": 0.33558178752107926,
"kl": 1.6125241816043854,
"learning_rate": 9.966498032602585e-07,
"loss": 0.020839476957917213,
"ratio/all_0": 0.0,
"ratio/all_2": 0.640625,
"reward": 1.9109375178813934,
"reward_std": 0.7973314970731735,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375089406967,
"rewards/point_reward": 0.9125000089406967,
"step": 597
},
{
"clip_ratio": 0.0,
"completion_length": 10.382812738418579,
"epoch": 0.3361439010680157,
"kl": 1.6878159642219543,
"learning_rate": 9.966441821247892e-07,
"loss": 0.024872442707419395,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.8984375,
"reward_std": 0.800084263086319,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375149011612,
"step": 598
},
{
"clip_ratio": 0.0,
"completion_length": 10.443750143051147,
"epoch": 0.3367060146149522,
"kl": 1.5578432381153107,
"learning_rate": 9.966385609893197e-07,
"loss": 0.022587155923247337,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8921875059604645,
"reward_std": 0.829266294836998,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875208616257,
"step": 599
},
{
"clip_ratio": 0.0,
"completion_length": 10.32187533378601,
"epoch": 0.3372681281618887,
"kl": 1.5858546495437622,
"learning_rate": 9.966329398538504e-07,
"loss": 0.01929290220141411,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8906250596046448,
"reward_std": 0.8204612880945206,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.890625,
"step": 600
}
],
"logging_steps": 1.0,
"max_steps": 177900,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}