{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3372681281618887, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 10.032812595367432, "epoch": 0.0005621135469364812, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.07857463508844376, "ratio/all_0": 0.125, "ratio/all_2": 0.46875, "reward": 1.776562511920929, "reward_std": 0.7588308304548264, "rewards/avg_0": 1.71875, "rewards/avg_1": 1.71875, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.703125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875298023224, "rewards/point_reward": 0.784375011920929, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 9.984375238418579, "epoch": 0.0011242270938729624, "kl": 0.2562527507543564, "learning_rate": 9.999943788645306e-07, "loss": -0.03776644542813301, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8640625476837158, "reward_std": 0.7996459901332855, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625178813934, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 10.035937547683716, "epoch": 0.0016863406408094434, "kl": 0.7022024244070053, "learning_rate": 9.999887577290613e-07, "loss": -0.0651589035987854, "ratio/all_0": 0.078125, "ratio/all_2": 0.375, "reward": 1.7562500536441803, "reward_std": 0.702281191945076, "rewards/avg_0": 1.6875, "rewards/avg_1": 1.6875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.75, "rewards/avg_4": 1.65625, "rewards/avg_5": 1.6875, "rewards/avg_6": 1.703125, "rewards/avg_7": 1.640625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.7578125149011612, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 10.115624904632568, "epoch": 0.0022484541877459247, "kl": 1.118683785200119, "learning_rate": 9.99983136593592e-07, "loss": -0.04716287553310394, "ratio/all_0": 0.125, "ratio/all_2": 0.5, "reward": 1.748437523841858, "reward_std": 0.7817670404911041, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.65625, "rewards/avg_2": 1.6875, "rewards/avg_3": 1.6875, "rewards/avg_4": 1.671875, "rewards/avg_5": 1.65625, "rewards/avg_6": 1.671875, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7484375238418579, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 9.982812404632568, "epoch": 0.002810567734682406, "kl": 1.4118792414665222, "learning_rate": 9.999775154581225e-07, "loss": -0.0010041920468211174, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.8250000178813934, "reward_std": 0.8004997074604034, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.75, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8250000029802322, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 10.034375429153442, "epoch": 0.003372681281618887, "kl": 1.565059244632721, "learning_rate": 9.999718943226531e-07, "loss": -0.005607172846794128, "ratio/all_0": 0.109375, "ratio/all_2": 0.453125, "reward": 1.784375011920929, "reward_std": 0.7476067095994949, "rewards/avg_0": 1.75, "rewards/avg_1": 1.703125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.71875, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.75, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.7859375178813934, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 10.014062881469727, "epoch": 0.0039347948285553686, "kl": 1.4795698821544647, "learning_rate": 9.999662731871838e-07, "loss": 0.002856176346540451, "ratio/all_0": 0.0625, "ratio/all_2": 0.546875, "reward": 1.8359375298023224, "reward_std": 0.7821812778711319, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8359375, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 10.068750381469727, "epoch": 0.004496908375491849, "kl": 1.4396315813064575, "learning_rate": 9.999606520517145e-07, "loss": -0.008815726265311241, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.854687511920929, "reward_std": 0.803190678358078, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750029802322, "rewards/point_reward": 0.8578125089406967, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 9.831250190734863, "epoch": 0.00505902192242833, "kl": 1.3827219307422638, "learning_rate": 9.99955030916245e-07, "loss": -0.043409671634435654, "ratio/all_0": 0.125, "ratio/all_2": 0.390625, "reward": 1.7234375178813934, "reward_std": 0.7338877320289612, "rewards/avg_0": 1.6875, "rewards/avg_1": 1.671875, "rewards/avg_2": 1.671875, "rewards/avg_3": 1.640625, "rewards/avg_4": 1.671875, "rewards/avg_5": 1.578125, "rewards/avg_6": 1.640625, "rewards/avg_7": 1.671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.7265625, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 9.918750286102295, "epoch": 0.005621135469364812, "kl": 1.4110659658908844, "learning_rate": 9.999494097807756e-07, "loss": -0.04152492806315422, "ratio/all_0": 0.140625, "ratio/all_2": 0.46875, "reward": 1.737500011920929, "reward_std": 0.7673228681087494, "rewards/avg_0": 1.6875, "rewards/avg_1": 1.640625, "rewards/avg_2": 1.6875, "rewards/avg_3": 1.671875, "rewards/avg_4": 1.671875, "rewards/avg_5": 1.6875, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.640625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500059604645, "rewards/point_reward": 0.7437500208616257, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 10.062500238418579, "epoch": 0.006183249016301293, "kl": 1.4412920773029327, "learning_rate": 9.999437886453063e-07, "loss": -0.0002207462675869465, "ratio/all_0": 0.078125, "ratio/all_2": 0.53125, "reward": 1.8234375417232513, "reward_std": 0.7895885556936264, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.75, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8250000178813934, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 9.984375238418579, "epoch": 0.006745362563237774, "kl": 1.4790368676185608, "learning_rate": 9.99938167509837e-07, "loss": -0.04130768030881882, "ratio/all_0": 0.046875, "ratio/all_2": 0.421875, "reward": 1.7734375298023224, "reward_std": 0.7166178226470947, "rewards/avg_0": 1.65625, "rewards/avg_1": 1.71875, "rewards/avg_2": 1.71875, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.703125, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.7750000059604645, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 9.83750033378601, "epoch": 0.007307476110174255, "kl": 1.534344106912613, "learning_rate": 9.999325463743674e-07, "loss": -0.026084139943122864, "ratio/all_0": 0.0625, "ratio/all_2": 0.421875, "reward": 1.7765625417232513, "reward_std": 0.719652533531189, "rewards/avg_0": 1.703125, "rewards/avg_1": 1.671875, "rewards/avg_2": 1.6875, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.703125, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.7781250029802322, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 10.014062643051147, "epoch": 0.007869589657110737, "kl": 1.600354939699173, "learning_rate": 9.999269252388981e-07, "loss": -0.010605765506625175, "ratio/all_0": 0.0625, "ratio/all_2": 0.453125, "reward": 1.8046875298023224, "reward_std": 0.7357720136642456, "rewards/avg_0": 1.703125, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.75, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8046875, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 9.998437404632568, "epoch": 0.008431703204047217, "kl": 1.5717801451683044, "learning_rate": 9.999213041034288e-07, "loss": 0.0074955616146326065, "ratio/all_0": 0.078125, "ratio/all_2": 0.609375, "reward": 1.834375023841858, "reward_std": 0.8261165618896484, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8343750089406967, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 10.057812690734863, "epoch": 0.008993816750983699, "kl": 1.4447836577892303, "learning_rate": 9.999156829679595e-07, "loss": -0.013929950073361397, "ratio/all_0": 0.109375, "ratio/all_2": 0.515625, "reward": 1.7890625298023224, "reward_std": 0.786567360162735, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.703125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.6875, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7890625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 9.904687643051147, "epoch": 0.00955593029792018, "kl": 1.501718521118164, "learning_rate": 9.999100618324902e-07, "loss": -0.019112706184387207, "ratio/all_0": 0.140625, "ratio/all_2": 0.375, "reward": 1.7453125417232513, "reward_std": 0.7173045426607132, "rewards/avg_0": 1.75, "rewards/avg_1": 1.6875, "rewards/avg_2": 1.65625, "rewards/avg_3": 1.609375, "rewards/avg_4": 1.71875, "rewards/avg_5": 1.65625, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.6875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7453124970197678, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 9.840625047683716, "epoch": 0.01011804384485666, "kl": 1.4566405415534973, "learning_rate": 9.999044406970208e-07, "loss": 0.005196334794163704, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.846875011920929, "reward_std": 0.8059732168912888, "rewards/avg_0": 1.75, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.846875011920929, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 9.921875476837158, "epoch": 0.010680157391793142, "kl": 1.509264975786209, "learning_rate": 9.998988195615515e-07, "loss": 0.006893848069012165, "ratio/all_0": 0.09375, "ratio/all_2": 0.625, "reward": 1.84375, "reward_std": 0.8279983550310135, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8453125208616257, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 10.048437595367432, "epoch": 0.011242270938729624, "kl": 1.4839653670787811, "learning_rate": 9.99893198426082e-07, "loss": 0.005196704529225826, "ratio/all_0": 0.109375, "ratio/all_2": 0.53125, "reward": 1.8062500357627869, "reward_std": 0.7997609674930573, "rewards/avg_0": 1.75, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.75, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8062500208616257, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 9.96875, "epoch": 0.011804384485666104, "kl": 1.5253983438014984, "learning_rate": 9.998875772906127e-07, "loss": -0.01355843897908926, "ratio/all_0": 0.046875, "ratio/all_2": 0.40625, "reward": 1.8078125417232513, "reward_std": 0.7039558738470078, "rewards/avg_0": 1.75, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.71875, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8093750029802322, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 9.98593783378601, "epoch": 0.012366498032602586, "kl": 1.5911829769611359, "learning_rate": 9.998819561551433e-07, "loss": -0.026118595153093338, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.796875, "reward_std": 0.7733737677335739, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.6875, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.75, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.7984375208616257, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 10.207812547683716, "epoch": 0.012928611579539067, "kl": 1.4360250234603882, "learning_rate": 9.99876335019674e-07, "loss": -0.007857441902160645, "ratio/all_0": 0.09375, "ratio/all_2": 0.4375, "reward": 1.7984375059604645, "reward_std": 0.736803725361824, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.75, "rewards/avg_3": 1.71875, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.75, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7984375208616257, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 10.175000190734863, "epoch": 0.013490725126475547, "kl": 1.6245340406894684, "learning_rate": 9.998707138842045e-07, "loss": -0.008536879904568195, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8359375298023224, "reward_std": 0.7617449015378952, "rewards/avg_0": 1.75, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8359375149011612, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 10.134375095367432, "epoch": 0.014052838673412029, "kl": 1.8844734132289886, "learning_rate": 9.998650927487351e-07, "loss": -0.01100741233676672, "ratio/all_0": 0.015625, "ratio/all_2": 0.5, "reward": 1.8343750536441803, "reward_std": 0.723815992474556, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8343750089406967, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 10.245312690734863, "epoch": 0.01461495222034851, "kl": 1.6238836646080017, "learning_rate": 9.998594716132658e-07, "loss": -0.013836846686899662, "ratio/all_0": 0.0625, "ratio/all_2": 0.578125, "reward": 1.8046875298023224, "reward_std": 0.802058219909668, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.6875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8046875149011612, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 10.129687786102295, "epoch": 0.01517706576728499, "kl": 1.5611946880817413, "learning_rate": 9.998538504777965e-07, "loss": -0.03556900471448898, "ratio/all_0": 0.0625, "ratio/all_2": 0.359375, "reward": 1.7734375298023224, "reward_std": 0.6553197205066681, "rewards/avg_0": 1.65625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.65625, "rewards/avg_4": 1.65625, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.6875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7734375149011612, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 10.168750047683716, "epoch": 0.015739179314221474, "kl": 1.512777954339981, "learning_rate": 9.998482293423272e-07, "loss": -0.014125403016805649, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.834375023841858, "reward_std": 0.7443395406007767, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.703125, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8343750238418579, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 10.273437738418579, "epoch": 0.016301292861157952, "kl": 1.5330235958099365, "learning_rate": 9.998426082068576e-07, "loss": -0.003860312746837735, "ratio/all_0": 0.03125, "ratio/all_2": 0.484375, "reward": 1.8406250178813934, "reward_std": 0.729529395699501, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8406250029802322, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 10.125000238418579, "epoch": 0.016863406408094434, "kl": 1.6126181781291962, "learning_rate": 9.998369870713883e-07, "loss": -0.00027049880009144545, "ratio/all_0": 0.046875, "ratio/all_2": 0.46875, "reward": 1.8328125178813934, "reward_std": 0.7283511310815811, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8328125178813934, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 10.178124904632568, "epoch": 0.017425519955030916, "kl": 1.4546434581279755, "learning_rate": 9.99831365935919e-07, "loss": 0.0018729576840996742, "ratio/all_0": 0.109375, "ratio/all_2": 0.53125, "reward": 1.8062500357627869, "reward_std": 0.8023815006017685, "rewards/avg_0": 1.75, "rewards/avg_1": 1.71875, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8062500059604645, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 10.1640625, "epoch": 0.017987633501967398, "kl": 1.4049744606018066, "learning_rate": 9.998257448004497e-07, "loss": -0.0031263651326298714, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8437500596046448, "reward_std": 0.8310911953449249, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.84375, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 10.075000286102295, "epoch": 0.01854974704890388, "kl": 1.3538335859775543, "learning_rate": 9.998201236649804e-07, "loss": -0.032431505620479584, "ratio/all_0": 0.09375, "ratio/all_2": 0.421875, "reward": 1.7656250298023224, "reward_std": 0.7300598323345184, "rewards/avg_0": 1.625, "rewards/avg_1": 1.75, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.703125, "rewards/avg_4": 1.640625, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7656250149011612, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 10.176562547683716, "epoch": 0.01911186059584036, "kl": 1.4589167535305023, "learning_rate": 9.99814502529511e-07, "loss": 0.011930462904274464, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.885937511920929, "reward_std": 0.8517080992460251, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8859375268220901, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 10.060937881469727, "epoch": 0.01967397414277684, "kl": 1.5603070259094238, "learning_rate": 9.998088813940415e-07, "loss": -0.02111126109957695, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8140625357627869, "reward_std": 0.7738387882709503, "rewards/avg_0": 1.65625, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8156249970197678, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 10.029687643051147, "epoch": 0.02023608768971332, "kl": 1.587103694677353, "learning_rate": 9.998032602585722e-07, "loss": 0.02595140039920807, "ratio/all_0": 0.0625, "ratio/all_2": 0.609375, "reward": 1.8718750476837158, "reward_std": 0.8129217773675919, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.871874988079071, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 10.100000143051147, "epoch": 0.020798201236649803, "kl": 1.5513676702976227, "learning_rate": 9.997976391231028e-07, "loss": -0.009216181933879852, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.857812523841858, "reward_std": 0.7749081701040268, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.75, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125238418579, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 10.182812452316284, "epoch": 0.021360314783586284, "kl": 1.4709611535072327, "learning_rate": 9.997920179876335e-07, "loss": -0.029708366841077805, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8078124821186066, "reward_std": 0.8228506743907928, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.75, "rewards/avg_2": 1.71875, "rewards/avg_3": 1.75, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8078125268220901, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 10.150000095367432, "epoch": 0.021922428330522766, "kl": 1.5169185996055603, "learning_rate": 9.99786396852164e-07, "loss": 0.017741110175848007, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8906250298023224, "reward_std": 0.7767912149429321, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 10.046875238418579, "epoch": 0.022484541877459248, "kl": 1.4632358253002167, "learning_rate": 9.997807757166947e-07, "loss": -0.03963226079940796, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.803125023841858, "reward_std": 0.7558850347995758, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.71875, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.75, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8046875149011612, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 10.187500238418579, "epoch": 0.02304665542439573, "kl": 1.3824447095394135, "learning_rate": 9.997751545812253e-07, "loss": 0.010992420837283134, "ratio/all_0": 0.078125, "ratio/all_2": 0.671875, "reward": 1.8593750298023224, "reward_std": 0.8612623810768127, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8609375059604645, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 10.057812690734863, "epoch": 0.023608768971332208, "kl": 1.5909472107887268, "learning_rate": 9.99769533445756e-07, "loss": -0.013673016801476479, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.823437511920929, "reward_std": 0.7688121795654297, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.75, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.75, "rewards/avg_4": 1.75, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.823437511920929, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 10.285937547683716, "epoch": 0.02417088251826869, "kl": 1.4693297147750854, "learning_rate": 9.997639123102867e-07, "loss": -0.0068480633199214935, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8453125357627869, "reward_std": 0.7629244476556778, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8453125059604645, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 10.30625033378601, "epoch": 0.02473299606520517, "kl": 1.4473371505737305, "learning_rate": 9.997582911748172e-07, "loss": -0.00031005823984742165, "ratio/all_0": 0.078125, "ratio/all_2": 0.53125, "reward": 1.8234375417232513, "reward_std": 0.7852741330862045, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.823437511920929, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 10.312500476837158, "epoch": 0.025295109612141653, "kl": 1.4984396696090698, "learning_rate": 9.997526700393478e-07, "loss": -0.007865441963076591, "ratio/all_0": 0.046875, "ratio/all_2": 0.453125, "reward": 1.8234375417232513, "reward_std": 0.7250258326530457, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.823437511920929, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 10.32968783378601, "epoch": 0.025857223159078135, "kl": 1.4959508776664734, "learning_rate": 9.997470489038785e-07, "loss": 0.0012173890136182308, "ratio/all_0": 0.09375, "ratio/all_2": 0.609375, "reward": 1.8203125298023224, "reward_std": 0.8282635509967804, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.75, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8203125, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 10.228125095367432, "epoch": 0.026419336706014616, "kl": 1.4259069859981537, "learning_rate": 9.997414277684092e-07, "loss": -0.006716692354530096, "ratio/all_0": 0.078125, "ratio/all_2": 0.484375, "reward": 1.8125, "reward_std": 0.7544415444135666, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.75, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8125000298023224, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 10.353125095367432, "epoch": 0.026981450252951095, "kl": 1.383703500032425, "learning_rate": 9.997358066329399e-07, "loss": -0.014200778678059578, "ratio/all_0": 0.046875, "ratio/all_2": 0.515625, "reward": 1.8359375, "reward_std": 0.7453095018863678, "rewards/avg_0": 1.75, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8375000059604645, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 10.187500476837158, "epoch": 0.027543563799887576, "kl": 1.6698559820652008, "learning_rate": 9.997301854974706e-07, "loss": 0.007298845797777176, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.8218750059604645, "reward_std": 0.7926801592111588, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.671875, "rewards/avg_3": 1.75, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8234375268220901, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 10.160937547683716, "epoch": 0.028105677346824058, "kl": 1.2673736810684204, "learning_rate": 9.99724564362001e-07, "loss": -0.008520995266735554, "ratio/all_0": 0.078125, "ratio/all_2": 0.515625, "reward": 1.8296875059604645, "reward_std": 0.7676666676998138, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8312499970197678, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 10.223437547683716, "epoch": 0.02866779089376054, "kl": 1.329988956451416, "learning_rate": 9.997189432265317e-07, "loss": -0.023494983091950417, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.815625011920929, "reward_std": 0.7471684217453003, "rewards/avg_0": 1.75, "rewards/avg_1": 1.75, "rewards/avg_2": 1.75, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.815625011920929, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 10.117187976837158, "epoch": 0.02922990444069702, "kl": 1.569214403629303, "learning_rate": 9.997133220910624e-07, "loss": -0.0041077700443565845, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.8109375536441803, "reward_std": 0.7974056601524353, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.75, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.71875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8109375089406967, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 10.22031283378601, "epoch": 0.029792017987633503, "kl": 1.572588711977005, "learning_rate": 9.99707700955593e-07, "loss": -0.012450341135263443, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.8390625417232513, "reward_std": 0.7325338274240494, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.839062511920929, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 10.168750524520874, "epoch": 0.03035413153456998, "kl": 1.601712167263031, "learning_rate": 9.997020798201237e-07, "loss": 0.008456533774733543, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.8796875178813934, "reward_std": 0.821140706539154, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875029802322, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 10.239062547683716, "epoch": 0.030916245081506463, "kl": 1.5150468349456787, "learning_rate": 9.996964586846542e-07, "loss": -0.010614115744829178, "ratio/all_0": 0.078125, "ratio/all_2": 0.5, "reward": 1.7984375059604645, "reward_std": 0.773614913225174, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.71875, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.703125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7984375059604645, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 10.107812643051147, "epoch": 0.03147835862844295, "kl": 1.594663679599762, "learning_rate": 9.996908375491849e-07, "loss": 0.005751887336373329, "ratio/all_0": 0.0625, "ratio/all_2": 0.484375, "reward": 1.8390624821186066, "reward_std": 0.73665352165699, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.75, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8390625268220901, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 10.292187690734863, "epoch": 0.03204047217537943, "kl": 1.5536901950836182, "learning_rate": 9.996852164137155e-07, "loss": 0.016127917915582657, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9140625, "reward_std": 0.8086505830287933, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.96875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625149011612, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 10.364062547683716, "epoch": 0.032602585722315905, "kl": 1.610331416130066, "learning_rate": 9.996795952782462e-07, "loss": -0.0014691497199237347, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.8609375357627869, "reward_std": 0.7582378685474396, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375059604645, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 10.270312786102295, "epoch": 0.03316469926925239, "kl": 1.541633427143097, "learning_rate": 9.996739741427767e-07, "loss": 0.01057470217347145, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8750000298023224, "reward_std": 0.7591521888971329, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.875, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 10.290625095367432, "epoch": 0.03372681281618887, "kl": 1.5340014100074768, "learning_rate": 9.996683530073074e-07, "loss": 0.009979034774005413, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.8750000298023224, "reward_std": 0.7346808016300201, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 10.28125, "epoch": 0.034288926363125354, "kl": 1.5578001737594604, "learning_rate": 9.99662731871838e-07, "loss": -0.021913019940257072, "ratio/all_0": 0.046875, "ratio/all_2": 0.546875, "reward": 1.8281250298023224, "reward_std": 0.7614506334066391, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.75, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8296875059604645, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 10.281250238418579, "epoch": 0.03485103991006183, "kl": 1.4720918536186218, "learning_rate": 9.996571107363687e-07, "loss": -0.0258667953312397, "ratio/all_0": 0.046875, "ratio/all_2": 0.46875, "reward": 1.8062500357627869, "reward_std": 0.7150072902441025, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.703125, "rewards/avg_2": 1.75, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.703125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8062500059604645, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 10.309375286102295, "epoch": 0.03541315345699832, "kl": 1.5031334161758423, "learning_rate": 9.996514896008992e-07, "loss": 0.0031455708667635918, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8453125357627869, "reward_std": 0.8244078755378723, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8453125059604645, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 10.251562595367432, "epoch": 0.035975267003934795, "kl": 1.509196251630783, "learning_rate": 9.9964586846543e-07, "loss": -0.022226005792617798, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.842187523841858, "reward_std": 0.7700472772121429, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875089406967, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 10.271875143051147, "epoch": 0.03653738055087127, "kl": 1.5132167041301727, "learning_rate": 9.996402473299608e-07, "loss": 0.0028878115117549896, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.873437523841858, "reward_std": 0.7714907974004745, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375238418579, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 10.362500429153442, "epoch": 0.03709949409780776, "kl": 1.5417117476463318, "learning_rate": 9.996346261944912e-07, "loss": 0.0048205191269516945, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.8921875059604645, "reward_std": 0.7501703798770905, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875208616257, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 10.425000190734863, "epoch": 0.03766160764474424, "kl": 1.5405822396278381, "learning_rate": 9.996290050590219e-07, "loss": -7.154536433517933e-06, "ratio/all_0": 0.078125, "ratio/all_2": 0.484375, "reward": 1.8203125298023224, "reward_std": 0.7464889883995056, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8203125149011612, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 10.325000286102295, "epoch": 0.03822372119168072, "kl": 1.4785286784172058, "learning_rate": 9.996233839235526e-07, "loss": -0.049676112830638885, "ratio/all_0": 0.015625, "ratio/all_2": 0.4375, "reward": 1.792187511920929, "reward_std": 0.6961659491062164, "rewards/avg_0": 1.71875, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.75, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7921874970197678, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 10.375, "epoch": 0.0387858347386172, "kl": 1.439880520105362, "learning_rate": 9.996177627880832e-07, "loss": -0.0004722205922007561, "ratio/all_0": 0.015625, "ratio/all_2": 0.515625, "reward": 1.8671875298023224, "reward_std": 0.7345052808523178, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8687500059604645, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 10.292187690734863, "epoch": 0.03934794828555368, "kl": 1.4667574167251587, "learning_rate": 9.996121416526137e-07, "loss": -0.004125084728002548, "ratio/all_0": 0.0625, "ratio/all_2": 0.484375, "reward": 1.826562523841858, "reward_std": 0.7579498142004013, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.71875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8265625089406967, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 10.384375095367432, "epoch": 0.039910061832490164, "kl": 1.4546357691287994, "learning_rate": 9.996065205171444e-07, "loss": -0.0041849445551633835, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.8359375, "reward_std": 0.7519094198942184, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8359375149011612, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 10.379687547683716, "epoch": 0.04047217537942664, "kl": 1.4776123464107513, "learning_rate": 9.99600899381675e-07, "loss": 0.012721182778477669, "ratio/all_0": 0.09375, "ratio/all_2": 0.5625, "reward": 1.8375000357627869, "reward_std": 0.7974309027194977, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 10.325000047683716, "epoch": 0.04103428892636313, "kl": 1.4323228895664215, "learning_rate": 9.995952782462057e-07, "loss": -0.0011236064601689577, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8796875178813934, "reward_std": 0.7972553670406342, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 10.217187643051147, "epoch": 0.041596402473299605, "kl": 1.5072983801364899, "learning_rate": 9.995896571107362e-07, "loss": -0.02287622168660164, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.8250000476837158, "reward_std": 0.7327965945005417, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8265625089406967, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 10.421875238418579, "epoch": 0.04215851602023609, "kl": 1.5226522088050842, "learning_rate": 9.995840359752669e-07, "loss": 0.014585795812308788, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.8640625476837158, "reward_std": 0.8066390603780746, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8656250089406967, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 10.390625, "epoch": 0.04272062956717257, "kl": 1.5350313782691956, "learning_rate": 9.995784148397976e-07, "loss": 0.0005742218345403671, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.831250011920929, "reward_std": 0.7712255567312241, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.6875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.831250011920929, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 10.404687643051147, "epoch": 0.04328274311410905, "kl": 1.487417221069336, "learning_rate": 9.995727937043282e-07, "loss": -0.0033020656555891037, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.865625023841858, "reward_std": 0.7682248502969742, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 10.354687452316284, "epoch": 0.04384485666104553, "kl": 1.5165870487689972, "learning_rate": 9.99567172568859e-07, "loss": -0.024435177445411682, "ratio/all_0": 0.0625, "ratio/all_2": 0.453125, "reward": 1.7906250357627869, "reward_std": 0.7295899838209152, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.671875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.703125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7906250059604645, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 10.387500286102295, "epoch": 0.04440697020798201, "kl": 1.452039659023285, "learning_rate": 9.995615514333896e-07, "loss": -0.03055991232395172, "ratio/all_0": 0.03125, "ratio/all_2": 0.46875, "reward": 1.803125023841858, "reward_std": 0.725995808839798, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.703125, "rewards/avg_3": 1.75, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8031250089406967, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 10.582812786102295, "epoch": 0.044969083754918496, "kl": 1.5275309681892395, "learning_rate": 9.995559302979203e-07, "loss": 0.016731752082705498, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9000000357627869, "reward_std": 0.848938599228859, "rewards/avg_0": 1.875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 10.4609375, "epoch": 0.045531197301854974, "kl": 1.53338223695755, "learning_rate": 9.995503091624507e-07, "loss": -0.010927405208349228, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8546875417232513, "reward_std": 0.7863930761814117, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8546874970197678, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 10.557812690734863, "epoch": 0.04609331084879146, "kl": 1.5185647904872894, "learning_rate": 9.995446880269814e-07, "loss": 0.02417963184416294, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.910937488079071, "reward_std": 0.8183117806911469, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375327825546, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 10.412499904632568, "epoch": 0.04665542439572794, "kl": 1.5981625616550446, "learning_rate": 9.99539066891512e-07, "loss": 0.014372358098626137, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8687500357627869, "reward_std": 0.7999352663755417, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500059604645, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 10.357812643051147, "epoch": 0.047217537942664416, "kl": 1.584965080022812, "learning_rate": 9.995334457560428e-07, "loss": 0.0024953281972557306, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8640625178813934, "reward_std": 0.7857124209403992, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625029802322, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 10.412500143051147, "epoch": 0.0477796514896009, "kl": 1.5639266967773438, "learning_rate": 9.995278246205732e-07, "loss": 0.014907699078321457, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8984375298023224, "reward_std": 0.787330225110054, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 10.464062690734863, "epoch": 0.04834176503653738, "kl": 1.5700583755970001, "learning_rate": 9.99522203485104e-07, "loss": 0.02378089912235737, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8828125298023224, "reward_std": 0.8185782730579376, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 10.39218783378601, "epoch": 0.048903878583473864, "kl": 1.5734200477600098, "learning_rate": 9.995165823496346e-07, "loss": -0.0061425757594406605, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8328125178813934, "reward_std": 0.7643389850854874, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8343750089406967, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 10.504687547683716, "epoch": 0.04946599213041034, "kl": 1.4525836110115051, "learning_rate": 9.995109612141653e-07, "loss": 0.030096255242824554, "ratio/all_0": 0.0, "ratio/all_2": 0.640625, "reward": 1.9406250417232513, "reward_std": 0.7833255678415298, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.96875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9406249970197678, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 10.453125, "epoch": 0.05002810567734682, "kl": 1.550961047410965, "learning_rate": 9.995053400786957e-07, "loss": 0.01746845617890358, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.8875000476837158, "reward_std": 0.8394832015037537, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.887499988079071, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 10.529687881469727, "epoch": 0.050590219224283306, "kl": 1.5564002692699432, "learning_rate": 9.994997189432264e-07, "loss": 0.009959584102034569, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8703125417232513, "reward_std": 0.7675454616546631, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 10.375000238418579, "epoch": 0.051152332771219784, "kl": 1.5659159719944, "learning_rate": 9.99494097807757e-07, "loss": 0.02350633218884468, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.8843750357627869, "reward_std": 0.8328011333942413, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 10.443750381469727, "epoch": 0.05171444631815627, "kl": 1.5553985238075256, "learning_rate": 9.994884766722878e-07, "loss": 0.011613225564360619, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8828125298023224, "reward_std": 0.8189027905464172, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 10.3828125, "epoch": 0.05227655986509275, "kl": 1.5122833847999573, "learning_rate": 9.994828555368184e-07, "loss": 0.03290847688913345, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9328125417232513, "reward_std": 0.8211647868156433, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.953125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.932812511920929, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 10.500000238418579, "epoch": 0.05283867341202923, "kl": 1.4713640809059143, "learning_rate": 9.994772344013491e-07, "loss": 0.025454115122556686, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9218750298023224, "reward_std": 0.8077968657016754, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.953125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.921875, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 10.546875238418579, "epoch": 0.05340078695896571, "kl": 1.5354516804218292, "learning_rate": 9.994716132658798e-07, "loss": -0.03127738833427429, "ratio/all_0": 0.0, "ratio/all_2": 0.5, "reward": 1.8312500417232513, "reward_std": 0.7202664166688919, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.75, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8328125029802322, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 10.439062595367432, "epoch": 0.05396290050590219, "kl": 1.5282418131828308, "learning_rate": 9.994659921304103e-07, "loss": 0.012253593653440475, "ratio/all_0": 0.0, "ratio/all_2": 0.65625, "reward": 1.904687523841858, "reward_std": 0.801643967628479, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875089406967, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 10.551562547683716, "epoch": 0.054525014052838675, "kl": 1.4728964567184448, "learning_rate": 9.99460370994941e-07, "loss": -0.01994449459016323, "ratio/all_0": 0.0625, "ratio/all_2": 0.578125, "reward": 1.807812511920929, "reward_std": 0.804884597659111, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.807812511920929, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 10.481250286102295, "epoch": 0.05508712759977515, "kl": 1.5632469356060028, "learning_rate": 9.994547498594716e-07, "loss": 0.020106907933950424, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.873437523841858, "reward_std": 0.8139888793230057, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8750000149011612, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 10.428125143051147, "epoch": 0.05564924114671164, "kl": 1.521868348121643, "learning_rate": 9.994491287240023e-07, "loss": 0.0054258364252746105, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.846875011920929, "reward_std": 0.7462553530931473, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.846875011920929, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 10.45468783378601, "epoch": 0.056211354693648116, "kl": 1.505905568599701, "learning_rate": 9.994435075885328e-07, "loss": 0.008497299626469612, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8718750178813934, "reward_std": 0.7737879902124405, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750178813934, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 10.315625429153442, "epoch": 0.0567734682405846, "kl": 1.5346179604530334, "learning_rate": 9.994378864530634e-07, "loss": -0.004003023728728294, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8593750298023224, "reward_std": 0.7765247821807861, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750149011612, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 10.559375047683716, "epoch": 0.05733558178752108, "kl": 1.5174490213394165, "learning_rate": 9.994322653175941e-07, "loss": -0.007914504036307335, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.8375000059604645, "reward_std": 0.7441652566194534, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 10.350000143051147, "epoch": 0.05789769533445756, "kl": 1.5438537895679474, "learning_rate": 9.994266441821248e-07, "loss": -0.005718717817217112, "ratio/all_0": 0.0, "ratio/all_2": 0.59375, "reward": 1.8765625357627869, "reward_std": 0.7701685428619385, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.75, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8781249970197678, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 10.517187595367432, "epoch": 0.05845980888139404, "kl": 1.5379104316234589, "learning_rate": 9.994210230466555e-07, "loss": -0.0068434132263064384, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.8796875178813934, "reward_std": 0.7376827597618103, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 10.384375095367432, "epoch": 0.05902192242833052, "kl": 1.5371789634227753, "learning_rate": 9.99415401911186e-07, "loss": 0.011498227715492249, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8906250298023224, "reward_std": 0.7910128086805344, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.890625, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 10.320312738418579, "epoch": 0.059584035975267007, "kl": 1.4497211277484894, "learning_rate": 9.994097807757166e-07, "loss": 0.023686472326517105, "ratio/all_0": 0.09375, "ratio/all_2": 0.609375, "reward": 1.854687511920929, "reward_std": 0.8297070264816284, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 10.340625286102295, "epoch": 0.060146149522203485, "kl": 1.56570765376091, "learning_rate": 9.994041596402473e-07, "loss": -0.023295482620596886, "ratio/all_0": 0.03125, "ratio/all_2": 0.4375, "reward": 1.8187500536441803, "reward_std": 0.6901557743549347, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.703125, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8187500089406967, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 10.503125190734863, "epoch": 0.06070826306913996, "kl": 1.4991244673728943, "learning_rate": 9.99398538504778e-07, "loss": -0.0025712046772241592, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8531250059604645, "reward_std": 0.7726690620183945, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250208616257, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 10.43906283378601, "epoch": 0.06127037661607645, "kl": 1.52072274684906, "learning_rate": 9.993929173693084e-07, "loss": -0.0034492649137973785, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.859375, "reward_std": 0.7871571332216263, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750149011612, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 10.625000476837158, "epoch": 0.061832490163012926, "kl": 1.4775553047657013, "learning_rate": 9.993872962338393e-07, "loss": -0.004010038450360298, "ratio/all_0": 0.015625, "ratio/all_2": 0.5, "reward": 1.8609375655651093, "reward_std": 0.7292641997337341, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375059604645, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 10.553124904632568, "epoch": 0.06239460370994941, "kl": 1.537141591310501, "learning_rate": 9.993816750983698e-07, "loss": 0.012600191868841648, "ratio/all_0": 0.0625, "ratio/all_2": 0.515625, "reward": 1.8531250357627869, "reward_std": 0.7582390755414963, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250059604645, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 10.557812452316284, "epoch": 0.0629567172568859, "kl": 1.4744674563407898, "learning_rate": 9.993760539629005e-07, "loss": 0.017602285370230675, "ratio/all_0": 0.078125, "ratio/all_2": 0.609375, "reward": 1.8593750298023224, "reward_std": 0.8213162273168564, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750149011612, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 10.635937452316284, "epoch": 0.06351883080382237, "kl": 1.5100781619548798, "learning_rate": 9.993704328274311e-07, "loss": -0.005892171524465084, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.8359375, "reward_std": 0.7448712140321732, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.75, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8359375298023224, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.06408094435075885, "kl": 1.4870060682296753, "learning_rate": 9.993648116919618e-07, "loss": -0.0005512246862053871, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.850000023841858, "reward_std": 0.7565291672945023, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8500000238418579, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 10.523437738418579, "epoch": 0.06464305789769534, "kl": 1.5237656235694885, "learning_rate": 9.993591905564923e-07, "loss": 0.0005614343099296093, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8578125536441803, "reward_std": 0.745839923620224, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578124940395355, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 10.535937547683716, "epoch": 0.06520517144463181, "kl": 1.5184479653835297, "learning_rate": 9.99353569421023e-07, "loss": 0.018243275582790375, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8921875357627869, "reward_std": 0.8082122951745987, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 10.553125381469727, "epoch": 0.0657672849915683, "kl": 1.5549254715442657, "learning_rate": 9.993479482855536e-07, "loss": 0.029085306450724602, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.9109375178813934, "reward_std": 0.8369801193475723, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 10.506250143051147, "epoch": 0.06632939853850478, "kl": 1.5414323210716248, "learning_rate": 9.993423271500843e-07, "loss": 0.011565025895833969, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8671875298023224, "reward_std": 0.7845959216356277, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8671875, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 10.417187690734863, "epoch": 0.06689151208544127, "kl": 1.533017873764038, "learning_rate": 9.99336706014615e-07, "loss": 0.0029391534626483917, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.865625023841858, "reward_std": 0.8084775656461716, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.75, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 10.514062643051147, "epoch": 0.06745362563237774, "kl": 1.5064394176006317, "learning_rate": 9.993310848791455e-07, "loss": -0.0017734202556312084, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.865625023841858, "reward_std": 0.7416293770074844, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 10.545312643051147, "epoch": 0.06801573917931422, "kl": 1.5602343082427979, "learning_rate": 9.993254637436761e-07, "loss": -0.006767038721591234, "ratio/all_0": 0.0625, "ratio/all_2": 0.5, "reward": 1.8250000178813934, "reward_std": 0.7424818426370621, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.734375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8250000029802322, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 10.515625, "epoch": 0.06857785272625071, "kl": 1.5196544229984283, "learning_rate": 9.993198426082068e-07, "loss": -0.00423224875703454, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.8250000178813934, "reward_std": 0.7697820663452148, "rewards/avg_0": 1.75, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8250000178813934, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 10.515625238418579, "epoch": 0.06913996627318718, "kl": 1.453963041305542, "learning_rate": 9.993142214727375e-07, "loss": -0.01648056134581566, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.8375000357627869, "reward_std": 0.7505858242511749, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.75, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 10.528125047683716, "epoch": 0.06970207982012366, "kl": 1.4716617166996002, "learning_rate": 9.99308600337268e-07, "loss": 0.02104857563972473, "ratio/all_0": 0.078125, "ratio/all_2": 0.53125, "reward": 1.857812523841858, "reward_std": 0.7756710201501846, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125089406967, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 10.478125095367432, "epoch": 0.07026419336706015, "kl": 1.5574918985366821, "learning_rate": 9.993029792017988e-07, "loss": -0.0042773825116455555, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8406250178813934, "reward_std": 0.7628335356712341, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8406250178813934, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 10.543750286102295, "epoch": 0.07082630691399663, "kl": 1.4953092634677887, "learning_rate": 9.992973580663293e-07, "loss": 0.021830586716532707, "ratio/all_0": 0.09375, "ratio/all_2": 0.53125, "reward": 1.8656249940395355, "reward_std": 0.7679849863052368, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8671875149011612, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 10.440625190734863, "epoch": 0.0713884204609331, "kl": 1.5018656253814697, "learning_rate": 9.9929173693086e-07, "loss": -0.0003001997247338295, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8640625178813934, "reward_std": 0.7929854989051819, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625178813934, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 10.370312690734863, "epoch": 0.07195053400786959, "kl": 1.4831855297088623, "learning_rate": 9.992861157953907e-07, "loss": 0.0010036500170826912, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8750000298023224, "reward_std": 0.79169100522995, "rewards/avg_0": 1.75, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 10.362500190734863, "epoch": 0.07251264755480608, "kl": 1.5014575123786926, "learning_rate": 9.992804946599213e-07, "loss": -0.005141148809343576, "ratio/all_0": 0.078125, "ratio/all_2": 0.515625, "reward": 1.8125, "reward_std": 0.7820057570934296, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.65625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8125000298023224, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 10.350000143051147, "epoch": 0.07307476110174255, "kl": 1.5373378098011017, "learning_rate": 9.99274873524452e-07, "loss": 0.018027551472187042, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9062500298023224, "reward_std": 0.7975836396217346, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9062500149011612, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 10.303125381469727, "epoch": 0.07363687464867903, "kl": 1.5327981412410736, "learning_rate": 9.992692523889825e-07, "loss": -0.019817432388663292, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8281250298023224, "reward_std": 0.7495806366205215, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8281250149011612, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 10.450000047683716, "epoch": 0.07419898819561552, "kl": 1.5207752287387848, "learning_rate": 9.992636312535132e-07, "loss": 0.024002227932214737, "ratio/all_0": 0.0, "ratio/all_2": 0.6875, "reward": 1.9218750298023224, "reward_std": 0.8269690722227097, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.96875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9218750149011612, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 10.307812452316284, "epoch": 0.07476110174255199, "kl": 1.5719003975391388, "learning_rate": 9.992580101180438e-07, "loss": 0.026019521057605743, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8765625059604645, "reward_std": 0.830683246254921, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.878125011920929, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 10.370312929153442, "epoch": 0.07532321528948847, "kl": 1.516474723815918, "learning_rate": 9.992523889825745e-07, "loss": -0.010382582433521748, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8406250476837158, "reward_std": 0.7487256675958633, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.75, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8406250029802322, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 10.26718783378601, "epoch": 0.07588532883642496, "kl": 1.586272120475769, "learning_rate": 9.99246767847105e-07, "loss": 0.007272321730852127, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.846875011920929, "reward_std": 0.8077567517757416, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8484375178813934, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 10.437500238418579, "epoch": 0.07644744238336144, "kl": 1.5889216363430023, "learning_rate": 9.992411467116357e-07, "loss": 0.00854005478322506, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8828125298023224, "reward_std": 0.7845934629440308, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 10.453125238418579, "epoch": 0.07700955593029792, "kl": 1.5352471768856049, "learning_rate": 9.992355255761663e-07, "loss": 0.01773301512002945, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.8843750059604645, "reward_std": 0.7357997745275497, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 10.485937595367432, "epoch": 0.0775716694772344, "kl": 1.5447623431682587, "learning_rate": 9.99229904440697e-07, "loss": 0.012218187563121319, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.893750011920929, "reward_std": 0.79325070977211, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 10.45468783378601, "epoch": 0.07813378302417089, "kl": 1.6122995018959045, "learning_rate": 9.992242833052275e-07, "loss": -0.013333135284483433, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.826562523841858, "reward_std": 0.7633627504110336, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8265625089406967, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 10.484375, "epoch": 0.07869589657110736, "kl": 1.593853771686554, "learning_rate": 9.992186621697581e-07, "loss": 0.020144429057836533, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.854687511920929, "reward_std": 0.7729583382606506, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8546874970197678, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 10.33750033378601, "epoch": 0.07925801011804384, "kl": 1.5863473117351532, "learning_rate": 9.992130410342888e-07, "loss": 0.015076573938131332, "ratio/all_0": 0.078125, "ratio/all_2": 0.5625, "reward": 1.8484375178813934, "reward_std": 0.7912780493497849, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375178813934, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 10.457812547683716, "epoch": 0.07982012366498033, "kl": 1.5253994762897491, "learning_rate": 9.992074198988195e-07, "loss": 0.02953893318772316, "ratio/all_0": 0.0625, "ratio/all_2": 0.671875, "reward": 1.8859375417232513, "reward_std": 0.8484649807214737, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 10.367187976837158, "epoch": 0.08038223721191681, "kl": 1.569258987903595, "learning_rate": 9.992017987633502e-07, "loss": 0.033501721918582916, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.9156250357627869, "reward_std": 0.8672823309898376, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250059604645, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 10.559375286102295, "epoch": 0.08094435075885328, "kl": 1.5631862878799438, "learning_rate": 9.991961776278809e-07, "loss": -0.0034087556414306164, "ratio/all_0": 0.046875, "ratio/all_2": 0.546875, "reward": 1.8421874940395355, "reward_std": 0.7658355087041855, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.703125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875238418579, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 10.410937786102295, "epoch": 0.08150646430578977, "kl": 1.4907237887382507, "learning_rate": 9.991905564924115e-07, "loss": 0.00786613579839468, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8671875298023224, "reward_std": 0.7537936717271805, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8671875, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 10.439062595367432, "epoch": 0.08206857785272625, "kl": 1.4968520998954773, "learning_rate": 9.99184935356942e-07, "loss": 0.01350158266723156, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.9000000357627869, "reward_std": 0.7897035330533981, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.9031250029802322, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 10.484375238418579, "epoch": 0.08263069139966273, "kl": 1.5990287363529205, "learning_rate": 9.991793142214727e-07, "loss": 0.031508635729551315, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.893750011920929, "reward_std": 0.8107470273971558, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8953125029802322, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 10.526562929153442, "epoch": 0.08319280494659921, "kl": 1.5559196174144745, "learning_rate": 9.991736930860034e-07, "loss": 0.011043448001146317, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8718750178813934, "reward_std": 0.8150699585676193, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750029802322, "rewards/point_reward": 0.8750000149011612, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 10.415625095367432, "epoch": 0.0837549184935357, "kl": 1.5151457488536835, "learning_rate": 9.99168071950534e-07, "loss": 0.013916858471930027, "ratio/all_0": 0.046875, "ratio/all_2": 0.546875, "reward": 1.8671875, "reward_std": 0.7652469575405121, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8671875149011612, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 10.537499904632568, "epoch": 0.08431703204047218, "kl": 1.5213910639286041, "learning_rate": 9.991624508150645e-07, "loss": 0.02209000289440155, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8812500536441803, "reward_std": 0.806503638625145, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 10.379687786102295, "epoch": 0.08487914558740865, "kl": 1.5504529476165771, "learning_rate": 9.991568296795952e-07, "loss": 0.02314211055636406, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.854687511920929, "reward_std": 0.7863009423017502, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 10.426562547683716, "epoch": 0.08544125913434514, "kl": 1.493497610092163, "learning_rate": 9.991512085441259e-07, "loss": 0.016514841467142105, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9015625417232513, "reward_std": 0.7979805618524551, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9031250029802322, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 10.464062690734863, "epoch": 0.08600337268128162, "kl": 1.5301005244255066, "learning_rate": 9.991455874086565e-07, "loss": 0.019722461700439453, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8796875178813934, "reward_std": 0.8272948712110519, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 10.364062547683716, "epoch": 0.0865654862282181, "kl": 1.5509155094623566, "learning_rate": 9.99139966273187e-07, "loss": 0.010571800172328949, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8718750178813934, "reward_std": 0.7905745208263397, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750029802322, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 10.395312786102295, "epoch": 0.08712759977515458, "kl": 1.5481449365615845, "learning_rate": 9.991343451377177e-07, "loss": -0.004659176804125309, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.850000023841858, "reward_std": 0.7373934984207153, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8500000089406967, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 10.460937738418579, "epoch": 0.08768971332209106, "kl": 1.5249327719211578, "learning_rate": 9.991287240022486e-07, "loss": 0.0059776403941214085, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8531250357627869, "reward_std": 0.7579726427793503, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250208616257, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 10.409374952316284, "epoch": 0.08825182686902755, "kl": 1.5869796872138977, "learning_rate": 9.99123102866779e-07, "loss": 0.008219568058848381, "ratio/all_0": 0.0625, "ratio/all_2": 0.515625, "reward": 1.8406250476837158, "reward_std": 0.7582390904426575, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8406250178813934, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 10.471874952316284, "epoch": 0.08881394041596402, "kl": 1.4864901006221771, "learning_rate": 9.991174817313097e-07, "loss": 0.019063614308834076, "ratio/all_0": 0.078125, "ratio/all_2": 0.734375, "reward": 1.8781250417232513, "reward_std": 0.880060464143753, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8796875029802322, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 10.4296875, "epoch": 0.0893760539629005, "kl": 1.5587861239910126, "learning_rate": 9.991118605958404e-07, "loss": 0.005818033590912819, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8562500178813934, "reward_std": 0.8070340156555176, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8562500029802322, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 10.453125238418579, "epoch": 0.08993816750983699, "kl": 1.5292396545410156, "learning_rate": 9.99106239460371e-07, "loss": 0.03895500302314758, "ratio/all_0": 0.0625, "ratio/all_2": 0.734375, "reward": 1.904687523841858, "reward_std": 0.886305496096611, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875089406967, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 10.471874952316284, "epoch": 0.09050028105677346, "kl": 1.5067280530929565, "learning_rate": 9.991006183249015e-07, "loss": 0.012101344764232635, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8718750476837158, "reward_std": 0.7817658483982086, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750029802322, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 10.331250190734863, "epoch": 0.09106239460370995, "kl": 1.6083339750766754, "learning_rate": 9.990949971894322e-07, "loss": 0.014146502129733562, "ratio/all_0": 0.078125, "ratio/all_2": 0.5625, "reward": 1.8437500298023224, "reward_std": 0.7921317666769028, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.75, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8437500149011612, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 10.457812786102295, "epoch": 0.09162450815064643, "kl": 1.5441269874572754, "learning_rate": 9.990893760539629e-07, "loss": 0.0075755673460662365, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8453125357627869, "reward_std": 0.8307097852230072, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8453125059604645, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 10.354687690734863, "epoch": 0.09218662169758292, "kl": 1.6081570386886597, "learning_rate": 9.990837549184936e-07, "loss": 0.028595969080924988, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9156250059604645, "reward_std": 0.8074723035097122, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250059604645, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 10.518750190734863, "epoch": 0.09274873524451939, "kl": 1.5291050374507904, "learning_rate": 9.99078133783024e-07, "loss": 0.008415000513195992, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.889062523841858, "reward_std": 0.7609771192073822, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 10.365625143051147, "epoch": 0.09331084879145587, "kl": 1.6111627221107483, "learning_rate": 9.990725126475547e-07, "loss": 0.00948229618370533, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.870312511920929, "reward_std": 0.8168683350086212, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8703124970197678, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 10.459375143051147, "epoch": 0.09387296233839236, "kl": 1.5411999225616455, "learning_rate": 9.990668915120854e-07, "loss": 0.018182864412665367, "ratio/all_0": 0.078125, "ratio/all_2": 0.640625, "reward": 1.854687511920929, "reward_std": 0.8434903770685196, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 10.379687786102295, "epoch": 0.09443507588532883, "kl": 1.5454858243465424, "learning_rate": 9.99061270376616e-07, "loss": 0.010270864702761173, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.8828125, "reward_std": 0.8237272053956985, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 10.451562643051147, "epoch": 0.09499718943226532, "kl": 1.509293019771576, "learning_rate": 9.990556492411467e-07, "loss": 0.029691193252801895, "ratio/all_0": 0.0, "ratio/all_2": 0.71875, "reward": 1.9359374940395355, "reward_std": 0.8380990624427795, "rewards/avg_0": 1.96875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9359375238418579, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 10.370312690734863, "epoch": 0.0955593029792018, "kl": 1.6071103811264038, "learning_rate": 9.990500281056772e-07, "loss": 0.004686681553721428, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8687500059604645, "reward_std": 0.8097708225250244, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.870312511920929, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 10.392187595367432, "epoch": 0.09612141652613827, "kl": 1.6090731918811798, "learning_rate": 9.99044406970208e-07, "loss": 0.011083896271884441, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.885937511920929, "reward_std": 0.772784024477005, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8859375268220901, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 10.264062643051147, "epoch": 0.09668353007307476, "kl": 1.5098300874233246, "learning_rate": 9.990387858347385e-07, "loss": 0.013532341457903385, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.8750000298023224, "reward_std": 0.8310911804437637, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 10.426562786102295, "epoch": 0.09724564362001124, "kl": 1.5602307319641113, "learning_rate": 9.990331646992692e-07, "loss": 0.01641331985592842, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.881250023841858, "reward_std": 0.7340317219495773, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 10.332812786102295, "epoch": 0.09780775716694773, "kl": 1.5638643205165863, "learning_rate": 9.990275435638e-07, "loss": 0.005013991147279739, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8578124940395355, "reward_std": 0.8247323632240295, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125238418579, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 10.34531283378601, "epoch": 0.0983698707138842, "kl": 1.5222603976726532, "learning_rate": 9.990219224283306e-07, "loss": 0.0225371066480875, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.8875000476837158, "reward_std": 0.8316797167062759, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000178813934, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 10.446875095367432, "epoch": 0.09893198426082069, "kl": 1.522368609905243, "learning_rate": 9.99016301292861e-07, "loss": 0.026052303612232208, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.9000000059604645, "reward_std": 0.8042051494121552, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000208616257, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 10.289062738418579, "epoch": 0.09949409780775717, "kl": 1.5553119480609894, "learning_rate": 9.990106801573917e-07, "loss": 0.03195918723940849, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.9109375178813934, "reward_std": 0.8066186159849167, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 10.404687881469727, "epoch": 0.10005621135469364, "kl": 1.5360974669456482, "learning_rate": 9.990050590219224e-07, "loss": 0.014459017664194107, "ratio/all_0": 0.046875, "ratio/all_2": 0.546875, "reward": 1.873437523841858, "reward_std": 0.7592671066522598, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8750000149011612, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 10.451562643051147, "epoch": 0.10061832490163013, "kl": 1.5627336502075195, "learning_rate": 9.98999437886453e-07, "loss": 0.0008577615953981876, "ratio/all_0": 0.015625, "ratio/all_2": 0.53125, "reward": 1.870312511920929, "reward_std": 0.7360649853944778, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 10.4296875, "epoch": 0.10118043844856661, "kl": 1.5364422798156738, "learning_rate": 9.989938167509838e-07, "loss": 0.013105844147503376, "ratio/all_0": 0.015625, "ratio/all_2": 0.546875, "reward": 1.885937511920929, "reward_std": 0.7499039620161057, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8859374970197678, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 10.578125, "epoch": 0.1017425519955031, "kl": 1.5224189162254333, "learning_rate": 9.989881956155142e-07, "loss": 0.006834621541202068, "ratio/all_0": 0.078125, "ratio/all_2": 0.5625, "reward": 1.8328125178813934, "reward_std": 0.7984373569488525, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.75, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8328125178813934, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 10.600000381469727, "epoch": 0.10230466554243957, "kl": 1.5363523662090302, "learning_rate": 9.989825744800449e-07, "loss": 0.002911627758294344, "ratio/all_0": 0.015625, "ratio/all_2": 0.546875, "reward": 1.8765625059604645, "reward_std": 0.7439265847206116, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625208616257, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 10.567187547683716, "epoch": 0.10286677908937605, "kl": 1.5349900424480438, "learning_rate": 9.989769533445756e-07, "loss": 0.020761406049132347, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8718750476837158, "reward_std": 0.8221687376499176, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750029802322, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.10342889263631254, "kl": 1.5807002484798431, "learning_rate": 9.989713322091062e-07, "loss": 0.001111285062506795, "ratio/all_0": 0.03125, "ratio/all_2": 0.4375, "reward": 1.8453125357627869, "reward_std": 0.6977837383747101, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8453125059604645, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 10.664062976837158, "epoch": 0.10399100618324901, "kl": 1.558085858821869, "learning_rate": 9.989657110736367e-07, "loss": 0.015460247173905373, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.8562500178813934, "reward_std": 0.762683317065239, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8562500178813934, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 10.529687881469727, "epoch": 0.1045531197301855, "kl": 1.5969407558441162, "learning_rate": 9.989600899381674e-07, "loss": 0.012307515367865562, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8578124940395355, "reward_std": 0.7931357175111771, "rewards/avg_0": 1.875, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125238418579, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 10.448437929153442, "epoch": 0.10511523327712198, "kl": 1.5586650669574738, "learning_rate": 9.98954468802698e-07, "loss": 0.020341139286756516, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8765625059604645, "reward_std": 0.8082123398780823, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625059604645, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 10.440625190734863, "epoch": 0.10567734682405847, "kl": 1.5379924774169922, "learning_rate": 9.989488476672287e-07, "loss": 0.011121533811092377, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.857812523841858, "reward_std": 0.8036759942770004, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.71875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125089406967, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 10.321875095367432, "epoch": 0.10623946037099494, "kl": 1.6033551394939423, "learning_rate": 9.989432265317594e-07, "loss": 0.009739244356751442, "ratio/all_0": 0.015625, "ratio/all_2": 0.546875, "reward": 1.878125011920929, "reward_std": 0.7489920854568481, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 10.498437404632568, "epoch": 0.10680157391793142, "kl": 1.5498007535934448, "learning_rate": 9.9893760539629e-07, "loss": 0.03089289367198944, "ratio/all_0": 0.0625, "ratio/all_2": 0.65625, "reward": 1.8843750655651093, "reward_std": 0.8420456349849701, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 10.40000033378601, "epoch": 0.10736368746486791, "kl": 1.5609517395496368, "learning_rate": 9.989319842608206e-07, "loss": 0.026409577578306198, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.896875023841858, "reward_std": 0.8327089995145798, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968749940395355, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 10.593750238418579, "epoch": 0.10792580101180438, "kl": 1.5071128010749817, "learning_rate": 9.989263631253512e-07, "loss": 0.007572017144411802, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8796875178813934, "reward_std": 0.770546168088913, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 10.456250190734863, "epoch": 0.10848791455874086, "kl": 1.519468516111374, "learning_rate": 9.98920741989882e-07, "loss": 0.020838668569922447, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.9031250476837158, "reward_std": 0.81875379383564, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250029802322, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 10.401562690734863, "epoch": 0.10905002810567735, "kl": 1.6071525514125824, "learning_rate": 9.989151208544126e-07, "loss": 0.01021914929151535, "ratio/all_0": 0.015625, "ratio/all_2": 0.5, "reward": 1.8750000298023224, "reward_std": 0.7244045436382294, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 10.456250429153442, "epoch": 0.10961214165261383, "kl": 1.5413286685943604, "learning_rate": 9.989094997189433e-07, "loss": 0.025953728705644608, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.901562511920929, "reward_std": 0.7901591062545776, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 10.44687533378601, "epoch": 0.1101742551995503, "kl": 1.528151273727417, "learning_rate": 9.989038785834737e-07, "loss": 0.03157404437661171, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9249999821186066, "reward_std": 0.8380978256464005, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9250000268220901, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 10.5, "epoch": 0.11073636874648679, "kl": 1.5467852056026459, "learning_rate": 9.988982574480044e-07, "loss": 0.010904987342655659, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8843750357627869, "reward_std": 0.8058620393276215, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.885937511920929, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 10.365625143051147, "epoch": 0.11129848229342328, "kl": 1.5382420718669891, "learning_rate": 9.98892636312535e-07, "loss": 0.01803477108478546, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8890624940395355, "reward_std": 0.8067688345909119, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625238418579, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 10.318750143051147, "epoch": 0.11186059584035975, "kl": 1.6021558046340942, "learning_rate": 9.988870151770658e-07, "loss": 0.004164498299360275, "ratio/all_0": 0.09375, "ratio/all_2": 0.515625, "reward": 1.810937523841858, "reward_std": 0.7804713249206543, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.75, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.6875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8109375089406967, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 10.292187452316284, "epoch": 0.11242270938729623, "kl": 1.6093721091747284, "learning_rate": 9.988813940415962e-07, "loss": 0.024522338062524796, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.889062523841858, "reward_std": 0.8257010728120804, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 10.400000095367432, "epoch": 0.11298482293423272, "kl": 1.527627944946289, "learning_rate": 9.98875772906127e-07, "loss": 0.015955956652760506, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.8953125178813934, "reward_std": 0.8191908448934555, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 10.30625057220459, "epoch": 0.1135469364811692, "kl": 1.6383016109466553, "learning_rate": 9.988701517706576e-07, "loss": 0.035872433334589005, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9234375357627869, "reward_std": 0.7858867347240448, "rewards/avg_0": 1.96875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9234375059604645, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 10.281250238418579, "epoch": 0.11410905002810567, "kl": 1.553186446428299, "learning_rate": 9.988645306351883e-07, "loss": 0.007708997465670109, "ratio/all_0": 0.046875, "ratio/all_2": 0.5625, "reward": 1.8515625298023224, "reward_std": 0.7847436964511871, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8515625, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 10.273437738418579, "epoch": 0.11467116357504216, "kl": 1.5798857808113098, "learning_rate": 9.98858909499719e-07, "loss": 0.03088054247200489, "ratio/all_0": 0.046875, "ratio/all_2": 0.6875, "reward": 1.8953125178813934, "reward_std": 0.8464910984039307, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 10.371875286102295, "epoch": 0.11523327712197864, "kl": 1.6130147278308868, "learning_rate": 9.988532883642496e-07, "loss": 0.012614361010491848, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8843750357627869, "reward_std": 0.7893950045108795, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 10.323437690734863, "epoch": 0.11579539066891512, "kl": 1.5593442618846893, "learning_rate": 9.988476672287803e-07, "loss": 0.016722215339541435, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8843750059604645, "reward_std": 0.8005313277244568, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8859375268220901, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 10.379688024520874, "epoch": 0.1163575042158516, "kl": 1.5896699726581573, "learning_rate": 9.988420460933108e-07, "loss": 0.010505123063921928, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.834375023841858, "reward_std": 0.7868907004594803, "rewards/avg_0": 1.75, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8343750089406967, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 10.426562547683716, "epoch": 0.11691961776278809, "kl": 1.5763377845287323, "learning_rate": 9.988364249578414e-07, "loss": 0.008979510515928268, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8640625178813934, "reward_std": 0.7982265949249268, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625178813934, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 10.460937738418579, "epoch": 0.11748173130972456, "kl": 1.5620132386684418, "learning_rate": 9.988308038223721e-07, "loss": 0.015451719984412193, "ratio/all_0": 0.0, "ratio/all_2": 0.59375, "reward": 1.9062500298023224, "reward_std": 0.769840195775032, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9062500149011612, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 10.368750095367432, "epoch": 0.11804384485666104, "kl": 1.602480798959732, "learning_rate": 9.988251826869028e-07, "loss": 0.005209244322031736, "ratio/all_0": 0.03125, "ratio/all_2": 0.484375, "reward": 1.8562500178813934, "reward_std": 0.7441752403974533, "rewards/avg_0": 1.875, "rewards/avg_1": 1.703125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8578125089406967, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 10.512500286102295, "epoch": 0.11860595840359753, "kl": 1.5443048775196075, "learning_rate": 9.988195615514333e-07, "loss": 0.023446325212717056, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.9015625417232513, "reward_std": 0.8415164798498154, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9015624970197678, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.11916807195053401, "kl": 1.4916993379592896, "learning_rate": 9.98813940415964e-07, "loss": 0.007108633406460285, "ratio/all_0": 0.0625, "ratio/all_2": 0.453125, "reward": 1.8437500298023224, "reward_std": 0.7147433012723923, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8453125208616257, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 10.500000238418579, "epoch": 0.11973018549747048, "kl": 1.5132508873939514, "learning_rate": 9.988083192804946e-07, "loss": 0.009618530049920082, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8921875059604645, "reward_std": 0.7719315439462662, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 10.487500190734863, "epoch": 0.12029229904440697, "kl": 1.5387333631515503, "learning_rate": 9.988026981450253e-07, "loss": 0.016562655568122864, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.9156250059604645, "reward_std": 0.7766409516334534, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250208616257, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 10.473437786102295, "epoch": 0.12085441259134345, "kl": 1.6194742918014526, "learning_rate": 9.987970770095558e-07, "loss": 0.023598913103342056, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.9140625298023224, "reward_std": 0.7459004819393158, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 10.490625143051147, "epoch": 0.12141652613827993, "kl": 1.5474883019924164, "learning_rate": 9.987914558740864e-07, "loss": -0.01752345636487007, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8328125178813934, "reward_std": 0.7599477916955948, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.75, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8328125178813934, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 10.504687786102295, "epoch": 0.12197863968521641, "kl": 1.5022177398204803, "learning_rate": 9.987858347386171e-07, "loss": 0.027818644419312477, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.9078125357627869, "reward_std": 0.805649921298027, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125208616257, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 10.465625286102295, "epoch": 0.1225407532321529, "kl": 1.5558375120162964, "learning_rate": 9.987802136031478e-07, "loss": 0.010633164085447788, "ratio/all_0": 0.0625, "ratio/all_2": 0.5625, "reward": 1.8484375476837158, "reward_std": 0.7891297936439514, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.75, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375178813934, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.12310286677908938, "kl": 1.5038470327854156, "learning_rate": 9.987745924676785e-07, "loss": -0.008047796785831451, "ratio/all_0": 0.0, "ratio/all_2": 0.578125, "reward": 1.865625023841858, "reward_std": 0.7684888392686844, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656249940395355, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 10.456250190734863, "epoch": 0.12366498032602585, "kl": 1.557765245437622, "learning_rate": 9.987689713322091e-07, "loss": 0.0018783528357744217, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.862500011920929, "reward_std": 0.7834757715463638, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.862500011920929, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 10.362500190734863, "epoch": 0.12422709387296234, "kl": 1.5848875045776367, "learning_rate": 9.987633501967398e-07, "loss": -0.004130351357161999, "ratio/all_0": 0.015625, "ratio/all_2": 0.515625, "reward": 1.854687511920929, "reward_std": 0.7420046925544739, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.8578125238418579, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 10.387500047683716, "epoch": 0.12478920741989882, "kl": 1.5096738934516907, "learning_rate": 9.987577290612703e-07, "loss": 0.017767585813999176, "ratio/all_0": 0.0625, "ratio/all_2": 0.609375, "reward": 1.870312511920929, "reward_std": 0.8075316995382309, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 10.434375047683716, "epoch": 0.1253513209668353, "kl": 1.586805284023285, "learning_rate": 9.98752107925801e-07, "loss": 0.028075214475393295, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.9109375178813934, "reward_std": 0.8107988387346268, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.953125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 10.417187690734863, "epoch": 0.1259134345137718, "kl": 1.5914437770843506, "learning_rate": 9.987464867903316e-07, "loss": 0.022167479619383812, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8843750357627869, "reward_std": 0.8190177977085114, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 10.428125143051147, "epoch": 0.12647554806070826, "kl": 1.5555630326271057, "learning_rate": 9.987408656548623e-07, "loss": 0.027459319680929184, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.9140625298023224, "reward_std": 0.7828859984874725, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625149011612, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 10.518750190734863, "epoch": 0.12703766160764474, "kl": 1.540842980146408, "learning_rate": 9.987352445193928e-07, "loss": -0.01165848784148693, "ratio/all_0": 0.046875, "ratio/all_2": 0.484375, "reward": 1.8125000596046448, "reward_std": 0.7399534285068512, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8125, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 10.38437533378601, "epoch": 0.12759977515458124, "kl": 1.5715409219264984, "learning_rate": 9.987296233839235e-07, "loss": 0.0172878485172987, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8828125, "reward_std": 0.8174593150615692, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 10.49375033378601, "epoch": 0.1281618887015177, "kl": 1.44660022854805, "learning_rate": 9.987240022484541e-07, "loss": 0.004869728349149227, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.8906250298023224, "reward_std": 0.7453107237815857, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 10.456250429153442, "epoch": 0.12872400224845418, "kl": 1.543150782585144, "learning_rate": 9.987183811129848e-07, "loss": 0.021657878533005714, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9218750298023224, "reward_std": 0.8066779375076294, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.96875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9218750149011612, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 10.453125238418579, "epoch": 0.12928611579539068, "kl": 1.5754946768283844, "learning_rate": 9.987127599775153e-07, "loss": 0.007582786493003368, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8843750357627869, "reward_std": 0.7794698178768158, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 10.390625238418579, "epoch": 0.12984822934232715, "kl": 1.5105478763580322, "learning_rate": 9.98707138842046e-07, "loss": 0.012311361730098724, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.854687511920929, "reward_std": 0.833659902215004, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8562500178813934, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 10.39218783378601, "epoch": 0.13041034288926362, "kl": 1.5355972051620483, "learning_rate": 9.987015177065766e-07, "loss": 0.012546722777187824, "ratio/all_0": 0.0, "ratio/all_2": 0.71875, "reward": 1.9078125357627869, "reward_std": 0.8449338972568512, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125208616257, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 10.446875095367432, "epoch": 0.13097245643620012, "kl": 1.5727481544017792, "learning_rate": 9.986958965711073e-07, "loss": 0.004496240522712469, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.8906250298023224, "reward_std": 0.8004075884819031, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8921875208616257, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 10.396875381469727, "epoch": 0.1315345699831366, "kl": 1.5587262213230133, "learning_rate": 9.98690275435638e-07, "loss": -0.0068831006065011024, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.875, "reward_std": 0.7379455119371414, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 10.4609375, "epoch": 0.13209668353007306, "kl": 1.5907757580280304, "learning_rate": 9.986846543001687e-07, "loss": 0.03402874246239662, "ratio/all_0": 0.109375, "ratio/all_2": 0.578125, "reward": 1.850000023841858, "reward_std": 0.8150699734687805, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8499999940395355, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 10.531250238418579, "epoch": 0.13265879707700956, "kl": 1.5824647843837738, "learning_rate": 9.986790331646993e-07, "loss": 0.021441660821437836, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.901562511920929, "reward_std": 0.8082363605499268, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 10.535937547683716, "epoch": 0.13322091062394603, "kl": 1.5818893015384674, "learning_rate": 9.986734120292298e-07, "loss": 0.0017864266410470009, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.873437523841858, "reward_std": 0.7641280144453049, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375089406967, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 10.562500238418579, "epoch": 0.13378302417088253, "kl": 1.5562168657779694, "learning_rate": 9.986677908937605e-07, "loss": 0.015783432871103287, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8796875178813934, "reward_std": 0.7744939774274826, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 10.482812643051147, "epoch": 0.134345137717819, "kl": 1.5716504752635956, "learning_rate": 9.986621697582912e-07, "loss": 0.026386769488453865, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.9031250178813934, "reward_std": 0.8459218442440033, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9046875238418579, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 10.529687643051147, "epoch": 0.13490725126475547, "kl": 1.5155498683452606, "learning_rate": 9.986565486228218e-07, "loss": 0.028578244149684906, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8953125476837158, "reward_std": 0.8043807148933411, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8968750238418579, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 10.468750238418579, "epoch": 0.13546936481169197, "kl": 1.5957022309303284, "learning_rate": 9.986509274873523e-07, "loss": 0.030983472242951393, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9171875417232513, "reward_std": 0.7883910536766052, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 10.460937738418579, "epoch": 0.13603147835862844, "kl": 1.5496326088905334, "learning_rate": 9.98645306351883e-07, "loss": 0.019616173580288887, "ratio/all_0": 0.0, "ratio/all_2": 0.578125, "reward": 1.9156250059604645, "reward_std": 0.7500213980674744, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250208616257, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 10.590625286102295, "epoch": 0.13659359190556492, "kl": 1.5903447270393372, "learning_rate": 9.986396852164137e-07, "loss": 0.005906625185161829, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.8656249940395355, "reward_std": 0.7259642332792282, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250238418579, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 10.357812643051147, "epoch": 0.13715570545250141, "kl": 1.5682716965675354, "learning_rate": 9.986340640809443e-07, "loss": 0.027652684599161148, "ratio/all_0": 0.09375, "ratio/all_2": 0.578125, "reward": 1.8531250357627869, "reward_std": 0.8177826255559921, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250059604645, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 10.401562690734863, "epoch": 0.13771781899943789, "kl": 1.5091069638729095, "learning_rate": 9.98628442945475e-07, "loss": 0.00808423850685358, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.870312511920929, "reward_std": 0.83954256772995, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8703125268220901, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 10.432812690734863, "epoch": 0.13827993254637436, "kl": 1.5255979001522064, "learning_rate": 9.986228218100055e-07, "loss": 0.013790331780910492, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8796875178813934, "reward_std": 0.7863009124994278, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 10.234375238418579, "epoch": 0.13884204609331086, "kl": 1.5829472541809082, "learning_rate": 9.986172006745362e-07, "loss": 0.02624700777232647, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.904687523841858, "reward_std": 0.8320054858922958, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875089406967, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 10.339062452316284, "epoch": 0.13940415964024733, "kl": 1.5977038741111755, "learning_rate": 9.986115795390668e-07, "loss": 0.006217719055712223, "ratio/all_0": 0.046875, "ratio/all_2": 0.5625, "reward": 1.8515625, "reward_std": 0.7779113799333572, "rewards/avg_0": 1.75, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8515625298023224, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 10.41562533378601, "epoch": 0.1399662731871838, "kl": 1.5816867351531982, "learning_rate": 9.986059584035975e-07, "loss": 0.014478843659162521, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8656249940395355, "reward_std": 0.7894555926322937, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250238418579, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 10.537500143051147, "epoch": 0.1405283867341203, "kl": 1.564496248960495, "learning_rate": 9.986003372681282e-07, "loss": 0.0180707685649395, "ratio/all_0": 0.078125, "ratio/all_2": 0.578125, "reward": 1.8421875536441803, "reward_std": 0.8131468445062637, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8437500149011612, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 10.4375, "epoch": 0.14109050028105677, "kl": 1.5500454604625702, "learning_rate": 9.985947161326589e-07, "loss": 0.02213384583592415, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8765625357627869, "reward_std": 0.8081517666578293, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625208616257, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 10.276562690734863, "epoch": 0.14165261382799327, "kl": 1.6070986092090607, "learning_rate": 9.985890949971893e-07, "loss": 0.02211572788655758, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.881250023841858, "reward_std": 0.8086518347263336, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 10.381250143051147, "epoch": 0.14221472737492974, "kl": 1.4865486919879913, "learning_rate": 9.9858347386172e-07, "loss": 0.013803028501570225, "ratio/all_0": 0.0625, "ratio/all_2": 0.546875, "reward": 1.8609375357627869, "reward_std": 0.7731085568666458, "rewards/avg_0": 1.75, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375208616257, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 10.46250033378601, "epoch": 0.1427768409218662, "kl": 1.562472254037857, "learning_rate": 9.985778527262507e-07, "loss": 0.0062060109339654446, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8687500059604645, "reward_std": 0.763007864356041, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500208616257, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 10.40781283378601, "epoch": 0.1433389544688027, "kl": 1.6123566925525665, "learning_rate": 9.985722315907814e-07, "loss": 0.01719619520008564, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.8640625476837158, "reward_std": 0.797432154417038, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625029802322, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 10.40781283378601, "epoch": 0.14390106801573918, "kl": 1.5859326422214508, "learning_rate": 9.985666104553118e-07, "loss": 0.02508179470896721, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.912500023841858, "reward_std": 0.7811773121356964, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 10.42343783378601, "epoch": 0.14446318156267565, "kl": 1.5853928923606873, "learning_rate": 9.985609893198425e-07, "loss": 0.029612088575959206, "ratio/all_0": 0.015625, "ratio/all_2": 0.671875, "reward": 1.921875, "reward_std": 0.8175742924213409, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9234375208616257, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 10.448437929153442, "epoch": 0.14502529510961215, "kl": 1.5741258263587952, "learning_rate": 9.985553681843732e-07, "loss": 0.0034783557057380676, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8625000417232513, "reward_std": 0.7571201622486115, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.862500011920929, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.14558740865654862, "kl": 1.4908913373947144, "learning_rate": 9.985497470489039e-07, "loss": 0.0013852245174348354, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8765625357627869, "reward_std": 0.7487268894910812, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.878125011920929, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 10.364062786102295, "epoch": 0.1461495222034851, "kl": 1.5390149652957916, "learning_rate": 9.985441259134345e-07, "loss": 0.011133184656500816, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8687500357627869, "reward_std": 0.7598303556442261, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500059604645, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 10.467187643051147, "epoch": 0.1467116357504216, "kl": 1.6012259721755981, "learning_rate": 9.98538504777965e-07, "loss": 0.006842806003987789, "ratio/all_0": 0.015625, "ratio/all_2": 0.546875, "reward": 1.8734375536441803, "reward_std": 0.745634064078331, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734374940395355, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 10.485937595367432, "epoch": 0.14727374929735806, "kl": 1.5561028718948364, "learning_rate": 9.985328836424957e-07, "loss": 0.0026737055741250515, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8609375655651093, "reward_std": 0.7467529475688934, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.71875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375059604645, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 10.328125238418579, "epoch": 0.14783586284429454, "kl": 1.61723193526268, "learning_rate": 9.985272625070264e-07, "loss": 0.03143702447414398, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.8937500417232513, "reward_std": 0.8296476751565933, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8937499970197678, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 10.489062786102295, "epoch": 0.14839797639123103, "kl": 1.558815211057663, "learning_rate": 9.98521641371557e-07, "loss": 0.014333357103168964, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8765625357627869, "reward_std": 0.8043806999921799, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625059604645, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 10.493750095367432, "epoch": 0.1489600899381675, "kl": 1.5746314227581024, "learning_rate": 9.985160202360877e-07, "loss": 0.04017938673496246, "ratio/all_0": 0.078125, "ratio/all_2": 0.6875, "reward": 1.8921875357627869, "reward_std": 0.8675475120544434, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 10.496875286102295, "epoch": 0.14952220348510398, "kl": 1.532246470451355, "learning_rate": 9.985103991006184e-07, "loss": -0.004921544808894396, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8421874940395355, "reward_std": 0.7891298532485962, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875238418579, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 10.459375143051147, "epoch": 0.15008431703204048, "kl": 1.5063121914863586, "learning_rate": 9.985047779651489e-07, "loss": 0.02832210063934326, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.9000000357627869, "reward_std": 0.8565918058156967, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 10.609375238418579, "epoch": 0.15064643057897695, "kl": 1.541852205991745, "learning_rate": 9.984991568296795e-07, "loss": 0.00951352994889021, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8765625357627869, "reward_std": 0.8181691318750381, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.878125011920929, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 10.385937452316284, "epoch": 0.15120854412591345, "kl": 1.6570470035076141, "learning_rate": 9.984935356942102e-07, "loss": 0.027027206495404243, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.904687523841858, "reward_std": 0.7799990177154541, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875238418579, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 10.460937738418579, "epoch": 0.15177065767284992, "kl": 1.6034429669380188, "learning_rate": 9.984879145587409e-07, "loss": 0.015010214410722256, "ratio/all_0": 0.078125, "ratio/all_2": 0.609375, "reward": 1.842187523841858, "reward_std": 0.8224339187145233, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875089406967, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 10.409375190734863, "epoch": 0.1523327712197864, "kl": 1.6018818020820618, "learning_rate": 9.984822934232716e-07, "loss": 0.033464521169662476, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.901562511920929, "reward_std": 0.7854724824428558, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 10.456249952316284, "epoch": 0.1528948847667229, "kl": 1.558174967765808, "learning_rate": 9.98476672287802e-07, "loss": 0.02480011060833931, "ratio/all_0": 0.09375, "ratio/all_2": 0.640625, "reward": 1.850000023841858, "reward_std": 0.8561535179615021, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8500000238418579, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 10.542187452316284, "epoch": 0.15345699831365936, "kl": 1.4964491426944733, "learning_rate": 9.984710511523327e-07, "loss": 0.037152573466300964, "ratio/all_0": 0.03125, "ratio/all_2": 0.75, "reward": 1.926562488079071, "reward_std": 0.8781774640083313, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9265625327825546, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 10.478125095367432, "epoch": 0.15401911186059583, "kl": 1.6227607131004333, "learning_rate": 9.984654300168634e-07, "loss": 0.04696275666356087, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.9234375357627869, "reward_std": 0.8446333706378937, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9234375208616257, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 10.473437786102295, "epoch": 0.15458122540753233, "kl": 1.5785993933677673, "learning_rate": 9.98459808881394e-07, "loss": 0.032174110412597656, "ratio/all_0": 0.0625, "ratio/all_2": 0.671875, "reward": 1.889062523841858, "reward_std": 0.8476100713014603, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 10.475000381469727, "epoch": 0.1551433389544688, "kl": 1.6346567571163177, "learning_rate": 9.984541877459245e-07, "loss": 0.02995876967906952, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8984375, "reward_std": 0.7702215760946274, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375149011612, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 10.375000238418579, "epoch": 0.15570545250140527, "kl": 1.5467197000980377, "learning_rate": 9.984485666104552e-07, "loss": 0.007414577528834343, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.871874988079071, "reward_std": 0.7875954359769821, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750327825546, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 10.318750381469727, "epoch": 0.15626756604834177, "kl": 1.605959415435791, "learning_rate": 9.984429454749859e-07, "loss": 0.016059426590800285, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8718750476837158, "reward_std": 0.7766409367322922, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750178813934, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 10.562500238418579, "epoch": 0.15682967959527824, "kl": 1.5902847647666931, "learning_rate": 9.984373243395166e-07, "loss": 0.023069556802511215, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8875000476837158, "reward_std": 0.8167774081230164, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000029802322, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 10.340625047683716, "epoch": 0.15739179314221471, "kl": 1.5816633105278015, "learning_rate": 9.984317032040472e-07, "loss": 0.0046473303809762, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.881250023841858, "reward_std": 0.8019672930240631, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 10.440625190734863, "epoch": 0.1579539066891512, "kl": 1.6009697616100311, "learning_rate": 9.98426082068578e-07, "loss": 0.02072141319513321, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.9000000357627869, "reward_std": 0.8073585629463196, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000208616257, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 10.429687738418579, "epoch": 0.15851602023608768, "kl": 1.6120758056640625, "learning_rate": 9.984204609331084e-07, "loss": 0.004281133413314819, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.857812523841858, "reward_std": 0.7453094720840454, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125238418579, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 10.454687595367432, "epoch": 0.15907813378302418, "kl": 1.5999435186386108, "learning_rate": 9.98414839797639e-07, "loss": 0.017133023589849472, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8953125178813934, "reward_std": 0.7781753242015839, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 10.448437690734863, "epoch": 0.15964024732996066, "kl": 1.5763249397277832, "learning_rate": 9.984092186621697e-07, "loss": 0.012010748498141766, "ratio/all_0": 0.0, "ratio/all_2": 0.609375, "reward": 1.9031250476837158, "reward_std": 0.7713418155908585, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250178813934, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 10.237499952316284, "epoch": 0.16020236087689713, "kl": 1.6282682716846466, "learning_rate": 9.984035975267004e-07, "loss": 0.025732139125466347, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.909375011920929, "reward_std": 0.8114782422780991, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9093750268220901, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 10.395312786102295, "epoch": 0.16076447442383363, "kl": 1.6078749597072601, "learning_rate": 9.98397976391231e-07, "loss": 0.002353701274842024, "ratio/all_0": 0.078125, "ratio/all_2": 0.453125, "reward": 1.8125000298023224, "reward_std": 0.7375993728637695, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8125, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 10.382812738418579, "epoch": 0.1613265879707701, "kl": 1.615327775478363, "learning_rate": 9.983923552557615e-07, "loss": 0.000707289669662714, "ratio/all_0": 0.0625, "ratio/all_2": 0.453125, "reward": 1.8218750059604645, "reward_std": 0.729204848408699, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8218750208616257, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.16188870151770657, "kl": 1.5665834844112396, "learning_rate": 9.983867341202922e-07, "loss": 0.018719196319580078, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.901562511920929, "reward_std": 0.7854484617710114, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9015625268220901, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 10.479687452316284, "epoch": 0.16245081506464307, "kl": 1.5476877391338348, "learning_rate": 9.98381112984823e-07, "loss": 0.029271548613905907, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9218750298023224, "reward_std": 0.805764839053154, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9218750149011612, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 10.376562595367432, "epoch": 0.16301292861157954, "kl": 1.5733246803283691, "learning_rate": 9.983754918493536e-07, "loss": 0.032239705324172974, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.934374988079071, "reward_std": 0.8063534200191498, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9343750178813934, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 10.378124952316284, "epoch": 0.163575042158516, "kl": 1.6404075622558594, "learning_rate": 9.98369870713884e-07, "loss": 0.029129423201084137, "ratio/all_0": 0.0625, "ratio/all_2": 0.6875, "reward": 1.8843750357627869, "reward_std": 0.8548843264579773, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 10.44687533378601, "epoch": 0.1641371557054525, "kl": 1.5501161813735962, "learning_rate": 9.983642495784147e-07, "loss": 0.017258845269680023, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8921875357627869, "reward_std": 0.7841527462005615, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 10.470312595367432, "epoch": 0.16469926925238898, "kl": 1.6193291246891022, "learning_rate": 9.983586284429454e-07, "loss": 0.004365533124655485, "ratio/all_0": 0.015625, "ratio/all_2": 0.671875, "reward": 1.8843750059604645, "reward_std": 0.8215802311897278, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 10.409375190734863, "epoch": 0.16526138279932545, "kl": 1.625065267086029, "learning_rate": 9.98353007307476e-07, "loss": 0.031236734241247177, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9171875417232513, "reward_std": 0.7906895130872726, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 10.48593783378601, "epoch": 0.16582349634626195, "kl": 1.6580379605293274, "learning_rate": 9.983473861720065e-07, "loss": -0.010036187246441841, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.8171875178813934, "reward_std": 0.7649829834699631, "rewards/avg_0": 1.75, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.734375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8171875029802322, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 10.325000047683716, "epoch": 0.16638560989319842, "kl": 1.6484183073043823, "learning_rate": 9.983417650365374e-07, "loss": 0.011245986446738243, "ratio/all_0": 0.046875, "ratio/all_2": 0.5625, "reward": 1.857812523841858, "reward_std": 0.7721955329179764, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125089406967, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 10.548437595367432, "epoch": 0.16694772344013492, "kl": 1.6036081910133362, "learning_rate": 9.983361439010681e-07, "loss": 0.02160033956170082, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.885937511920929, "reward_std": 0.7658355087041855, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 10.30625033378601, "epoch": 0.1675098369870714, "kl": 1.5906179249286652, "learning_rate": 9.983305227655986e-07, "loss": 0.023189380764961243, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8828125298023224, "reward_std": 0.8100359886884689, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 10.362500429153442, "epoch": 0.16807195053400786, "kl": 1.57911616563797, "learning_rate": 9.983249016301292e-07, "loss": 0.024665620177984238, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8953125476837158, "reward_std": 0.813511535525322, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125029802322, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 10.420312643051147, "epoch": 0.16863406408094436, "kl": 1.5858178734779358, "learning_rate": 9.9831928049466e-07, "loss": 0.019302021712064743, "ratio/all_0": 0.0, "ratio/all_2": 0.5625, "reward": 1.9109375178813934, "reward_std": 0.7481989115476608, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 10.367187738418579, "epoch": 0.16919617762788083, "kl": 1.5977429747581482, "learning_rate": 9.983136593591906e-07, "loss": 0.03787188231945038, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8937499821186066, "reward_std": 0.8181628286838531, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8937500268220901, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 10.29062533378601, "epoch": 0.1697582911748173, "kl": 1.5648789703845978, "learning_rate": 9.98308038223721e-07, "loss": 0.011258106678724289, "ratio/all_0": 0.046875, "ratio/all_2": 0.59375, "reward": 1.865625023841858, "reward_std": 0.7881258726119995, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 10.531250238418579, "epoch": 0.1703204047217538, "kl": 1.592081367969513, "learning_rate": 9.983024170882517e-07, "loss": 0.009078697301447392, "ratio/all_0": 0.078125, "ratio/all_2": 0.625, "reward": 1.8406250178813934, "reward_std": 0.8274097889661789, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.75, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8406250178813934, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 10.617187738418579, "epoch": 0.17088251826869028, "kl": 1.5056523084640503, "learning_rate": 9.982967959527824e-07, "loss": 0.014645126648247242, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9046874940395355, "reward_std": 0.8363915532827377, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875238418579, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.17144463181562675, "kl": 1.6216383576393127, "learning_rate": 9.98291174817313e-07, "loss": 0.018544694408774376, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8843750059604645, "reward_std": 0.822757288813591, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 10.337500095367432, "epoch": 0.17200674536256325, "kl": 1.5746294260025024, "learning_rate": 9.982855536818436e-07, "loss": 0.029133716598153114, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9234375357627869, "reward_std": 0.8675475269556046, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9234375059604645, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 10.406250238418579, "epoch": 0.17256885890949972, "kl": 1.6034705936908722, "learning_rate": 9.982799325463742e-07, "loss": 0.022716794162988663, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8796875178813934, "reward_std": 0.8130151182413101, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875029802322, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 10.504687547683716, "epoch": 0.1731309724564362, "kl": 1.6118988692760468, "learning_rate": 9.98274311410905e-07, "loss": 0.025835514068603516, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9109375476837158, "reward_std": 0.8112130612134933, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375029802322, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 10.340625286102295, "epoch": 0.1736930860033727, "kl": 1.5647907555103302, "learning_rate": 9.982686902754356e-07, "loss": -0.005121363326907158, "ratio/all_0": 0.0, "ratio/all_2": 0.515625, "reward": 1.8703125417232513, "reward_std": 0.7209883630275726, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8703124970197678, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 10.40625, "epoch": 0.17425519955030916, "kl": 1.6206236481666565, "learning_rate": 9.982630691399663e-07, "loss": 0.014173053205013275, "ratio/all_0": 0.0625, "ratio/all_2": 0.59375, "reward": 1.854687511920929, "reward_std": 0.8075632303953171, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.75, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 10.479687690734863, "epoch": 0.17481731309724563, "kl": 1.4940095245838165, "learning_rate": 9.98257448004497e-07, "loss": -0.020530520007014275, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.8250000476837158, "reward_std": 0.7503218501806259, "rewards/avg_0": 1.75, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.71875, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8250000029802322, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 10.342187643051147, "epoch": 0.17537942664418213, "kl": 1.6300634145736694, "learning_rate": 9.982518268690276e-07, "loss": 0.021781207993626595, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.889062523841858, "reward_std": 0.7929854989051819, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625238418579, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 10.31406283378601, "epoch": 0.1759415401911186, "kl": 1.5964065194129944, "learning_rate": 9.98246205733558e-07, "loss": 0.019931331276893616, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8781250417232513, "reward_std": 0.8184268027544022, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 10.473437786102295, "epoch": 0.1765036537380551, "kl": 1.5781501233577728, "learning_rate": 9.982405845980888e-07, "loss": 0.014711914584040642, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8828125298023224, "reward_std": 0.7929855138063431, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 10.487500190734863, "epoch": 0.17706576728499157, "kl": 1.5839548408985138, "learning_rate": 9.982349634626194e-07, "loss": 0.007520720828324556, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.885937511920929, "reward_std": 0.7497561424970627, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 10.245312690734863, "epoch": 0.17762788083192804, "kl": 1.586982786655426, "learning_rate": 9.982293423271501e-07, "loss": 0.00036374107003211975, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.854687511920929, "reward_std": 0.7707860469818115, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 10.434375047683716, "epoch": 0.17818999437886454, "kl": 1.5729843974113464, "learning_rate": 9.982237211916806e-07, "loss": 0.007898079231381416, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.8953125476837158, "reward_std": 0.7891903966665268, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125029802322, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 10.385937690734863, "epoch": 0.178752107925801, "kl": 1.5544759631156921, "learning_rate": 9.982181000562113e-07, "loss": 0.006270549260079861, "ratio/all_0": 0.046875, "ratio/all_2": 0.515625, "reward": 1.857812523841858, "reward_std": 0.7436625808477402, "rewards/avg_0": 1.75, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125089406967, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 10.350000381469727, "epoch": 0.17931422147273748, "kl": 1.5743531286716461, "learning_rate": 9.98212478920742e-07, "loss": 0.023270972073078156, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.890625, "reward_std": 0.7828847467899323, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 10.340625286102295, "epoch": 0.17987633501967398, "kl": 1.5790450274944305, "learning_rate": 9.982068577852726e-07, "loss": 0.02013951539993286, "ratio/all_0": 0.0625, "ratio/all_2": 0.515625, "reward": 1.865625023841858, "reward_std": 0.7560012191534042, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 10.237500190734863, "epoch": 0.18043844856661045, "kl": 1.616362452507019, "learning_rate": 9.982012366498033e-07, "loss": 0.0052131060510873795, "ratio/all_0": 0.046875, "ratio/all_2": 0.46875, "reward": 1.8421875536441803, "reward_std": 0.7244033068418503, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.75, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875089406967, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 10.382812976837158, "epoch": 0.18100056211354693, "kl": 1.5818106830120087, "learning_rate": 9.981956155143338e-07, "loss": 0.014903640374541283, "ratio/all_0": 0.078125, "ratio/all_2": 0.5625, "reward": 1.842187523841858, "reward_std": 0.7995538413524628, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.75, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875238418579, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 10.398437738418579, "epoch": 0.18156267566048342, "kl": 1.5807396471500397, "learning_rate": 9.981899943788644e-07, "loss": 0.0053629628382623196, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8484375476837158, "reward_std": 0.7667220234870911, "rewards/avg_0": 1.875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375029802322, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 10.39218783378601, "epoch": 0.1821247892074199, "kl": 1.5583379864692688, "learning_rate": 9.981843732433951e-07, "loss": -0.00255510862916708, "ratio/all_0": 0.0625, "ratio/all_2": 0.484375, "reward": 1.8250000178813934, "reward_std": 0.7457249462604523, "rewards/avg_0": 1.71875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8250000029802322, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 10.398437738418579, "epoch": 0.18268690275435637, "kl": 1.5831120014190674, "learning_rate": 9.981787521079258e-07, "loss": -0.011474616825580597, "ratio/all_0": 0.0625, "ratio/all_2": 0.5, "reward": 1.8171875178813934, "reward_std": 0.7433380484580994, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.75, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8171875178813934, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 10.418750286102295, "epoch": 0.18324901630129287, "kl": 1.5725690126419067, "learning_rate": 9.981731309724565e-07, "loss": 0.016651010140776634, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8953125178813934, "reward_std": 0.7747579514980316, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125029802322, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 10.435937643051147, "epoch": 0.18381112984822934, "kl": 1.591273546218872, "learning_rate": 9.981675098369872e-07, "loss": 0.013820771127939224, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.8906250298023224, "reward_std": 0.7669556438922882, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 10.481250047683716, "epoch": 0.18437324339516584, "kl": 1.6049018502235413, "learning_rate": 9.981618887015176e-07, "loss": -0.015562339685857296, "ratio/all_0": 0.015625, "ratio/all_2": 0.46875, "reward": 1.8421875536441803, "reward_std": 0.6979580372571945, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.71875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8421875089406967, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 10.570312738418579, "epoch": 0.1849353569421023, "kl": 1.596179574728012, "learning_rate": 9.981562675660483e-07, "loss": 0.015992822125554085, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8562500476837158, "reward_std": 0.8342421501874924, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8562500029802322, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 10.570312738418579, "epoch": 0.18549747048903878, "kl": 1.5825116634368896, "learning_rate": 9.98150646430579e-07, "loss": -0.016910620033740997, "ratio/all_0": 0.0625, "ratio/all_2": 0.484375, "reward": 1.8078125417232513, "reward_std": 0.7369780391454697, "rewards/avg_0": 1.71875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.75, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.75, "rewards/avg_6": 1.75, "rewards/avg_7": 1.6875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.807812511920929, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 10.49375033378601, "epoch": 0.18605958403597528, "kl": 1.592354953289032, "learning_rate": 9.981450252951096e-07, "loss": 0.02186637371778488, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8921875357627869, "reward_std": 0.7789381444454193, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 10.5546875, "epoch": 0.18662169758291175, "kl": 1.5267528891563416, "learning_rate": 9.981394041596401e-07, "loss": 0.032644666731357574, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.889062523841858, "reward_std": 0.8288520723581314, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625238418579, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 10.526562452316284, "epoch": 0.18718381112984822, "kl": 1.5329179763793945, "learning_rate": 9.981337830241708e-07, "loss": 0.0030208320822566748, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.862500011920929, "reward_std": 0.7397526204586029, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8640625029802322, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 10.589062690734863, "epoch": 0.18774592467678472, "kl": 1.6342924535274506, "learning_rate": 9.981281618887015e-07, "loss": 0.0077019971795380116, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8656249940395355, "reward_std": 0.7672802358865738, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250238418579, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 10.593750238418579, "epoch": 0.1883080382237212, "kl": 1.6173183023929596, "learning_rate": 9.981225407532321e-07, "loss": 0.03500716760754585, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.901562511920929, "reward_std": 0.7957247048616409, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 10.456250190734863, "epoch": 0.18887015177065766, "kl": 1.5762000679969788, "learning_rate": 9.981169196177628e-07, "loss": 0.02564232610166073, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.9000000059604645, "reward_std": 0.8302361965179443, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000208616257, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 10.412500381469727, "epoch": 0.18943226531759416, "kl": 1.6118149757385254, "learning_rate": 9.981112984822933e-07, "loss": -0.015139142982661724, "ratio/all_0": 0.078125, "ratio/all_2": 0.515625, "reward": 1.792187511920929, "reward_std": 0.780357614159584, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.71875, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.6875, "rewards/avg_7": 1.703125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.792187511920929, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 10.465625286102295, "epoch": 0.18999437886453063, "kl": 1.5685740411281586, "learning_rate": 9.98105677346824e-07, "loss": 0.013548189774155617, "ratio/all_0": 0.046875, "ratio/all_2": 0.546875, "reward": 1.8656249940395355, "reward_std": 0.7624193429946899, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250089406967, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 10.489062547683716, "epoch": 0.1905564924114671, "kl": 1.6768341958522797, "learning_rate": 9.981000562113546e-07, "loss": 0.027506785467267036, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.9078125357627869, "reward_std": 0.7817645967006683, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 10.407812595367432, "epoch": 0.1911186059584036, "kl": 1.5220493972301483, "learning_rate": 9.980944350758853e-07, "loss": 0.014555826783180237, "ratio/all_0": 0.0, "ratio/all_2": 0.59375, "reward": 1.90625, "reward_std": 0.763274297118187, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.90625, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 10.437500238418579, "epoch": 0.19168071950534007, "kl": 1.5265831649303436, "learning_rate": 9.980888139404158e-07, "loss": 0.010628137737512589, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.8937500417232513, "reward_std": 0.8350971043109894, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 10.521875143051147, "epoch": 0.19224283305227655, "kl": 1.488110363483429, "learning_rate": 9.980831928049467e-07, "loss": 0.006948402151465416, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.889062523841858, "reward_std": 0.774493932723999, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 10.481250286102295, "epoch": 0.19280494659921305, "kl": 1.5184958577156067, "learning_rate": 9.980775716694771e-07, "loss": 0.01944681815803051, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9156250357627869, "reward_std": 0.8254118114709854, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250059604645, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 10.668750286102295, "epoch": 0.19336706014614952, "kl": 1.4792628586292267, "learning_rate": 9.980719505340078e-07, "loss": 0.010150870308279991, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8953125476837158, "reward_std": 0.7756129056215286, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.895312488079071, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 10.492187738418579, "epoch": 0.19392917369308602, "kl": 1.5462858974933624, "learning_rate": 9.980663293985385e-07, "loss": 0.006519682239741087, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.8796875476837158, "reward_std": 0.7582378536462784, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875029802322, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 10.500000238418579, "epoch": 0.1944912872400225, "kl": 1.6059279441833496, "learning_rate": 9.980607082630692e-07, "loss": -0.009531073272228241, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.8375000357627869, "reward_std": 0.7587139010429382, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.75, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 10.500000238418579, "epoch": 0.19505340078695896, "kl": 1.581993728876114, "learning_rate": 9.980550871275998e-07, "loss": 0.027156025171279907, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8984375, "reward_std": 0.7981103658676147, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375149011612, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.19561551433389546, "kl": 1.5765255689620972, "learning_rate": 9.980494659921303e-07, "loss": 0.008464457467198372, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.8718750476837158, "reward_std": 0.7553205490112305, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750178813934, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 10.450000286102295, "epoch": 0.19617762788083193, "kl": 1.5645933151245117, "learning_rate": 9.98043844856661e-07, "loss": 0.024853838607668877, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8718750178813934, "reward_std": 0.8353610932826996, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750029802322, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 10.345312595367432, "epoch": 0.1967397414277684, "kl": 1.6021571457386017, "learning_rate": 9.980382237211917e-07, "loss": 0.014802216552197933, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8609375059604645, "reward_std": 0.8342433571815491, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375208616257, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 10.448437929153442, "epoch": 0.1973018549747049, "kl": 1.5465485453605652, "learning_rate": 9.980326025857223e-07, "loss": 0.015489017590880394, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.8843750059604645, "reward_std": 0.8451384902000427, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 10.354687690734863, "epoch": 0.19786396852164137, "kl": 1.5961918532848358, "learning_rate": 9.980269814502528e-07, "loss": 0.0025184075348079205, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.8828125596046448, "reward_std": 0.8035257160663605, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 10.414062738418579, "epoch": 0.19842608206857784, "kl": 1.6324600279331207, "learning_rate": 9.980213603147835e-07, "loss": 0.03646010905504227, "ratio/all_0": 0.0625, "ratio/all_2": 0.671875, "reward": 1.896875023841858, "reward_std": 0.8411010056734085, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 10.350000143051147, "epoch": 0.19898819561551434, "kl": 1.555162936449051, "learning_rate": 9.980157391793142e-07, "loss": 0.03622834011912346, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.932812511920929, "reward_std": 0.8318552523851395, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.96875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.932812511920929, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 10.462500095367432, "epoch": 0.1995503091624508, "kl": 1.6028335988521576, "learning_rate": 9.980101180438448e-07, "loss": 0.017967145889997482, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.873437523841858, "reward_std": 0.8339188247919083, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375089406967, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 10.381250381469727, "epoch": 0.20011242270938728, "kl": 1.5512114763259888, "learning_rate": 9.980044969083753e-07, "loss": 0.028043627738952637, "ratio/all_0": 0.015625, "ratio/all_2": 0.671875, "reward": 1.920312523841858, "reward_std": 0.8178975880146027, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9218750149011612, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 10.398437738418579, "epoch": 0.20067453625632378, "kl": 1.64331915974617, "learning_rate": 9.979988757729062e-07, "loss": 0.01075400784611702, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.8484375476837158, "reward_std": 0.7601196467876434, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375178813934, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 10.457812547683716, "epoch": 0.20123664980326025, "kl": 1.6044352352619171, "learning_rate": 9.979932546374367e-07, "loss": 0.01619352400302887, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8796875476837158, "reward_std": 0.7670706510543823, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875029802322, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 10.553125143051147, "epoch": 0.20179876335019675, "kl": 1.579399436712265, "learning_rate": 9.979876335019673e-07, "loss": 0.017630768939852715, "ratio/all_0": 0.046875, "ratio/all_2": 0.5625, "reward": 1.8687500059604645, "reward_std": 0.7773216217756271, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500208616257, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 10.58750033378601, "epoch": 0.20236087689713322, "kl": 1.5743523836135864, "learning_rate": 9.97982012366498e-07, "loss": 0.028611905872821808, "ratio/all_0": 0.046875, "ratio/all_2": 0.6875, "reward": 1.8953125178813934, "reward_std": 0.8501724600791931, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 10.462500095367432, "epoch": 0.2029229904440697, "kl": 1.5823079943656921, "learning_rate": 9.979763912310287e-07, "loss": 0.018070163205266, "ratio/all_0": 0.03125, "ratio/all_2": 0.515625, "reward": 1.889062523841858, "reward_std": 0.7269922941923141, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 10.409375190734863, "epoch": 0.2034851039910062, "kl": 1.5337883830070496, "learning_rate": 9.979707700955594e-07, "loss": 0.02222316712141037, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9124999940395355, "reward_std": 0.7893053442239761, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.20404721753794267, "kl": 1.4978066980838776, "learning_rate": 9.979651489600898e-07, "loss": 0.028549140319228172, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.9078125357627869, "reward_std": 0.7738133370876312, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 10.428125143051147, "epoch": 0.20460933108487914, "kl": 1.5625019669532776, "learning_rate": 9.979595278246205e-07, "loss": 0.022949017584323883, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.893750011920929, "reward_std": 0.7820322662591934, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 10.537500143051147, "epoch": 0.20517144463181564, "kl": 1.6440396308898926, "learning_rate": 9.979539066891512e-07, "loss": 0.012113522738218307, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8531250059604645, "reward_std": 0.7996447384357452, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250059604645, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 10.534375190734863, "epoch": 0.2057335581787521, "kl": 1.5494819581508636, "learning_rate": 9.979482855536819e-07, "loss": 0.038545746356248856, "ratio/all_0": 0.0, "ratio/all_2": 0.75, "reward": 1.9484375417232513, "reward_std": 0.8534396290779114, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.948437511920929, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 10.525000095367432, "epoch": 0.20629567172568858, "kl": 1.5757357478141785, "learning_rate": 9.979426644182123e-07, "loss": 0.015849340707063675, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8718750178813934, "reward_std": 0.7840642780065536, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750178813934, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 10.434375047683716, "epoch": 0.20685778527262508, "kl": 1.6400718986988068, "learning_rate": 9.97937043282743e-07, "loss": 0.0333583801984787, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.909375011920929, "reward_std": 0.8277343511581421, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 10.59375, "epoch": 0.20741989881956155, "kl": 1.5465018153190613, "learning_rate": 9.979314221472737e-07, "loss": 0.015456119552254677, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8828125596046448, "reward_std": 0.7825614660978317, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 10.57968783378601, "epoch": 0.20798201236649802, "kl": 1.5840701758861542, "learning_rate": 9.979258010118044e-07, "loss": 0.013504572212696075, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8812500536441803, "reward_std": 0.8036747425794601, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 10.364062547683716, "epoch": 0.20854412591343452, "kl": 1.5801215767860413, "learning_rate": 9.979201798763348e-07, "loss": 0.007882913574576378, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8703125417232513, "reward_std": 0.8143058866262436, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 10.595312595367432, "epoch": 0.209106239460371, "kl": 1.5764483511447906, "learning_rate": 9.979145587408655e-07, "loss": 0.02095562405884266, "ratio/all_0": 0.0625, "ratio/all_2": 0.609375, "reward": 1.8687500357627869, "reward_std": 0.8148956596851349, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500059604645, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 10.579687595367432, "epoch": 0.2096683530073075, "kl": 1.5262295305728912, "learning_rate": 9.979089376053964e-07, "loss": -0.005769479088485241, "ratio/all_0": 0.015625, "ratio/all_2": 0.53125, "reward": 1.8609375059604645, "reward_std": 0.7386855036020279, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8609375208616257, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 10.607812404632568, "epoch": 0.21023046655424396, "kl": 1.5864124596118927, "learning_rate": 9.979033164699269e-07, "loss": 0.017187146469950676, "ratio/all_0": 0.09375, "ratio/all_2": 0.5, "reward": 1.8375000357627869, "reward_std": 0.7572944611310959, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 10.489062786102295, "epoch": 0.21079258010118043, "kl": 1.592396318912506, "learning_rate": 9.978976953344575e-07, "loss": 0.012976432219147682, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.8859375417232513, "reward_std": 0.7594173699617386, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 10.604687929153442, "epoch": 0.21135469364811693, "kl": 1.5425201952457428, "learning_rate": 9.978920741989882e-07, "loss": 0.02891225554049015, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.9078125059604645, "reward_std": 0.8302794843912125, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.909375011920929, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 10.482812881469727, "epoch": 0.2119168071950534, "kl": 1.5657863020896912, "learning_rate": 9.978864530635189e-07, "loss": 0.025767218321561813, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.9015625417232513, "reward_std": 0.8274085968732834, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 10.575000286102295, "epoch": 0.21247892074198987, "kl": 1.5857770144939423, "learning_rate": 9.978808319280494e-07, "loss": 0.01667657122015953, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8765625357627869, "reward_std": 0.8095056265592575, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625208616257, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 10.503125190734863, "epoch": 0.21304103428892637, "kl": 1.5959461033344269, "learning_rate": 9.9787521079258e-07, "loss": 0.02422156184911728, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8828125, "reward_std": 0.79684117436409, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 10.410937547683716, "epoch": 0.21360314783586284, "kl": 1.606498122215271, "learning_rate": 9.978695896571107e-07, "loss": 0.031145939603447914, "ratio/all_0": 0.078125, "ratio/all_2": 0.625, "reward": 1.873437523841858, "reward_std": 0.8260256797075272, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375238418579, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 10.540625095367432, "epoch": 0.21416526138279932, "kl": 1.6109911799430847, "learning_rate": 9.978639685216414e-07, "loss": 0.027983110398054123, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.893750011920929, "reward_std": 0.8392192274332047, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 10.517187595367432, "epoch": 0.21472737492973581, "kl": 1.5722034871578217, "learning_rate": 9.978583473861719e-07, "loss": 0.008846685290336609, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8687500357627869, "reward_std": 0.8315659910440445, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500059604645, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 10.415625095367432, "epoch": 0.21528948847667229, "kl": 1.5834924280643463, "learning_rate": 9.978527262507025e-07, "loss": 0.017309188842773438, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.895312488079071, "reward_std": 0.8090067058801651, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125327825546, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 10.556250095367432, "epoch": 0.21585160202360876, "kl": 1.5703161358833313, "learning_rate": 9.978471051152332e-07, "loss": 0.019084349274635315, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.893750011920929, "reward_std": 0.8291173279285431, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 10.362500190734863, "epoch": 0.21641371557054526, "kl": 1.5886655449867249, "learning_rate": 9.978414839797639e-07, "loss": 0.028999771922826767, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9187500178813934, "reward_std": 0.7858879417181015, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9187500029802322, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 10.295312643051147, "epoch": 0.21697582911748173, "kl": 1.6783213019371033, "learning_rate": 9.978358628442946e-07, "loss": 0.04459211230278015, "ratio/all_0": 0.0625, "ratio/all_2": 0.6875, "reward": 1.9031250178813934, "reward_std": 0.8525859266519547, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250178813934, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 10.375, "epoch": 0.2175379426644182, "kl": 1.597625732421875, "learning_rate": 9.97830241708825e-07, "loss": 0.012688029557466507, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8765625357627869, "reward_std": 0.7545565068721771, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625059604645, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.2181000562113547, "kl": 1.5851397812366486, "learning_rate": 9.97824620573356e-07, "loss": 0.03079412877559662, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8984375298023224, "reward_std": 0.8054997026920319, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375149011612, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 10.317187547683716, "epoch": 0.21866216975829117, "kl": 1.564811259508133, "learning_rate": 9.978189994378864e-07, "loss": 0.013478105887770653, "ratio/all_0": 0.0625, "ratio/all_2": 0.609375, "reward": 1.857812523841858, "reward_std": 0.814572349190712, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125089406967, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 10.317187786102295, "epoch": 0.21922428330522767, "kl": 1.582626074552536, "learning_rate": 9.97813378302417e-07, "loss": 0.030684249475598335, "ratio/all_0": 0.078125, "ratio/all_2": 0.59375, "reward": 1.8734374940395355, "reward_std": 0.7998443990945816, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375238418579, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 10.339062690734863, "epoch": 0.21978639685216414, "kl": 1.6444810032844543, "learning_rate": 9.978077571669477e-07, "loss": 0.017499804496765137, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.8984375596046448, "reward_std": 0.8140418827533722, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 10.404687881469727, "epoch": 0.2203485103991006, "kl": 1.6334162950515747, "learning_rate": 9.978021360314784e-07, "loss": 0.020157240331172943, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8687500059604645, "reward_std": 0.7964091897010803, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8703125268220901, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 10.318750381469727, "epoch": 0.2209106239460371, "kl": 1.6211105585098267, "learning_rate": 9.977965148960089e-07, "loss": 0.017307277768850327, "ratio/all_0": 0.078125, "ratio/all_2": 0.578125, "reward": 1.850000023841858, "reward_std": 0.7988163083791733, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8500000238418579, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 10.357812881469727, "epoch": 0.22147273749297358, "kl": 1.5798417031764984, "learning_rate": 9.977908937605396e-07, "loss": 0.019197724759578705, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8921875357627869, "reward_std": 0.8184860795736313, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 10.396874904632568, "epoch": 0.22203485103991005, "kl": 1.5937089622020721, "learning_rate": 9.977852726250702e-07, "loss": -0.003021673299372196, "ratio/all_0": 0.0, "ratio/all_2": 0.578125, "reward": 1.8718750476837158, "reward_std": 0.766514927148819, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750178813934, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 10.267187595367432, "epoch": 0.22259696458684655, "kl": 1.5950027704238892, "learning_rate": 9.97779651489601e-07, "loss": 0.03185337409377098, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.9000000059604645, "reward_std": 0.8525859266519547, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 10.409375190734863, "epoch": 0.22315907813378302, "kl": 1.5446783006191254, "learning_rate": 9.977740303541314e-07, "loss": 0.020665908232331276, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9015625417232513, "reward_std": 0.7969914227724075, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 10.395312786102295, "epoch": 0.2237211916807195, "kl": 1.5845154523849487, "learning_rate": 9.97768409218662e-07, "loss": 0.0206165611743927, "ratio/all_0": 0.078125, "ratio/all_2": 0.609375, "reward": 1.8484375476837158, "reward_std": 0.8286777585744858, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375029802322, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 10.417187690734863, "epoch": 0.224283305227656, "kl": 1.6193600594997406, "learning_rate": 9.977627880831927e-07, "loss": 0.030626408755779266, "ratio/all_0": 0.0, "ratio/all_2": 0.71875, "reward": 1.932812511920929, "reward_std": 0.8446698635816574, "rewards/avg_0": 1.96875, "rewards/avg_1": 1.96875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9343750178813934, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 10.428125143051147, "epoch": 0.22484541877459246, "kl": 1.623527467250824, "learning_rate": 9.977571669477234e-07, "loss": 0.024629278108477592, "ratio/all_0": 0.046875, "ratio/all_2": 0.71875, "reward": 1.8828125298023224, "reward_std": 0.8720838725566864, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 10.539062738418579, "epoch": 0.22540753232152894, "kl": 1.56095489859581, "learning_rate": 9.97751545812254e-07, "loss": -0.004245146177709103, "ratio/all_0": 0.0625, "ratio/all_2": 0.46875, "reward": 1.8296875357627869, "reward_std": 0.7254326045513153, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.75, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8296875208616257, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 10.514062881469727, "epoch": 0.22596964586846544, "kl": 1.557588368654251, "learning_rate": 9.977459246767845e-07, "loss": 0.0035413671284914017, "ratio/all_0": 0.015625, "ratio/all_2": 0.671875, "reward": 1.885937511920929, "reward_std": 0.8230249434709549, "rewards/avg_0": 1.875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8859375268220901, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 10.495312690734863, "epoch": 0.2265317594154019, "kl": 1.5131984651088715, "learning_rate": 9.977403035413154e-07, "loss": 0.02268044278025627, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9093749821186066, "reward_std": 0.793575257062912, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9093750268220901, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 10.500000238418579, "epoch": 0.2270938729623384, "kl": 1.5539019405841827, "learning_rate": 9.97734682405846e-07, "loss": 0.006285407580435276, "ratio/all_0": 0.09375, "ratio/all_2": 0.53125, "reward": 1.8171875178813934, "reward_std": 0.7920168191194534, "rewards/avg_0": 1.734375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.75, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8171875178813934, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 10.506250143051147, "epoch": 0.22765598650927488, "kl": 1.6138789355754852, "learning_rate": 9.977290612703766e-07, "loss": 0.011725908145308495, "ratio/all_0": 0.015625, "ratio/all_2": 0.671875, "reward": 1.8921875357627869, "reward_std": 0.8255873620510101, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875208616257, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.22821810005621135, "kl": 1.526683360338211, "learning_rate": 9.977234401349073e-07, "loss": 0.027411654591560364, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.9093750417232513, "reward_std": 0.8125075399875641, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 10.47812533378601, "epoch": 0.22878021360314785, "kl": 1.6264634728431702, "learning_rate": 9.97717818999438e-07, "loss": 0.01731857657432556, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.889062523841858, "reward_std": 0.8020582050085068, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 10.4765625, "epoch": 0.22934232715008432, "kl": 1.6295714378356934, "learning_rate": 9.977121978639684e-07, "loss": 0.017600316554307938, "ratio/all_0": 0.09375, "ratio/all_2": 0.5625, "reward": 1.8375000059604645, "reward_std": 0.8005238026380539, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8375000059604645, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 10.534375190734863, "epoch": 0.2299044406970208, "kl": 1.6157631576061249, "learning_rate": 9.97706576728499e-07, "loss": 0.015536559745669365, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8968750536441803, "reward_std": 0.7858879566192627, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968749940395355, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 10.476562738418579, "epoch": 0.2304665542439573, "kl": 1.5186310112476349, "learning_rate": 9.977009555930298e-07, "loss": 0.020370997488498688, "ratio/all_0": 0.078125, "ratio/all_2": 0.546875, "reward": 1.8531250059604645, "reward_std": 0.7863021939992905, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.75, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8531250208616257, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 10.484375238418579, "epoch": 0.23102866779089376, "kl": 1.554486483335495, "learning_rate": 9.976953344575604e-07, "loss": 0.016492923721671104, "ratio/all_0": 0.046875, "ratio/all_2": 0.59375, "reward": 1.8765625059604645, "reward_std": 0.7910115867853165, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625059604645, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 10.554687738418579, "epoch": 0.23159078133783023, "kl": 1.5660745203495026, "learning_rate": 9.976897133220911e-07, "loss": 0.015180633403360844, "ratio/all_0": 0.03125, "ratio/all_2": 0.53125, "reward": 1.878125011920929, "reward_std": 0.7512881010770798, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8781250268220901, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 10.562500238418579, "epoch": 0.23215289488476673, "kl": 1.5972300469875336, "learning_rate": 9.976840921866216e-07, "loss": -0.0018402566201984882, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.859375, "reward_std": 0.819339856505394, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750298023224, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 10.426562547683716, "epoch": 0.2327150084317032, "kl": 1.564846783876419, "learning_rate": 9.976784710511522e-07, "loss": 0.011204143986105919, "ratio/all_0": 0.015625, "ratio/all_2": 0.484375, "reward": 1.8796875476837158, "reward_std": 0.7146098762750626, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8812500089406967, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 10.589062690734863, "epoch": 0.23327712197863967, "kl": 1.5199431777000427, "learning_rate": 9.97672849915683e-07, "loss": 0.01307937502861023, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.8921875357627869, "reward_std": 0.7638034969568253, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 10.439062595367432, "epoch": 0.23383923552557617, "kl": 1.6180415451526642, "learning_rate": 9.976672287802136e-07, "loss": 0.022075064480304718, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9078125357627869, "reward_std": 0.7796428948640823, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125208616257, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 10.46875, "epoch": 0.23440134907251264, "kl": 1.6099268794059753, "learning_rate": 9.97661607644744e-07, "loss": 0.03135301545262337, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.8953125178813934, "reward_std": 0.8648688793182373, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125029802322, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 10.514062643051147, "epoch": 0.23496346261944911, "kl": 1.589457243680954, "learning_rate": 9.976559865092747e-07, "loss": 0.007210549898445606, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.8828125298023224, "reward_std": 0.7537620812654495, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 10.415624856948853, "epoch": 0.23552557616638561, "kl": 1.6951603591442108, "learning_rate": 9.976503653738054e-07, "loss": 0.03695042431354523, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.9000000357627869, "reward_std": 0.7999933958053589, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 10.485937595367432, "epoch": 0.23608768971332209, "kl": 1.585318922996521, "learning_rate": 9.97644744238336e-07, "loss": 0.026742789894342422, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.9031250178813934, "reward_std": 0.8052344620227814, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250178813934, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 10.468750238418579, "epoch": 0.23664980326025858, "kl": 1.5903295278549194, "learning_rate": 9.976391231028668e-07, "loss": 0.015053427778184414, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8875000476837158, "reward_std": 0.7833255380392075, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000029802322, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 10.546875238418579, "epoch": 0.23721191680719506, "kl": 1.6383749842643738, "learning_rate": 9.976335019673975e-07, "loss": -0.012720446102321148, "ratio/all_0": 0.0625, "ratio/all_2": 0.515625, "reward": 1.810937523841858, "reward_std": 0.7563245743513107, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.703125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8109375089406967, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 10.335937738418579, "epoch": 0.23777403035413153, "kl": 1.6610612869262695, "learning_rate": 9.976278808319281e-07, "loss": 0.023902423679828644, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8828125, "reward_std": 0.8249382674694061, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 10.459375143051147, "epoch": 0.23833614390106803, "kl": 1.5190238654613495, "learning_rate": 9.976222596964586e-07, "loss": 0.028971953317523003, "ratio/all_0": 0.078125, "ratio/all_2": 0.640625, "reward": 1.873437523841858, "reward_std": 0.8383654952049255, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375089406967, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 10.475000381469727, "epoch": 0.2388982574480045, "kl": 1.6028160750865936, "learning_rate": 9.976166385609893e-07, "loss": 0.009334921836853027, "ratio/all_0": 0.0625, "ratio/all_2": 0.5625, "reward": 1.846875011920929, "reward_std": 0.7874211072921753, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.846875011920929, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 10.36093783378601, "epoch": 0.23946037099494097, "kl": 1.523706704378128, "learning_rate": 9.9761101742552e-07, "loss": -0.0016508856788277626, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.8828125298023224, "reward_std": 0.8340931683778763, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 10.381250381469727, "epoch": 0.24002248454187747, "kl": 1.5795050263404846, "learning_rate": 9.976053962900506e-07, "loss": 0.010042035952210426, "ratio/all_0": 0.0, "ratio/all_2": 0.609375, "reward": 1.9000000357627869, "reward_std": 0.7759362161159515, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 10.395312786102295, "epoch": 0.24058459808881394, "kl": 1.561740130186081, "learning_rate": 9.97599775154581e-07, "loss": 0.010532597079873085, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.878125011920929, "reward_std": 0.8440195620059967, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 10.446875095367432, "epoch": 0.2411467116357504, "kl": 1.5893949568271637, "learning_rate": 9.975941540191118e-07, "loss": 0.018870336934924126, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.873437523841858, "reward_std": 0.8117434531450272, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8734375089406967, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 10.371875286102295, "epoch": 0.2417088251826869, "kl": 1.6396957039833069, "learning_rate": 9.975885328836424e-07, "loss": 0.015891946852207184, "ratio/all_0": 0.03125, "ratio/all_2": 0.546875, "reward": 1.8750000298023224, "reward_std": 0.7589618265628815, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8765625059604645, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 10.46250033378601, "epoch": 0.24227093872962338, "kl": 1.5293635129928589, "learning_rate": 9.975829117481731e-07, "loss": 0.02730741538107395, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8796875476837158, "reward_std": 0.8261153250932693, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 10.426562786102295, "epoch": 0.24283305227655985, "kl": 1.5679107904434204, "learning_rate": 9.975772906127036e-07, "loss": 0.027399998158216476, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9109375178813934, "reward_std": 0.8114770352840424, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 10.462500095367432, "epoch": 0.24339516582349635, "kl": 1.5263629853725433, "learning_rate": 9.975716694772343e-07, "loss": 0.028345463797450066, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.9078125059604645, "reward_std": 0.781880795955658, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 10.364062547683716, "epoch": 0.24395727937043282, "kl": 1.6106272339820862, "learning_rate": 9.97566048341765e-07, "loss": 0.027817612513899803, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8875000178813934, "reward_std": 0.8168696016073227, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000178813934, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 10.362500429153442, "epoch": 0.24451939291736932, "kl": 1.698000580072403, "learning_rate": 9.975604272062956e-07, "loss": 0.025879831984639168, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.885937511920929, "reward_std": 0.7702215611934662, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 10.412500143051147, "epoch": 0.2450815064643058, "kl": 1.6197902262210846, "learning_rate": 9.975548060708263e-07, "loss": 0.02497243694961071, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.870312511920929, "reward_std": 0.8267063200473785, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 10.443750381469727, "epoch": 0.24564362001124226, "kl": 1.4857002794742584, "learning_rate": 9.97549184935357e-07, "loss": 0.011337099596858025, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9015625417232513, "reward_std": 0.8217532634735107, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 10.474999904632568, "epoch": 0.24620573355817876, "kl": 1.6131562292575836, "learning_rate": 9.975435637998877e-07, "loss": 0.023611659184098244, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.9078125059604645, "reward_std": 0.7654212862253189, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125208616257, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 10.414062738418579, "epoch": 0.24676784710511523, "kl": 1.5450926423072815, "learning_rate": 9.975379426644181e-07, "loss": 0.006920941174030304, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.881250023841858, "reward_std": 0.7529083490371704, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 10.464062690734863, "epoch": 0.2473299606520517, "kl": 1.5010447204113007, "learning_rate": 9.975323215289488e-07, "loss": 0.03678865730762482, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.925000011920929, "reward_std": 0.8583438247442245, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9265625178813934, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 10.3125, "epoch": 0.2478920741989882, "kl": 1.600469321012497, "learning_rate": 9.975267003934795e-07, "loss": 0.02798604406416416, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.889062523841858, "reward_std": 0.84578637778759, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625089406967, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 10.450000047683716, "epoch": 0.24845418774592468, "kl": 1.568362832069397, "learning_rate": 9.975210792580102e-07, "loss": 0.021247677505016327, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8921875357627869, "reward_std": 0.7840049564838409, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 10.414062738418579, "epoch": 0.24901630129286115, "kl": 1.5653727054595947, "learning_rate": 9.975154581225406e-07, "loss": 0.00297177373431623, "ratio/all_0": 0.09375, "ratio/all_2": 0.546875, "reward": 1.8359375298023224, "reward_std": 0.7760512083768845, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.734375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8375000208616257, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 10.399999856948853, "epoch": 0.24957841483979765, "kl": 1.6329529285430908, "learning_rate": 9.975098369870713e-07, "loss": 0.023191925138235092, "ratio/all_0": 0.0, "ratio/all_2": 0.703125, "reward": 1.9124999940395355, "reward_std": 0.8387785255908966, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.2501405283867341, "kl": 1.5339020490646362, "learning_rate": 9.97504215851602e-07, "loss": 0.04132520407438278, "ratio/all_0": 0.0, "ratio/all_2": 0.75, "reward": 1.9562500417232513, "reward_std": 0.8514669239521027, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.96875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.953125, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.956250011920929, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 10.375000238418579, "epoch": 0.2507026419336706, "kl": 1.5444872379302979, "learning_rate": 9.974985947161326e-07, "loss": 0.032432883977890015, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.9140625596046448, "reward_std": 0.861393392086029, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625149011612, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 10.546875238418579, "epoch": 0.25126475548060706, "kl": 1.5955162942409515, "learning_rate": 9.974929735806631e-07, "loss": 0.028477732092142105, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.8906250298023224, "reward_std": 0.8443440943956375, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 10.315625190734863, "epoch": 0.2518268690275436, "kl": 1.578765332698822, "learning_rate": 9.974873524451938e-07, "loss": 0.019804082810878754, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9078125357627869, "reward_std": 0.8340931385755539, "rewards/avg_0": 1.875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 10.485937356948853, "epoch": 0.25238898257448006, "kl": 1.5700731873512268, "learning_rate": 9.974817313097245e-07, "loss": 0.030755460262298584, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.9171875417232513, "reward_std": 0.8595938086509705, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 10.506249904632568, "epoch": 0.25295109612141653, "kl": 1.6172282099723816, "learning_rate": 9.974761101742551e-07, "loss": 0.025174066424369812, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8968749940395355, "reward_std": 0.8019672930240631, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 10.470312595367432, "epoch": 0.253513209668353, "kl": 1.5827001631259918, "learning_rate": 9.974704890387858e-07, "loss": 0.027356263250112534, "ratio/all_0": 0.046875, "ratio/all_2": 0.6875, "reward": 1.8984375298023224, "reward_std": 0.842809721827507, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 10.407812595367432, "epoch": 0.2540753232152895, "kl": 1.5556872487068176, "learning_rate": 9.974648679033165e-07, "loss": 0.028104005381464958, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.9062500298023224, "reward_std": 0.8070946335792542, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.90625, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.25463743676222594, "kl": 1.5716063380241394, "learning_rate": 9.974592467678472e-07, "loss": 0.02172784134745598, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8796875178813934, "reward_std": 0.8272948265075684, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 10.418750286102295, "epoch": 0.25519955030916247, "kl": 1.5538042485713959, "learning_rate": 9.974536256323776e-07, "loss": 0.0027528153732419014, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8515625298023224, "reward_std": 0.7925472110509872, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8515625149011612, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 10.464062929153442, "epoch": 0.25576166385609894, "kl": 1.5341682434082031, "learning_rate": 9.974480044969083e-07, "loss": 0.027387108653783798, "ratio/all_0": 0.0, "ratio/all_2": 0.734375, "reward": 1.9328125417232513, "reward_std": 0.8470215350389481, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.932812511920929, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 10.554687738418579, "epoch": 0.2563237774030354, "kl": 1.5187903046607971, "learning_rate": 9.97442383361439e-07, "loss": 0.00495085958391428, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.878125011920929, "reward_std": 0.7772054225206375, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8781250268220901, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 10.518750190734863, "epoch": 0.2568858909499719, "kl": 1.5951554477214813, "learning_rate": 9.974367622259697e-07, "loss": 0.007676597684621811, "ratio/all_0": 0.0, "ratio/all_2": 0.640625, "reward": 1.8953125178813934, "reward_std": 0.7952839285135269, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 10.503125190734863, "epoch": 0.25744800449690836, "kl": 1.6238690912723541, "learning_rate": 9.974311410905001e-07, "loss": 0.031223241239786148, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.90625, "reward_std": 0.8409267216920853, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9062500298023224, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 10.607812881469727, "epoch": 0.2580101180438449, "kl": 1.5360244810581207, "learning_rate": 9.974255199550308e-07, "loss": 0.009246742352843285, "ratio/all_0": 0.0, "ratio/all_2": 0.640625, "reward": 1.896875023841858, "reward_std": 0.8043554276227951, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 10.387500047683716, "epoch": 0.25857223159078135, "kl": 1.5769808888435364, "learning_rate": 9.974198988195615e-07, "loss": 0.008776286616921425, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.862500011920929, "reward_std": 0.8146316558122635, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8625000268220901, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 10.512500047683716, "epoch": 0.2591343451377178, "kl": 1.588375449180603, "learning_rate": 9.974142776840922e-07, "loss": 0.023817773908376694, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8843750357627869, "reward_std": 0.8069419264793396, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 10.506250143051147, "epoch": 0.2596964586846543, "kl": 1.561067909002304, "learning_rate": 9.974086565486228e-07, "loss": 0.014013110660016537, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8671875298023224, "reward_std": 0.7865673452615738, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8671875, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 10.548437595367432, "epoch": 0.26025857223159077, "kl": 1.557085394859314, "learning_rate": 9.974030354131533e-07, "loss": 0.031534142792224884, "ratio/all_0": 0.015625, "ratio/all_2": 0.78125, "reward": 1.9296875298023224, "reward_std": 0.8877490013837814, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9296875149011612, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 10.304687738418579, "epoch": 0.26082068577852724, "kl": 1.5971083343029022, "learning_rate": 9.97397414277684e-07, "loss": 0.015254084020853043, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.8546875417232513, "reward_std": 0.8286777883768082, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 10.368750095367432, "epoch": 0.26138279932546377, "kl": 1.6045811474323273, "learning_rate": 9.973917931422147e-07, "loss": 0.022352047264575958, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9046875536441803, "reward_std": 0.7893041223287582, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046874940395355, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 10.520312547683716, "epoch": 0.26194491287240024, "kl": 1.5685151815414429, "learning_rate": 9.973861720067453e-07, "loss": 0.006927984766662121, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.875, "reward_std": 0.7674835920333862, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 10.440625190734863, "epoch": 0.2625070264193367, "kl": 1.6096564829349518, "learning_rate": 9.97380550871276e-07, "loss": 0.0011199398431926966, "ratio/all_0": 0.0, "ratio/all_2": 0.640625, "reward": 1.8843750357627869, "reward_std": 0.7997609376907349, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 10.407812595367432, "epoch": 0.2630691399662732, "kl": 1.6002353727817535, "learning_rate": 9.973749297358067e-07, "loss": 0.0033882353454828262, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8484375178813934, "reward_std": 0.7542925029993057, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375178813934, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 10.417187690734863, "epoch": 0.26363125351320965, "kl": 1.6339455544948578, "learning_rate": 9.973693086003372e-07, "loss": 0.02932353876531124, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.9000000059604645, "reward_std": 0.8019672483205795, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000208616257, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 10.435937881469727, "epoch": 0.2641933670601461, "kl": 1.6218569576740265, "learning_rate": 9.973636874648678e-07, "loss": 0.03466630354523659, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9265625178813934, "reward_std": 0.8086530715227127, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9265625178813934, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.26475548060708265, "kl": 1.5886335372924805, "learning_rate": 9.973580663293985e-07, "loss": 0.030788417905569077, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.9343750178813934, "reward_std": 0.7689536660909653, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9343750029802322, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 10.364062786102295, "epoch": 0.2653175941540191, "kl": 1.619934231042862, "learning_rate": 9.973524451939292e-07, "loss": 0.04304520785808563, "ratio/all_0": 0.0625, "ratio/all_2": 0.703125, "reward": 1.9062500298023224, "reward_std": 0.8607139140367508, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.90625, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 10.498437881469727, "epoch": 0.2658797077009556, "kl": 1.579685002565384, "learning_rate": 9.973468240584597e-07, "loss": 0.02157140150666237, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8906250298023224, "reward_std": 0.8131882101297379, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 10.5859375, "epoch": 0.26644182124789206, "kl": 1.4751878082752228, "learning_rate": 9.973412029229903e-07, "loss": 0.02198803797364235, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8859375417232513, "reward_std": 0.8028210699558258, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 10.456249952316284, "epoch": 0.26700393479482853, "kl": 1.5577751994132996, "learning_rate": 9.97335581787521e-07, "loss": 0.018937459215521812, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.8968750536441803, "reward_std": 0.8412512391805649, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 10.490625143051147, "epoch": 0.26756604834176506, "kl": 1.5770387947559357, "learning_rate": 9.973299606520517e-07, "loss": 0.02208469621837139, "ratio/all_0": 0.078125, "ratio/all_2": 0.578125, "reward": 1.856249988079071, "reward_std": 0.8088020831346512, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8562500327825546, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 10.36875033378601, "epoch": 0.26812816188870153, "kl": 1.6067821681499481, "learning_rate": 9.973243395165824e-07, "loss": 0.0007646908052265644, "ratio/all_0": 0.015625, "ratio/all_2": 0.578125, "reward": 1.870312511920929, "reward_std": 0.7674848735332489, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8718750178813934, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 10.517187595367432, "epoch": 0.268690275435638, "kl": 1.5556055009365082, "learning_rate": 9.973187183811128e-07, "loss": 0.019349299371242523, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.890625, "reward_std": 0.794694185256958, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 10.41562533378601, "epoch": 0.2692523889825745, "kl": 1.5935364365577698, "learning_rate": 9.973130972456435e-07, "loss": 0.023987704887986183, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9109375178813934, "reward_std": 0.7916025668382645, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375029802322, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 10.3984375, "epoch": 0.26981450252951095, "kl": 1.6000608801841736, "learning_rate": 9.973074761101742e-07, "loss": 0.0358889140188694, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9296875298023224, "reward_std": 0.7762255221605301, "rewards/avg_0": 1.875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9296875149011612, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 10.443750381469727, "epoch": 0.2703766160764474, "kl": 1.5882815420627594, "learning_rate": 9.973018549747049e-07, "loss": 0.03253752738237381, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9328125417232513, "reward_std": 0.8097695410251617, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.932812511920929, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 10.576562643051147, "epoch": 0.27093872962338394, "kl": 1.5390391051769257, "learning_rate": 9.972962338392355e-07, "loss": 0.030903009697794914, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.9031250178813934, "reward_std": 0.824316993355751, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250178813934, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 10.390625476837158, "epoch": 0.2715008431703204, "kl": 1.5586816370487213, "learning_rate": 9.972906127037662e-07, "loss": 0.012824834324419498, "ratio/all_0": 0.0625, "ratio/all_2": 0.5625, "reward": 1.8546875417232513, "reward_std": 0.785448431968689, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.854687511920929, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.2720629567172569, "kl": 1.5961687564849854, "learning_rate": 9.972849915682967e-07, "loss": 0.021679479628801346, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8984375, "reward_std": 0.78426893055439, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375298023224, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.27262507026419336, "kl": 1.5359916687011719, "learning_rate": 9.972793704328274e-07, "loss": 0.02804931253194809, "ratio/all_0": 0.0625, "ratio/all_2": 0.640625, "reward": 1.8828125298023224, "reward_std": 0.829443022608757, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125149011612, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 10.435937881469727, "epoch": 0.27318718381112983, "kl": 1.5890219509601593, "learning_rate": 9.97273749297358e-07, "loss": 0.016086315736174583, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.881250023841858, "reward_std": 0.7686631232500076, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812500089406967, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 10.521875143051147, "epoch": 0.27374929735806636, "kl": 1.567686676979065, "learning_rate": 9.972681281618887e-07, "loss": 0.02141323685646057, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.921875, "reward_std": 0.8018170297145844, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9234375208616257, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 10.406250238418579, "epoch": 0.27431141090500283, "kl": 1.607559710741043, "learning_rate": 9.972625070264194e-07, "loss": 0.026632366701960564, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9093750417232513, "reward_std": 0.8379234969615936, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 10.351562738418579, "epoch": 0.2748735244519393, "kl": 1.5950010418891907, "learning_rate": 9.972568858909499e-07, "loss": 0.03235364705324173, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.9078125059604645, "reward_std": 0.8532653301954269, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 10.367187738418579, "epoch": 0.27543563799887577, "kl": 1.6210860908031464, "learning_rate": 9.972512647554805e-07, "loss": 0.016208132728934288, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8765625357627869, "reward_std": 0.7815902531147003, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765624910593033, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 10.451562643051147, "epoch": 0.27599775154581224, "kl": 1.5678941011428833, "learning_rate": 9.972456436200112e-07, "loss": 0.009668991900980473, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.8875000178813934, "reward_std": 0.8114782571792603, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000029802322, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 10.390625238418579, "epoch": 0.2765598650927487, "kl": 1.5274541079998016, "learning_rate": 9.972400224845419e-07, "loss": 0.030864091590046883, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.912500023841858, "reward_std": 0.8409267067909241, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.27712197863968524, "kl": 1.6061663329601288, "learning_rate": 9.972344013490724e-07, "loss": 0.017122022807598114, "ratio/all_0": 0.046875, "ratio/all_2": 0.640625, "reward": 1.8718750178813934, "reward_std": 0.8296681493520737, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8734375238418579, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 10.246875047683716, "epoch": 0.2776840921866217, "kl": 1.5769842863082886, "learning_rate": 9.97228780213603e-07, "loss": 0.00853213481605053, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.8718750476837158, "reward_std": 0.7921317666769028, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8718750029802322, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 10.370312929153442, "epoch": 0.2782462057335582, "kl": 1.6459482312202454, "learning_rate": 9.972231590781337e-07, "loss": 0.034749262034893036, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.928125023841858, "reward_std": 0.8268794566392899, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9281250089406967, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 10.273437738418579, "epoch": 0.27880831928049465, "kl": 1.5733148455619812, "learning_rate": 9.972175379426644e-07, "loss": 0.009545298293232918, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.8875000178813934, "reward_std": 0.8128636181354523, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000178813934, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 10.364062786102295, "epoch": 0.2793704328274311, "kl": 1.6750578880310059, "learning_rate": 9.97211916807195e-07, "loss": 0.035452038049697876, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.9234375357627869, "reward_std": 0.8499085307121277, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9234375059604645, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 10.317187547683716, "epoch": 0.2799325463743676, "kl": 1.5531387031078339, "learning_rate": 9.972062956717257e-07, "loss": 0.03181811794638634, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9265625178813934, "reward_std": 0.86952143907547, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9265625178813934, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 10.400000095367432, "epoch": 0.2804946599213041, "kl": 1.5031760036945343, "learning_rate": 9.972006745362562e-07, "loss": 0.0034638706129044294, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.8937500417232513, "reward_std": 0.7857717573642731, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 10.295312643051147, "epoch": 0.2810567734682406, "kl": 1.576939582824707, "learning_rate": 9.971950534007869e-07, "loss": 0.034424349665641785, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.9125000536441803, "reward_std": 0.7881839871406555, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 10.325000286102295, "epoch": 0.28161888701517707, "kl": 1.6134376525878906, "learning_rate": 9.971894322653176e-07, "loss": 0.01385501679033041, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.8984375298023224, "reward_std": 0.78426893055439, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 10.582812309265137, "epoch": 0.28218100056211354, "kl": 1.587764173746109, "learning_rate": 9.971838111298482e-07, "loss": 0.01922702230513096, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.9000000357627869, "reward_std": 0.783739760518074, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 10.487500190734863, "epoch": 0.28274311410905, "kl": 1.5082274377346039, "learning_rate": 9.97178189994379e-07, "loss": 0.009739737957715988, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.890625, "reward_std": 0.753702774643898, "rewards/avg_0": 1.875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 10.554687738418579, "epoch": 0.28330522765598654, "kl": 1.5581303536891937, "learning_rate": 9.971725688589094e-07, "loss": 0.028458768501877785, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9218750298023224, "reward_std": 0.835271418094635, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.921875, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 10.362500429153442, "epoch": 0.283867341202923, "kl": 1.585391253232956, "learning_rate": 9.9716694772344e-07, "loss": 0.02503221668303013, "ratio/all_0": 0.0, "ratio/all_2": 0.734375, "reward": 1.9296875298023224, "reward_std": 0.8459025174379349, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.96875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9296875149011612, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 10.503124952316284, "epoch": 0.2844294547498595, "kl": 1.5686596930027008, "learning_rate": 9.971613265879707e-07, "loss": 0.027785908430814743, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.925000011920929, "reward_std": 0.8607139587402344, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9250000268220901, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 10.531250238418579, "epoch": 0.28499156829679595, "kl": 1.5021850764751434, "learning_rate": 9.971557054525014e-07, "loss": 0.0024004708975553513, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8781250417232513, "reward_std": 0.7744927406311035, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8781249970197678, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 10.490625143051147, "epoch": 0.2855536818437324, "kl": 1.5643205046653748, "learning_rate": 9.971500843170319e-07, "loss": 0.03464096784591675, "ratio/all_0": 0.03125, "ratio/all_2": 0.75, "reward": 1.921875, "reward_std": 0.8716684281826019, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.921875, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 10.617187738418579, "epoch": 0.2861157953906689, "kl": 1.5092883110046387, "learning_rate": 9.971444631815626e-07, "loss": 0.021037127822637558, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.893750011920929, "reward_std": 0.7613003849983215, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 10.392187595367432, "epoch": 0.2866779089376054, "kl": 1.5385313928127289, "learning_rate": 9.971388420460932e-07, "loss": 0.023627443239092827, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.9218750298023224, "reward_std": 0.837095096707344, "rewards/avg_0": 1.875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9234375059604645, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 10.495312690734863, "epoch": 0.2872400224845419, "kl": 1.5302962362766266, "learning_rate": 9.97133220910624e-07, "loss": 0.0014047394506633282, "ratio/all_0": 0.0625, "ratio/all_2": 0.53125, "reward": 1.8359375298023224, "reward_std": 0.7675429880619049, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8359375, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 10.453125238418579, "epoch": 0.28780213603147836, "kl": 1.5516058504581451, "learning_rate": 9.971275997751546e-07, "loss": 0.023453330621123314, "ratio/all_0": 0.09375, "ratio/all_2": 0.625, "reward": 1.8562500476837158, "reward_std": 0.8335639387369156, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8562500029802322, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 10.467187881469727, "epoch": 0.28836424957841483, "kl": 1.5807332396507263, "learning_rate": 9.971219786396853e-07, "loss": 0.002808466088026762, "ratio/all_0": 0.0, "ratio/all_2": 0.515625, "reward": 1.885937511920929, "reward_std": 0.7218433618545532, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8859375268220901, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 10.543750286102295, "epoch": 0.2889263631253513, "kl": 1.5733287930488586, "learning_rate": 9.97116357504216e-07, "loss": 0.02668760158121586, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.873437523841858, "reward_std": 0.8334064036607742, "rewards/avg_0": 1.875, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8750000298023224, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 10.467187881469727, "epoch": 0.2894884766722878, "kl": 1.6143159568309784, "learning_rate": 9.971107363687464e-07, "loss": 0.025657769292593002, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.8953125178813934, "reward_std": 0.8407426327466965, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.8984375298023224, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 10.417187929153442, "epoch": 0.2900505902192243, "kl": 1.5857263207435608, "learning_rate": 9.97105115233277e-07, "loss": 0.01410939171910286, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.878125011920929, "reward_std": 0.8141909092664719, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 10.425000429153442, "epoch": 0.2906127037661608, "kl": 1.6421043276786804, "learning_rate": 9.970994940978078e-07, "loss": 0.019229963421821594, "ratio/all_0": 0.0, "ratio/all_2": 0.65625, "reward": 1.907812476158142, "reward_std": 0.8077450841665268, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9093750268220901, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 10.395312547683716, "epoch": 0.29117481731309725, "kl": 1.5684965550899506, "learning_rate": 9.970938729623384e-07, "loss": 0.025036733597517014, "ratio/all_0": 0.046875, "ratio/all_2": 0.59375, "reward": 1.8843750059604645, "reward_std": 0.7915432304143906, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750208616257, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 10.353125095367432, "epoch": 0.2917369308600337, "kl": 1.5684770345687866, "learning_rate": 9.97088251826869e-07, "loss": 0.02313578687608242, "ratio/all_0": 0.0, "ratio/all_2": 0.703125, "reward": 1.9265625178813934, "reward_std": 0.8191908597946167, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9265625178813934, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 10.453125, "epoch": 0.2922990444069702, "kl": 1.5355479717254639, "learning_rate": 9.970826306913996e-07, "loss": 0.014254674315452576, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.878125011920929, "reward_std": 0.7888064831495285, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 10.4453125, "epoch": 0.2928611579539067, "kl": 1.6413032114505768, "learning_rate": 9.970770095559303e-07, "loss": 0.0015225689858198166, "ratio/all_0": 0.0625, "ratio/all_2": 0.609375, "reward": 1.8484375178813934, "reward_std": 0.8030874729156494, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8500000089406967, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 10.562500238418579, "epoch": 0.2934232715008432, "kl": 1.550758719444275, "learning_rate": 9.97071388420461e-07, "loss": 0.002582995221018791, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.8828125298023224, "reward_std": 0.7766397297382355, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 10.451562643051147, "epoch": 0.29398538504777966, "kl": 1.594634473323822, "learning_rate": 9.970657672849914e-07, "loss": 0.016852285712957382, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.879687488079071, "reward_std": 0.77764493227005, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875327825546, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 10.415625095367432, "epoch": 0.29454749859471613, "kl": 1.5855125486850739, "learning_rate": 9.97060146149522e-07, "loss": 0.011170834302902222, "ratio/all_0": 0.046875, "ratio/all_2": 0.53125, "reward": 1.8578124940395355, "reward_std": 0.7613887935876846, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8578125238418579, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 10.465625286102295, "epoch": 0.2951096121416526, "kl": 1.5837196707725525, "learning_rate": 9.970545250140528e-07, "loss": 0.02334693819284439, "ratio/all_0": 0.078125, "ratio/all_2": 0.578125, "reward": 1.8593750298023224, "reward_std": 0.8002597838640213, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.734375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750149011612, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 10.393750429153442, "epoch": 0.29567172568858907, "kl": 1.5401901602745056, "learning_rate": 9.970489038785834e-07, "loss": 0.016585461795330048, "ratio/all_0": 0.0, "ratio/all_2": 0.65625, "reward": 1.909375011920929, "reward_std": 0.8099210113286972, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 10.382812738418579, "epoch": 0.2962338392355256, "kl": 1.5949873328208923, "learning_rate": 9.970432827431141e-07, "loss": 0.025830864906311035, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.896875023841858, "reward_std": 0.8379149883985519, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.9000000059604645, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 10.221875190734863, "epoch": 0.29679595278246207, "kl": 1.5679085850715637, "learning_rate": 9.970376616076448e-07, "loss": 0.04403117299079895, "ratio/all_0": 0.0, "ratio/all_2": 0.75, "reward": 1.959375023841858, "reward_std": 0.850878432393074, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.984375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.984375, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9593750089406967, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 10.404687643051147, "epoch": 0.29735806632939854, "kl": 1.5711309611797333, "learning_rate": 9.970320404721755e-07, "loss": 0.03724472224712372, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9390625059604645, "reward_std": 0.8469318300485611, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.96875, "rewards/avg_6": 1.953125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9390625208616257, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 10.362500429153442, "epoch": 0.297920179876335, "kl": 1.5913471281528473, "learning_rate": 9.97026419336706e-07, "loss": 0.014495437033474445, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.8828125298023224, "reward_std": 0.8245996832847595, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.8875000029802322, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 10.245312929153442, "epoch": 0.2984822934232715, "kl": 1.5916757583618164, "learning_rate": 9.970207982012366e-07, "loss": 0.04658622294664383, "ratio/all_0": 0.046875, "ratio/all_2": 0.75, "reward": 1.9250000417232513, "reward_std": 0.8800604343414307, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.925000011920929, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 10.359375476837158, "epoch": 0.29904440697020795, "kl": 1.4983412623405457, "learning_rate": 9.970151770657673e-07, "loss": 0.023762550204992294, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.890625, "reward_std": 0.8271434009075165, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 10.406250238418579, "epoch": 0.2996065205171445, "kl": 1.6456592977046967, "learning_rate": 9.97009555930298e-07, "loss": 0.02334306575357914, "ratio/all_0": 0.03125, "ratio/all_2": 0.5, "reward": 1.8828125298023224, "reward_std": 0.7357584089040756, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8843750059604645, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 10.381250381469727, "epoch": 0.30016863406408095, "kl": 1.5607287883758545, "learning_rate": 9.970039347948284e-07, "loss": 0.0018885387107729912, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.873437523841858, "reward_std": 0.8119989484548569, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.878125011920929, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 10.375, "epoch": 0.3007307476110174, "kl": 1.6651538908481598, "learning_rate": 9.96998313659359e-07, "loss": 0.02762552909553051, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8953125476837158, "reward_std": 0.8060480654239655, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8968750089406967, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 10.467187643051147, "epoch": 0.3012928611579539, "kl": 1.5861826539039612, "learning_rate": 9.969926925238898e-07, "loss": 0.011971995234489441, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.878125011920929, "reward_std": 0.7851677536964417, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.8812500089406967, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 10.456250190734863, "epoch": 0.30185497470489037, "kl": 1.6371749639511108, "learning_rate": 9.969870713884205e-07, "loss": 0.023145480081439018, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.885937511920929, "reward_std": 0.8331510424613953, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500059604645, "rewards/point_reward": 0.8921875208616257, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 10.387500286102295, "epoch": 0.3024170882518269, "kl": 1.6015640199184418, "learning_rate": 9.969814502529511e-07, "loss": 0.009054386988282204, "ratio/all_0": 0.046875, "ratio/all_2": 0.6875, "reward": 1.8765625357627869, "reward_std": 0.8440788686275482, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.878125011920929, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 10.440625190734863, "epoch": 0.30297920179876336, "kl": 1.582722783088684, "learning_rate": 9.969758291174816e-07, "loss": 0.015762681141495705, "ratio/all_0": 0.03125, "ratio/all_2": 0.578125, "reward": 1.8796875178813934, "reward_std": 0.7894563376903534, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750029802322, "rewards/point_reward": 0.8828125149011612, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 10.434375286102295, "epoch": 0.30354131534569984, "kl": 1.558344304561615, "learning_rate": 9.969702079820123e-07, "loss": 0.011870039626955986, "ratio/all_0": 0.03125, "ratio/all_2": 0.484375, "reward": 1.8765625357627869, "reward_std": 0.7154543548822403, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8781250268220901, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 10.453125, "epoch": 0.3041034288926363, "kl": 1.5748683512210846, "learning_rate": 9.96964586846543e-07, "loss": 0.03740207105875015, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9312500357627869, "reward_std": 0.8683542609214783, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9328124970197678, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 10.41562533378601, "epoch": 0.3046655424395728, "kl": 1.5355166792869568, "learning_rate": 9.969589657110736e-07, "loss": 0.04280543699860573, "ratio/all_0": 0.015625, "ratio/all_2": 0.765625, "reward": 1.9453125298023224, "reward_std": 0.8763377517461777, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.96875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9468750059604645, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 10.500000476837158, "epoch": 0.30522765598650925, "kl": 1.543639749288559, "learning_rate": 9.969533445756043e-07, "loss": 0.015493150800466537, "ratio/all_0": 0.046875, "ratio/all_2": 0.5625, "reward": 1.8750000298023224, "reward_std": 0.7713418304920197, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000149011612, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 10.548437595367432, "epoch": 0.3057897695334458, "kl": 1.5717388689517975, "learning_rate": 9.96947723440135e-07, "loss": 0.01888006553053856, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9000000357627869, "reward_std": 0.8184342980384827, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.901562511920929, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 10.504687547683716, "epoch": 0.30635188308038225, "kl": 1.5470468699932098, "learning_rate": 9.969421023046654e-07, "loss": 0.020793110132217407, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.878125011920929, "reward_std": 0.7924162298440933, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8796875178813934, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 10.40000033378601, "epoch": 0.3069139966273187, "kl": 1.5460689961910248, "learning_rate": 9.969364811691961e-07, "loss": 0.03830792009830475, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.9375000298023224, "reward_std": 0.8405124992132187, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9375, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 10.550000429153442, "epoch": 0.3074761101742552, "kl": 1.5856299698352814, "learning_rate": 9.969308600337268e-07, "loss": 0.01524635311216116, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.885937511920929, "reward_std": 0.8167786598205566, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 10.48593783378601, "epoch": 0.30803822372119166, "kl": 1.5868599712848663, "learning_rate": 9.969252388982575e-07, "loss": 0.029491964727640152, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9156250059604645, "reward_std": 0.839767649769783, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.953125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9171875268220901, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 10.570312738418579, "epoch": 0.3086003372681282, "kl": 1.582980364561081, "learning_rate": 9.96919617762788e-07, "loss": 0.017688706517219543, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.896875023841858, "reward_std": 0.8135938942432404, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.9000000208616257, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 10.457812786102295, "epoch": 0.30916245081506466, "kl": 1.5583482682704926, "learning_rate": 9.969139966273186e-07, "loss": 0.009958229959011078, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.896875023841858, "reward_std": 0.8217545002698898, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8984375149011612, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 10.673437595367432, "epoch": 0.30972456436200113, "kl": 1.573817253112793, "learning_rate": 9.969083754918493e-07, "loss": 0.027634086087346077, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9203125536441803, "reward_std": 0.8161921203136444, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.925000011920929, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 10.523437738418579, "epoch": 0.3102866779089376, "kl": 1.537708729505539, "learning_rate": 9.9690275435638e-07, "loss": 0.017526382580399513, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8875000178813934, "reward_std": 0.8181226849555969, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8890625238418579, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 10.4609375, "epoch": 0.3108487914558741, "kl": 1.6289916336536407, "learning_rate": 9.968971332209107e-07, "loss": 0.01097562350332737, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.8828125298023224, "reward_std": 0.7881049513816833, "rewards/avg_0": 1.875, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500208616257, "rewards/point_reward": 0.8890624940395355, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 10.576562643051147, "epoch": 0.31141090500281055, "kl": 1.5423737466335297, "learning_rate": 9.968915120854411e-07, "loss": 0.021829504519701004, "ratio/all_0": 0.046875, "ratio/all_2": 0.59375, "reward": 1.8750000298023224, "reward_std": 0.7977400124073029, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750029802322, "rewards/point_reward": 0.878125011920929, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 10.65781283378601, "epoch": 0.31197301854974707, "kl": 1.5545784831047058, "learning_rate": 9.968858909499718e-07, "loss": 0.030509337782859802, "ratio/all_0": 0.0, "ratio/all_2": 0.75, "reward": 1.9375, "reward_std": 0.8586998879909515, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9390625059604645, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 10.456249952316284, "epoch": 0.31253513209668354, "kl": 1.5957116782665253, "learning_rate": 9.968802698145025e-07, "loss": 0.023539943620562553, "ratio/all_0": 0.0, "ratio/all_2": 0.65625, "reward": 1.9171875417232513, "reward_std": 0.8016439527273178, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 10.578125238418579, "epoch": 0.31309724564362, "kl": 1.5839785635471344, "learning_rate": 9.968746486790332e-07, "loss": 0.020525125786662102, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8859375417232513, "reward_std": 0.7854725420475006, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 10.565625190734863, "epoch": 0.3136593591905565, "kl": 1.6725057363510132, "learning_rate": 9.968690275435636e-07, "loss": 0.01900116726756096, "ratio/all_0": 0.0625, "ratio/all_2": 0.625, "reward": 1.862500011920929, "reward_std": 0.8175742924213409, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.862500011920929, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 10.484375238418579, "epoch": 0.31422147273749296, "kl": 1.5269731283187866, "learning_rate": 9.968634064080945e-07, "loss": 0.0200035460293293, "ratio/all_0": 0.0625, "ratio/all_2": 0.5625, "reward": 1.8640625178813934, "reward_std": 0.7826195657253265, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8640625029802322, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 10.370312690734863, "epoch": 0.31478358628442943, "kl": 1.5648876428604126, "learning_rate": 9.96857785272625e-07, "loss": 0.024630874395370483, "ratio/all_0": 0.03125, "ratio/all_2": 0.75, "reward": 1.9093750417232513, "reward_std": 0.8682510405778885, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 10.45468783378601, "epoch": 0.31534569983136596, "kl": 1.5348423719406128, "learning_rate": 9.968521641371556e-07, "loss": 0.01418773178011179, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.896875023841858, "reward_std": 0.8042657375335693, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 10.44687533378601, "epoch": 0.3159078133783024, "kl": 1.5903120338916779, "learning_rate": 9.968465430016863e-07, "loss": 0.01568039134144783, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.8859374821186066, "reward_std": 0.7972578704357147, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8875000178813934, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 10.389062881469727, "epoch": 0.3164699269252389, "kl": 1.5598368048667908, "learning_rate": 9.96840921866217e-07, "loss": 0.015399058349430561, "ratio/all_0": 0.0625, "ratio/all_2": 0.765625, "reward": 1.8984375, "reward_std": 0.8830222487449646, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9953125268220901, "rewards/point_reward": 0.9031250029802322, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 10.385937452316284, "epoch": 0.31703204047217537, "kl": 1.5837746560573578, "learning_rate": 9.968353007307477e-07, "loss": 0.0344664491713047, "ratio/all_0": 0.046875, "ratio/all_2": 0.71875, "reward": 1.9031250178813934, "reward_std": 0.8720826357603073, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250029802322, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 10.389062643051147, "epoch": 0.31759415401911184, "kl": 1.5606088936328888, "learning_rate": 9.968296795952781e-07, "loss": 0.01733417436480522, "ratio/all_0": 0.015625, "ratio/all_2": 0.5625, "reward": 1.8921875357627869, "reward_std": 0.7611248642206192, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 10.4375, "epoch": 0.31815626756604837, "kl": 1.586754322052002, "learning_rate": 9.968240584598088e-07, "loss": 0.019815683364868164, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8765625059604645, "reward_std": 0.7880108803510666, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625208616257, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 10.346875190734863, "epoch": 0.31871838111298484, "kl": 1.628895491361618, "learning_rate": 9.968184373243395e-07, "loss": 0.0360635444521904, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.9140625298023224, "reward_std": 0.8494412302970886, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9156250208616257, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 10.350000143051147, "epoch": 0.3192804946599213, "kl": 1.6824475228786469, "learning_rate": 9.968128161888702e-07, "loss": 0.028761250898241997, "ratio/all_0": 0.03125, "ratio/all_2": 0.640625, "reward": 1.8953125476837158, "reward_std": 0.8177232891321182, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 10.35312533378601, "epoch": 0.3198426082068578, "kl": 1.5600067675113678, "learning_rate": 9.968071950534006e-07, "loss": 0.027938317507505417, "ratio/all_0": 0.015625, "ratio/all_2": 0.765625, "reward": 1.9187500178813934, "reward_std": 0.8827731013298035, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9187500178813934, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 10.323437690734863, "epoch": 0.32040472175379425, "kl": 1.5906377136707306, "learning_rate": 9.968015739179313e-07, "loss": 0.0034601669758558273, "ratio/all_0": 0.015625, "ratio/all_2": 0.640625, "reward": 1.8765625059604645, "reward_std": 0.8050589561462402, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625208616257, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 10.2578125, "epoch": 0.3209668353007307, "kl": 1.598751276731491, "learning_rate": 9.96795952782462e-07, "loss": 0.0312412828207016, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9187500476837158, "reward_std": 0.8491444438695908, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9187500178813934, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 10.268750190734863, "epoch": 0.32152894884766725, "kl": 1.614193320274353, "learning_rate": 9.967903316469927e-07, "loss": 0.03290371596813202, "ratio/all_0": 0.0625, "ratio/all_2": 0.75, "reward": 1.9046874940395355, "reward_std": 0.881769135594368, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9062500149011612, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 10.412500143051147, "epoch": 0.3220910623946037, "kl": 1.5136121213436127, "learning_rate": 9.967847105115231e-07, "loss": 0.021257780492305756, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9078125059604645, "reward_std": 0.816282257437706, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125208616257, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 10.325000286102295, "epoch": 0.3226531759415402, "kl": 1.5708222687244415, "learning_rate": 9.96779089376054e-07, "loss": 0.03907536342740059, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.9250000417232513, "reward_std": 0.84419384598732, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9265625178813934, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 10.3359375, "epoch": 0.32321528948847666, "kl": 1.5484612584114075, "learning_rate": 9.967734682405845e-07, "loss": 0.011735038831830025, "ratio/all_0": 0.046875, "ratio/all_2": 0.609375, "reward": 1.870312511920929, "reward_std": 0.8025570511817932, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.870312511920929, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 10.39218783378601, "epoch": 0.32377740303541314, "kl": 1.5881067514419556, "learning_rate": 9.967678471051152e-07, "loss": 0.01938185654580593, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.890625, "reward_std": 0.7996688187122345, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 10.467187881469727, "epoch": 0.3243395165823496, "kl": 1.5919342041015625, "learning_rate": 9.967622259696458e-07, "loss": 0.03571542724967003, "ratio/all_0": 0.0625, "ratio/all_2": 0.65625, "reward": 1.8921875059604645, "reward_std": 0.8346816897392273, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875208616257, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 10.492187738418579, "epoch": 0.32490163012928613, "kl": 1.517667442560196, "learning_rate": 9.967566048341765e-07, "loss": 0.012189960107207298, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.896875023841858, "reward_std": 0.8305607587099075, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8968750089406967, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 10.50156283378601, "epoch": 0.3254637436762226, "kl": 1.6046421825885773, "learning_rate": 9.967509836987072e-07, "loss": 0.01817990466952324, "ratio/all_0": 0.0, "ratio/all_2": 0.625, "reward": 1.9124999940395355, "reward_std": 0.7751974761486053, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000238418579, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 10.565624952316284, "epoch": 0.3260258572231591, "kl": 1.532376229763031, "learning_rate": 9.967453625632377e-07, "loss": 0.009712206199765205, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.878125011920929, "reward_std": 0.825791984796524, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 10.450000286102295, "epoch": 0.32658797077009555, "kl": 1.5661653578281403, "learning_rate": 9.967397414277683e-07, "loss": 0.021667201071977615, "ratio/all_0": 0.0, "ratio/all_2": 0.578125, "reward": 1.9140625298023224, "reward_std": 0.7548810392618179, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625149011612, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 10.564062595367432, "epoch": 0.327150084317032, "kl": 1.586315929889679, "learning_rate": 9.96734120292299e-07, "loss": 0.01577918976545334, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.8953125178813934, "reward_std": 0.8373362272977829, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125029802322, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 10.425000190734863, "epoch": 0.32771219786396855, "kl": 1.5910922586917877, "learning_rate": 9.967284991568297e-07, "loss": 0.022063709795475006, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.9109375178813934, "reward_std": 0.8032617419958115, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 10.426562547683716, "epoch": 0.328274311410905, "kl": 1.5398958623409271, "learning_rate": 9.967228780213602e-07, "loss": 0.010586312040686607, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.884374976158142, "reward_std": 0.8187513500452042, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750357627869, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 10.479687690734863, "epoch": 0.3288364249578415, "kl": 1.6160912215709686, "learning_rate": 9.967172568858908e-07, "loss": 0.02243340015411377, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8937500417232513, "reward_std": 0.778084471821785, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 10.428125143051147, "epoch": 0.32939853850477796, "kl": 1.5488245785236359, "learning_rate": 9.967116357504215e-07, "loss": 0.026065057143568993, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.9281249940395355, "reward_std": 0.8089158087968826, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9281250238418579, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 10.481250047683716, "epoch": 0.32996065205171443, "kl": 1.566488653421402, "learning_rate": 9.967060146149522e-07, "loss": 0.02423604018986225, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.8921875059604645, "reward_std": 0.8361251354217529, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875208616257, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 10.476562738418579, "epoch": 0.3305227655986509, "kl": 1.5282692313194275, "learning_rate": 9.967003934794827e-07, "loss": 0.024469148367643356, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9125000536441803, "reward_std": 0.7933112531900406, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000089406967, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 10.512500047683716, "epoch": 0.33108487914558743, "kl": 1.6268229484558105, "learning_rate": 9.966947723440135e-07, "loss": 0.028700783848762512, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.904687523841858, "reward_std": 0.8226082473993301, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046875089406967, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 10.434375286102295, "epoch": 0.3316469926925239, "kl": 1.5528870820999146, "learning_rate": 9.966891512085442e-07, "loss": 0.031011313199996948, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.910937488079071, "reward_std": 0.8530013412237167, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375327825546, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.33220910623946037, "kl": 1.521255999803543, "learning_rate": 9.966835300730747e-07, "loss": 0.013384701684117317, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.8906250298023224, "reward_std": 0.7729595750570297, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250149011612, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 10.656250238418579, "epoch": 0.33277121978639684, "kl": 1.5627491772174835, "learning_rate": 9.966779089376054e-07, "loss": 0.01156577654182911, "ratio/all_0": 0.03125, "ratio/all_2": 0.5625, "reward": 1.878125011920929, "reward_std": 0.7626833021640778, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8781250268220901, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 10.473437547683716, "epoch": 0.3333333333333333, "kl": 1.6590400338172913, "learning_rate": 9.96672287802136e-07, "loss": 0.03726842254400253, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.9000000059604645, "reward_std": 0.8307802230119705, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.901562511920929, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 10.389062404632568, "epoch": 0.33389544688026984, "kl": 1.6278806328773499, "learning_rate": 9.966666666666667e-07, "loss": 0.010789453983306885, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.875, "reward_std": 0.7950026988983154, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.8765625208616257, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 10.534374952316284, "epoch": 0.3344575604272063, "kl": 1.603445678949356, "learning_rate": 9.966610455311972e-07, "loss": 0.02332155592739582, "ratio/all_0": 0.0625, "ratio/all_2": 0.671875, "reward": 1.8765625059604645, "reward_std": 0.84195476770401, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8765625059604645, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 10.4765625, "epoch": 0.3350196739741428, "kl": 1.5592808425426483, "learning_rate": 9.966554243957279e-07, "loss": 0.02063816972076893, "ratio/all_0": 0.046875, "ratio/all_2": 0.625, "reward": 1.8781249821186066, "reward_std": 0.8197565823793411, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8781250268220901, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 10.306249856948853, "epoch": 0.33558178752107926, "kl": 1.6125241816043854, "learning_rate": 9.966498032602585e-07, "loss": 0.020839476957917213, "ratio/all_0": 0.0, "ratio/all_2": 0.640625, "reward": 1.9109375178813934, "reward_std": 0.7973314970731735, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375089406967, "rewards/point_reward": 0.9125000089406967, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 10.382812738418579, "epoch": 0.3361439010680157, "kl": 1.6878159642219543, "learning_rate": 9.966441821247892e-07, "loss": 0.024872442707419395, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.8984375, "reward_std": 0.800084263086319, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375149011612, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 10.443750143051147, "epoch": 0.3367060146149522, "kl": 1.5578432381153107, "learning_rate": 9.966385609893197e-07, "loss": 0.022587155923247337, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8921875059604645, "reward_std": 0.829266294836998, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875208616257, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 10.32187533378601, "epoch": 0.3372681281618887, "kl": 1.5858546495437622, "learning_rate": 9.966329398538504e-07, "loss": 0.01929290220141411, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8906250596046448, "reward_std": 0.8204612880945206, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.890625, "step": 600 } ], "logging_steps": 1.0, "max_steps": 177900, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }