{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.44969083754918493, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 10.445312976837158, "epoch": 0.0011242270938729624, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.05997609347105026, "ratio/all_0": 0.0859375, "ratio/all_2": 0.5234375, "reward": 1.8171875476837158, "reward_std": 0.78060582280159, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.7578125, "rewards/avg_2": 1.75, "rewards/avg_3": 1.7421875, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.7578125, "rewards/avg_7": 1.734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8171875178813934, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 10.516406536102295, "epoch": 0.0022484541877459247, "kl": 0.19082757085561752, "learning_rate": 9.999887640449438e-07, "loss": -0.05867362394928932, "ratio/all_0": 0.1171875, "ratio/all_2": 0.4609375, "reward": 1.77734375, "reward_std": 0.7686226665973663, "rewards/avg_0": 1.7109375, "rewards/avg_1": 1.640625, "rewards/avg_2": 1.7578125, "rewards/avg_3": 1.7265625, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.6875, "rewards/avg_6": 1.703125, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7773437798023224, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 10.396093845367432, "epoch": 0.003372681281618887, "kl": 0.8681274950504303, "learning_rate": 9.999775280898875e-07, "loss": -0.03326645493507385, "ratio/all_0": 0.09375, "ratio/all_2": 0.484375, "reward": 1.796093761920929, "reward_std": 0.7614105343818665, "rewards/avg_0": 1.7421875, "rewards/avg_1": 1.7109375, "rewards/avg_2": 1.7734375, "rewards/avg_3": 1.7265625, "rewards/avg_4": 1.71875, "rewards/avg_5": 1.7734375, "rewards/avg_6": 1.75, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.796875, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 10.425781726837158, "epoch": 0.004496908375491849, "kl": 1.0842570066452026, "learning_rate": 9.999662921348314e-07, "loss": -0.01579746976494789, "ratio/all_0": 0.0390625, "ratio/all_2": 0.5546875, "reward": 1.8515625, "reward_std": 0.779606819152832, "rewards/avg_0": 1.8046875, "rewards/avg_1": 1.7890625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8203125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8515625298023224, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 10.410937786102295, "epoch": 0.005621135469364812, "kl": 1.4418902397155762, "learning_rate": 9.999550561797753e-07, "loss": -0.01998813822865486, "ratio/all_0": 0.1484375, "ratio/all_2": 0.3671875, "reward": 1.736718773841858, "reward_std": 0.7228272557258606, "rewards/avg_0": 1.6875, "rewards/avg_1": 1.6640625, "rewards/avg_2": 1.640625, "rewards/avg_3": 1.6796875, "rewards/avg_4": 1.6796875, "rewards/avg_5": 1.6328125, "rewards/avg_6": 1.6953125, "rewards/avg_7": 1.6875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7367187440395355, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 10.425000190734863, "epoch": 0.006745362563237774, "kl": 1.5545793771743774, "learning_rate": 9.99943820224719e-07, "loss": -0.014625937677919865, "ratio/all_0": 0.0546875, "ratio/all_2": 0.484375, "reward": 1.8125, "reward_std": 0.7375520765781403, "rewards/avg_0": 1.75, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.75, "rewards/avg_3": 1.75, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.7890625, "rewards/avg_7": 1.75, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8125000298023224, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 10.370312690734863, "epoch": 0.007869589657110737, "kl": 1.6209379434585571, "learning_rate": 9.999325842696629e-07, "loss": -0.026230724528431892, "ratio/all_0": 0.0859375, "ratio/all_2": 0.4296875, "reward": 1.762499988079071, "reward_std": 0.7292351722717285, "rewards/avg_0": 1.671875, "rewards/avg_1": 1.6953125, "rewards/avg_2": 1.6953125, "rewards/avg_3": 1.71875, "rewards/avg_4": 1.7265625, "rewards/avg_5": 1.71875, "rewards/avg_6": 1.7265625, "rewards/avg_7": 1.671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.7625000178813934, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 10.485937595367432, "epoch": 0.008993816750983699, "kl": 1.5760462284088135, "learning_rate": 9.999213483146068e-07, "loss": -0.013463463634252548, "ratio/all_0": 0.0546875, "ratio/all_2": 0.5625, "reward": 1.822656273841858, "reward_std": 0.7794111371040344, "rewards/avg_0": 1.7890625, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.796875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.78125, "rewards/avg_5": 1.7734375, "rewards/avg_6": 1.7578125, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8226562440395355, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 10.241406440734863, "epoch": 0.01011804384485666, "kl": 1.6185383796691895, "learning_rate": 9.999101123595504e-07, "loss": -0.002992298686876893, "ratio/all_0": 0.0703125, "ratio/all_2": 0.4921875, "reward": 1.8101562857627869, "reward_std": 0.7595367431640625, "rewards/avg_0": 1.7265625, "rewards/avg_1": 1.7421875, "rewards/avg_2": 1.7734375, "rewards/avg_3": 1.7734375, "rewards/avg_4": 1.75, "rewards/avg_5": 1.7578125, "rewards/avg_6": 1.8046875, "rewards/avg_7": 1.7734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8101562559604645, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 10.352344036102295, "epoch": 0.011242270938729624, "kl": 1.6257511377334595, "learning_rate": 9.998988764044943e-07, "loss": 0.0020705917850136757, "ratio/all_0": 0.0625, "ratio/all_2": 0.5546875, "reward": 1.8320313096046448, "reward_std": 0.785227507352829, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.765625, "rewards/avg_2": 1.7890625, "rewards/avg_3": 1.8046875, "rewards/avg_4": 1.8203125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.78125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.83203125, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 10.372656345367432, "epoch": 0.012366498032602586, "kl": 1.6162938475608826, "learning_rate": 9.998876404494382e-07, "loss": 0.007467743009328842, "ratio/all_0": 0.0703125, "ratio/all_2": 0.515625, "reward": 1.830468773841858, "reward_std": 0.7664113640785217, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.78125, "rewards/avg_3": 1.7890625, "rewards/avg_4": 1.7578125, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.7734375, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8304687738418579, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 10.474218845367432, "epoch": 0.013490725126475547, "kl": 1.6269961595535278, "learning_rate": 9.99876404494382e-07, "loss": 0.004299253225326538, "ratio/all_0": 0.0859375, "ratio/all_2": 0.5390625, "reward": 1.8203125, "reward_std": 0.7797641456127167, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.7578125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.7578125, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.7734375, "rewards/avg_7": 1.7578125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8203125298023224, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 10.396093845367432, "epoch": 0.01461495222034851, "kl": 1.6182974576950073, "learning_rate": 9.998651685393258e-07, "loss": -0.004290747921913862, "ratio/all_0": 0.0703125, "ratio/all_2": 0.4921875, "reward": 1.813281238079071, "reward_std": 0.7540134191513062, "rewards/avg_0": 1.7734375, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.7578125, "rewards/avg_5": 1.7890625, "rewards/avg_6": 1.7265625, "rewards/avg_7": 1.7734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8132812678813934, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 10.291406154632568, "epoch": 0.015739179314221474, "kl": 1.652035653591156, "learning_rate": 9.998539325842697e-07, "loss": -0.010208970867097378, "ratio/all_0": 0.0546875, "ratio/all_2": 0.515625, "reward": 1.8179687857627869, "reward_std": 0.7553539872169495, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.734375, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8046875, "rewards/avg_5": 1.7421875, "rewards/avg_6": 1.7890625, "rewards/avg_7": 1.765625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8179687559604645, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 10.327343940734863, "epoch": 0.016863406408094434, "kl": 1.6964421272277832, "learning_rate": 9.998426966292134e-07, "loss": -0.003030479419976473, "ratio/all_0": 0.0234375, "ratio/all_2": 0.5078125, "reward": 1.84765625, "reward_std": 0.73407843708992, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8046875, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.7890625, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.8359375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8476562798023224, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 10.395312786102295, "epoch": 0.017987633501967398, "kl": 1.6439769864082336, "learning_rate": 9.998314606741573e-07, "loss": 0.0077795363031327724, "ratio/all_0": 0.109375, "ratio/all_2": 0.5546875, "reward": 1.807031273841858, "reward_std": 0.8117163181304932, "rewards/avg_0": 1.75, "rewards/avg_1": 1.7421875, "rewards/avg_2": 1.7421875, "rewards/avg_3": 1.734375, "rewards/avg_4": 1.7578125, "rewards/avg_5": 1.7734375, "rewards/avg_6": 1.765625, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8070312440395355, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 10.37656307220459, "epoch": 0.01911186059584036, "kl": 1.606972336769104, "learning_rate": 9.998202247191011e-07, "loss": -0.013812951743602753, "ratio/all_0": 0.046875, "ratio/all_2": 0.5, "reward": 1.819531261920929, "reward_std": 0.7394805550575256, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.7421875, "rewards/avg_2": 1.75, "rewards/avg_3": 1.765625, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.765625, "rewards/avg_6": 1.7734375, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.819531261920929, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 10.347656726837158, "epoch": 0.02023608768971332, "kl": 1.6440010070800781, "learning_rate": 9.998089887640448e-07, "loss": 0.002852785401046276, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6171875, "reward": 1.865625023841858, "reward_std": 0.7961074411869049, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8046875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656250238418579, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 10.438281059265137, "epoch": 0.021360314783586284, "kl": 1.5873689651489258, "learning_rate": 9.997977528089887e-07, "loss": 0.009698862209916115, "ratio/all_0": 0.0234375, "ratio/all_2": 0.625, "reward": 1.872656226158142, "reward_std": 0.8082565665245056, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8203125, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8726562857627869, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 10.417187690734863, "epoch": 0.022484541877459248, "kl": 1.6128740310668945, "learning_rate": 9.997865168539326e-07, "loss": -0.008634892292320728, "ratio/all_0": 0.0390625, "ratio/all_2": 0.5234375, "reward": 1.8335937857627869, "reward_std": 0.748685210943222, "rewards/avg_0": 1.7734375, "rewards/avg_1": 1.8203125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.7734375, "rewards/avg_4": 1.7578125, "rewards/avg_5": 1.796875, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.7890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8343749940395355, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 10.411718845367432, "epoch": 0.023608768971332208, "kl": 1.613295078277588, "learning_rate": 9.997752808988763e-07, "loss": 0.009994986467063427, "ratio/all_0": 0.0546875, "ratio/all_2": 0.5859375, "reward": 1.8492187857627869, "reward_std": 0.7984052896499634, "rewards/avg_0": 1.8046875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8203125, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.7890625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8492187559604645, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 10.517968654632568, "epoch": 0.02473299606520517, "kl": 1.6258622407913208, "learning_rate": 9.997640449438202e-07, "loss": 0.005634765140712261, "ratio/all_0": 0.0390625, "ratio/all_2": 0.59375, "reward": 1.853906273841858, "reward_std": 0.7958876490592957, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.8203125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8203125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8203125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8539062738418579, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.025857223159078135, "kl": 1.6079289317131042, "learning_rate": 9.99752808988764e-07, "loss": 0.00109954085201025, "ratio/all_0": 0.0546875, "ratio/all_2": 0.546875, "reward": 1.8312500715255737, "reward_std": 0.7803563475608826, "rewards/avg_0": 1.7421875, "rewards/avg_1": 1.8046875, "rewards/avg_2": 1.7734375, "rewards/avg_3": 1.796875, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.8046875, "rewards/avg_7": 1.7734375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.831250011920929, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 10.396093845367432, "epoch": 0.026981450252951095, "kl": 1.6150875091552734, "learning_rate": 9.99741573033708e-07, "loss": 0.016750413924455643, "ratio/all_0": 0.0625, "ratio/all_2": 0.6015625, "reward": 1.8609375357627869, "reward_std": 0.8065908253192902, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.7890625, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8617187738418579, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 10.32421875, "epoch": 0.028105677346824058, "kl": 1.6833326816558838, "learning_rate": 9.997303370786516e-07, "loss": 0.017063738778233528, "ratio/all_0": 0.078125, "ratio/all_2": 0.578125, "reward": 1.83984375, "reward_std": 0.8057522475719452, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.7734375, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.765625, "rewards/avg_5": 1.8203125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8398437798023224, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 10.313281536102295, "epoch": 0.02922990444069702, "kl": 1.6673230528831482, "learning_rate": 9.997191011235955e-07, "loss": -0.007147204130887985, "ratio/all_0": 0.046875, "ratio/all_2": 0.5390625, "reward": 1.819531261920929, "reward_std": 0.7743638455867767, "rewards/avg_0": 1.7578125, "rewards/avg_1": 1.78125, "rewards/avg_2": 1.765625, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8203125, "rewards/avg_5": 1.7890625, "rewards/avg_6": 1.7578125, "rewards/avg_7": 1.7421875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.819531261920929, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 10.321875095367432, "epoch": 0.03035413153456998, "kl": 1.7122045159339905, "learning_rate": 9.997078651685394e-07, "loss": 0.017641516402363777, "ratio/all_0": 0.015625, "ratio/all_2": 0.5859375, "reward": 1.889062523841858, "reward_std": 0.7684561014175415, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8890625238418579, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 10.247656345367432, "epoch": 0.03147835862844295, "kl": 1.7543825507164001, "learning_rate": 9.99696629213483e-07, "loss": 0.01977822184562683, "ratio/all_0": 0.0625, "ratio/all_2": 0.6015625, "reward": 1.850000023841858, "reward_std": 0.8139838874340057, "rewards/avg_0": 1.765625, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8046875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8046875, "rewards/avg_6": 1.8046875, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8500000238418579, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 10.428906440734863, "epoch": 0.032602585722315905, "kl": 1.6742790341377258, "learning_rate": 9.99685393258427e-07, "loss": 0.025930162519216537, "ratio/all_0": 0.03125, "ratio/all_2": 0.59375, "reward": 1.892968773841858, "reward_std": 0.782163679599762, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8929687440395355, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 10.371094226837158, "epoch": 0.03372681281618887, "kl": 1.6981696486473083, "learning_rate": 9.996741573033709e-07, "loss": 0.01707715541124344, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6328125, "reward": 1.885156273841858, "reward_std": 0.8080797493457794, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8851562738418579, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 10.41796875, "epoch": 0.03485103991006183, "kl": 1.666592299938202, "learning_rate": 9.996629213483146e-07, "loss": 0.0028374078683555126, "ratio/all_0": 0.03125, "ratio/all_2": 0.5859375, "reward": 1.859375, "reward_std": 0.7776727378368378, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8593750298023224, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 10.405468940734863, "epoch": 0.035975267003934795, "kl": 1.6903700828552246, "learning_rate": 9.996516853932585e-07, "loss": 0.017414983361959457, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8796875476837158, "reward_std": 0.8306158185005188, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8804687559604645, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 10.460156440734863, "epoch": 0.03709949409780776, "kl": 1.6858877539634705, "learning_rate": 9.996404494382023e-07, "loss": 0.015359701588749886, "ratio/all_0": 0.0390625, "ratio/all_2": 0.609375, "reward": 1.867968738079071, "reward_std": 0.8056941330432892, "rewards/avg_0": 1.8203125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.7890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8679687678813934, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 10.491406440734863, "epoch": 0.03822372119168072, "kl": 1.652479112148285, "learning_rate": 9.99629213483146e-07, "loss": 0.012801921926438808, "ratio/all_0": 0.0625, "ratio/all_2": 0.6171875, "reward": 1.848437488079071, "reward_std": 0.8226411044597626, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.78125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8484375178813934, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 10.46875, "epoch": 0.03934794828555368, "kl": 1.6815390586853027, "learning_rate": 9.9961797752809e-07, "loss": 0.01791820488870144, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6328125, "reward": 1.874218761920929, "reward_std": 0.811478853225708, "rewards/avg_0": 1.8203125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.874218761920929, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 10.484375476837158, "epoch": 0.04047217537942664, "kl": 1.6642175912857056, "learning_rate": 9.996067415730338e-07, "loss": 0.01151614636182785, "ratio/all_0": 0.0546875, "ratio/all_2": 0.6171875, "reward": 1.85546875, "reward_std": 0.8131150305271149, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.8046875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.796875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8562500178813934, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 10.414062976837158, "epoch": 0.041596402473299605, "kl": 1.6078103184700012, "learning_rate": 9.995955056179775e-07, "loss": 0.01336689293384552, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.8781250715255737, "reward_std": 0.8475979864597321, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.878125011920929, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 10.53281307220459, "epoch": 0.04272062956717257, "kl": 1.6236757636070251, "learning_rate": 9.995842696629214e-07, "loss": 0.0206887386739254, "ratio/all_0": 0.03125, "ratio/all_2": 0.65625, "reward": 1.8882812857627869, "reward_std": 0.823714554309845, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8882812559604645, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 10.517187595367432, "epoch": 0.04384485666104553, "kl": 1.6272926926612854, "learning_rate": 9.995730337078653e-07, "loss": -0.0022770659998059273, "ratio/all_0": 0.0390625, "ratio/all_2": 0.5390625, "reward": 1.8445312976837158, "reward_std": 0.7606980204582214, "rewards/avg_0": 1.7890625, "rewards/avg_1": 1.796875, "rewards/avg_2": 1.7890625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8046875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8046875, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8445312678813934, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 10.602344036102295, "epoch": 0.044969083754918496, "kl": 1.587278664112091, "learning_rate": 9.99561797752809e-07, "loss": 0.022644348442554474, "ratio/all_0": 0.046875, "ratio/all_2": 0.6875, "reward": 1.885156273841858, "reward_std": 0.8540178835391998, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8851562738418579, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 10.587500095367432, "epoch": 0.04609331084879146, "kl": 1.6034030318260193, "learning_rate": 9.995505617977528e-07, "loss": 0.009336121380329132, "ratio/all_0": 0.015625, "ratio/all_2": 0.5703125, "reward": 1.8820313215255737, "reward_std": 0.7586211562156677, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.882031261920929, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 10.473437786102295, "epoch": 0.047217537942664416, "kl": 1.5840712189674377, "learning_rate": 9.995393258426967e-07, "loss": 0.018098287284374237, "ratio/all_0": 0.03125, "ratio/all_2": 0.609375, "reward": 1.889062523841858, "reward_std": 0.7948274314403534, "rewards/avg_0": 1.875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.889843761920929, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 10.46484375, "epoch": 0.04834176503653738, "kl": 1.6059932112693787, "learning_rate": 9.995280898876404e-07, "loss": 0.02952752448618412, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.9078125357627869, "reward_std": 0.8541045188903809, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 10.481250286102295, "epoch": 0.04946599213041034, "kl": 1.5430679321289062, "learning_rate": 9.995168539325843e-07, "loss": 0.013610566034913063, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6328125, "reward": 1.8796875476837158, "reward_std": 0.8104925453662872, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8046875, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8046875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 10.551562786102295, "epoch": 0.050590219224283306, "kl": 1.5598188638687134, "learning_rate": 9.995056179775282e-07, "loss": 0.02287764474749565, "ratio/all_0": 0.03125, "ratio/all_2": 0.6796875, "reward": 1.897656261920929, "reward_std": 0.8357208371162415, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8984375298023224, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.05171444631815627, "kl": 1.538119375705719, "learning_rate": 9.994943820224719e-07, "loss": 0.01902620680630207, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.9039062857627869, "reward_std": 0.7881555557250977, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9046875238418579, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 10.503125190734863, "epoch": 0.05283867341202923, "kl": 1.500148355960846, "learning_rate": 9.994831460674158e-07, "loss": 0.03803616017103195, "ratio/all_0": 0.0390625, "ratio/all_2": 0.734375, "reward": 1.9226562976837158, "reward_std": 0.8660586476325989, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9453125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9226562678813934, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 10.578125476837158, "epoch": 0.05396290050590219, "kl": 1.4877179265022278, "learning_rate": 9.994719101123596e-07, "loss": 0.016881819814443588, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.8992187976837158, "reward_std": 0.8531339764595032, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8992187678813934, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 10.573437690734863, "epoch": 0.05508712759977515, "kl": 1.4212284088134766, "learning_rate": 9.994606741573033e-07, "loss": 0.010123580694198608, "ratio/all_0": 0.03125, "ratio/all_2": 0.625, "reward": 1.8835937976837158, "reward_std": 0.8060767948627472, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8843750059604645, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 10.537500381469727, "epoch": 0.056211354693648116, "kl": 1.405593991279602, "learning_rate": 9.994494382022472e-07, "loss": 0.012005077674984932, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6171875, "reward": 1.8796875476837158, "reward_std": 0.8047949373722076, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8796875178813934, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 10.54843807220459, "epoch": 0.05733558178752108, "kl": 1.379647970199585, "learning_rate": 9.994382022471909e-07, "loss": 0.023727476596832275, "ratio/all_0": 0.0625, "ratio/all_2": 0.671875, "reward": 1.8875000476837158, "reward_std": 0.8443561792373657, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.887499988079071, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 10.575781345367432, "epoch": 0.05845980888139404, "kl": 1.4058558344841003, "learning_rate": 9.994269662921348e-07, "loss": 0.022184893488883972, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.920312523841858, "reward_std": 0.8409400582313538, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203125238418579, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 10.510937690734863, "epoch": 0.059584035975267007, "kl": 1.3978136777877808, "learning_rate": 9.994157303370787e-07, "loss": 0.025388304144144058, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.914843738079071, "reward_std": 0.847051203250885, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9148437678813934, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 10.57187557220459, "epoch": 0.06070826306913996, "kl": 1.4011179208755493, "learning_rate": 9.994044943820224e-07, "loss": 0.011839143931865692, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.8914062976837158, "reward_std": 0.8390854001045227, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8914062678813934, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 10.611719131469727, "epoch": 0.061832490163012926, "kl": 1.4960259199142456, "learning_rate": 9.993932584269662e-07, "loss": 0.02140812575817108, "ratio/all_0": 0.046875, "ratio/all_2": 0.578125, "reward": 1.8828125, "reward_std": 0.7827528119087219, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.78125, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125298023224, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 10.598437786102295, "epoch": 0.0629567172568859, "kl": 1.549820065498352, "learning_rate": 9.993820224719101e-07, "loss": 0.018605422228574753, "ratio/all_0": 0.046875, "ratio/all_2": 0.65625, "reward": 1.885937511920929, "reward_std": 0.823595255613327, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.8359375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.885937511920929, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 10.602344036102295, "epoch": 0.06408094435075885, "kl": 1.5655131936073303, "learning_rate": 9.993707865168538e-07, "loss": 0.01846824586391449, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6328125, "reward": 1.887499988079071, "reward_std": 0.8110823035240173, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8875000178813934, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 10.494531154632568, "epoch": 0.06520517144463181, "kl": 1.5266385078430176, "learning_rate": 9.993595505617977e-07, "loss": 0.0141812264919281, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.9031250476837158, "reward_std": 0.8471269309520721, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9031250178813934, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 10.503906726837158, "epoch": 0.06632939853850478, "kl": 1.5263578295707703, "learning_rate": 9.993483146067416e-07, "loss": 0.014608601108193398, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6640625, "reward": 1.8960937857627869, "reward_std": 0.8199012279510498, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8960937559604645, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 10.405468940734863, "epoch": 0.06745362563237774, "kl": 1.4636675119400024, "learning_rate": 9.993370786516853e-07, "loss": 0.017880389466881752, "ratio/all_0": 0.015625, "ratio/all_2": 0.6640625, "reward": 1.9070312976837158, "reward_std": 0.8169863820075989, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.907031238079071, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 10.500781059265137, "epoch": 0.06857785272625071, "kl": 1.4090477228164673, "learning_rate": 9.993258426966292e-07, "loss": 0.006813580170273781, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6796875, "reward": 1.893750011920929, "reward_std": 0.8272766470909119, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.06970207982012366, "kl": 1.3941453099250793, "learning_rate": 9.99314606741573e-07, "loss": 0.028155988082289696, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6875, "reward": 1.9257813096046448, "reward_std": 0.8283512890338898, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.92578125, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 10.515625, "epoch": 0.07082630691399663, "kl": 1.4108628034591675, "learning_rate": 9.993033707865167e-07, "loss": 0.01993025653064251, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6484375, "reward": 1.8960937857627869, "reward_std": 0.8207258880138397, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8203125, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8960937559604645, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.07195053400786959, "kl": 1.4183465838432312, "learning_rate": 9.992921348314606e-07, "loss": 0.02124522626399994, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7109375, "reward": 1.8984375, "reward_std": 0.8578440248966217, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8359375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375298023224, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 10.477344036102295, "epoch": 0.07307476110174255, "kl": 1.4451609253883362, "learning_rate": 9.992808988764045e-07, "loss": 0.015427444130182266, "ratio/all_0": 0.0546875, "ratio/all_2": 0.640625, "reward": 1.8750000596046448, "reward_std": 0.8293256461620331, "rewards/avg_0": 1.796875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.875, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 10.509375095367432, "epoch": 0.07419898819561552, "kl": 1.483718454837799, "learning_rate": 9.992696629213482e-07, "loss": 0.024618471041321754, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.917187511920929, "reward_std": 0.8308260142803192, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 10.473437309265137, "epoch": 0.07532321528948847, "kl": 1.4856454133987427, "learning_rate": 9.99258426966292e-07, "loss": 0.01140589639544487, "ratio/all_0": 0.046875, "ratio/all_2": 0.6328125, "reward": 1.869531273841858, "reward_std": 0.8212272822856903, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8046875, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8695312440395355, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 10.454687595367432, "epoch": 0.07644744238336144, "kl": 1.4795401096343994, "learning_rate": 9.99247191011236e-07, "loss": 0.014171874150633812, "ratio/all_0": 0.0546875, "ratio/all_2": 0.6171875, "reward": 1.867968738079071, "reward_std": 0.8124658763408661, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8046875, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8203125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8679687678813934, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 10.560937881469727, "epoch": 0.0775716694772344, "kl": 1.4359744787216187, "learning_rate": 9.992359550561797e-07, "loss": 0.022417651489377022, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9195312857627869, "reward_std": 0.857976645231247, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9195312559604645, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 10.557031154632568, "epoch": 0.07869589657110736, "kl": 1.480742871761322, "learning_rate": 9.992247191011235e-07, "loss": 0.019886016845703125, "ratio/all_0": 0.0703125, "ratio/all_2": 0.609375, "reward": 1.865625023841858, "reward_std": 0.8209626972675323, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.7734375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8359375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8656249940395355, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 10.470312595367432, "epoch": 0.07982012366498033, "kl": 1.382175326347351, "learning_rate": 9.992134831460674e-07, "loss": 0.013301249593496323, "ratio/all_0": 0.0546875, "ratio/all_2": 0.6875, "reward": 1.8789063096046448, "reward_std": 0.8519689440727234, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8203125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.87890625, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 10.515625, "epoch": 0.08094435075885328, "kl": 1.3844139575958252, "learning_rate": 9.992022471910111e-07, "loss": 0.00932213943451643, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.8875000476837158, "reward_std": 0.8321071863174438, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.887499988079071, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 10.482031345367432, "epoch": 0.08206857785272625, "kl": 1.3557974100112915, "learning_rate": 9.99191011235955e-07, "loss": 0.025301288813352585, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9273437857627869, "reward_std": 0.8360967040061951, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9273437559604645, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 10.583593845367432, "epoch": 0.08319280494659921, "kl": 1.3512595295906067, "learning_rate": 9.99179775280899e-07, "loss": 0.015560301020741463, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6875, "reward": 1.895312488079071, "reward_std": 0.8442690074443817, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8953125178813934, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 10.58203125, "epoch": 0.08431703204047218, "kl": 1.3721051812171936, "learning_rate": 9.991685393258426e-07, "loss": 0.025348057970404625, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.916406273841858, "reward_std": 0.8428552448749542, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9164062440395355, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 10.527344226837158, "epoch": 0.08544125913434514, "kl": 1.4204409718513489, "learning_rate": 9.991573033707865e-07, "loss": 0.008842497132718563, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.892187476158142, "reward_std": 0.8469344079494476, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875357627869, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 10.510156154632568, "epoch": 0.0865654862282181, "kl": 1.4739211201667786, "learning_rate": 9.991460674157304e-07, "loss": 0.013035193085670471, "ratio/all_0": 0.015625, "ratio/all_2": 0.6484375, "reward": 1.8992187976837158, "reward_std": 0.8030118048191071, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8992187678813934, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 10.525000095367432, "epoch": 0.08768971332209106, "kl": 1.540924608707428, "learning_rate": 9.99134831460674e-07, "loss": 0.010562058538198471, "ratio/all_0": 0.0390625, "ratio/all_2": 0.625, "reward": 1.87109375, "reward_std": 0.8110974431037903, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.8203125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.87109375, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 10.564844131469727, "epoch": 0.08881394041596402, "kl": 1.5210090279579163, "learning_rate": 9.99123595505618e-07, "loss": 0.023617252707481384, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.908593773841858, "reward_std": 0.8432826697826385, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9085937738418579, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 10.55859375, "epoch": 0.08993816750983699, "kl": 1.5450841188430786, "learning_rate": 9.991123595505618e-07, "loss": 0.030371172353625298, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.91796875, "reward_std": 0.8362286984920502, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9187500178813934, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 10.520312786102295, "epoch": 0.09106239460370995, "kl": 1.5201088786125183, "learning_rate": 9.991011235955055e-07, "loss": 0.01299591176211834, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6796875, "reward": 1.884374976158142, "reward_std": 0.8359851539134979, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8851562738418579, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 10.455468654632568, "epoch": 0.09218662169758292, "kl": 1.4778642058372498, "learning_rate": 9.990898876404494e-07, "loss": 0.026645315811038017, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.909375011920929, "reward_std": 0.8547311127185822, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.91015625, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 10.520312786102295, "epoch": 0.09331084879145587, "kl": 1.4318110346794128, "learning_rate": 9.990786516853933e-07, "loss": 0.02681383118033409, "ratio/all_0": 0.03125, "ratio/all_2": 0.7421875, "reward": 1.9156250357627869, "reward_std": 0.8724653422832489, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250059604645, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.09443507588532883, "kl": 1.3569493889808655, "learning_rate": 9.99067415730337e-07, "loss": 0.012078452855348587, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9101563096046448, "reward_std": 0.8564302027225494, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.91015625, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 10.544531345367432, "epoch": 0.0955593029792018, "kl": 1.42191481590271, "learning_rate": 9.990561797752808e-07, "loss": 0.02844950556755066, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7421875, "reward": 1.934374988079071, "reward_std": 0.8588309288024902, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9343750178813934, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 10.442968845367432, "epoch": 0.09668353007307476, "kl": 1.3797348737716675, "learning_rate": 9.990449438202247e-07, "loss": 0.02336825057864189, "ratio/all_0": 0.015625, "ratio/all_2": 0.765625, "reward": 1.927343726158142, "reward_std": 0.875806987285614, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9273437857627869, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 10.498437404632568, "epoch": 0.09780775716694773, "kl": 1.3973508477210999, "learning_rate": 9.990337078651684e-07, "loss": 0.016720149666070938, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.9101563096046448, "reward_std": 0.8276163339614868, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.91015625, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 10.514062881469727, "epoch": 0.09893198426082069, "kl": 1.412993311882019, "learning_rate": 9.990224719101123e-07, "loss": 0.03139276057481766, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7734375, "reward": 1.932031273841858, "reward_std": 0.8831829130649567, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9320312440395355, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 10.442968845367432, "epoch": 0.10005621135469364, "kl": 1.3898016214370728, "learning_rate": 9.990112359550562e-07, "loss": 0.024522768333554268, "ratio/all_0": 0.0, "ratio/all_2": 0.7890625, "reward": 1.9406250715255737, "reward_std": 0.8826980292797089, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.940625011920929, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 10.457812786102295, "epoch": 0.10118043844856661, "kl": 1.413306713104248, "learning_rate": 9.989999999999999e-07, "loss": 0.020547227934002876, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.917187511920929, "reward_std": 0.8212399482727051, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.91796875, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 10.63671875, "epoch": 0.10230466554243957, "kl": 1.4090213179588318, "learning_rate": 9.989887640449438e-07, "loss": 0.016374479979276657, "ratio/all_0": 0.0625, "ratio/all_2": 0.6796875, "reward": 1.874218761920929, "reward_std": 0.8512181341648102, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.874218761920929, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.10342889263631254, "kl": 1.3873103857040405, "learning_rate": 9.989775280898877e-07, "loss": 0.007781813386827707, "ratio/all_0": 0.0390625, "ratio/all_2": 0.671875, "reward": 1.881250023841858, "reward_std": 0.8377176225185394, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8203125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8812499940395355, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 10.610156536102295, "epoch": 0.1045531197301855, "kl": 1.4300039410591125, "learning_rate": 9.989662921348313e-07, "loss": 0.012583991512656212, "ratio/all_0": 0.046875, "ratio/all_2": 0.6328125, "reward": 1.875, "reward_std": 0.8191207945346832, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8203125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8750000298023224, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 10.410156726837158, "epoch": 0.10567734682405847, "kl": 1.4425511360168457, "learning_rate": 9.989550561797752e-07, "loss": 0.030106104910373688, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.9054688215255737, "reward_std": 0.8589926362037659, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.905468761920929, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 10.420312881469727, "epoch": 0.10680157391793142, "kl": 1.45794278383255, "learning_rate": 9.98943820224719e-07, "loss": 0.027616795152425766, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.9187500476837158, "reward_std": 0.8559161722660065, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.918749988079071, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 10.521874904632568, "epoch": 0.10792580101180438, "kl": 1.465618371963501, "learning_rate": 9.989325842696628e-07, "loss": 0.02687779814004898, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.9140625596046448, "reward_std": 0.8488925099372864, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 10.46484375, "epoch": 0.10905002810567735, "kl": 1.4335193037986755, "learning_rate": 9.989213483146067e-07, "loss": 0.02943534404039383, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.9195312857627869, "reward_std": 0.8390406370162964, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9195312559604645, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 10.596875190734863, "epoch": 0.1101742551995503, "kl": 1.3374882340431213, "learning_rate": 9.989101123595504e-07, "loss": 0.0237848162651062, "ratio/all_0": 0.0078125, "ratio/all_2": 0.75, "reward": 1.9343750476837158, "reward_std": 0.8592123687267303, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9453125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9343750178813934, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.11129848229342328, "kl": 1.3550831079483032, "learning_rate": 9.988988764044943e-07, "loss": 0.024313587695360184, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.916406273841858, "reward_std": 0.848257303237915, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9164062738418579, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 10.461719036102295, "epoch": 0.11242270938729623, "kl": 1.4057585000991821, "learning_rate": 9.988876404494382e-07, "loss": 0.007561494130641222, "ratio/all_0": 0.0546875, "ratio/all_2": 0.65625, "reward": 1.8687500357627869, "reward_std": 0.8348900377750397, "rewards/avg_0": 1.875, "rewards/avg_1": 1.828125, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8359375, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8687500059604645, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 10.51171875, "epoch": 0.1135469364811692, "kl": 1.315906286239624, "learning_rate": 9.988764044943818e-07, "loss": 0.013327661901712418, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7421875, "reward": 1.920312523841858, "reward_std": 0.8560033440589905, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203124940395355, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 10.456250190734863, "epoch": 0.11467116357504216, "kl": 1.409066081047058, "learning_rate": 9.988651685393257e-07, "loss": 0.014456999488174915, "ratio/all_0": 0.046875, "ratio/all_2": 0.6640625, "reward": 1.882031261920929, "reward_std": 0.835316926240921, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.828125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8828125, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 10.49765682220459, "epoch": 0.11579539066891512, "kl": 1.4170143604278564, "learning_rate": 9.988539325842696e-07, "loss": 0.016626989468932152, "ratio/all_0": 0.0390625, "ratio/all_2": 0.65625, "reward": 1.889843761920929, "reward_std": 0.829914778470993, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.889843761920929, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 10.4921875, "epoch": 0.11691961776278809, "kl": 1.4739782810211182, "learning_rate": 9.988426966292133e-07, "loss": 0.018051698803901672, "ratio/all_0": 0.046875, "ratio/all_2": 0.6484375, "reward": 1.8843750357627869, "reward_std": 0.8223588466644287, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8843750059604645, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 10.5546875, "epoch": 0.11804384485666104, "kl": 1.4696778655052185, "learning_rate": 9.988314606741572e-07, "loss": 0.02195235900580883, "ratio/all_0": 0.0078125, "ratio/all_2": 0.703125, "reward": 1.920312523841858, "reward_std": 0.8361712098121643, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203125238418579, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 10.5859375, "epoch": 0.11916807195053401, "kl": 1.4029964208602905, "learning_rate": 9.98820224719101e-07, "loss": 0.018382327631115913, "ratio/all_0": 0.015625, "ratio/all_2": 0.6953125, "reward": 1.916406273841858, "reward_std": 0.8307805061340332, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9164062738418579, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 10.575000286102295, "epoch": 0.12029229904440697, "kl": 1.4130637645721436, "learning_rate": 9.988089887640448e-07, "loss": 0.020427603274583817, "ratio/all_0": 0.0, "ratio/all_2": 0.7578125, "reward": 1.932812511920929, "reward_std": 0.861215353012085, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.932812511920929, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 10.534375190734863, "epoch": 0.12141652613827993, "kl": 1.3674690127372742, "learning_rate": 9.987977528089886e-07, "loss": 0.011559784412384033, "ratio/all_0": 0.046875, "ratio/all_2": 0.71875, "reward": 1.8882812857627869, "reward_std": 0.862981528043747, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8882812559604645, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 10.5390625, "epoch": 0.1225407532321529, "kl": 1.3945258855819702, "learning_rate": 9.987865168539325e-07, "loss": 0.03130093589425087, "ratio/all_0": 0.03125, "ratio/all_2": 0.7578125, "reward": 1.927343726158142, "reward_std": 0.8774406015872955, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9273437857627869, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 10.492969036102295, "epoch": 0.12366498032602585, "kl": 1.3998687267303467, "learning_rate": 9.987752808988762e-07, "loss": 0.021291855722665787, "ratio/all_0": 0.03125, "ratio/all_2": 0.6640625, "reward": 1.90625, "reward_std": 0.8211067318916321, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9062500298023224, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 10.427343845367432, "epoch": 0.12478920741989882, "kl": 1.3501304984092712, "learning_rate": 9.9876404494382e-07, "loss": 0.011759497225284576, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9078125357627869, "reward_std": 0.8611572980880737, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 10.4375, "epoch": 0.1259134345137718, "kl": 1.4030646681785583, "learning_rate": 9.98752808988764e-07, "loss": 0.029689906165003777, "ratio/all_0": 0.0390625, "ratio/all_2": 0.734375, "reward": 1.916406273841858, "reward_std": 0.8714778125286102, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.917187511920929, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 10.514062881469727, "epoch": 0.12703766160764474, "kl": 1.4460313320159912, "learning_rate": 9.987415730337079e-07, "loss": 0.012388680130243301, "ratio/all_0": 0.0078125, "ratio/all_2": 0.65625, "reward": 1.9078125357627869, "reward_std": 0.8054125308990479, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 10.461719036102295, "epoch": 0.1281618887015177, "kl": 1.487072765827179, "learning_rate": 9.987303370786516e-07, "loss": 0.017670128494501114, "ratio/all_0": 0.03125, "ratio/all_2": 0.6328125, "reward": 1.892968773841858, "reward_std": 0.8094190359115601, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8929687738418579, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 10.473437309265137, "epoch": 0.12928611579539068, "kl": 1.5192728638648987, "learning_rate": 9.987191011235955e-07, "loss": 0.02703724056482315, "ratio/all_0": 0.0078125, "ratio/all_2": 0.75, "reward": 1.9304687976837158, "reward_std": 0.8568115830421448, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9304687678813934, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 10.421875476837158, "epoch": 0.13041034288926362, "kl": 1.5342833995819092, "learning_rate": 9.987078651685393e-07, "loss": 0.03299251198768616, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.9234375357627869, "reward_std": 0.8558132350444794, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9234375059604645, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 10.435156345367432, "epoch": 0.1315345699831366, "kl": 1.4555895328521729, "learning_rate": 9.98696629213483e-07, "loss": 0.027663029730319977, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9265625476837158, "reward_std": 0.8661331236362457, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.926562488079071, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 10.500781059265137, "epoch": 0.13265879707700956, "kl": 1.3940675258636475, "learning_rate": 9.98685393258427e-07, "loss": 0.03329860046505928, "ratio/all_0": 0.03125, "ratio/all_2": 0.8046875, "reward": 1.9312500357627869, "reward_std": 0.9077420830726624, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9312500059604645, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 10.550000190734863, "epoch": 0.13378302417088253, "kl": 1.4418742656707764, "learning_rate": 9.986741573033708e-07, "loss": 0.011464491486549377, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.8992187976837158, "reward_std": 0.842133492231369, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.899218738079071, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 10.529687881469727, "epoch": 0.13490725126475547, "kl": 1.3481199145317078, "learning_rate": 9.986629213483145e-07, "loss": 0.02446187473833561, "ratio/all_0": 0.03125, "ratio/all_2": 0.7734375, "reward": 1.9187500476837158, "reward_std": 0.8890580832958221, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9187500178813934, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 10.51171875, "epoch": 0.13603147835862844, "kl": 1.3256433010101318, "learning_rate": 9.986516853932584e-07, "loss": 0.03311225771903992, "ratio/all_0": 0.015625, "ratio/all_2": 0.7109375, "reward": 1.942968726158142, "reward_std": 0.8395049273967743, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9437500238418579, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 10.56796932220459, "epoch": 0.13715570545250141, "kl": 1.4150030016899109, "learning_rate": 9.986404494382023e-07, "loss": 0.019649198278784752, "ratio/all_0": 0.046875, "ratio/all_2": 0.71875, "reward": 1.893750011920929, "reward_std": 0.8653823733329773, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.893750011920929, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 10.499218940734863, "epoch": 0.13827993254637436, "kl": 1.4379026293754578, "learning_rate": 9.98629213483146e-07, "loss": 0.024776604026556015, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.91796875, "reward_std": 0.855737566947937, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9187500178813934, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 10.416406631469727, "epoch": 0.13940415964024733, "kl": 1.5306707620620728, "learning_rate": 9.986179775280898e-07, "loss": 0.04271028935909271, "ratio/all_0": 0.0234375, "ratio/all_2": 0.84375, "reward": 1.944531261920929, "reward_std": 0.9272224605083466, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9609375, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9609375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.944531261920929, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 10.608593940734863, "epoch": 0.1405283867341203, "kl": 1.485119104385376, "learning_rate": 9.986067415730337e-07, "loss": 0.015734048560261726, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.8828125, "reward_std": 0.8565918803215027, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.8203125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8828125298023224, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 10.492969036102295, "epoch": 0.14165261382799327, "kl": 1.4620481133460999, "learning_rate": 9.985955056179774e-07, "loss": 0.03640381991863251, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7890625, "reward": 1.9382812976837158, "reward_std": 0.8856255412101746, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9453125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9382812678813934, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 10.580468654632568, "epoch": 0.1427768409218662, "kl": 1.4295486211776733, "learning_rate": 9.985842696629213e-07, "loss": 0.02829963155090809, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7421875, "reward": 1.925000011920929, "reward_std": 0.8635706603527069, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.925000011920929, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 10.490624904632568, "epoch": 0.14390106801573918, "kl": 1.3682859539985657, "learning_rate": 9.985730337078652e-07, "loss": 0.026628486812114716, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.932031273841858, "reward_std": 0.8417931199073792, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9320312440395355, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 10.538281440734863, "epoch": 0.14502529510961215, "kl": 1.341777503490448, "learning_rate": 9.985617977528089e-07, "loss": 0.02736259251832962, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7421875, "reward": 1.94140625, "reward_std": 0.8543242514133453, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9453125, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9414062798023224, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 10.537499904632568, "epoch": 0.1461495222034851, "kl": 1.3316110372543335, "learning_rate": 9.985505617977528e-07, "loss": 0.02057470940053463, "ratio/all_0": 0.0078125, "ratio/all_2": 0.75, "reward": 1.93359375, "reward_std": 0.8592420816421509, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9609375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9335937798023224, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 10.550000190734863, "epoch": 0.14727374929735806, "kl": 1.3152986764907837, "learning_rate": 9.985393258426966e-07, "loss": 0.019020576030015945, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.928906261920929, "reward_std": 0.8400276005268097, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.928906261920929, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 10.494531154632568, "epoch": 0.14839797639123103, "kl": 1.3366295099258423, "learning_rate": 9.985280898876403e-07, "loss": 0.027881566435098648, "ratio/all_0": 0.046875, "ratio/all_2": 0.7265625, "reward": 1.9117187857627869, "reward_std": 0.8693137466907501, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9117187559604645, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 10.592187881469727, "epoch": 0.14952220348510398, "kl": 1.3480384349822998, "learning_rate": 9.985168539325842e-07, "loss": 0.013358078896999359, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.903124988079071, "reward_std": 0.8528032302856445, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9039062857627869, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 10.603125095367432, "epoch": 0.15064643057897695, "kl": 1.384570598602295, "learning_rate": 9.985056179775281e-07, "loss": 0.016048742458224297, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.909375011920929, "reward_std": 0.8402921855449677, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 10.504687786102295, "epoch": 0.15177065767284992, "kl": 1.4957668781280518, "learning_rate": 9.984943820224718e-07, "loss": 0.022608522325754166, "ratio/all_0": 0.0234375, "ratio/all_2": 0.75, "reward": 1.9148437976837158, "reward_std": 0.8707560300827026, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9148437678813934, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 10.506250381469727, "epoch": 0.1528948847667229, "kl": 1.49778151512146, "learning_rate": 9.984831460674157e-07, "loss": 0.025695206597447395, "ratio/all_0": 0.046875, "ratio/all_2": 0.7265625, "reward": 1.9015625715255737, "reward_std": 0.8657637238502502, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.901562511920929, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 10.603906631469727, "epoch": 0.15401911186059583, "kl": 1.4697266817092896, "learning_rate": 9.984719101123596e-07, "loss": 0.03311506658792496, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.928906261920929, "reward_std": 0.841498851776123, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.928906261920929, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 10.595312595367432, "epoch": 0.1551433389544688, "kl": 1.5000845193862915, "learning_rate": 9.984606741573032e-07, "loss": 0.03179492428898811, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7109375, "reward": 1.910937488079071, "reward_std": 0.8583005368709564, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9109375178813934, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 10.467187881469727, "epoch": 0.15626756604834177, "kl": 1.492616891860962, "learning_rate": 9.984494382022471e-07, "loss": 0.0324878990650177, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6796875, "reward": 1.9132813215255737, "reward_std": 0.8348826766014099, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9140625, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 10.553906440734863, "epoch": 0.15739179314221471, "kl": 1.486423134803772, "learning_rate": 9.98438202247191e-07, "loss": 0.02975376322865486, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.924218773841858, "reward_std": 0.8577986359596252, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9242187738418579, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 10.591406345367432, "epoch": 0.15851602023608768, "kl": 1.4460753798484802, "learning_rate": 9.984269662921347e-07, "loss": 0.01627442240715027, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.91015625, "reward_std": 0.8073700666427612, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9101562798023224, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 10.568749904632568, "epoch": 0.15964024732996066, "kl": 1.4809756875038147, "learning_rate": 9.984157303370786e-07, "loss": 0.0247793085873127, "ratio/all_0": 0.015625, "ratio/all_2": 0.7265625, "reward": 1.923437476158142, "reward_std": 0.8487512767314911, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9242187738418579, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 10.45703125, "epoch": 0.16076447442383363, "kl": 1.5039972066879272, "learning_rate": 9.984044943820225e-07, "loss": 0.03990985080599785, "ratio/all_0": 0.046875, "ratio/all_2": 0.71875, "reward": 1.920312523841858, "reward_std": 0.862554669380188, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203124940395355, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 10.50546932220459, "epoch": 0.16188870151770657, "kl": 1.4829599261283875, "learning_rate": 9.983932584269662e-07, "loss": 0.023315520957112312, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6875, "reward": 1.8984375596046448, "reward_std": 0.8472589552402496, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8984375, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 10.517969131469727, "epoch": 0.16301292861157954, "kl": 1.439881980419159, "learning_rate": 9.9838202247191e-07, "loss": 0.0387931689620018, "ratio/all_0": 0.0078125, "ratio/all_2": 0.796875, "reward": 1.9531250596046448, "reward_std": 0.8867897987365723, "rewards/avg_0": 1.9453125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.953125, "rewards/avg_4": 1.9453125, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.953125, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 10.51171875, "epoch": 0.1641371557054525, "kl": 1.4910458326339722, "learning_rate": 9.98370786516854e-07, "loss": 0.034083664417266846, "ratio/all_0": 0.0546875, "ratio/all_2": 0.734375, "reward": 1.907812476158142, "reward_std": 0.8765862584114075, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125357627869, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 10.510156154632568, "epoch": 0.16526138279932545, "kl": 1.507016360759735, "learning_rate": 9.983595505617976e-07, "loss": 0.02047543413937092, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.9078125357627869, "reward_std": 0.8427807092666626, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 10.460156440734863, "epoch": 0.16638560989319842, "kl": 1.5077899098396301, "learning_rate": 9.983483146067415e-07, "loss": 0.012134671211242676, "ratio/all_0": 0.046875, "ratio/all_2": 0.6328125, "reward": 1.8726562857627869, "reward_std": 0.8212587535381317, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8203125, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.8125, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8726562559604645, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 10.539843559265137, "epoch": 0.1675098369870714, "kl": 1.411302089691162, "learning_rate": 9.983370786516854e-07, "loss": 0.02341485023498535, "ratio/all_0": 0.0078125, "ratio/all_2": 0.765625, "reward": 1.9312500357627869, "reward_std": 0.8712870478630066, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9312500059604645, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 10.496875286102295, "epoch": 0.16863406408094436, "kl": 1.4035818576812744, "learning_rate": 9.98325842696629e-07, "loss": 0.01982831582427025, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.925000011920929, "reward_std": 0.8388589024543762, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.92578125, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 10.400000095367432, "epoch": 0.1697582911748173, "kl": 1.44032484292984, "learning_rate": 9.98314606741573e-07, "loss": 0.023583440110087395, "ratio/all_0": 0.046875, "ratio/all_2": 0.7421875, "reward": 1.90234375, "reward_std": 0.8744505941867828, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9023437798023224, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 10.638281345367432, "epoch": 0.17088251826869028, "kl": 1.4179207682609558, "learning_rate": 9.983033707865169e-07, "loss": 0.028874140232801437, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.926562488079071, "reward_std": 0.8614144921302795, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9273437857627869, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 10.497656345367432, "epoch": 0.17200674536256325, "kl": 1.4119802713394165, "learning_rate": 9.982921348314606e-07, "loss": 0.02332199364900589, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.9234375357627869, "reward_std": 0.8475228846073151, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9242187440395355, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 10.483593940734863, "epoch": 0.1731309724564362, "kl": 1.447148084640503, "learning_rate": 9.982808988764044e-07, "loss": 0.030631529167294502, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9351562857627869, "reward_std": 0.8493313789367676, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9351562559604645, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 10.414844036102295, "epoch": 0.17425519955030916, "kl": 1.4554988145828247, "learning_rate": 9.982696629213483e-07, "loss": 0.023993976414203644, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.9156250357627869, "reward_std": 0.852101594209671, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9156250059604645, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 10.483593940734863, "epoch": 0.17537942664418213, "kl": 1.433959722518921, "learning_rate": 9.982584269662922e-07, "loss": 0.026601165533065796, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6953125, "reward": 1.921093761920929, "reward_std": 0.833401083946228, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.921093761920929, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 10.534375190734863, "epoch": 0.1765036537380551, "kl": 1.4553956389427185, "learning_rate": 9.98247191011236e-07, "loss": 0.03449885919690132, "ratio/all_0": 0.0234375, "ratio/all_2": 0.78125, "reward": 1.935937523841858, "reward_std": 0.8834026753902435, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9359374940395355, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.17762788083192804, "kl": 1.4700213074684143, "learning_rate": 9.982359550561798e-07, "loss": 0.015104904770851135, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.9039062857627869, "reward_std": 0.8533373475074768, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9039062559604645, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 10.586719036102295, "epoch": 0.178752107925801, "kl": 1.4438948035240173, "learning_rate": 9.982247191011237e-07, "loss": 0.02373446524143219, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7265625, "reward": 1.9078125357627869, "reward_std": 0.8605807721614838, "rewards/avg_0": 1.875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 10.5390625, "epoch": 0.17987633501967398, "kl": 1.459674894809723, "learning_rate": 9.982134831460674e-07, "loss": 0.025322623550891876, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9218750596046448, "reward_std": 0.863748699426651, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.921875, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 10.46640682220459, "epoch": 0.18100056211354693, "kl": 1.4958550333976746, "learning_rate": 9.982022471910113e-07, "loss": 0.015952421352267265, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.900781273841858, "reward_std": 0.8476106822490692, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9007812738418579, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 10.486719131469727, "epoch": 0.1821247892074199, "kl": 1.4854081273078918, "learning_rate": 9.981910112359551e-07, "loss": 0.008778873831033707, "ratio/all_0": 0.0234375, "ratio/all_2": 0.609375, "reward": 1.8882812857627869, "reward_std": 0.7877383232116699, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8203125, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8890624940395355, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 10.478906631469727, "epoch": 0.18324901630129287, "kl": 1.5270226001739502, "learning_rate": 9.981797752808988e-07, "loss": 0.025792792439460754, "ratio/all_0": 0.03125, "ratio/all_2": 0.75, "reward": 1.9078125357627869, "reward_std": 0.8755120933055878, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 10.463281154632568, "epoch": 0.18437324339516584, "kl": 1.5529326796531677, "learning_rate": 9.981685393258427e-07, "loss": 0.030110936611890793, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7734375, "reward": 1.9296875, "reward_std": 0.8803807497024536, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9304687678813934, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 10.582031726837158, "epoch": 0.18549747048903878, "kl": 1.6017058491706848, "learning_rate": 9.981573033707866e-07, "loss": 0.018820036202669144, "ratio/all_0": 0.0625, "ratio/all_2": 0.6953125, "reward": 1.8632813096046448, "reward_std": 0.8700221478939056, "rewards/avg_0": 1.8046875, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8203125, "rewards/avg_4": 1.8046875, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.86328125, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 10.49609375, "epoch": 0.18662169758291175, "kl": 1.5901198387145996, "learning_rate": 9.981460674157303e-07, "loss": 0.01749960146844387, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.9039062857627869, "reward_std": 0.8250192999839783, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9046874940395355, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 10.495312690734863, "epoch": 0.18774592467678472, "kl": 1.6122857332229614, "learning_rate": 9.981348314606742e-07, "loss": 0.024158619344234467, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.899999976158142, "reward_std": 0.8434443175792694, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000357627869, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 10.47265625, "epoch": 0.18887015177065766, "kl": 1.7152747511863708, "learning_rate": 9.98123595505618e-07, "loss": 0.048164140433073044, "ratio/all_0": 0.0234375, "ratio/all_2": 0.765625, "reward": 1.938281238079071, "reward_std": 0.8790540397167206, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9453125, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9390625059604645, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 10.390625, "epoch": 0.18999437886453063, "kl": 1.6773305535316467, "learning_rate": 9.981123595505617e-07, "loss": 0.010047555901110172, "ratio/all_0": 0.015625, "ratio/all_2": 0.609375, "reward": 1.877343773841858, "reward_std": 0.788997232913971, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8359375, "rewards/avg_3": 1.78125, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8773437738418579, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 10.450781345367432, "epoch": 0.1911186059584036, "kl": 1.6623247861862183, "learning_rate": 9.981011235955056e-07, "loss": 0.03203898295760155, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9226562976837158, "reward_std": 0.8614811599254608, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9226562678813934, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 10.44921875, "epoch": 0.19224283305227655, "kl": 1.6179015636444092, "learning_rate": 9.980898876404495e-07, "loss": 0.036546383053064346, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9312500357627869, "reward_std": 0.8603616058826447, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9320312738418579, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 10.635937690734863, "epoch": 0.19336706014614952, "kl": 1.5302655696868896, "learning_rate": 9.980786516853932e-07, "loss": 0.03663820028305054, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7421875, "reward": 1.9429687857627869, "reward_std": 0.8540288507938385, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9609375, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9429687559604645, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 10.55078125, "epoch": 0.1944912872400225, "kl": 1.6461214423179626, "learning_rate": 9.98067415730337e-07, "loss": 0.029548870399594307, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.9140625596046448, "reward_std": 0.8425402939319611, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9148437678813934, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 10.571094036102295, "epoch": 0.19561551433389546, "kl": 1.5366535782814026, "learning_rate": 9.98056179775281e-07, "loss": 0.030620839446783066, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.918749988079071, "reward_std": 0.8569993078708649, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9195312857627869, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 10.46875, "epoch": 0.1967397414277684, "kl": 1.5700982213020325, "learning_rate": 9.980449438202247e-07, "loss": 0.02145414985716343, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7109375, "reward": 1.892968773841858, "reward_std": 0.8565924763679504, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8929687440395355, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 10.450000286102295, "epoch": 0.19786396852164137, "kl": 1.6610383987426758, "learning_rate": 9.980337078651686e-07, "loss": 0.028878550976514816, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6796875, "reward": 1.8984375596046448, "reward_std": 0.8399778306484222, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8992187678813934, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 10.414844036102295, "epoch": 0.19898819561551434, "kl": 1.5835221409797668, "learning_rate": 9.980224719101124e-07, "loss": 0.03403393179178238, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9312500357627869, "reward_std": 0.8549413084983826, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9320312738418579, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 10.460156440734863, "epoch": 0.20011242270938728, "kl": 1.5369529128074646, "learning_rate": 9.980112359550561e-07, "loss": 0.03677598387002945, "ratio/all_0": 0.0390625, "ratio/all_2": 0.75, "reward": 1.918749988079071, "reward_std": 0.8851003050804138, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9203125238418579, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 10.396875381469727, "epoch": 0.20123664980326025, "kl": 1.5669890642166138, "learning_rate": 9.98e-07, "loss": 0.022701524198055267, "ratio/all_0": 0.046875, "ratio/all_2": 0.6640625, "reward": 1.885937511920929, "reward_std": 0.8360760807991028, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8125, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.88671875, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 10.587500095367432, "epoch": 0.20236087689713322, "kl": 1.5295946598052979, "learning_rate": 9.97988764044944e-07, "loss": 0.029509663581848145, "ratio/all_0": 0.03125, "ratio/all_2": 0.7265625, "reward": 1.913281261920929, "reward_std": 0.8641042709350586, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9140625, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 10.464844226837158, "epoch": 0.2034851039910062, "kl": 1.4037991166114807, "learning_rate": 9.979775280898876e-07, "loss": 0.02910599112510681, "ratio/all_0": 0.0, "ratio/all_2": 0.7421875, "reward": 1.9453125, "reward_std": 0.8485521972179413, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9453125, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 10.485156536102295, "epoch": 0.20460933108487914, "kl": 1.392296850681305, "learning_rate": 9.979662921348315e-07, "loss": 0.03196254372596741, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.9421875476837158, "reward_std": 0.8366577923297882, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.944531261920929, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 10.508593559265137, "epoch": 0.2057335581787521, "kl": 1.431138515472412, "learning_rate": 9.979550561797754e-07, "loss": 0.020911136642098427, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.9109375476837158, "reward_std": 0.8656867742538452, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9117187559604645, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 10.397656440734863, "epoch": 0.20685778527262508, "kl": 1.4381036162376404, "learning_rate": 9.97943820224719e-07, "loss": 0.026089351624250412, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6875, "reward": 1.9179688096046448, "reward_std": 0.8316961228847504, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.918749988079071, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 10.519531726837158, "epoch": 0.20798201236649802, "kl": 1.5049086213111877, "learning_rate": 9.97932584269663e-07, "loss": 0.020233262330293655, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6796875, "reward": 1.9179688096046448, "reward_std": 0.8203654885292053, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9187500178813934, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 10.432812690734863, "epoch": 0.209106239460371, "kl": 1.4891046285629272, "learning_rate": 9.979213483146068e-07, "loss": 0.015585238113999367, "ratio/all_0": 0.015625, "ratio/all_2": 0.7578125, "reward": 1.9117187857627869, "reward_std": 0.8686578571796417, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9124999940395355, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 10.551562786102295, "epoch": 0.21023046655424396, "kl": 1.4672995805740356, "learning_rate": 9.979101123595505e-07, "loss": 0.02163076400756836, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6640625, "reward": 1.8953125476837158, "reward_std": 0.8324027061462402, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.895312488079071, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 10.550000190734863, "epoch": 0.21135469364811693, "kl": 1.4560156464576721, "learning_rate": 9.978988764044944e-07, "loss": 0.027444355189800262, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.928125023841858, "reward_std": 0.8456373810768127, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9281249940395355, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 10.555469036102295, "epoch": 0.21247892074198987, "kl": 1.3982338905334473, "learning_rate": 9.978876404494383e-07, "loss": 0.03762711584568024, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7578125, "reward": 1.930468738079071, "reward_std": 0.881901741027832, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9304687678813934, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 10.504687786102295, "epoch": 0.21360314783586284, "kl": 1.4209675788879395, "learning_rate": 9.97876404494382e-07, "loss": 0.019087618216872215, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.90234375, "reward_std": 0.8551211357116699, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.90234375, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 10.585156440734863, "epoch": 0.21472737492973581, "kl": 1.372999668121338, "learning_rate": 9.978651685393259e-07, "loss": 0.027694687247276306, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7578125, "reward": 1.927343726158142, "reward_std": 0.876830667257309, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9281250238418579, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 10.588281154632568, "epoch": 0.21585160202360876, "kl": 1.462668538093567, "learning_rate": 9.978539325842695e-07, "loss": 0.02637871727347374, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9296875, "reward_std": 0.8622894585132599, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9296875298023224, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 10.403125286102295, "epoch": 0.21697582911748173, "kl": 1.453848421573639, "learning_rate": 9.978426966292134e-07, "loss": 0.04470495507121086, "ratio/all_0": 0.015625, "ratio/all_2": 0.8359375, "reward": 1.9585937857627869, "reward_std": 0.9128505885601044, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9609375, "rewards/avg_3": 1.9609375, "rewards/avg_4": 1.9609375, "rewards/avg_5": 1.9609375, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9585937559604645, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 10.55859375, "epoch": 0.2181000562113547, "kl": 1.548885464668274, "learning_rate": 9.978314606741573e-07, "loss": 0.03591137379407883, "ratio/all_0": 0.046875, "ratio/all_2": 0.734375, "reward": 1.9117187857627869, "reward_std": 0.8763015270233154, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9124999940395355, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 10.505468845367432, "epoch": 0.21922428330522767, "kl": 1.6363706588745117, "learning_rate": 9.97820224719101e-07, "loss": 0.03188881278038025, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.9171875715255737, "reward_std": 0.8619976937770844, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9187500178813934, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 10.568749904632568, "epoch": 0.2203485103991006, "kl": 1.5860892534255981, "learning_rate": 9.978089887640449e-07, "loss": 0.028511447831988335, "ratio/all_0": 0.03125, "ratio/all_2": 0.6796875, "reward": 1.9078125357627869, "reward_std": 0.8347568511962891, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9078125059604645, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 10.498437881469727, "epoch": 0.22147273749297358, "kl": 1.5308297872543335, "learning_rate": 9.977977528089888e-07, "loss": 0.026905380189418793, "ratio/all_0": 0.0390625, "ratio/all_2": 0.71875, "reward": 1.907031238079071, "reward_std": 0.8604941964149475, "rewards/avg_0": 1.875, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9070312678813934, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 10.496875286102295, "epoch": 0.22259696458684655, "kl": 1.5878414511680603, "learning_rate": 9.977865168539325e-07, "loss": 0.03801161050796509, "ratio/all_0": 0.03125, "ratio/all_2": 0.7578125, "reward": 1.925000011920929, "reward_std": 0.8770124614238739, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.92578125, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 10.521093845367432, "epoch": 0.2237211916807195, "kl": 1.5883166193962097, "learning_rate": 9.977752808988763e-07, "loss": 0.023211535066366196, "ratio/all_0": 0.015625, "ratio/all_2": 0.6953125, "reward": 1.908593773841858, "reward_std": 0.8363709449768066, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.909375011920929, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 10.541406154632568, "epoch": 0.22484541877459246, "kl": 1.5315197706222534, "learning_rate": 9.977640449438202e-07, "loss": 0.023549862205982208, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.920312523841858, "reward_std": 0.8471402525901794, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.921875, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 10.565625190734863, "epoch": 0.22596964586846544, "kl": 1.5974794626235962, "learning_rate": 9.97752808988764e-07, "loss": 0.024372313171625137, "ratio/all_0": 0.03125, "ratio/all_2": 0.6640625, "reward": 1.9000000357627869, "reward_std": 0.8229183256626129, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000059604645, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 10.517969131469727, "epoch": 0.2270938729623384, "kl": 1.539478600025177, "learning_rate": 9.977415730337078e-07, "loss": 0.012446017935872078, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6953125, "reward": 1.904687523841858, "reward_std": 0.8295447826385498, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9046874940395355, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 10.535156726837158, "epoch": 0.22821810005621135, "kl": 1.6015858054161072, "learning_rate": 9.977303370786517e-07, "loss": 0.03816835209727287, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7578125, "reward": 1.930468738079071, "reward_std": 0.8757118582725525, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9312500059604645, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 10.500000476837158, "epoch": 0.22934232715008432, "kl": 1.6012290716171265, "learning_rate": 9.977191011235954e-07, "loss": 0.035076532512903214, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6953125, "reward": 1.9117187857627869, "reward_std": 0.8442235589027405, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9117187559604645, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 10.500000476837158, "epoch": 0.2304665542439573, "kl": 1.6269680857658386, "learning_rate": 9.977078651685393e-07, "loss": 0.02707146294414997, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.909375011920929, "reward_std": 0.8505551815032959, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.909375011920929, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 10.557031631469727, "epoch": 0.23159078133783023, "kl": 1.6032747626304626, "learning_rate": 9.976966292134832e-07, "loss": 0.029255583882331848, "ratio/all_0": 0.015625, "ratio/all_2": 0.6953125, "reward": 1.9195312857627869, "reward_std": 0.8345302641391754, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9203125238418579, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 10.522656440734863, "epoch": 0.2327150084317032, "kl": 1.5486992001533508, "learning_rate": 9.976853932584268e-07, "loss": 0.014459997415542603, "ratio/all_0": 0.0, "ratio/all_2": 0.65625, "reward": 1.908593773841858, "reward_std": 0.8039405941963196, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9085937738418579, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 10.563281536102295, "epoch": 0.23383923552557617, "kl": 1.5740108489990234, "learning_rate": 9.976741573033707e-07, "loss": 0.0257696695625782, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.918749988079071, "reward_std": 0.8626956641674042, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9195312559604645, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 10.550000190734863, "epoch": 0.23496346261944911, "kl": 1.5769821405410767, "learning_rate": 9.976629213483146e-07, "loss": 0.03247952461242676, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.913281261920929, "reward_std": 0.8521692454814911, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9140625, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.23608768971332209, "kl": 1.536140501499176, "learning_rate": 9.976516853932583e-07, "loss": 0.03490378335118294, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6953125, "reward": 1.92578125, "reward_std": 0.8375056982040405, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9281249940395355, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 10.572656631469727, "epoch": 0.23721191680719506, "kl": 1.613707423210144, "learning_rate": 9.976404494382022e-07, "loss": 0.0240375567227602, "ratio/all_0": 0.0625, "ratio/all_2": 0.6875, "reward": 1.8757812976837158, "reward_std": 0.8590532541275024, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8765625059604645, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 10.489062786102295, "epoch": 0.23833614390106803, "kl": 1.7040197253227234, "learning_rate": 9.97629213483146e-07, "loss": 0.02793644554913044, "ratio/all_0": 0.046875, "ratio/all_2": 0.6640625, "reward": 1.8882812857627869, "reward_std": 0.8288446962833405, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8890625238418579, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 10.516406536102295, "epoch": 0.23946037099494097, "kl": 1.6352243423461914, "learning_rate": 9.976179775280898e-07, "loss": 0.018058663234114647, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.890625, "reward_std": 0.8580062985420227, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8906250298023224, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 10.466406345367432, "epoch": 0.24058459808881394, "kl": 1.6236803531646729, "learning_rate": 9.976067415730337e-07, "loss": 0.03151131793856621, "ratio/all_0": 0.0, "ratio/all_2": 0.703125, "reward": 1.9312500357627869, "reward_std": 0.832555741071701, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9320312440395355, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 10.541406154632568, "epoch": 0.2417088251826869, "kl": 1.6875005960464478, "learning_rate": 9.975955056179775e-07, "loss": 0.031967081129550934, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.919531226158142, "reward_std": 0.8395129144191742, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9195312857627869, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 10.545312881469727, "epoch": 0.24283305227655985, "kl": 1.674071729183197, "learning_rate": 9.975842696629212e-07, "loss": 0.03784000501036644, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7578125, "reward": 1.9257813096046448, "reward_std": 0.8713735342025757, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.92578125, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 10.56406307220459, "epoch": 0.24395727937043282, "kl": 1.6124215126037598, "learning_rate": 9.975730337078651e-07, "loss": 0.03100433573126793, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.920312523841858, "reward_std": 0.8564988374710083, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.922656238079071, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 10.494531631469727, "epoch": 0.2450815064643058, "kl": 1.7857028245925903, "learning_rate": 9.97561797752809e-07, "loss": 0.03976079821586609, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.928906261920929, "reward_std": 0.8637703955173492, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9296875298023224, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 10.584375381469727, "epoch": 0.24620573355817876, "kl": 1.6516713500022888, "learning_rate": 9.975505617977527e-07, "loss": 0.02732301503419876, "ratio/all_0": 0.015625, "ratio/all_2": 0.6953125, "reward": 1.914843738079071, "reward_std": 0.8281229138374329, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9156250059604645, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 10.557812690734863, "epoch": 0.2473299606520517, "kl": 1.6257204413414001, "learning_rate": 9.975393258426966e-07, "loss": 0.03726682811975479, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9296875, "reward_std": 0.8575507402420044, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.932812511920929, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 10.563281059265137, "epoch": 0.24845418774592468, "kl": 1.6728870272636414, "learning_rate": 9.975280898876405e-07, "loss": 0.03137045353651047, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.913281261920929, "reward_std": 0.8622699677944183, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9148437678813934, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 10.493750095367432, "epoch": 0.24957841483979765, "kl": 1.6135631203651428, "learning_rate": 9.975168539325841e-07, "loss": 0.02609895169734955, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.920312523841858, "reward_std": 0.8621271550655365, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203125238418579, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 10.526562690734863, "epoch": 0.2507026419336706, "kl": 1.6434112787246704, "learning_rate": 9.97505617977528e-07, "loss": 0.029833676293492317, "ratio/all_0": 0.0, "ratio/all_2": 0.671875, "reward": 1.928906261920929, "reward_std": 0.8060888051986694, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.928906261920929, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 10.494531631469727, "epoch": 0.2518268690275436, "kl": 1.5851457118988037, "learning_rate": 9.97494382022472e-07, "loss": 0.031208906322717667, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.924218773841858, "reward_std": 0.8377038240432739, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.925000011920929, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 10.5546875, "epoch": 0.25295109612141653, "kl": 1.5501510500907898, "learning_rate": 9.974831460674156e-07, "loss": 0.02604903280735016, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.9140625, "reward_std": 0.846492350101471, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9140625298023224, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 10.51718807220459, "epoch": 0.2540753232152895, "kl": 1.5336772799491882, "learning_rate": 9.974719101123595e-07, "loss": 0.03907974064350128, "ratio/all_0": 0.03125, "ratio/all_2": 0.78125, "reward": 1.9304687976837158, "reward_std": 0.8920327723026276, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9304687678813934, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 10.474218845367432, "epoch": 0.25519955030916247, "kl": 1.6211896538734436, "learning_rate": 9.974606741573034e-07, "loss": 0.03476298600435257, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.912500023841858, "reward_std": 0.8594194948673248, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9125000238418579, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 10.568749904632568, "epoch": 0.2563237774030354, "kl": 1.5672905445098877, "learning_rate": 9.97449438202247e-07, "loss": 0.027938904240727425, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.9093750715255737, "reward_std": 0.8576548993587494, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9109375178813934, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 10.542187690734863, "epoch": 0.25744800449690836, "kl": 1.562992513179779, "learning_rate": 9.97438202247191e-07, "loss": 0.03218929469585419, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.9226562976837158, "reward_std": 0.856164962053299, "rewards/avg_0": 1.875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9242187440395355, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 10.540625095367432, "epoch": 0.25857223159078135, "kl": 1.708520531654358, "learning_rate": 9.974269662921348e-07, "loss": 0.025115033611655235, "ratio/all_0": 0.0234375, "ratio/all_2": 0.671875, "reward": 1.899218738079071, "reward_std": 0.8306452631950378, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.90234375, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 10.558594226837158, "epoch": 0.2596964586846543, "kl": 1.8696183562278748, "learning_rate": 9.974157303370785e-07, "loss": 0.041371747851371765, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6640625, "reward": 1.912500023841858, "reward_std": 0.8245647549629211, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9148437678813934, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 10.474218845367432, "epoch": 0.26082068577852724, "kl": 1.932099997997284, "learning_rate": 9.974044943820224e-07, "loss": 0.04683533310890198, "ratio/all_0": 0.0234375, "ratio/all_2": 0.765625, "reward": 1.9195312857627869, "reward_std": 0.8856737911701202, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875, "rewards/point_reward": 0.9273437559604645, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 10.432812690734863, "epoch": 0.26194491287240024, "kl": 1.7457298040390015, "learning_rate": 9.973932584269663e-07, "loss": 0.03385300561785698, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6953125, "reward": 1.918749988079071, "reward_std": 0.8396010994911194, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9234375059604645, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 10.432031631469727, "epoch": 0.2630691399662732, "kl": 1.8611086010932922, "learning_rate": 9.9738202247191e-07, "loss": 0.04638049378991127, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.90234375, "reward_std": 0.8645775318145752, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8046875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.996874988079071, "rewards/point_reward": 0.905468761920929, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 10.385156154632568, "epoch": 0.2641933670601461, "kl": 1.723706603050232, "learning_rate": 9.973707865168539e-07, "loss": 0.034559160470962524, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.9234375357627869, "reward_std": 0.8410924971103668, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.996874988079071, "rewards/point_reward": 0.9265625178813934, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 10.43125057220459, "epoch": 0.2653175941540191, "kl": 1.6914669275283813, "learning_rate": 9.973595505617976e-07, "loss": 0.0417524054646492, "ratio/all_0": 0.0078125, "ratio/all_2": 0.78125, "reward": 1.942187488079071, "reward_std": 0.8815446197986603, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.984375, "rewards/avg_4": 1.9453125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.944531261920929, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 10.531250476837158, "epoch": 0.26644182124789206, "kl": 1.6563090085983276, "learning_rate": 9.973483146067414e-07, "loss": 0.031678296625614166, "ratio/all_0": 0.0078125, "ratio/all_2": 0.734375, "reward": 1.92578125, "reward_std": 0.8512683510780334, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9273437559604645, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 10.491406440734863, "epoch": 0.26756604834176506, "kl": 1.7366485595703125, "learning_rate": 9.973370786516853e-07, "loss": 0.02747419849038124, "ratio/all_0": 0.0234375, "ratio/all_2": 0.671875, "reward": 1.9000000357627869, "reward_std": 0.8285984098911285, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9023437798023224, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 10.403906345367432, "epoch": 0.268690275435638, "kl": 1.775952935218811, "learning_rate": 9.97325842696629e-07, "loss": 0.03478237986564636, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6796875, "reward": 1.9062500596046448, "reward_std": 0.8357247114181519, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.909375011920929, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 10.389062404632568, "epoch": 0.26981450252951095, "kl": 1.7484761476516724, "learning_rate": 9.97314606741573e-07, "loss": 0.04855723679065704, "ratio/all_0": 0.0234375, "ratio/all_2": 0.78125, "reward": 1.9351562857627869, "reward_std": 0.8961874544620514, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9375, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 10.538281440734863, "epoch": 0.27093872962338394, "kl": 1.6602862477302551, "learning_rate": 9.973033707865168e-07, "loss": 0.04181479662656784, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.934374988079071, "reward_std": 0.8723446130752563, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9390625059604645, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 10.399219036102295, "epoch": 0.2720629567172569, "kl": 1.8109837770462036, "learning_rate": 9.972921348314605e-07, "loss": 0.032406628131866455, "ratio/all_0": 0.015625, "ratio/all_2": 0.703125, "reward": 1.916406273841858, "reward_std": 0.8395773470401764, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9187500178813934, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 10.46484375, "epoch": 0.27318718381112983, "kl": 1.7080157399177551, "learning_rate": 9.972808988764044e-07, "loss": 0.02838246151804924, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7265625, "reward": 1.911718726158142, "reward_std": 0.8578216135501862, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9960937798023224, "rewards/point_reward": 0.9156250059604645, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 10.516406536102295, "epoch": 0.27431141090500283, "kl": 1.8099465370178223, "learning_rate": 9.972696629213483e-07, "loss": 0.04237574338912964, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.916406273841858, "reward_std": 0.8517789244651794, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9187500178813934, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 10.339062690734863, "epoch": 0.27543563799887577, "kl": 1.8641058206558228, "learning_rate": 9.972584269662921e-07, "loss": 0.04015839472413063, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6953125, "reward": 1.92578125, "reward_std": 0.827897310256958, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9265625178813934, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 10.48046875, "epoch": 0.2765598650927487, "kl": 1.6710260510444641, "learning_rate": 9.972471910112358e-07, "loss": 0.0362420380115509, "ratio/all_0": 0.0234375, "ratio/all_2": 0.75, "reward": 1.9210938215255737, "reward_std": 0.872996062040329, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9234375059604645, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 10.430469036102295, "epoch": 0.2776840921866217, "kl": 1.6808490753173828, "learning_rate": 9.972359550561797e-07, "loss": 0.031743235886096954, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.90234375, "reward_std": 0.8524691760540009, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.90625, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 10.414844036102295, "epoch": 0.27880831928049465, "kl": 1.6693540811538696, "learning_rate": 9.972247191011236e-07, "loss": 0.035806022584438324, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.905468761920929, "reward_std": 0.8627266883850098, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.992968738079071, "rewards/point_reward": 0.9125000238418579, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 10.3984375, "epoch": 0.2799325463743676, "kl": 1.5856854319572449, "learning_rate": 9.972134831460673e-07, "loss": 0.04258953034877777, "ratio/all_0": 0.0078125, "ratio/all_2": 0.828125, "reward": 1.952343761920929, "reward_std": 0.9072423577308655, "rewards/avg_0": 1.9453125, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.96875, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.953906238079071, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 10.359375476837158, "epoch": 0.2810567734682406, "kl": 1.5875438451766968, "learning_rate": 9.972022471910112e-07, "loss": 0.04035266861319542, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7890625, "reward": 1.9343750476837158, "reward_std": 0.8995476365089417, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9359374940395355, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 10.521093845367432, "epoch": 0.28218100056211354, "kl": 1.642164707183838, "learning_rate": 9.97191011235955e-07, "loss": 0.03470568358898163, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.9078125357627869, "reward_std": 0.8443655967712402, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9124999940395355, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 10.52734375, "epoch": 0.28330522765598654, "kl": 1.558402955532074, "learning_rate": 9.971797752808987e-07, "loss": 0.0262003056704998, "ratio/all_0": 0.0, "ratio/all_2": 0.6875, "reward": 1.9226562976837158, "reward_std": 0.8312846720218658, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9929687678813934, "rewards/point_reward": 0.9296875, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 10.449219226837158, "epoch": 0.2844294547498595, "kl": 1.532906413078308, "learning_rate": 9.971685393258426e-07, "loss": 0.042840905487537384, "ratio/all_0": 0.0390625, "ratio/all_2": 0.7890625, "reward": 1.9296875, "reward_std": 0.9091970920562744, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9320312738418579, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 10.514843940734863, "epoch": 0.2855536818437324, "kl": 1.6184453964233398, "learning_rate": 9.971573033707865e-07, "loss": 0.029775146394968033, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.91796875, "reward_std": 0.8635019361972809, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875298023224, "rewards/point_reward": 0.9257812798023224, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 10.499218940734863, "epoch": 0.2866779089376054, "kl": 1.5478734374046326, "learning_rate": 9.971460674157302e-07, "loss": 0.036088164895772934, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7890625, "reward": 1.9312500357627869, "reward_std": 0.8939096629619598, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.9453125, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.9343750178813934, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 10.510156631469727, "epoch": 0.28780213603147836, "kl": 1.6254317164421082, "learning_rate": 9.97134831460674e-07, "loss": 0.022765733301639557, "ratio/all_0": 0.03125, "ratio/all_2": 0.7265625, "reward": 1.904687523841858, "reward_std": 0.8665963411331177, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875298023224, "rewards/point_reward": 0.9125000238418579, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.2889263631253513, "kl": 1.5246264338493347, "learning_rate": 9.97123595505618e-07, "loss": 0.017598390579223633, "ratio/all_0": 0.0234375, "ratio/all_2": 0.625, "reward": 1.8984375596046448, "reward_std": 0.8018450438976288, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500059604645, "rewards/point_reward": 0.9046875238418579, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 10.46875, "epoch": 0.2900505902192243, "kl": 1.503324806690216, "learning_rate": 9.971123595505617e-07, "loss": 0.02756604552268982, "ratio/all_0": 0.03125, "ratio/all_2": 0.7421875, "reward": 1.916406273841858, "reward_std": 0.869181752204895, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.917187511920929, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 10.486719131469727, "epoch": 0.29117481731309725, "kl": 1.6142412424087524, "learning_rate": 9.971011235955056e-07, "loss": 0.03882833570241928, "ratio/all_0": 0.046875, "ratio/all_2": 0.703125, "reward": 1.908593773841858, "reward_std": 0.8646510541439056, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.9125000238418579, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 10.471094131469727, "epoch": 0.2922990444069702, "kl": 1.557334840297699, "learning_rate": 9.970898876404495e-07, "loss": 0.03715989738702774, "ratio/all_0": 0.015625, "ratio/all_2": 0.7734375, "reward": 1.9335938096046448, "reward_std": 0.8902114033699036, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9359374940395355, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 10.576562881469727, "epoch": 0.2934232715008432, "kl": 1.6058542132377625, "learning_rate": 9.970786516853931e-07, "loss": 0.02706632763147354, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.913281261920929, "reward_std": 0.8577883243560791, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500357627869, "rewards/point_reward": 0.9195312559604645, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 10.518750190734863, "epoch": 0.29454749859471613, "kl": 1.631466269493103, "learning_rate": 9.97067415730337e-07, "loss": 0.029847048223018646, "ratio/all_0": 0.03125, "ratio/all_2": 0.7265625, "reward": 1.920312523841858, "reward_std": 0.8593951761722565, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.925000011920929, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 10.529687404632568, "epoch": 0.29567172568858907, "kl": 1.6081148386001587, "learning_rate": 9.97056179775281e-07, "loss": 0.03809971362352371, "ratio/all_0": 0.0234375, "ratio/all_2": 0.671875, "reward": 1.921875, "reward_std": 0.8334166705608368, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9929687678813934, "rewards/point_reward": 0.928906261920929, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 10.452343940734863, "epoch": 0.29679595278246207, "kl": 1.687977135181427, "learning_rate": 9.970449438202246e-07, "loss": 0.03532067686319351, "ratio/all_0": 0.0078125, "ratio/all_2": 0.734375, "reward": 1.928125023841858, "reward_std": 0.8631133139133453, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.932812511920929, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 10.543750286102295, "epoch": 0.297920179876335, "kl": 1.62973290681839, "learning_rate": 9.970337078651685e-07, "loss": 0.04487081244587898, "ratio/all_0": 0.015625, "ratio/all_2": 0.7734375, "reward": 1.932031273841858, "reward_std": 0.9043321311473846, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9859375059604645, "rewards/point_reward": 0.9460937678813934, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 10.403125286102295, "epoch": 0.29904440697020795, "kl": 1.6182299852371216, "learning_rate": 9.970224719101124e-07, "loss": 0.03904420882463455, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.9187500476837158, "reward_std": 0.8686837255954742, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.989062488079071, "rewards/point_reward": 0.9296875, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 10.600781440734863, "epoch": 0.30016863406408095, "kl": 1.6868852376937866, "learning_rate": 9.97011235955056e-07, "loss": 0.028194734826683998, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6484375, "reward": 1.8882812857627869, "reward_std": 0.8412781953811646, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9867187738418579, "rewards/point_reward": 0.901562511920929, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 10.592968940734863, "epoch": 0.3012928611579539, "kl": 1.6456496119499207, "learning_rate": 9.97e-07, "loss": 0.033215299248695374, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.924218773841858, "reward_std": 0.853204607963562, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875, "rewards/point_reward": 0.9320312738418579, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 10.525781154632568, "epoch": 0.3024170882518269, "kl": 1.8199474811553955, "learning_rate": 9.969887640449438e-07, "loss": 0.02363828755915165, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9078125357627869, "reward_std": 0.8277927041053772, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.98828125, "rewards/point_reward": 0.9195312559604645, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 10.553906440734863, "epoch": 0.30354131534569984, "kl": 1.738243043422699, "learning_rate": 9.969775280898875e-07, "loss": 0.03283867612481117, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6796875, "reward": 1.8992187976837158, "reward_std": 0.8493183255195618, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.828125, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9843750298023224, "rewards/point_reward": 0.914843738079071, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 10.518750190734863, "epoch": 0.3046655424395728, "kl": 1.7232638597488403, "learning_rate": 9.969662921348314e-07, "loss": 0.039549570530653, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.927343726158142, "reward_std": 0.8627557754516602, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9828125238418579, "rewards/point_reward": 0.944531261920929, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 10.567187786102295, "epoch": 0.3057897695334458, "kl": 1.806475043296814, "learning_rate": 9.969550561797753e-07, "loss": 0.03781045228242874, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.916406273841858, "reward_std": 0.8432705104351044, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.987500011920929, "rewards/point_reward": 0.928906261920929, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 10.459374904632568, "epoch": 0.3069139966273187, "kl": 1.7418119311332703, "learning_rate": 9.96943820224719e-07, "loss": 0.02658051624894142, "ratio/all_0": 0.0234375, "ratio/all_2": 0.640625, "reward": 1.896875023841858, "reward_std": 0.8112240135669708, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.796875, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9828124940395355, "rewards/point_reward": 0.9140625298023224, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 10.614062786102295, "epoch": 0.30803822372119166, "kl": 1.7741380333900452, "learning_rate": 9.969325842696629e-07, "loss": 0.038595620542764664, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6171875, "reward": 1.9195312857627869, "reward_std": 0.7923027276992798, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9742187559604645, "rewards/point_reward": 0.9453125, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 10.54843807220459, "epoch": 0.30916245081506466, "kl": 1.8489094972610474, "learning_rate": 9.969213483146068e-07, "loss": 0.03723674640059471, "ratio/all_0": 0.0234375, "ratio/all_2": 0.65625, "reward": 1.908593773841858, "reward_std": 0.8213367164134979, "rewards/avg_0": 1.875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9750000238418579, "rewards/point_reward": 0.93359375, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 10.652344226837158, "epoch": 0.3102866779089376, "kl": 1.8202200531959534, "learning_rate": 9.969101123595504e-07, "loss": 0.025497809052467346, "ratio/all_0": 0.0078125, "ratio/all_2": 0.5546875, "reward": 1.88671875, "reward_std": 0.7714699506759644, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8359375, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.8125, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.96484375, "rewards/point_reward": 0.9218750298023224, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 10.56406307220459, "epoch": 0.31141090500281055, "kl": 1.928186297416687, "learning_rate": 9.968988764044943e-07, "loss": 0.02249237895011902, "ratio/all_0": 0.0234375, "ratio/all_2": 0.5859375, "reward": 1.875, "reward_std": 0.787726491689682, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.796875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.971875011920929, "rewards/point_reward": 0.9031250178813934, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 10.594531059265137, "epoch": 0.31253513209668354, "kl": 1.8774849772453308, "learning_rate": 9.968876404494382e-07, "loss": 0.040980830788612366, "ratio/all_0": 0.0078125, "ratio/all_2": 0.65625, "reward": 1.9195312857627869, "reward_std": 0.8190862238407135, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.973437488079071, "rewards/point_reward": 0.9460937678813934, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 10.603125095367432, "epoch": 0.3136593591905565, "kl": 1.9386613965034485, "learning_rate": 9.968764044943819e-07, "loss": 0.023881088942289352, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6015625, "reward": 1.8671875, "reward_std": 0.8201425671577454, "rewards/avg_0": 1.78125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8203125, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.975781261920929, "rewards/point_reward": 0.8914062678813934, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 10.528906345367432, "epoch": 0.31478358628442943, "kl": 1.9172906279563904, "learning_rate": 9.968651685393258e-07, "loss": 0.0323064848780632, "ratio/all_0": 0.015625, "ratio/all_2": 0.625, "reward": 1.901562511920929, "reward_std": 0.8001732230186462, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8359375, "rewards/avg_2": 1.828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9671875238418579, "rewards/point_reward": 0.9343750178813934, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 10.539062976837158, "epoch": 0.3159078133783024, "kl": 1.888903796672821, "learning_rate": 9.968539325842697e-07, "loss": 0.038979463279247284, "ratio/all_0": 0.015625, "ratio/all_2": 0.59375, "reward": 1.908593773841858, "reward_std": 0.7810782790184021, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9750000238418579, "rewards/point_reward": 0.9335937798023224, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 10.551562786102295, "epoch": 0.31703204047217537, "kl": 1.9569891095161438, "learning_rate": 9.968426966292134e-07, "loss": 0.03946223855018616, "ratio/all_0": 0.0234375, "ratio/all_2": 0.578125, "reward": 1.8921875357627869, "reward_std": 0.7819642722606659, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8359375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9695312678813934, "rewards/point_reward": 0.9226562678813934, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 10.463281631469727, "epoch": 0.31815626756604837, "kl": 1.9771220684051514, "learning_rate": 9.968314606741572e-07, "loss": 0.029817869886755943, "ratio/all_0": 0.0234375, "ratio/all_2": 0.59375, "reward": 1.8835937976837158, "reward_std": 0.791181355714798, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.84375, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9781250059604645, "rewards/point_reward": 0.905468761920929, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 10.390625, "epoch": 0.3192804946599213, "kl": 1.9137450456619263, "learning_rate": 9.968202247191011e-07, "loss": 0.05185917764902115, "ratio/all_0": 0.0078125, "ratio/all_2": 0.734375, "reward": 1.9382812976837158, "reward_std": 0.8540681302547455, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9890625178813934, "rewards/point_reward": 0.9492187798023224, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 10.33750057220459, "epoch": 0.32040472175379425, "kl": 1.8866459131240845, "learning_rate": 9.968089887640448e-07, "loss": 0.04224765673279762, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.909375011920929, "reward_std": 0.8564162254333496, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.913281261920929, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 10.255468845367432, "epoch": 0.32152894884766725, "kl": 1.8018646836280823, "learning_rate": 9.967977528089887e-07, "loss": 0.043330300599336624, "ratio/all_0": 0.0, "ratio/all_2": 0.7890625, "reward": 1.9390625357627869, "reward_std": 0.8851653337478638, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9398437738418579, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 10.403125286102295, "epoch": 0.3226531759415402, "kl": 1.8123514652252197, "learning_rate": 9.967865168539326e-07, "loss": 0.049471862614154816, "ratio/all_0": 0.0, "ratio/all_2": 0.7734375, "reward": 1.947656273841858, "reward_std": 0.8822700083255768, "rewards/avg_0": 1.9609375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9765625, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.953125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.952343761920929, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 10.360937595367432, "epoch": 0.32377740303541314, "kl": 1.7900111675262451, "learning_rate": 9.967752808988765e-07, "loss": 0.035676054656505585, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6953125, "reward": 1.90625, "reward_std": 0.842722624540329, "rewards/avg_0": 1.875, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9070312678813934, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 10.370312690734863, "epoch": 0.32490163012928613, "kl": 1.8105761408805847, "learning_rate": 9.967640449438202e-07, "loss": 0.040825922042131424, "ratio/all_0": 0.0078125, "ratio/all_2": 0.75, "reward": 1.9312500357627869, "reward_std": 0.8617748022079468, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.9453125, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.953125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9312500059604645, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 10.416406631469727, "epoch": 0.3260258572231591, "kl": 1.8610867261886597, "learning_rate": 9.96752808988764e-07, "loss": 0.042752377688884735, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.913281261920929, "reward_std": 0.8681075572967529, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.913281261920929, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 10.275000095367432, "epoch": 0.327150084317032, "kl": 1.8451239466667175, "learning_rate": 9.96741573033708e-07, "loss": 0.04314912483096123, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.92578125, "reward_std": 0.8235087096691132, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9257812798023224, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 10.250000476837158, "epoch": 0.328274311410905, "kl": 1.79464590549469, "learning_rate": 9.967303370786516e-07, "loss": 0.03976229578256607, "ratio/all_0": 0.0078125, "ratio/all_2": 0.75, "reward": 1.928906261920929, "reward_std": 0.8630857169628143, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.928906261920929, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 10.306250095367432, "epoch": 0.32939853850477796, "kl": 1.8493717908859253, "learning_rate": 9.967191011235955e-07, "loss": 0.048004940152168274, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.928125023841858, "reward_std": 0.8397326469421387, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9281250238418579, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 10.316406726837158, "epoch": 0.3305227655986509, "kl": 1.8398056626319885, "learning_rate": 9.967078651685394e-07, "loss": 0.041067518293857574, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.91796875, "reward_std": 0.8646991848945618, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9195312559604645, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 10.385156631469727, "epoch": 0.3316469926925239, "kl": 1.7265257239341736, "learning_rate": 9.96696629213483e-07, "loss": 0.036400072276592255, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.9234375357627869, "reward_std": 0.8537377119064331, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.925000011920929, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 10.515625476837158, "epoch": 0.33277121978639684, "kl": 1.7181497812271118, "learning_rate": 9.96685393258427e-07, "loss": 0.035892877727746964, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.905468761920929, "reward_std": 0.8431965708732605, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9070312678813934, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 10.43359375, "epoch": 0.33389544688026984, "kl": 1.7789562344551086, "learning_rate": 9.966741573033709e-07, "loss": 0.038690704852342606, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.917187511920929, "reward_std": 0.8528233170509338, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.917187511920929, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 10.446875095367432, "epoch": 0.3350196739741428, "kl": 1.7555699944496155, "learning_rate": 9.966629213483145e-07, "loss": 0.043629299849271774, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.920312523841858, "reward_std": 0.8546639978885651, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9203125238418579, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 10.271874904632568, "epoch": 0.3361439010680157, "kl": 1.8395432829856873, "learning_rate": 9.966516853932584e-07, "loss": 0.05077920854091644, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9382812976837158, "reward_std": 0.8653816878795624, "rewards/avg_0": 1.9453125, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9382812678813934, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 10.353906154632568, "epoch": 0.3372681281618887, "kl": 1.8199289441108704, "learning_rate": 9.966404494382023e-07, "loss": 0.04438919201493263, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.924218773841858, "reward_std": 0.8613188862800598, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9242187440395355, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 10.187500476837158, "epoch": 0.33839235525576167, "kl": 2.1463682055473328, "learning_rate": 9.96629213483146e-07, "loss": 0.055228251963853836, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.9148437976837158, "reward_std": 0.8456367552280426, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9148437678813934, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 10.175000190734863, "epoch": 0.3395165823496346, "kl": 2.1115158200263977, "learning_rate": 9.9661797752809e-07, "loss": 0.05504034459590912, "ratio/all_0": 0.03125, "ratio/all_2": 0.765625, "reward": 1.9226562976837158, "reward_std": 0.884088397026062, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9242187738418579, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 10.18359375, "epoch": 0.3406408094435076, "kl": 1.990248441696167, "learning_rate": 9.966067415730338e-07, "loss": 0.052734263241291046, "ratio/all_0": 0.015625, "ratio/all_2": 0.828125, "reward": 1.940625011920929, "reward_std": 0.9085220098495483, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.940625011920929, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 10.145312309265137, "epoch": 0.34176503653738055, "kl": 1.8839709758758545, "learning_rate": 9.965955056179775e-07, "loss": 0.04460742324590683, "ratio/all_0": 0.0078125, "ratio/all_2": 0.734375, "reward": 1.932031273841858, "reward_std": 0.857248067855835, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.932812511920929, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 10.23046875, "epoch": 0.3428892636312535, "kl": 1.853227436542511, "learning_rate": 9.965842696629214e-07, "loss": 0.04168041795492172, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.916406273841858, "reward_std": 0.8461984395980835, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.91796875, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 10.2265625, "epoch": 0.3440134907251265, "kl": 1.869143784046173, "learning_rate": 9.965730337078652e-07, "loss": 0.0380750373005867, "ratio/all_0": 0.0078125, "ratio/all_2": 0.703125, "reward": 1.9195312857627869, "reward_std": 0.8381736278533936, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9195312559604645, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 10.282031059265137, "epoch": 0.34513771781899943, "kl": 1.8878865242004395, "learning_rate": 9.96561797752809e-07, "loss": 0.046272002160549164, "ratio/all_0": 0.0390625, "ratio/all_2": 0.78125, "reward": 1.916406273841858, "reward_std": 0.9037257432937622, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.91796875, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 10.348437786102295, "epoch": 0.3462619449128724, "kl": 1.7901397943496704, "learning_rate": 9.965505617977528e-07, "loss": 0.03747926652431488, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.918749988079071, "reward_std": 0.8264937102794647, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9203125238418579, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 10.301562786102295, "epoch": 0.3473861720067454, "kl": 1.82473886013031, "learning_rate": 9.965393258426967e-07, "loss": 0.041796181350946426, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.928125023841858, "reward_std": 0.8394474387168884, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.928906261920929, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 10.310937881469727, "epoch": 0.3485103991006183, "kl": 1.8468487858772278, "learning_rate": 9.965280898876404e-07, "loss": 0.04869385436177254, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.932812511920929, "reward_std": 0.8740152418613434, "rewards/avg_0": 1.9453125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9351562559604645, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 10.325000286102295, "epoch": 0.34963462619449126, "kl": 1.824562668800354, "learning_rate": 9.965168539325843e-07, "loss": 0.04901933670043945, "ratio/all_0": 0.0234375, "ratio/all_2": 0.8046875, "reward": 1.93359375, "reward_std": 0.9041942656040192, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9351562559604645, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 10.381250381469727, "epoch": 0.35075885328836426, "kl": 1.77147775888443, "learning_rate": 9.965056179775282e-07, "loss": 0.04348636791110039, "ratio/all_0": 0.015625, "ratio/all_2": 0.7734375, "reward": 1.935937523841858, "reward_std": 0.8781629204750061, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9359375238418579, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 10.47421932220459, "epoch": 0.3518830803822372, "kl": 1.7815687656402588, "learning_rate": 9.964943820224718e-07, "loss": 0.04413023591041565, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7265625, "reward": 1.924218773841858, "reward_std": 0.8591809868812561, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.953125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9265625178813934, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 10.405468940734863, "epoch": 0.3530073074761102, "kl": 1.8134968280792236, "learning_rate": 9.964831460674157e-07, "loss": 0.046355605125427246, "ratio/all_0": 0.0, "ratio/all_2": 0.796875, "reward": 1.94921875, "reward_std": 0.8853201568126678, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.9453125, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9609375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9507812559604645, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 10.400781154632568, "epoch": 0.35413153456998314, "kl": 1.8659851551055908, "learning_rate": 9.964719101123596e-07, "loss": 0.04664193093776703, "ratio/all_0": 0.03125, "ratio/all_2": 0.734375, "reward": 1.9195312857627869, "reward_std": 0.8666932284832001, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9195312559604645, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 10.532031536102295, "epoch": 0.3552557616638561, "kl": 1.852257788181305, "learning_rate": 9.964606741573033e-07, "loss": 0.0502714179456234, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7421875, "reward": 1.932812511920929, "reward_std": 0.8658345639705658, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9343750178813934, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 10.500781536102295, "epoch": 0.3563799887577291, "kl": 1.9854409098625183, "learning_rate": 9.964494382022472e-07, "loss": 0.051377031952142715, "ratio/all_0": 0.0234375, "ratio/all_2": 0.71875, "reward": 1.921875, "reward_std": 0.858985036611557, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.926562488079071, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 10.478906154632568, "epoch": 0.357504215851602, "kl": 1.9958835244178772, "learning_rate": 9.96438202247191e-07, "loss": 0.05269791930913925, "ratio/all_0": 0.0234375, "ratio/all_2": 0.75, "reward": 1.9250000715255737, "reward_std": 0.8795750141143799, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9296875298023224, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 10.481250286102295, "epoch": 0.35862844294547497, "kl": 2.0202487111091614, "learning_rate": 9.964269662921348e-07, "loss": 0.05175580829381943, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6953125, "reward": 1.9187500476837158, "reward_std": 0.8507504165172577, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9945312738418579, "rewards/point_reward": 0.9242187738418579, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 10.442187786102295, "epoch": 0.35975267003934797, "kl": 2.0611737966537476, "learning_rate": 9.964157303370787e-07, "loss": 0.0645291656255722, "ratio/all_0": 0.0234375, "ratio/all_2": 0.796875, "reward": 1.9429687857627869, "reward_std": 0.902433842420578, "rewards/avg_0": 1.9375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.9468750059604645, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 10.372656345367432, "epoch": 0.3608768971332209, "kl": 2.1989023685455322, "learning_rate": 9.964044943820226e-07, "loss": 0.06326432526111603, "ratio/all_0": 0.015625, "ratio/all_2": 0.765625, "reward": 1.9335938096046448, "reward_std": 0.8833630681037903, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9382812678813934, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 10.446875095367432, "epoch": 0.36200112422709385, "kl": 2.1978585720062256, "learning_rate": 9.963932584269662e-07, "loss": 0.046397365629673004, "ratio/all_0": 0.015625, "ratio/all_2": 0.7265625, "reward": 1.91015625, "reward_std": 0.8547484576702118, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9937500357627869, "rewards/point_reward": 0.9164062440395355, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 10.348437309265137, "epoch": 0.36312535132096685, "kl": 2.1940536499023438, "learning_rate": 9.963820224719101e-07, "loss": 0.05930664390325546, "ratio/all_0": 0.015625, "ratio/all_2": 0.734375, "reward": 1.9296875, "reward_std": 0.8640153706073761, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9890625178813934, "rewards/point_reward": 0.940625011920929, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 10.446094036102295, "epoch": 0.3642495784148398, "kl": 2.160746693611145, "learning_rate": 9.96370786516854e-07, "loss": 0.0425700843334198, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.90234375, "reward_std": 0.8400704264640808, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.875, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.987500011920929, "rewards/point_reward": 0.9148437678813934, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 10.309375286102295, "epoch": 0.36537380550871273, "kl": 2.020435869693756, "learning_rate": 9.963595505617977e-07, "loss": 0.05405348166823387, "ratio/all_0": 0.0234375, "ratio/all_2": 0.75, "reward": 1.9265625476837158, "reward_std": 0.8764021694660187, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.9304687678813934, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 10.322656631469727, "epoch": 0.36649803260258573, "kl": 2.034945547580719, "learning_rate": 9.963483146067416e-07, "loss": 0.056264981627464294, "ratio/all_0": 0.0078125, "ratio/all_2": 0.765625, "reward": 1.943750023841858, "reward_std": 0.8759823739528656, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.948437511920929, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 10.303906440734863, "epoch": 0.3676222596964587, "kl": 2.0568439960479736, "learning_rate": 9.963370786516855e-07, "loss": 0.04515969753265381, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.899218738079071, "reward_std": 0.8356598913669586, "rewards/avg_0": 1.8125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.84375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9882812798023224, "rewards/point_reward": 0.9109375178813934, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 10.306250095367432, "epoch": 0.3687464867903317, "kl": 1.9643288254737854, "learning_rate": 9.963258426966292e-07, "loss": 0.053189441561698914, "ratio/all_0": 0.015625, "ratio/all_2": 0.78125, "reward": 1.939843773841858, "reward_std": 0.8839648067951202, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.953125, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.9437499940395355, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 10.207812786102295, "epoch": 0.3698707138842046, "kl": 2.013589084148407, "learning_rate": 9.96314606741573e-07, "loss": 0.04469640925526619, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.919531226158142, "reward_std": 0.8481138348579407, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9945312440395355, "rewards/point_reward": 0.925000011920929, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 10.314844131469727, "epoch": 0.37099494097807756, "kl": 1.8831562399864197, "learning_rate": 9.96303370786517e-07, "loss": 0.043579135090112686, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9296875596046448, "reward_std": 0.8622469007968903, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9343750178813934, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 10.259375095367432, "epoch": 0.37211916807195056, "kl": 1.9401150941848755, "learning_rate": 9.962921348314606e-07, "loss": 0.034515127539634705, "ratio/all_0": 0.015625, "ratio/all_2": 0.65625, "reward": 1.90234375, "reward_std": 0.8185981810092926, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9929687678813934, "rewards/point_reward": 0.909375011920929, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 10.256249904632568, "epoch": 0.3732433951658235, "kl": 1.9391788840293884, "learning_rate": 9.962808988764045e-07, "loss": 0.04044695198535919, "ratio/all_0": 0.015625, "ratio/all_2": 0.6875, "reward": 1.9109375476837158, "reward_std": 0.8457313776016235, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9156250059604645, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 10.24375057220459, "epoch": 0.37436762225969644, "kl": 1.934513509273529, "learning_rate": 9.962696629213482e-07, "loss": 0.040316130965948105, "ratio/all_0": 0.03125, "ratio/all_2": 0.7421875, "reward": 1.917187511920929, "reward_std": 0.8681433200836182, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9218750298023224, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 10.280468940734863, "epoch": 0.37549184935356944, "kl": 1.921375572681427, "learning_rate": 9.96258426966292e-07, "loss": 0.04540445655584335, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.924218773841858, "reward_std": 0.855448454618454, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9929687678813934, "rewards/point_reward": 0.9312500357627869, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 10.259375095367432, "epoch": 0.3766160764474424, "kl": 1.8859546780586243, "learning_rate": 9.96247191011236e-07, "loss": 0.04824208840727806, "ratio/all_0": 0.0078125, "ratio/all_2": 0.71875, "reward": 1.930468738079071, "reward_std": 0.8533749580383301, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9351562559604645, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 10.177343845367432, "epoch": 0.3777403035413153, "kl": 1.868826985359192, "learning_rate": 9.962359550561796e-07, "loss": 0.0528462678194046, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7890625, "reward": 1.94921875, "reward_std": 0.8829784393310547, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9296875, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9453125, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9500000178813934, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 10.29453182220459, "epoch": 0.3788645306351883, "kl": 1.7946856617927551, "learning_rate": 9.962247191011235e-07, "loss": 0.04495403170585632, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6953125, "reward": 1.9117187857627869, "reward_std": 0.8583216667175293, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9140625, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 10.392187595367432, "epoch": 0.37998875772906127, "kl": 1.770771324634552, "learning_rate": 9.962134831460674e-07, "loss": 0.03466241434216499, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6328125, "reward": 1.9062500596046448, "reward_std": 0.804929107427597, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9078125059604645, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 10.415625095367432, "epoch": 0.3811129848229342, "kl": 1.7638018727302551, "learning_rate": 9.96202247191011e-07, "loss": 0.038400210440158844, "ratio/all_0": 0.0234375, "ratio/all_2": 0.75, "reward": 1.917187511920929, "reward_std": 0.8736513555049896, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.91796875, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 10.450000286102295, "epoch": 0.3822372119168072, "kl": 1.7452391982078552, "learning_rate": 9.96191011235955e-07, "loss": 0.031908392906188965, "ratio/all_0": 0.0234375, "ratio/all_2": 0.734375, "reward": 1.90625, "reward_std": 0.8690169453620911, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9085937738418579, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 10.489843845367432, "epoch": 0.38336143901068015, "kl": 1.7394488453865051, "learning_rate": 9.961797752808989e-07, "loss": 0.03782523423433304, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.9148437976837158, "reward_std": 0.8547420799732208, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.91796875, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 10.541406631469727, "epoch": 0.3844856661045531, "kl": 1.6925815343856812, "learning_rate": 9.961685393258426e-07, "loss": 0.031927358359098434, "ratio/all_0": 0.015625, "ratio/all_2": 0.75, "reward": 1.9226562976837158, "reward_std": 0.8643298447132111, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.9296875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9234375059604645, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 10.5390625, "epoch": 0.3856098931984261, "kl": 1.673849105834961, "learning_rate": 9.961573033707865e-07, "loss": 0.02241407334804535, "ratio/all_0": 0.046875, "ratio/all_2": 0.6640625, "reward": 1.8765625357627869, "reward_std": 0.8381451368331909, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8203125, "rewards/avg_3": 1.828125, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.8203125, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.878125011920929, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 10.5078125, "epoch": 0.38673412029229903, "kl": 1.6821864247322083, "learning_rate": 9.961460674157303e-07, "loss": 0.04435833916068077, "ratio/all_0": 0.015625, "ratio/all_2": 0.7734375, "reward": 1.940625011920929, "reward_std": 0.8782646358013153, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.9453125, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.940625011920929, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 10.510937690734863, "epoch": 0.38785834738617203, "kl": 1.7037270069122314, "learning_rate": 9.96134831460674e-07, "loss": 0.02727324329316616, "ratio/all_0": 0.0078125, "ratio/all_2": 0.703125, "reward": 1.9140625596046448, "reward_std": 0.8372543752193451, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9148437678813934, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 10.547656536102295, "epoch": 0.388982574480045, "kl": 1.6825117468833923, "learning_rate": 9.96123595505618e-07, "loss": 0.037956684827804565, "ratio/all_0": 0.015625, "ratio/all_2": 0.765625, "reward": 1.928906261920929, "reward_std": 0.8790275156497955, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9304687678813934, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.3901068015739179, "kl": 1.700487732887268, "learning_rate": 9.961123595505618e-07, "loss": 0.03412896394729614, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.921875, "reward_std": 0.8527751564979553, "rewards/avg_0": 1.9453125, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9242187738418579, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 10.485937595367432, "epoch": 0.3912310286677909, "kl": 1.7351809740066528, "learning_rate": 9.961011235955055e-07, "loss": 0.024284057319164276, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.90234375, "reward_std": 0.8538567125797272, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9039062857627869, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 10.400781631469727, "epoch": 0.39235525576166386, "kl": 1.7726843357086182, "learning_rate": 9.960898876404494e-07, "loss": 0.04192570224404335, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6875, "reward": 1.9101563096046448, "reward_std": 0.8402937352657318, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.9117187559604645, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 10.570312976837158, "epoch": 0.3934794828555368, "kl": 1.7546494603157043, "learning_rate": 9.960786516853933e-07, "loss": 0.02213025651872158, "ratio/all_0": 0.0390625, "ratio/all_2": 0.609375, "reward": 1.872656226158142, "reward_std": 0.8090003728866577, "rewards/avg_0": 1.84375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.8046875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.875, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 10.430468559265137, "epoch": 0.3946037099494098, "kl": 1.813601315021515, "learning_rate": 9.96067415730337e-07, "loss": 0.037648413330316544, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6796875, "reward": 1.9265625476837158, "reward_std": 0.8125456273555756, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9273437559604645, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 10.321094036102295, "epoch": 0.39572793704328274, "kl": 1.8753434419631958, "learning_rate": 9.960561797752808e-07, "loss": 0.039298996329307556, "ratio/all_0": 0.0234375, "ratio/all_2": 0.671875, "reward": 1.91015625, "reward_std": 0.8232802748680115, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9109375178813934, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 10.334375381469727, "epoch": 0.3968521641371557, "kl": 1.867890477180481, "learning_rate": 9.960449438202247e-07, "loss": 0.04137108474969864, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.909375011920929, "reward_std": 0.8602083325386047, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.91015625, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 10.3515625, "epoch": 0.3979763912310287, "kl": 1.7816566228866577, "learning_rate": 9.960337078651684e-07, "loss": 0.031370680779218674, "ratio/all_0": 0.03125, "ratio/all_2": 0.703125, "reward": 1.900781273841858, "reward_std": 0.8489590287208557, "rewards/avg_0": 1.875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.901562511920929, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 10.286718845367432, "epoch": 0.3991006183249016, "kl": 1.8434048295021057, "learning_rate": 9.960224719101123e-07, "loss": 0.03749451786279678, "ratio/all_0": 0.046875, "ratio/all_2": 0.6796875, "reward": 1.8921875357627869, "reward_std": 0.8432820439338684, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.84375, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.8921875059604645, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 10.339062690734863, "epoch": 0.40022484541877457, "kl": 1.7783136367797852, "learning_rate": 9.960112359550562e-07, "loss": 0.03067711368203163, "ratio/all_0": 0.046875, "ratio/all_2": 0.6328125, "reward": 1.8835937976837158, "reward_std": 0.8181053400039673, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.8359375, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.84375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.883593738079071, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 10.340625286102295, "epoch": 0.40134907251264756, "kl": 1.7568607330322266, "learning_rate": 9.959999999999999e-07, "loss": 0.03994826227426529, "ratio/all_0": 0.0546875, "ratio/all_2": 0.703125, "reward": 1.8953125476837158, "reward_std": 0.8587200343608856, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.875, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.8960937559604645, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 10.377344131469727, "epoch": 0.4024732996065205, "kl": 1.7589952945709229, "learning_rate": 9.959887640449438e-07, "loss": 0.035448405891656876, "ratio/all_0": 0.0078125, "ratio/all_2": 0.640625, "reward": 1.9195312857627869, "reward_std": 0.8011971116065979, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.875, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9203125238418579, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 10.403125286102295, "epoch": 0.4035975267003935, "kl": 2.400500178337097, "learning_rate": 9.959775280898876e-07, "loss": 0.0631265640258789, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6953125, "reward": 1.928125023841858, "reward_std": 0.8244198858737946, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9281249940395355, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 10.425781726837158, "epoch": 0.40472175379426645, "kl": 1.7974015474319458, "learning_rate": 9.959662921348313e-07, "loss": 0.040458083152770996, "ratio/all_0": 0.015625, "ratio/all_2": 0.7421875, "reward": 1.9289063215255737, "reward_std": 0.8593026697635651, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9296875, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 10.502344131469727, "epoch": 0.4058459808881394, "kl": 1.714927613735199, "learning_rate": 9.959550561797752e-07, "loss": 0.016150854527950287, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6484375, "reward": 1.8867188096046448, "reward_std": 0.8087844252586365, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.88671875, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 10.498437404632568, "epoch": 0.4069702079820124, "kl": 1.7653551697731018, "learning_rate": 9.959438202247191e-07, "loss": 0.038302332162857056, "ratio/all_0": 0.046875, "ratio/all_2": 0.671875, "reward": 1.897656261920929, "reward_std": 0.8373755216598511, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8671875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.8992187678813934, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.40809443507588533, "kl": 1.7473861575126648, "learning_rate": 9.959325842696628e-07, "loss": 0.031032182276248932, "ratio/all_0": 0.0546875, "ratio/all_2": 0.6484375, "reward": 1.880468726158142, "reward_std": 0.8342385292053223, "rewards/avg_0": 1.8359375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8203125, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.882031261920929, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 10.524219036102295, "epoch": 0.4092186621697583, "kl": 1.738921344280243, "learning_rate": 9.959213483146067e-07, "loss": 0.033890679478645325, "ratio/all_0": 0.03125, "ratio/all_2": 0.71875, "reward": 1.909375011920929, "reward_std": 0.8538974523544312, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.91015625, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 10.576562404632568, "epoch": 0.41034288926363127, "kl": 1.743700623512268, "learning_rate": 9.959101123595506e-07, "loss": 0.03802679851651192, "ratio/all_0": 0.0390625, "ratio/all_2": 0.6875, "reward": 1.90625, "reward_std": 0.8454999029636383, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.8828125, "rewards/avg_6": 1.890625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9085937738418579, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 10.540625095367432, "epoch": 0.4114671163575042, "kl": 1.8171001076698303, "learning_rate": 9.958988764044942e-07, "loss": 0.029331211000680923, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.899218738079071, "reward_std": 0.8547923564910889, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8515625, "rewards/avg_7": 1.8359375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.901562511920929, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 10.59765625, "epoch": 0.41259134345137716, "kl": 1.8153007626533508, "learning_rate": 9.958876404494381e-07, "loss": 0.031290844082832336, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.9117187857627869, "reward_std": 0.8251713514328003, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.913281261920929, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 10.586718559265137, "epoch": 0.41371557054525016, "kl": 1.7499891519546509, "learning_rate": 9.95876404494382e-07, "loss": 0.045041367411613464, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7578125, "reward": 1.928125023841858, "reward_std": 0.8811922073364258, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9375, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.921875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9296875298023224, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 10.546093940734863, "epoch": 0.4148397976391231, "kl": 1.7742969393730164, "learning_rate": 9.958651685393257e-07, "loss": 0.03312449902296066, "ratio/all_0": 0.0234375, "ratio/all_2": 0.65625, "reward": 1.9078125357627869, "reward_std": 0.8149179220199585, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.859375, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9085937738418579, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 10.528125286102295, "epoch": 0.41596402473299604, "kl": 1.8458644151687622, "learning_rate": 9.958539325842696e-07, "loss": 0.04537215456366539, "ratio/all_0": 0.0078125, "ratio/all_2": 0.703125, "reward": 1.932812511920929, "reward_std": 0.8349111080169678, "rewards/avg_0": 1.953125, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.93359375, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 10.56796932220459, "epoch": 0.41708825182686904, "kl": 1.7916463613510132, "learning_rate": 9.958426966292135e-07, "loss": 0.027775993570685387, "ratio/all_0": 0.015625, "ratio/all_2": 0.6484375, "reward": 1.899999976158142, "reward_std": 0.8068560361862183, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.8515625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 1.0, "rewards/point_reward": 0.9000000357627869, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 10.571094036102295, "epoch": 0.418212478920742, "kl": 1.8079155087471008, "learning_rate": 9.958314606741572e-07, "loss": 0.0378127321600914, "ratio/all_0": 0.0, "ratio/all_2": 0.6640625, "reward": 1.928906261920929, "reward_std": 0.8060687780380249, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.953125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.90625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9296875298023224, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 10.447656631469727, "epoch": 0.419336706014615, "kl": 2.0055981278419495, "learning_rate": 9.95820224719101e-07, "loss": 0.05942863970994949, "ratio/all_0": 0.0234375, "ratio/all_2": 0.8203125, "reward": 1.939843773841858, "reward_std": 0.9128240644931793, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9296875, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.940625011920929, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 10.457812786102295, "epoch": 0.4204609331084879, "kl": 2.0407444834709167, "learning_rate": 9.958089887640447e-07, "loss": 0.047153279185295105, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.9148437976837158, "reward_std": 0.8315493166446686, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.921875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.917187511920929, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 10.54453182220459, "epoch": 0.42158516020236086, "kl": 2.314019560813904, "learning_rate": 9.957977528089886e-07, "loss": 0.058852873742580414, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6484375, "reward": 1.9054688215255737, "reward_std": 0.8244183659553528, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875, "rewards/point_reward": 0.913281261920929, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 10.529687881469727, "epoch": 0.42270938729623386, "kl": 2.133104085922241, "learning_rate": 9.957865168539325e-07, "loss": 0.04806741327047348, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.907031238079071, "reward_std": 0.8460875153541565, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.875, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.890625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.91015625, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 10.531250476837158, "epoch": 0.4238336143901068, "kl": 2.215003252029419, "learning_rate": 9.957752808988764e-07, "loss": 0.05868378281593323, "ratio/all_0": 0.0390625, "ratio/all_2": 0.703125, "reward": 1.905468761920929, "reward_std": 0.8680282831192017, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8671875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9945312440395355, "rewards/point_reward": 0.9109375178813934, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 10.579687595367432, "epoch": 0.42495784148397975, "kl": 1.8949918150901794, "learning_rate": 9.9576404494382e-07, "loss": 0.05246419459581375, "ratio/all_0": 0.0078125, "ratio/all_2": 0.765625, "reward": 1.9468750357627869, "reward_std": 0.8695424795150757, "rewards/avg_0": 1.921875, "rewards/avg_1": 1.9375, "rewards/avg_2": 1.9375, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9609375, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9296875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9476562738418579, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 10.537499904632568, "epoch": 0.42608206857785275, "kl": 2.0464513897895813, "learning_rate": 9.95752808988764e-07, "loss": 0.046930864453315735, "ratio/all_0": 0.015625, "ratio/all_2": 0.6796875, "reward": 1.9250000715255737, "reward_std": 0.8235551118850708, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.9273437559604645, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 10.576562404632568, "epoch": 0.4272062956717257, "kl": 2.3635306358337402, "learning_rate": 9.957415730337079e-07, "loss": 0.06351582705974579, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6875, "reward": 1.91015625, "reward_std": 0.8465672135353088, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.875, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8671875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9945312738418579, "rewards/point_reward": 0.9156250059604645, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 10.589062690734863, "epoch": 0.42833052276559863, "kl": 2.1300774812698364, "learning_rate": 9.957303370786516e-07, "loss": 0.04272787272930145, "ratio/all_0": 0.03125, "ratio/all_2": 0.671875, "reward": 1.896875023841858, "reward_std": 0.8363301753997803, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.84375, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.8515625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.996874988079071, "rewards/point_reward": 0.9000000059604645, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 10.495312690734863, "epoch": 0.42945474985947163, "kl": 1.9909522533416748, "learning_rate": 9.957191011235954e-07, "loss": 0.050648171454668045, "ratio/all_0": 0.03125, "ratio/all_2": 0.6875, "reward": 1.9156250357627869, "reward_std": 0.8425585627555847, "rewards/avg_0": 1.875, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8359375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.91796875, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 10.517187595367432, "epoch": 0.43057897695334457, "kl": 1.9220994114875793, "learning_rate": 9.957078651685393e-07, "loss": 0.046620436012744904, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6796875, "reward": 1.9304687976837158, "reward_std": 0.8176408112049103, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.9312500059604645, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 10.509375095367432, "epoch": 0.4317032040472175, "kl": 2.019084930419922, "learning_rate": 9.95696629213483e-07, "loss": 0.052892833948135376, "ratio/all_0": 0.03125, "ratio/all_2": 0.6953125, "reward": 1.9148437976837158, "reward_std": 0.8502573072910309, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.859375, "rewards/avg_5": 1.8671875, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.9375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.9187500178813934, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 10.425781726837158, "epoch": 0.4328274311410905, "kl": 1.8879412412643433, "learning_rate": 9.95685393258427e-07, "loss": 0.031065676361322403, "ratio/all_0": 0.0078125, "ratio/all_2": 0.6640625, "reward": 1.909375011920929, "reward_std": 0.8127116858959198, "rewards/avg_0": 1.890625, "rewards/avg_1": 1.8984375, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.99609375, "rewards/point_reward": 0.913281261920929, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 10.498437881469727, "epoch": 0.43395165823496346, "kl": 1.8726175427436829, "learning_rate": 9.956741573033708e-07, "loss": 0.049964789301157, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7421875, "reward": 1.944531261920929, "reward_std": 0.8532433807849884, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9140625, "rewards/avg_2": 1.8984375, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9375, "rewards/avg_6": 1.9453125, "rewards/avg_7": 1.96875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984375238418579, "rewards/point_reward": 0.9460937678813934, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 10.471875190734863, "epoch": 0.4350758853288364, "kl": 1.9985505938529968, "learning_rate": 9.956629213483145e-07, "loss": 0.04716179892420769, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7109375, "reward": 1.913281261920929, "reward_std": 0.8565007448196411, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.9140625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.9140625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.859375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9179687798023224, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 10.591406345367432, "epoch": 0.4362001124227094, "kl": 1.9754056334495544, "learning_rate": 9.956516853932584e-07, "loss": 0.04457426816225052, "ratio/all_0": 0.03125, "ratio/all_2": 0.7109375, "reward": 1.9148437976837158, "reward_std": 0.8505724966526031, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.917187511920929, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 10.512500286102295, "epoch": 0.43732433951658234, "kl": 1.8616729974746704, "learning_rate": 9.956404494382023e-07, "loss": 0.04564625769853592, "ratio/all_0": 0.015625, "ratio/all_2": 0.7265625, "reward": 1.928125023841858, "reward_std": 0.8594686686992645, "rewards/avg_0": 1.90625, "rewards/avg_1": 1.8828125, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.8828125, "rewards/avg_5": 1.9453125, "rewards/avg_6": 1.9140625, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.996874988079071, "rewards/point_reward": 0.9312500059604645, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 10.460937976837158, "epoch": 0.43844856661045534, "kl": 1.9120779633522034, "learning_rate": 9.95629213483146e-07, "loss": 0.05435582995414734, "ratio/all_0": 0.015625, "ratio/all_2": 0.796875, "reward": 1.947656273841858, "reward_std": 0.8881878852844238, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.9453125, "rewards/avg_2": 1.9453125, "rewards/avg_3": 1.9453125, "rewards/avg_4": 1.9375, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.921875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.999218761920929, "rewards/point_reward": 0.948437511920929, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 10.542187690734863, "epoch": 0.4395727937043283, "kl": 2.190659284591675, "learning_rate": 9.956179775280898e-07, "loss": 0.0439864918589592, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6484375, "reward": 1.8921875357627869, "reward_std": 0.8169780969619751, "rewards/avg_0": 1.859375, "rewards/avg_1": 1.875, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8671875, "rewards/avg_4": 1.875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.8359375, "rewards/avg_7": 1.8828125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9984374940395355, "rewards/point_reward": 0.893750011920929, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 10.589844226837158, "epoch": 0.4406970207982012, "kl": 2.186584234237671, "learning_rate": 9.956067415730337e-07, "loss": 0.04994940757751465, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6484375, "reward": 1.901562511920929, "reward_std": 0.823219507932663, "rewards/avg_0": 1.828125, "rewards/avg_1": 1.8671875, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.890625, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9968750178813934, "rewards/point_reward": 0.9046875238418579, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 10.546875, "epoch": 0.4418212478920742, "kl": 2.2678062915802, "learning_rate": 9.955955056179774e-07, "loss": 0.06236354634165764, "ratio/all_0": 0.0234375, "ratio/all_2": 0.6875, "reward": 1.9226562976837158, "reward_std": 0.837425947189331, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.921875, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.9140625, "rewards/avg_4": 1.9140625, "rewards/avg_5": 1.9296875, "rewards/avg_6": 1.8828125, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9945312738418579, "rewards/point_reward": 0.9281249940395355, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 10.487500190734863, "epoch": 0.44294547498594716, "kl": 2.0774426460266113, "learning_rate": 9.955842696629213e-07, "loss": 0.05997675657272339, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.935937523841858, "reward_std": 0.8445488810539246, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.9453125, "rewards/avg_2": 1.859375, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.9375, "rewards/avg_7": 1.9453125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9976562559604645, "rewards/point_reward": 0.938281238079071, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 10.522656440734863, "epoch": 0.4440697020798201, "kl": 2.3769845962524414, "learning_rate": 9.955730337078652e-07, "loss": 0.05876045674085617, "ratio/all_0": 0.0234375, "ratio/all_2": 0.7421875, "reward": 1.9148437976837158, "reward_std": 0.8725523352622986, "rewards/avg_0": 1.8828125, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.890625, "rewards/avg_3": 1.90625, "rewards/avg_4": 1.90625, "rewards/avg_5": 1.890625, "rewards/avg_6": 1.921875, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.995312511920929, "rewards/point_reward": 0.9195312559604645, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 10.563281536102295, "epoch": 0.4451939291736931, "kl": 2.1915425062179565, "learning_rate": 9.955617977528089e-07, "loss": 0.05748950317502022, "ratio/all_0": 0.0234375, "ratio/all_2": 0.703125, "reward": 1.916406273841858, "reward_std": 0.8571533858776093, "rewards/avg_0": 1.9140625, "rewards/avg_1": 1.90625, "rewards/avg_2": 1.8828125, "rewards/avg_3": 1.8984375, "rewards/avg_4": 1.921875, "rewards/avg_5": 1.859375, "rewards/avg_6": 1.90625, "rewards/avg_7": 1.875, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.992968738079071, "rewards/point_reward": 0.9234375059604645, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 10.535937786102295, "epoch": 0.44631815626756605, "kl": 2.4020025730133057, "learning_rate": 9.955505617977527e-07, "loss": 0.041312672197818756, "ratio/all_0": 0.0234375, "ratio/all_2": 0.5625, "reward": 1.8679687976837158, "reward_std": 0.7768263518810272, "rewards/avg_0": 1.8515625, "rewards/avg_1": 1.8125, "rewards/avg_2": 1.84375, "rewards/avg_3": 1.859375, "rewards/avg_4": 1.8515625, "rewards/avg_5": 1.828125, "rewards/avg_6": 1.8125, "rewards/avg_7": 1.8203125, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9921875, "rewards/point_reward": 0.8757812678813934, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 10.42578125, "epoch": 0.447442383361439, "kl": 2.278535842895508, "learning_rate": 9.955393258426966e-07, "loss": 0.05333054065704346, "ratio/all_0": 0.0078125, "ratio/all_2": 0.7109375, "reward": 1.9179688096046448, "reward_std": 0.8447178602218628, "rewards/avg_0": 1.8984375, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.90625, "rewards/avg_3": 1.8828125, "rewards/avg_4": 1.8984375, "rewards/avg_5": 1.90625, "rewards/avg_6": 1.8984375, "rewards/avg_7": 1.8984375, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.991406261920929, "rewards/point_reward": 0.9265625178813934, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 10.431250095367432, "epoch": 0.448566610455312, "kl": 2.0164719820022583, "learning_rate": 9.955280898876403e-07, "loss": 0.05713193863630295, "ratio/all_0": 0.015625, "ratio/all_2": 0.71875, "reward": 1.930468738079071, "reward_std": 0.8625682890415192, "rewards/avg_0": 1.9296875, "rewards/avg_1": 1.890625, "rewards/avg_2": 1.921875, "rewards/avg_3": 1.9296875, "rewards/avg_4": 1.890625, "rewards/avg_5": 1.8984375, "rewards/avg_6": 1.9296875, "rewards/avg_7": 1.9140625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.9929687678813934, "rewards/point_reward": 0.9375000298023224, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 10.560156345367432, "epoch": 0.44969083754918493, "kl": 2.090118169784546, "learning_rate": 9.955168539325842e-07, "loss": 0.04012680798768997, "ratio/all_0": 0.0390625, "ratio/all_2": 0.609375, "reward": 1.88671875, "reward_std": 0.7999500036239624, "rewards/avg_0": 1.8671875, "rewards/avg_1": 1.8515625, "rewards/avg_2": 1.875, "rewards/avg_3": 1.8515625, "rewards/avg_4": 1.8671875, "rewards/avg_5": 1.84375, "rewards/avg_6": 1.859375, "rewards/avg_7": 1.8515625, "rewards/avg_8": 2.0, "rewards/avg_9": 2.0, "rewards/format_reward": 0.991406261920929, "rewards/point_reward": 0.8953125178813934, "step": 400 } ], "logging_steps": 1.0, "max_steps": 89000, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }