VISTA-9B / trainer_state.json
m1ngcheng's picture
Add files using upload-large-folder tool
82e4012 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.44969083754918493,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 10.445312976837158,
"epoch": 0.0011242270938729624,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.05997609347105026,
"ratio/all_0": 0.0859375,
"ratio/all_2": 0.5234375,
"reward": 1.8171875476837158,
"reward_std": 0.78060582280159,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.7578125,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.7421875,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.7578125,
"rewards/avg_7": 1.734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8171875178813934,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 10.516406536102295,
"epoch": 0.0022484541877459247,
"kl": 0.19082757085561752,
"learning_rate": 9.999887640449438e-07,
"loss": -0.05867362394928932,
"ratio/all_0": 0.1171875,
"ratio/all_2": 0.4609375,
"reward": 1.77734375,
"reward_std": 0.7686226665973663,
"rewards/avg_0": 1.7109375,
"rewards/avg_1": 1.640625,
"rewards/avg_2": 1.7578125,
"rewards/avg_3": 1.7265625,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.6875,
"rewards/avg_6": 1.703125,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7773437798023224,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 10.396093845367432,
"epoch": 0.003372681281618887,
"kl": 0.8681274950504303,
"learning_rate": 9.999775280898875e-07,
"loss": -0.03326645493507385,
"ratio/all_0": 0.09375,
"ratio/all_2": 0.484375,
"reward": 1.796093761920929,
"reward_std": 0.7614105343818665,
"rewards/avg_0": 1.7421875,
"rewards/avg_1": 1.7109375,
"rewards/avg_2": 1.7734375,
"rewards/avg_3": 1.7265625,
"rewards/avg_4": 1.71875,
"rewards/avg_5": 1.7734375,
"rewards/avg_6": 1.75,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.796875,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 10.425781726837158,
"epoch": 0.004496908375491849,
"kl": 1.0842570066452026,
"learning_rate": 9.999662921348314e-07,
"loss": -0.01579746976494789,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.5546875,
"reward": 1.8515625,
"reward_std": 0.779606819152832,
"rewards/avg_0": 1.8046875,
"rewards/avg_1": 1.7890625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8203125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8515625298023224,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 10.410937786102295,
"epoch": 0.005621135469364812,
"kl": 1.4418902397155762,
"learning_rate": 9.999550561797753e-07,
"loss": -0.01998813822865486,
"ratio/all_0": 0.1484375,
"ratio/all_2": 0.3671875,
"reward": 1.736718773841858,
"reward_std": 0.7228272557258606,
"rewards/avg_0": 1.6875,
"rewards/avg_1": 1.6640625,
"rewards/avg_2": 1.640625,
"rewards/avg_3": 1.6796875,
"rewards/avg_4": 1.6796875,
"rewards/avg_5": 1.6328125,
"rewards/avg_6": 1.6953125,
"rewards/avg_7": 1.6875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7367187440395355,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 10.425000190734863,
"epoch": 0.006745362563237774,
"kl": 1.5545793771743774,
"learning_rate": 9.99943820224719e-07,
"loss": -0.014625937677919865,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.484375,
"reward": 1.8125,
"reward_std": 0.7375520765781403,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.75,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.7890625,
"rewards/avg_7": 1.75,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8125000298023224,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312690734863,
"epoch": 0.007869589657110737,
"kl": 1.6209379434585571,
"learning_rate": 9.999325842696629e-07,
"loss": -0.026230724528431892,
"ratio/all_0": 0.0859375,
"ratio/all_2": 0.4296875,
"reward": 1.762499988079071,
"reward_std": 0.7292351722717285,
"rewards/avg_0": 1.671875,
"rewards/avg_1": 1.6953125,
"rewards/avg_2": 1.6953125,
"rewards/avg_3": 1.71875,
"rewards/avg_4": 1.7265625,
"rewards/avg_5": 1.71875,
"rewards/avg_6": 1.7265625,
"rewards/avg_7": 1.671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.7625000178813934,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937595367432,
"epoch": 0.008993816750983699,
"kl": 1.5760462284088135,
"learning_rate": 9.999213483146068e-07,
"loss": -0.013463463634252548,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.5625,
"reward": 1.822656273841858,
"reward_std": 0.7794111371040344,
"rewards/avg_0": 1.7890625,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.796875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.78125,
"rewards/avg_5": 1.7734375,
"rewards/avg_6": 1.7578125,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8226562440395355,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 10.241406440734863,
"epoch": 0.01011804384485666,
"kl": 1.6185383796691895,
"learning_rate": 9.999101123595504e-07,
"loss": -0.002992298686876893,
"ratio/all_0": 0.0703125,
"ratio/all_2": 0.4921875,
"reward": 1.8101562857627869,
"reward_std": 0.7595367431640625,
"rewards/avg_0": 1.7265625,
"rewards/avg_1": 1.7421875,
"rewards/avg_2": 1.7734375,
"rewards/avg_3": 1.7734375,
"rewards/avg_4": 1.75,
"rewards/avg_5": 1.7578125,
"rewards/avg_6": 1.8046875,
"rewards/avg_7": 1.7734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8101562559604645,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 10.352344036102295,
"epoch": 0.011242270938729624,
"kl": 1.6257511377334595,
"learning_rate": 9.998988764044943e-07,
"loss": 0.0020705917850136757,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.5546875,
"reward": 1.8320313096046448,
"reward_std": 0.785227507352829,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.765625,
"rewards/avg_2": 1.7890625,
"rewards/avg_3": 1.8046875,
"rewards/avg_4": 1.8203125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.78125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.83203125,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 10.372656345367432,
"epoch": 0.012366498032602586,
"kl": 1.6162938475608826,
"learning_rate": 9.998876404494382e-07,
"loss": 0.007467743009328842,
"ratio/all_0": 0.0703125,
"ratio/all_2": 0.515625,
"reward": 1.830468773841858,
"reward_std": 0.7664113640785217,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.78125,
"rewards/avg_3": 1.7890625,
"rewards/avg_4": 1.7578125,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.7734375,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8304687738418579,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 10.474218845367432,
"epoch": 0.013490725126475547,
"kl": 1.6269961595535278,
"learning_rate": 9.99876404494382e-07,
"loss": 0.004299253225326538,
"ratio/all_0": 0.0859375,
"ratio/all_2": 0.5390625,
"reward": 1.8203125,
"reward_std": 0.7797641456127167,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.7578125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.7578125,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.7734375,
"rewards/avg_7": 1.7578125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8203125298023224,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 10.396093845367432,
"epoch": 0.01461495222034851,
"kl": 1.6182974576950073,
"learning_rate": 9.998651685393258e-07,
"loss": -0.004290747921913862,
"ratio/all_0": 0.0703125,
"ratio/all_2": 0.4921875,
"reward": 1.813281238079071,
"reward_std": 0.7540134191513062,
"rewards/avg_0": 1.7734375,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.7578125,
"rewards/avg_5": 1.7890625,
"rewards/avg_6": 1.7265625,
"rewards/avg_7": 1.7734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8132812678813934,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 10.291406154632568,
"epoch": 0.015739179314221474,
"kl": 1.652035653591156,
"learning_rate": 9.998539325842697e-07,
"loss": -0.010208970867097378,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.515625,
"reward": 1.8179687857627869,
"reward_std": 0.7553539872169495,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.734375,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8046875,
"rewards/avg_5": 1.7421875,
"rewards/avg_6": 1.7890625,
"rewards/avg_7": 1.765625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8179687559604645,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 10.327343940734863,
"epoch": 0.016863406408094434,
"kl": 1.6964421272277832,
"learning_rate": 9.998426966292134e-07,
"loss": -0.003030479419976473,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.5078125,
"reward": 1.84765625,
"reward_std": 0.73407843708992,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8046875,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.7890625,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.8359375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8476562798023224,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 10.395312786102295,
"epoch": 0.017987633501967398,
"kl": 1.6439769864082336,
"learning_rate": 9.998314606741573e-07,
"loss": 0.0077795363031327724,
"ratio/all_0": 0.109375,
"ratio/all_2": 0.5546875,
"reward": 1.807031273841858,
"reward_std": 0.8117163181304932,
"rewards/avg_0": 1.75,
"rewards/avg_1": 1.7421875,
"rewards/avg_2": 1.7421875,
"rewards/avg_3": 1.734375,
"rewards/avg_4": 1.7578125,
"rewards/avg_5": 1.7734375,
"rewards/avg_6": 1.765625,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8070312440395355,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 10.37656307220459,
"epoch": 0.01911186059584036,
"kl": 1.606972336769104,
"learning_rate": 9.998202247191011e-07,
"loss": -0.013812951743602753,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5,
"reward": 1.819531261920929,
"reward_std": 0.7394805550575256,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.7421875,
"rewards/avg_2": 1.75,
"rewards/avg_3": 1.765625,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.765625,
"rewards/avg_6": 1.7734375,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.819531261920929,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 10.347656726837158,
"epoch": 0.02023608768971332,
"kl": 1.6440010070800781,
"learning_rate": 9.998089887640448e-07,
"loss": 0.002852785401046276,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6171875,
"reward": 1.865625023841858,
"reward_std": 0.7961074411869049,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8046875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656250238418579,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 10.438281059265137,
"epoch": 0.021360314783586284,
"kl": 1.5873689651489258,
"learning_rate": 9.997977528089887e-07,
"loss": 0.009698862209916115,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.625,
"reward": 1.872656226158142,
"reward_std": 0.8082565665245056,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8203125,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8726562857627869,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 10.417187690734863,
"epoch": 0.022484541877459248,
"kl": 1.6128740310668945,
"learning_rate": 9.997865168539326e-07,
"loss": -0.008634892292320728,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.5234375,
"reward": 1.8335937857627869,
"reward_std": 0.748685210943222,
"rewards/avg_0": 1.7734375,
"rewards/avg_1": 1.8203125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.7734375,
"rewards/avg_4": 1.7578125,
"rewards/avg_5": 1.796875,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.7890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8343749940395355,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 10.411718845367432,
"epoch": 0.023608768971332208,
"kl": 1.613295078277588,
"learning_rate": 9.997752808988763e-07,
"loss": 0.009994986467063427,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.5859375,
"reward": 1.8492187857627869,
"reward_std": 0.7984052896499634,
"rewards/avg_0": 1.8046875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8203125,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.7890625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8492187559604645,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 10.517968654632568,
"epoch": 0.02473299606520517,
"kl": 1.6258622407913208,
"learning_rate": 9.997640449438202e-07,
"loss": 0.005634765140712261,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.59375,
"reward": 1.853906273841858,
"reward_std": 0.7958876490592957,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.8203125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8203125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8203125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8539062738418579,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.025857223159078135,
"kl": 1.6079289317131042,
"learning_rate": 9.99752808988764e-07,
"loss": 0.00109954085201025,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.546875,
"reward": 1.8312500715255737,
"reward_std": 0.7803563475608826,
"rewards/avg_0": 1.7421875,
"rewards/avg_1": 1.8046875,
"rewards/avg_2": 1.7734375,
"rewards/avg_3": 1.796875,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.8046875,
"rewards/avg_7": 1.7734375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.831250011920929,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 10.396093845367432,
"epoch": 0.026981450252951095,
"kl": 1.6150875091552734,
"learning_rate": 9.99741573033708e-07,
"loss": 0.016750413924455643,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6015625,
"reward": 1.8609375357627869,
"reward_std": 0.8065908253192902,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.7890625,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8617187738418579,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 10.32421875,
"epoch": 0.028105677346824058,
"kl": 1.6833326816558838,
"learning_rate": 9.997303370786516e-07,
"loss": 0.017063738778233528,
"ratio/all_0": 0.078125,
"ratio/all_2": 0.578125,
"reward": 1.83984375,
"reward_std": 0.8057522475719452,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.7734375,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.765625,
"rewards/avg_5": 1.8203125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8398437798023224,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 10.313281536102295,
"epoch": 0.02922990444069702,
"kl": 1.6673230528831482,
"learning_rate": 9.997191011235955e-07,
"loss": -0.007147204130887985,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.5390625,
"reward": 1.819531261920929,
"reward_std": 0.7743638455867767,
"rewards/avg_0": 1.7578125,
"rewards/avg_1": 1.78125,
"rewards/avg_2": 1.765625,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8203125,
"rewards/avg_5": 1.7890625,
"rewards/avg_6": 1.7578125,
"rewards/avg_7": 1.7421875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.819531261920929,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 10.321875095367432,
"epoch": 0.03035413153456998,
"kl": 1.7122045159339905,
"learning_rate": 9.997078651685394e-07,
"loss": 0.017641516402363777,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5859375,
"reward": 1.889062523841858,
"reward_std": 0.7684561014175415,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8890625238418579,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 10.247656345367432,
"epoch": 0.03147835862844295,
"kl": 1.7543825507164001,
"learning_rate": 9.99696629213483e-07,
"loss": 0.01977822184562683,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6015625,
"reward": 1.850000023841858,
"reward_std": 0.8139838874340057,
"rewards/avg_0": 1.765625,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8046875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8046875,
"rewards/avg_6": 1.8046875,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8500000238418579,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 10.428906440734863,
"epoch": 0.032602585722315905,
"kl": 1.6742790341377258,
"learning_rate": 9.99685393258427e-07,
"loss": 0.025930162519216537,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.59375,
"reward": 1.892968773841858,
"reward_std": 0.782163679599762,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8929687440395355,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 10.371094226837158,
"epoch": 0.03372681281618887,
"kl": 1.6981696486473083,
"learning_rate": 9.996741573033709e-07,
"loss": 0.01707715541124344,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6328125,
"reward": 1.885156273841858,
"reward_std": 0.8080797493457794,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8851562738418579,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 10.41796875,
"epoch": 0.03485103991006183,
"kl": 1.666592299938202,
"learning_rate": 9.996629213483146e-07,
"loss": 0.0028374078683555126,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.5859375,
"reward": 1.859375,
"reward_std": 0.7776727378368378,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8593750298023224,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 10.405468940734863,
"epoch": 0.035975267003934795,
"kl": 1.6903700828552246,
"learning_rate": 9.996516853932585e-07,
"loss": 0.017414983361959457,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8796875476837158,
"reward_std": 0.8306158185005188,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8804687559604645,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 10.460156440734863,
"epoch": 0.03709949409780776,
"kl": 1.6858877539634705,
"learning_rate": 9.996404494382023e-07,
"loss": 0.015359701588749886,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.609375,
"reward": 1.867968738079071,
"reward_std": 0.8056941330432892,
"rewards/avg_0": 1.8203125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.7890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8679687678813934,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 10.491406440734863,
"epoch": 0.03822372119168072,
"kl": 1.652479112148285,
"learning_rate": 9.99629213483146e-07,
"loss": 0.012801921926438808,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6171875,
"reward": 1.848437488079071,
"reward_std": 0.8226411044597626,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.78125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8484375178813934,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 10.46875,
"epoch": 0.03934794828555368,
"kl": 1.6815390586853027,
"learning_rate": 9.9961797752809e-07,
"loss": 0.01791820488870144,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6328125,
"reward": 1.874218761920929,
"reward_std": 0.811478853225708,
"rewards/avg_0": 1.8203125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.874218761920929,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 10.484375476837158,
"epoch": 0.04047217537942664,
"kl": 1.6642175912857056,
"learning_rate": 9.996067415730338e-07,
"loss": 0.01151614636182785,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.6171875,
"reward": 1.85546875,
"reward_std": 0.8131150305271149,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.8046875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.796875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8562500178813934,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 10.414062976837158,
"epoch": 0.041596402473299605,
"kl": 1.6078103184700012,
"learning_rate": 9.995955056179775e-07,
"loss": 0.01336689293384552,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.8781250715255737,
"reward_std": 0.8475979864597321,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.878125011920929,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 10.53281307220459,
"epoch": 0.04272062956717257,
"kl": 1.6236757636070251,
"learning_rate": 9.995842696629214e-07,
"loss": 0.0206887386739254,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.65625,
"reward": 1.8882812857627869,
"reward_std": 0.823714554309845,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8882812559604645,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 10.517187595367432,
"epoch": 0.04384485666104553,
"kl": 1.6272926926612854,
"learning_rate": 9.995730337078653e-07,
"loss": -0.0022770659998059273,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.5390625,
"reward": 1.8445312976837158,
"reward_std": 0.7606980204582214,
"rewards/avg_0": 1.7890625,
"rewards/avg_1": 1.796875,
"rewards/avg_2": 1.7890625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8046875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8046875,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8445312678813934,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 10.602344036102295,
"epoch": 0.044969083754918496,
"kl": 1.587278664112091,
"learning_rate": 9.99561797752809e-07,
"loss": 0.022644348442554474,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6875,
"reward": 1.885156273841858,
"reward_std": 0.8540178835391998,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8851562738418579,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 10.587500095367432,
"epoch": 0.04609331084879146,
"kl": 1.6034030318260193,
"learning_rate": 9.995505617977528e-07,
"loss": 0.009336121380329132,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.5703125,
"reward": 1.8820313215255737,
"reward_std": 0.7586211562156677,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.882031261920929,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437786102295,
"epoch": 0.047217537942664416,
"kl": 1.5840712189674377,
"learning_rate": 9.995393258426967e-07,
"loss": 0.018098287284374237,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.609375,
"reward": 1.889062523841858,
"reward_std": 0.7948274314403534,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.889843761920929,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 10.46484375,
"epoch": 0.04834176503653738,
"kl": 1.6059932112693787,
"learning_rate": 9.995280898876404e-07,
"loss": 0.02952752448618412,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.9078125357627869,
"reward_std": 0.8541045188903809,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250286102295,
"epoch": 0.04946599213041034,
"kl": 1.5430679321289062,
"learning_rate": 9.995168539325843e-07,
"loss": 0.013610566034913063,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6328125,
"reward": 1.8796875476837158,
"reward_std": 0.8104925453662872,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8046875,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8046875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 10.551562786102295,
"epoch": 0.050590219224283306,
"kl": 1.5598188638687134,
"learning_rate": 9.995056179775282e-07,
"loss": 0.02287764474749565,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6796875,
"reward": 1.897656261920929,
"reward_std": 0.8357208371162415,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8984375298023224,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.05171444631815627,
"kl": 1.538119375705719,
"learning_rate": 9.994943820224719e-07,
"loss": 0.01902620680630207,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.9039062857627869,
"reward_std": 0.7881555557250977,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9046875238418579,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 10.503125190734863,
"epoch": 0.05283867341202923,
"kl": 1.500148355960846,
"learning_rate": 9.994831460674158e-07,
"loss": 0.03803616017103195,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.734375,
"reward": 1.9226562976837158,
"reward_std": 0.8660586476325989,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9453125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9226562678813934,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 10.578125476837158,
"epoch": 0.05396290050590219,
"kl": 1.4877179265022278,
"learning_rate": 9.994719101123596e-07,
"loss": 0.016881819814443588,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.8992187976837158,
"reward_std": 0.8531339764595032,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8992187678813934,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 10.573437690734863,
"epoch": 0.05508712759977515,
"kl": 1.4212284088134766,
"learning_rate": 9.994606741573033e-07,
"loss": 0.010123580694198608,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.625,
"reward": 1.8835937976837158,
"reward_std": 0.8060767948627472,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8843750059604645,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 10.537500381469727,
"epoch": 0.056211354693648116,
"kl": 1.405593991279602,
"learning_rate": 9.994494382022472e-07,
"loss": 0.012005077674984932,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6171875,
"reward": 1.8796875476837158,
"reward_std": 0.8047949373722076,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8796875178813934,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 10.54843807220459,
"epoch": 0.05733558178752108,
"kl": 1.379647970199585,
"learning_rate": 9.994382022471909e-07,
"loss": 0.023727476596832275,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.671875,
"reward": 1.8875000476837158,
"reward_std": 0.8443561792373657,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.887499988079071,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 10.575781345367432,
"epoch": 0.05845980888139404,
"kl": 1.4058558344841003,
"learning_rate": 9.994269662921348e-07,
"loss": 0.022184893488883972,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.920312523841858,
"reward_std": 0.8409400582313538,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203125238418579,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 10.510937690734863,
"epoch": 0.059584035975267007,
"kl": 1.3978136777877808,
"learning_rate": 9.994157303370787e-07,
"loss": 0.025388304144144058,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.914843738079071,
"reward_std": 0.847051203250885,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9148437678813934,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 10.57187557220459,
"epoch": 0.06070826306913996,
"kl": 1.4011179208755493,
"learning_rate": 9.994044943820224e-07,
"loss": 0.011839143931865692,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.8914062976837158,
"reward_std": 0.8390854001045227,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8914062678813934,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 10.611719131469727,
"epoch": 0.061832490163012926,
"kl": 1.4960259199142456,
"learning_rate": 9.993932584269662e-07,
"loss": 0.02140812575817108,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.578125,
"reward": 1.8828125,
"reward_std": 0.7827528119087219,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.78125,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125298023224,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 10.598437786102295,
"epoch": 0.0629567172568859,
"kl": 1.549820065498352,
"learning_rate": 9.993820224719101e-07,
"loss": 0.018605422228574753,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.65625,
"reward": 1.885937511920929,
"reward_std": 0.823595255613327,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.8359375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.885937511920929,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 10.602344036102295,
"epoch": 0.06408094435075885,
"kl": 1.5655131936073303,
"learning_rate": 9.993707865168538e-07,
"loss": 0.01846824586391449,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6328125,
"reward": 1.887499988079071,
"reward_std": 0.8110823035240173,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8875000178813934,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 10.494531154632568,
"epoch": 0.06520517144463181,
"kl": 1.5266385078430176,
"learning_rate": 9.993595505617977e-07,
"loss": 0.0141812264919281,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.9031250476837158,
"reward_std": 0.8471269309520721,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9031250178813934,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 10.503906726837158,
"epoch": 0.06632939853850478,
"kl": 1.5263578295707703,
"learning_rate": 9.993483146067416e-07,
"loss": 0.014608601108193398,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6640625,
"reward": 1.8960937857627869,
"reward_std": 0.8199012279510498,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8960937559604645,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 10.405468940734863,
"epoch": 0.06745362563237774,
"kl": 1.4636675119400024,
"learning_rate": 9.993370786516853e-07,
"loss": 0.017880389466881752,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6640625,
"reward": 1.9070312976837158,
"reward_std": 0.8169863820075989,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.907031238079071,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 10.500781059265137,
"epoch": 0.06857785272625071,
"kl": 1.4090477228164673,
"learning_rate": 9.993258426966292e-07,
"loss": 0.006813580170273781,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6796875,
"reward": 1.893750011920929,
"reward_std": 0.8272766470909119,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.06970207982012366,
"kl": 1.3941453099250793,
"learning_rate": 9.99314606741573e-07,
"loss": 0.028155988082289696,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6875,
"reward": 1.9257813096046448,
"reward_std": 0.8283512890338898,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.92578125,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 10.515625,
"epoch": 0.07082630691399663,
"kl": 1.4108628034591675,
"learning_rate": 9.993033707865167e-07,
"loss": 0.01993025653064251,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6484375,
"reward": 1.8960937857627869,
"reward_std": 0.8207258880138397,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8203125,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8960937559604645,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.07195053400786959,
"kl": 1.4183465838432312,
"learning_rate": 9.992921348314606e-07,
"loss": 0.02124522626399994,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7109375,
"reward": 1.8984375,
"reward_std": 0.8578440248966217,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8359375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375298023224,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 10.477344036102295,
"epoch": 0.07307476110174255,
"kl": 1.4451609253883362,
"learning_rate": 9.992808988764045e-07,
"loss": 0.015427444130182266,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.640625,
"reward": 1.8750000596046448,
"reward_std": 0.8293256461620331,
"rewards/avg_0": 1.796875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.875,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 10.509375095367432,
"epoch": 0.07419898819561552,
"kl": 1.483718454837799,
"learning_rate": 9.992696629213482e-07,
"loss": 0.024618471041321754,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.917187511920929,
"reward_std": 0.8308260142803192,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437309265137,
"epoch": 0.07532321528948847,
"kl": 1.4856454133987427,
"learning_rate": 9.99258426966292e-07,
"loss": 0.01140589639544487,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6328125,
"reward": 1.869531273841858,
"reward_std": 0.8212272822856903,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8046875,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8695312440395355,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 10.454687595367432,
"epoch": 0.07644744238336144,
"kl": 1.4795401096343994,
"learning_rate": 9.99247191011236e-07,
"loss": 0.014171874150633812,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.6171875,
"reward": 1.867968738079071,
"reward_std": 0.8124658763408661,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8046875,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8203125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8679687678813934,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 10.560937881469727,
"epoch": 0.0775716694772344,
"kl": 1.4359744787216187,
"learning_rate": 9.992359550561797e-07,
"loss": 0.022417651489377022,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9195312857627869,
"reward_std": 0.857976645231247,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9195312559604645,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 10.557031154632568,
"epoch": 0.07869589657110736,
"kl": 1.480742871761322,
"learning_rate": 9.992247191011235e-07,
"loss": 0.019886016845703125,
"ratio/all_0": 0.0703125,
"ratio/all_2": 0.609375,
"reward": 1.865625023841858,
"reward_std": 0.8209626972675323,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.7734375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8359375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8656249940395355,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 10.470312595367432,
"epoch": 0.07982012366498033,
"kl": 1.382175326347351,
"learning_rate": 9.992134831460674e-07,
"loss": 0.013301249593496323,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.6875,
"reward": 1.8789063096046448,
"reward_std": 0.8519689440727234,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8203125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.87890625,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 10.515625,
"epoch": 0.08094435075885328,
"kl": 1.3844139575958252,
"learning_rate": 9.992022471910111e-07,
"loss": 0.00932213943451643,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.8875000476837158,
"reward_std": 0.8321071863174438,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.887499988079071,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 10.482031345367432,
"epoch": 0.08206857785272625,
"kl": 1.3557974100112915,
"learning_rate": 9.99191011235955e-07,
"loss": 0.025301288813352585,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9273437857627869,
"reward_std": 0.8360967040061951,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9273437559604645,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 10.583593845367432,
"epoch": 0.08319280494659921,
"kl": 1.3512595295906067,
"learning_rate": 9.99179775280899e-07,
"loss": 0.015560301020741463,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6875,
"reward": 1.895312488079071,
"reward_std": 0.8442690074443817,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8953125178813934,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 10.58203125,
"epoch": 0.08431703204047218,
"kl": 1.3721051812171936,
"learning_rate": 9.991685393258426e-07,
"loss": 0.025348057970404625,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.916406273841858,
"reward_std": 0.8428552448749542,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9164062440395355,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 10.527344226837158,
"epoch": 0.08544125913434514,
"kl": 1.4204409718513489,
"learning_rate": 9.991573033707865e-07,
"loss": 0.008842497132718563,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.892187476158142,
"reward_std": 0.8469344079494476,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875357627869,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 10.510156154632568,
"epoch": 0.0865654862282181,
"kl": 1.4739211201667786,
"learning_rate": 9.991460674157304e-07,
"loss": 0.013035193085670471,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6484375,
"reward": 1.8992187976837158,
"reward_std": 0.8030118048191071,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8992187678813934,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 10.525000095367432,
"epoch": 0.08768971332209106,
"kl": 1.540924608707428,
"learning_rate": 9.99134831460674e-07,
"loss": 0.010562058538198471,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.625,
"reward": 1.87109375,
"reward_std": 0.8110974431037903,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.8203125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.87109375,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 10.564844131469727,
"epoch": 0.08881394041596402,
"kl": 1.5210090279579163,
"learning_rate": 9.99123595505618e-07,
"loss": 0.023617252707481384,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.908593773841858,
"reward_std": 0.8432826697826385,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9085937738418579,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 10.55859375,
"epoch": 0.08993816750983699,
"kl": 1.5450841188430786,
"learning_rate": 9.991123595505618e-07,
"loss": 0.030371172353625298,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.91796875,
"reward_std": 0.8362286984920502,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9187500178813934,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 10.520312786102295,
"epoch": 0.09106239460370995,
"kl": 1.5201088786125183,
"learning_rate": 9.991011235955055e-07,
"loss": 0.01299591176211834,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6796875,
"reward": 1.884374976158142,
"reward_std": 0.8359851539134979,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8851562738418579,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 10.455468654632568,
"epoch": 0.09218662169758292,
"kl": 1.4778642058372498,
"learning_rate": 9.990898876404494e-07,
"loss": 0.026645315811038017,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.909375011920929,
"reward_std": 0.8547311127185822,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.91015625,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 10.520312786102295,
"epoch": 0.09331084879145587,
"kl": 1.4318110346794128,
"learning_rate": 9.990786516853933e-07,
"loss": 0.02681383118033409,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7421875,
"reward": 1.9156250357627869,
"reward_std": 0.8724653422832489,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250059604645,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.09443507588532883,
"kl": 1.3569493889808655,
"learning_rate": 9.99067415730337e-07,
"loss": 0.012078452855348587,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9101563096046448,
"reward_std": 0.8564302027225494,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.91015625,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 10.544531345367432,
"epoch": 0.0955593029792018,
"kl": 1.42191481590271,
"learning_rate": 9.990561797752808e-07,
"loss": 0.02844950556755066,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7421875,
"reward": 1.934374988079071,
"reward_std": 0.8588309288024902,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9343750178813934,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 10.442968845367432,
"epoch": 0.09668353007307476,
"kl": 1.3797348737716675,
"learning_rate": 9.990449438202247e-07,
"loss": 0.02336825057864189,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.765625,
"reward": 1.927343726158142,
"reward_std": 0.875806987285614,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9273437857627869,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437404632568,
"epoch": 0.09780775716694773,
"kl": 1.3973508477210999,
"learning_rate": 9.990337078651684e-07,
"loss": 0.016720149666070938,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.9101563096046448,
"reward_std": 0.8276163339614868,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.91015625,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 10.514062881469727,
"epoch": 0.09893198426082069,
"kl": 1.412993311882019,
"learning_rate": 9.990224719101123e-07,
"loss": 0.03139276057481766,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7734375,
"reward": 1.932031273841858,
"reward_std": 0.8831829130649567,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9320312440395355,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 10.442968845367432,
"epoch": 0.10005621135469364,
"kl": 1.3898016214370728,
"learning_rate": 9.990112359550562e-07,
"loss": 0.024522768333554268,
"ratio/all_0": 0.0,
"ratio/all_2": 0.7890625,
"reward": 1.9406250715255737,
"reward_std": 0.8826980292797089,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.940625011920929,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812786102295,
"epoch": 0.10118043844856661,
"kl": 1.413306713104248,
"learning_rate": 9.989999999999999e-07,
"loss": 0.020547227934002876,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.917187511920929,
"reward_std": 0.8212399482727051,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.91796875,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 10.63671875,
"epoch": 0.10230466554243957,
"kl": 1.4090213179588318,
"learning_rate": 9.989887640449438e-07,
"loss": 0.016374479979276657,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6796875,
"reward": 1.874218761920929,
"reward_std": 0.8512181341648102,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.874218761920929,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.10342889263631254,
"kl": 1.3873103857040405,
"learning_rate": 9.989775280898877e-07,
"loss": 0.007781813386827707,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.671875,
"reward": 1.881250023841858,
"reward_std": 0.8377176225185394,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8203125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8812499940395355,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 10.610156536102295,
"epoch": 0.1045531197301855,
"kl": 1.4300039410591125,
"learning_rate": 9.989662921348313e-07,
"loss": 0.012583991512656212,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6328125,
"reward": 1.875,
"reward_std": 0.8191207945346832,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8203125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8750000298023224,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 10.410156726837158,
"epoch": 0.10567734682405847,
"kl": 1.4425511360168457,
"learning_rate": 9.989550561797752e-07,
"loss": 0.030106104910373688,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.9054688215255737,
"reward_std": 0.8589926362037659,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.905468761920929,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 10.420312881469727,
"epoch": 0.10680157391793142,
"kl": 1.45794278383255,
"learning_rate": 9.98943820224719e-07,
"loss": 0.027616795152425766,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.9187500476837158,
"reward_std": 0.8559161722660065,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.918749988079071,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 10.521874904632568,
"epoch": 0.10792580101180438,
"kl": 1.465618371963501,
"learning_rate": 9.989325842696628e-07,
"loss": 0.02687779814004898,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.9140625596046448,
"reward_std": 0.8488925099372864,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 10.46484375,
"epoch": 0.10905002810567735,
"kl": 1.4335193037986755,
"learning_rate": 9.989213483146067e-07,
"loss": 0.02943534404039383,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.9195312857627869,
"reward_std": 0.8390406370162964,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9195312559604645,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 10.596875190734863,
"epoch": 0.1101742551995503,
"kl": 1.3374882340431213,
"learning_rate": 9.989101123595504e-07,
"loss": 0.0237848162651062,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.75,
"reward": 1.9343750476837158,
"reward_std": 0.8592123687267303,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9453125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9343750178813934,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.11129848229342328,
"kl": 1.3550831079483032,
"learning_rate": 9.988988764044943e-07,
"loss": 0.024313587695360184,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.916406273841858,
"reward_std": 0.848257303237915,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9164062738418579,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 10.461719036102295,
"epoch": 0.11242270938729623,
"kl": 1.4057585000991821,
"learning_rate": 9.988876404494382e-07,
"loss": 0.007561494130641222,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.65625,
"reward": 1.8687500357627869,
"reward_std": 0.8348900377750397,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.828125,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8359375,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8687500059604645,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 10.51171875,
"epoch": 0.1135469364811692,
"kl": 1.315906286239624,
"learning_rate": 9.988764044943818e-07,
"loss": 0.013327661901712418,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7421875,
"reward": 1.920312523841858,
"reward_std": 0.8560033440589905,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203124940395355,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 10.456250190734863,
"epoch": 0.11467116357504216,
"kl": 1.409066081047058,
"learning_rate": 9.988651685393257e-07,
"loss": 0.014456999488174915,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6640625,
"reward": 1.882031261920929,
"reward_std": 0.835316926240921,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.828125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8828125,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 10.49765682220459,
"epoch": 0.11579539066891512,
"kl": 1.4170143604278564,
"learning_rate": 9.988539325842696e-07,
"loss": 0.016626989468932152,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.65625,
"reward": 1.889843761920929,
"reward_std": 0.829914778470993,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.889843761920929,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 10.4921875,
"epoch": 0.11691961776278809,
"kl": 1.4739782810211182,
"learning_rate": 9.988426966292133e-07,
"loss": 0.018051698803901672,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6484375,
"reward": 1.8843750357627869,
"reward_std": 0.8223588466644287,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8843750059604645,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 10.5546875,
"epoch": 0.11804384485666104,
"kl": 1.4696778655052185,
"learning_rate": 9.988314606741572e-07,
"loss": 0.02195235900580883,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.703125,
"reward": 1.920312523841858,
"reward_std": 0.8361712098121643,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203125238418579,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 10.5859375,
"epoch": 0.11916807195053401,
"kl": 1.4029964208602905,
"learning_rate": 9.98820224719101e-07,
"loss": 0.018382327631115913,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6953125,
"reward": 1.916406273841858,
"reward_std": 0.8307805061340332,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9164062738418579,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 10.575000286102295,
"epoch": 0.12029229904440697,
"kl": 1.4130637645721436,
"learning_rate": 9.988089887640448e-07,
"loss": 0.020427603274583817,
"ratio/all_0": 0.0,
"ratio/all_2": 0.7578125,
"reward": 1.932812511920929,
"reward_std": 0.861215353012085,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.932812511920929,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 10.534375190734863,
"epoch": 0.12141652613827993,
"kl": 1.3674690127372742,
"learning_rate": 9.987977528089886e-07,
"loss": 0.011559784412384033,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.71875,
"reward": 1.8882812857627869,
"reward_std": 0.862981528043747,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8882812559604645,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 10.5390625,
"epoch": 0.1225407532321529,
"kl": 1.3945258855819702,
"learning_rate": 9.987865168539325e-07,
"loss": 0.03130093589425087,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7578125,
"reward": 1.927343726158142,
"reward_std": 0.8774406015872955,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9273437857627869,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 10.492969036102295,
"epoch": 0.12366498032602585,
"kl": 1.3998687267303467,
"learning_rate": 9.987752808988762e-07,
"loss": 0.021291855722665787,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6640625,
"reward": 1.90625,
"reward_std": 0.8211067318916321,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9062500298023224,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 10.427343845367432,
"epoch": 0.12478920741989882,
"kl": 1.3501304984092712,
"learning_rate": 9.9876404494382e-07,
"loss": 0.011759497225284576,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9078125357627869,
"reward_std": 0.8611572980880737,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 10.4375,
"epoch": 0.1259134345137718,
"kl": 1.4030646681785583,
"learning_rate": 9.98752808988764e-07,
"loss": 0.029689906165003777,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.734375,
"reward": 1.916406273841858,
"reward_std": 0.8714778125286102,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.917187511920929,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 10.514062881469727,
"epoch": 0.12703766160764474,
"kl": 1.4460313320159912,
"learning_rate": 9.987415730337079e-07,
"loss": 0.012388680130243301,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.65625,
"reward": 1.9078125357627869,
"reward_std": 0.8054125308990479,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 10.461719036102295,
"epoch": 0.1281618887015177,
"kl": 1.487072765827179,
"learning_rate": 9.987303370786516e-07,
"loss": 0.017670128494501114,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6328125,
"reward": 1.892968773841858,
"reward_std": 0.8094190359115601,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8929687738418579,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 10.473437309265137,
"epoch": 0.12928611579539068,
"kl": 1.5192728638648987,
"learning_rate": 9.987191011235955e-07,
"loss": 0.02703724056482315,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.75,
"reward": 1.9304687976837158,
"reward_std": 0.8568115830421448,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9304687678813934,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 10.421875476837158,
"epoch": 0.13041034288926362,
"kl": 1.5342833995819092,
"learning_rate": 9.987078651685393e-07,
"loss": 0.03299251198768616,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.9234375357627869,
"reward_std": 0.8558132350444794,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9234375059604645,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 10.435156345367432,
"epoch": 0.1315345699831366,
"kl": 1.4555895328521729,
"learning_rate": 9.98696629213483e-07,
"loss": 0.027663029730319977,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9265625476837158,
"reward_std": 0.8661331236362457,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.926562488079071,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 10.500781059265137,
"epoch": 0.13265879707700956,
"kl": 1.3940675258636475,
"learning_rate": 9.98685393258427e-07,
"loss": 0.03329860046505928,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.8046875,
"reward": 1.9312500357627869,
"reward_std": 0.9077420830726624,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9312500059604645,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 10.550000190734863,
"epoch": 0.13378302417088253,
"kl": 1.4418742656707764,
"learning_rate": 9.986741573033708e-07,
"loss": 0.011464491486549377,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.8992187976837158,
"reward_std": 0.842133492231369,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.899218738079071,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687881469727,
"epoch": 0.13490725126475547,
"kl": 1.3481199145317078,
"learning_rate": 9.986629213483145e-07,
"loss": 0.02446187473833561,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7734375,
"reward": 1.9187500476837158,
"reward_std": 0.8890580832958221,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9187500178813934,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 10.51171875,
"epoch": 0.13603147835862844,
"kl": 1.3256433010101318,
"learning_rate": 9.986516853932584e-07,
"loss": 0.03311225771903992,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7109375,
"reward": 1.942968726158142,
"reward_std": 0.8395049273967743,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9437500238418579,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 10.56796932220459,
"epoch": 0.13715570545250141,
"kl": 1.4150030016899109,
"learning_rate": 9.986404494382023e-07,
"loss": 0.019649198278784752,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.71875,
"reward": 1.893750011920929,
"reward_std": 0.8653823733329773,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.893750011920929,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 10.499218940734863,
"epoch": 0.13827993254637436,
"kl": 1.4379026293754578,
"learning_rate": 9.98629213483146e-07,
"loss": 0.024776604026556015,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.91796875,
"reward_std": 0.855737566947937,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9187500178813934,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 10.416406631469727,
"epoch": 0.13940415964024733,
"kl": 1.5306707620620728,
"learning_rate": 9.986179775280898e-07,
"loss": 0.04271028935909271,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.84375,
"reward": 1.944531261920929,
"reward_std": 0.9272224605083466,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9609375,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9609375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.944531261920929,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 10.608593940734863,
"epoch": 0.1405283867341203,
"kl": 1.485119104385376,
"learning_rate": 9.986067415730337e-07,
"loss": 0.015734048560261726,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.8828125,
"reward_std": 0.8565918803215027,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.8203125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8828125298023224,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 10.492969036102295,
"epoch": 0.14165261382799327,
"kl": 1.4620481133460999,
"learning_rate": 9.985955056179774e-07,
"loss": 0.03640381991863251,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7890625,
"reward": 1.9382812976837158,
"reward_std": 0.8856255412101746,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9453125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9382812678813934,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 10.580468654632568,
"epoch": 0.1427768409218662,
"kl": 1.4295486211776733,
"learning_rate": 9.985842696629213e-07,
"loss": 0.02829963155090809,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7421875,
"reward": 1.925000011920929,
"reward_std": 0.8635706603527069,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.925000011920929,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 10.490624904632568,
"epoch": 0.14390106801573918,
"kl": 1.3682859539985657,
"learning_rate": 9.985730337078652e-07,
"loss": 0.026628486812114716,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.932031273841858,
"reward_std": 0.8417931199073792,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9320312440395355,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 10.538281440734863,
"epoch": 0.14502529510961215,
"kl": 1.341777503490448,
"learning_rate": 9.985617977528089e-07,
"loss": 0.02736259251832962,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7421875,
"reward": 1.94140625,
"reward_std": 0.8543242514133453,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9453125,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9414062798023224,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 10.537499904632568,
"epoch": 0.1461495222034851,
"kl": 1.3316110372543335,
"learning_rate": 9.985505617977528e-07,
"loss": 0.02057470940053463,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.75,
"reward": 1.93359375,
"reward_std": 0.8592420816421509,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9609375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9335937798023224,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 10.550000190734863,
"epoch": 0.14727374929735806,
"kl": 1.3152986764907837,
"learning_rate": 9.985393258426966e-07,
"loss": 0.019020576030015945,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.928906261920929,
"reward_std": 0.8400276005268097,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.928906261920929,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 10.494531154632568,
"epoch": 0.14839797639123103,
"kl": 1.3366295099258423,
"learning_rate": 9.985280898876403e-07,
"loss": 0.027881566435098648,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.7265625,
"reward": 1.9117187857627869,
"reward_std": 0.8693137466907501,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9117187559604645,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 10.592187881469727,
"epoch": 0.14952220348510398,
"kl": 1.3480384349822998,
"learning_rate": 9.985168539325842e-07,
"loss": 0.013358078896999359,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.903124988079071,
"reward_std": 0.8528032302856445,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9039062857627869,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 10.603125095367432,
"epoch": 0.15064643057897695,
"kl": 1.384570598602295,
"learning_rate": 9.985056179775281e-07,
"loss": 0.016048742458224297,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.909375011920929,
"reward_std": 0.8402921855449677,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687786102295,
"epoch": 0.15177065767284992,
"kl": 1.4957668781280518,
"learning_rate": 9.984943820224718e-07,
"loss": 0.022608522325754166,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.75,
"reward": 1.9148437976837158,
"reward_std": 0.8707560300827026,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9148437678813934,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 10.506250381469727,
"epoch": 0.1528948847667229,
"kl": 1.49778151512146,
"learning_rate": 9.984831460674157e-07,
"loss": 0.025695206597447395,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.7265625,
"reward": 1.9015625715255737,
"reward_std": 0.8657637238502502,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.901562511920929,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 10.603906631469727,
"epoch": 0.15401911186059583,
"kl": 1.4697266817092896,
"learning_rate": 9.984719101123596e-07,
"loss": 0.03311506658792496,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.928906261920929,
"reward_std": 0.841498851776123,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.928906261920929,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 10.595312595367432,
"epoch": 0.1551433389544688,
"kl": 1.5000845193862915,
"learning_rate": 9.984606741573032e-07,
"loss": 0.03179492428898811,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7109375,
"reward": 1.910937488079071,
"reward_std": 0.8583005368709564,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9109375178813934,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 10.467187881469727,
"epoch": 0.15626756604834177,
"kl": 1.492616891860962,
"learning_rate": 9.984494382022471e-07,
"loss": 0.0324878990650177,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6796875,
"reward": 1.9132813215255737,
"reward_std": 0.8348826766014099,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9140625,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 10.553906440734863,
"epoch": 0.15739179314221471,
"kl": 1.486423134803772,
"learning_rate": 9.98438202247191e-07,
"loss": 0.02975376322865486,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.924218773841858,
"reward_std": 0.8577986359596252,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9242187738418579,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 10.591406345367432,
"epoch": 0.15851602023608768,
"kl": 1.4460753798484802,
"learning_rate": 9.984269662921347e-07,
"loss": 0.01627442240715027,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.91015625,
"reward_std": 0.8073700666427612,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9101562798023224,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 10.568749904632568,
"epoch": 0.15964024732996066,
"kl": 1.4809756875038147,
"learning_rate": 9.984157303370786e-07,
"loss": 0.0247793085873127,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7265625,
"reward": 1.923437476158142,
"reward_std": 0.8487512767314911,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9242187738418579,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 10.45703125,
"epoch": 0.16076447442383363,
"kl": 1.5039972066879272,
"learning_rate": 9.984044943820225e-07,
"loss": 0.03990985080599785,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.71875,
"reward": 1.920312523841858,
"reward_std": 0.862554669380188,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203124940395355,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 10.50546932220459,
"epoch": 0.16188870151770657,
"kl": 1.4829599261283875,
"learning_rate": 9.983932584269662e-07,
"loss": 0.023315520957112312,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6875,
"reward": 1.8984375596046448,
"reward_std": 0.8472589552402496,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8984375,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 10.517969131469727,
"epoch": 0.16301292861157954,
"kl": 1.439881980419159,
"learning_rate": 9.9838202247191e-07,
"loss": 0.0387931689620018,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.796875,
"reward": 1.9531250596046448,
"reward_std": 0.8867897987365723,
"rewards/avg_0": 1.9453125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.953125,
"rewards/avg_4": 1.9453125,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.953125,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 10.51171875,
"epoch": 0.1641371557054525,
"kl": 1.4910458326339722,
"learning_rate": 9.98370786516854e-07,
"loss": 0.034083664417266846,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.734375,
"reward": 1.907812476158142,
"reward_std": 0.8765862584114075,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125357627869,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 10.510156154632568,
"epoch": 0.16526138279932545,
"kl": 1.507016360759735,
"learning_rate": 9.983595505617976e-07,
"loss": 0.02047543413937092,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.9078125357627869,
"reward_std": 0.8427807092666626,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 10.460156440734863,
"epoch": 0.16638560989319842,
"kl": 1.5077899098396301,
"learning_rate": 9.983483146067415e-07,
"loss": 0.012134671211242676,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6328125,
"reward": 1.8726562857627869,
"reward_std": 0.8212587535381317,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8203125,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.8125,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8726562559604645,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 10.539843559265137,
"epoch": 0.1675098369870714,
"kl": 1.411302089691162,
"learning_rate": 9.983370786516854e-07,
"loss": 0.02341485023498535,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.765625,
"reward": 1.9312500357627869,
"reward_std": 0.8712870478630066,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9312500059604645,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 10.496875286102295,
"epoch": 0.16863406408094436,
"kl": 1.4035818576812744,
"learning_rate": 9.98325842696629e-07,
"loss": 0.01982831582427025,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.925000011920929,
"reward_std": 0.8388589024543762,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.92578125,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 10.400000095367432,
"epoch": 0.1697582911748173,
"kl": 1.44032484292984,
"learning_rate": 9.98314606741573e-07,
"loss": 0.023583440110087395,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.7421875,
"reward": 1.90234375,
"reward_std": 0.8744505941867828,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9023437798023224,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 10.638281345367432,
"epoch": 0.17088251826869028,
"kl": 1.4179207682609558,
"learning_rate": 9.983033707865169e-07,
"loss": 0.028874140232801437,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.926562488079071,
"reward_std": 0.8614144921302795,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9273437857627869,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 10.497656345367432,
"epoch": 0.17200674536256325,
"kl": 1.4119802713394165,
"learning_rate": 9.982921348314606e-07,
"loss": 0.02332199364900589,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.9234375357627869,
"reward_std": 0.8475228846073151,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9242187440395355,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 10.483593940734863,
"epoch": 0.1731309724564362,
"kl": 1.447148084640503,
"learning_rate": 9.982808988764044e-07,
"loss": 0.030631529167294502,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9351562857627869,
"reward_std": 0.8493313789367676,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9351562559604645,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 10.414844036102295,
"epoch": 0.17425519955030916,
"kl": 1.4554988145828247,
"learning_rate": 9.982696629213483e-07,
"loss": 0.023993976414203644,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.9156250357627869,
"reward_std": 0.852101594209671,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9156250059604645,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 10.483593940734863,
"epoch": 0.17537942664418213,
"kl": 1.433959722518921,
"learning_rate": 9.982584269662922e-07,
"loss": 0.026601165533065796,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6953125,
"reward": 1.921093761920929,
"reward_std": 0.833401083946228,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.921093761920929,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 10.534375190734863,
"epoch": 0.1765036537380551,
"kl": 1.4553956389427185,
"learning_rate": 9.98247191011236e-07,
"loss": 0.03449885919690132,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.78125,
"reward": 1.935937523841858,
"reward_std": 0.8834026753902435,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9359374940395355,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.17762788083192804,
"kl": 1.4700213074684143,
"learning_rate": 9.982359550561798e-07,
"loss": 0.015104904770851135,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.9039062857627869,
"reward_std": 0.8533373475074768,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9039062559604645,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 10.586719036102295,
"epoch": 0.178752107925801,
"kl": 1.4438948035240173,
"learning_rate": 9.982247191011237e-07,
"loss": 0.02373446524143219,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7265625,
"reward": 1.9078125357627869,
"reward_std": 0.8605807721614838,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 10.5390625,
"epoch": 0.17987633501967398,
"kl": 1.459674894809723,
"learning_rate": 9.982134831460674e-07,
"loss": 0.025322623550891876,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9218750596046448,
"reward_std": 0.863748699426651,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.921875,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 10.46640682220459,
"epoch": 0.18100056211354693,
"kl": 1.4958550333976746,
"learning_rate": 9.982022471910113e-07,
"loss": 0.015952421352267265,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.900781273841858,
"reward_std": 0.8476106822490692,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9007812738418579,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 10.486719131469727,
"epoch": 0.1821247892074199,
"kl": 1.4854081273078918,
"learning_rate": 9.981910112359551e-07,
"loss": 0.008778873831033707,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.609375,
"reward": 1.8882812857627869,
"reward_std": 0.7877383232116699,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8203125,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8890624940395355,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 10.478906631469727,
"epoch": 0.18324901630129287,
"kl": 1.5270226001739502,
"learning_rate": 9.981797752808988e-07,
"loss": 0.025792792439460754,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.75,
"reward": 1.9078125357627869,
"reward_std": 0.8755120933055878,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 10.463281154632568,
"epoch": 0.18437324339516584,
"kl": 1.5529326796531677,
"learning_rate": 9.981685393258427e-07,
"loss": 0.030110936611890793,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7734375,
"reward": 1.9296875,
"reward_std": 0.8803807497024536,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9304687678813934,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 10.582031726837158,
"epoch": 0.18549747048903878,
"kl": 1.6017058491706848,
"learning_rate": 9.981573033707866e-07,
"loss": 0.018820036202669144,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6953125,
"reward": 1.8632813096046448,
"reward_std": 0.8700221478939056,
"rewards/avg_0": 1.8046875,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8203125,
"rewards/avg_4": 1.8046875,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.86328125,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 10.49609375,
"epoch": 0.18662169758291175,
"kl": 1.5901198387145996,
"learning_rate": 9.981460674157303e-07,
"loss": 0.01749960146844387,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.9039062857627869,
"reward_std": 0.8250192999839783,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9046874940395355,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 10.495312690734863,
"epoch": 0.18774592467678472,
"kl": 1.6122857332229614,
"learning_rate": 9.981348314606742e-07,
"loss": 0.024158619344234467,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.899999976158142,
"reward_std": 0.8434443175792694,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000357627869,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 10.47265625,
"epoch": 0.18887015177065766,
"kl": 1.7152747511863708,
"learning_rate": 9.98123595505618e-07,
"loss": 0.048164140433073044,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.765625,
"reward": 1.938281238079071,
"reward_std": 0.8790540397167206,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9453125,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9390625059604645,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625,
"epoch": 0.18999437886453063,
"kl": 1.6773305535316467,
"learning_rate": 9.981123595505617e-07,
"loss": 0.010047555901110172,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.609375,
"reward": 1.877343773841858,
"reward_std": 0.788997232913971,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8359375,
"rewards/avg_3": 1.78125,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8773437738418579,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 10.450781345367432,
"epoch": 0.1911186059584036,
"kl": 1.6623247861862183,
"learning_rate": 9.981011235955056e-07,
"loss": 0.03203898295760155,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9226562976837158,
"reward_std": 0.8614811599254608,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9226562678813934,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 10.44921875,
"epoch": 0.19224283305227655,
"kl": 1.6179015636444092,
"learning_rate": 9.980898876404495e-07,
"loss": 0.036546383053064346,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9312500357627869,
"reward_std": 0.8603616058826447,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9320312738418579,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 10.635937690734863,
"epoch": 0.19336706014614952,
"kl": 1.5302655696868896,
"learning_rate": 9.980786516853932e-07,
"loss": 0.03663820028305054,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7421875,
"reward": 1.9429687857627869,
"reward_std": 0.8540288507938385,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9609375,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9429687559604645,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 10.55078125,
"epoch": 0.1944912872400225,
"kl": 1.6461214423179626,
"learning_rate": 9.98067415730337e-07,
"loss": 0.029548870399594307,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.9140625596046448,
"reward_std": 0.8425402939319611,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9148437678813934,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 10.571094036102295,
"epoch": 0.19561551433389546,
"kl": 1.5366535782814026,
"learning_rate": 9.98056179775281e-07,
"loss": 0.030620839446783066,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.918749988079071,
"reward_std": 0.8569993078708649,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9195312857627869,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 10.46875,
"epoch": 0.1967397414277684,
"kl": 1.5700982213020325,
"learning_rate": 9.980449438202247e-07,
"loss": 0.02145414985716343,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7109375,
"reward": 1.892968773841858,
"reward_std": 0.8565924763679504,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8929687440395355,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000286102295,
"epoch": 0.19786396852164137,
"kl": 1.6610383987426758,
"learning_rate": 9.980337078651686e-07,
"loss": 0.028878550976514816,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6796875,
"reward": 1.8984375596046448,
"reward_std": 0.8399778306484222,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8992187678813934,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 10.414844036102295,
"epoch": 0.19898819561551434,
"kl": 1.5835221409797668,
"learning_rate": 9.980224719101124e-07,
"loss": 0.03403393179178238,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9312500357627869,
"reward_std": 0.8549413084983826,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9320312738418579,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 10.460156440734863,
"epoch": 0.20011242270938728,
"kl": 1.5369529128074646,
"learning_rate": 9.980112359550561e-07,
"loss": 0.03677598387002945,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.75,
"reward": 1.918749988079071,
"reward_std": 0.8851003050804138,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9203125238418579,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 10.396875381469727,
"epoch": 0.20123664980326025,
"kl": 1.5669890642166138,
"learning_rate": 9.98e-07,
"loss": 0.022701524198055267,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6640625,
"reward": 1.885937511920929,
"reward_std": 0.8360760807991028,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8125,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.88671875,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 10.587500095367432,
"epoch": 0.20236087689713322,
"kl": 1.5295946598052979,
"learning_rate": 9.97988764044944e-07,
"loss": 0.029509663581848145,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7265625,
"reward": 1.913281261920929,
"reward_std": 0.8641042709350586,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9140625,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 10.464844226837158,
"epoch": 0.2034851039910062,
"kl": 1.4037991166114807,
"learning_rate": 9.979775280898876e-07,
"loss": 0.02910599112510681,
"ratio/all_0": 0.0,
"ratio/all_2": 0.7421875,
"reward": 1.9453125,
"reward_std": 0.8485521972179413,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9453125,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 10.485156536102295,
"epoch": 0.20460933108487914,
"kl": 1.392296850681305,
"learning_rate": 9.979662921348315e-07,
"loss": 0.03196254372596741,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.9421875476837158,
"reward_std": 0.8366577923297882,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.944531261920929,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 10.508593559265137,
"epoch": 0.2057335581787521,
"kl": 1.431138515472412,
"learning_rate": 9.979550561797754e-07,
"loss": 0.020911136642098427,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.9109375476837158,
"reward_std": 0.8656867742538452,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9117187559604645,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 10.397656440734863,
"epoch": 0.20685778527262508,
"kl": 1.4381036162376404,
"learning_rate": 9.97943820224719e-07,
"loss": 0.026089351624250412,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6875,
"reward": 1.9179688096046448,
"reward_std": 0.8316961228847504,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.918749988079071,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 10.519531726837158,
"epoch": 0.20798201236649802,
"kl": 1.5049086213111877,
"learning_rate": 9.97932584269663e-07,
"loss": 0.020233262330293655,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6796875,
"reward": 1.9179688096046448,
"reward_std": 0.8203654885292053,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9187500178813934,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 10.432812690734863,
"epoch": 0.209106239460371,
"kl": 1.4891046285629272,
"learning_rate": 9.979213483146068e-07,
"loss": 0.015585238113999367,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7578125,
"reward": 1.9117187857627869,
"reward_std": 0.8686578571796417,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9124999940395355,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 10.551562786102295,
"epoch": 0.21023046655424396,
"kl": 1.4672995805740356,
"learning_rate": 9.979101123595505e-07,
"loss": 0.02163076400756836,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6640625,
"reward": 1.8953125476837158,
"reward_std": 0.8324027061462402,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.895312488079071,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 10.550000190734863,
"epoch": 0.21135469364811693,
"kl": 1.4560156464576721,
"learning_rate": 9.978988764044944e-07,
"loss": 0.027444355189800262,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.928125023841858,
"reward_std": 0.8456373810768127,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9281249940395355,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 10.555469036102295,
"epoch": 0.21247892074198987,
"kl": 1.3982338905334473,
"learning_rate": 9.978876404494383e-07,
"loss": 0.03762711584568024,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7578125,
"reward": 1.930468738079071,
"reward_std": 0.881901741027832,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9304687678813934,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 10.504687786102295,
"epoch": 0.21360314783586284,
"kl": 1.4209675788879395,
"learning_rate": 9.97876404494382e-07,
"loss": 0.019087618216872215,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.90234375,
"reward_std": 0.8551211357116699,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.90234375,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 10.585156440734863,
"epoch": 0.21472737492973581,
"kl": 1.372999668121338,
"learning_rate": 9.978651685393259e-07,
"loss": 0.027694687247276306,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7578125,
"reward": 1.927343726158142,
"reward_std": 0.876830667257309,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9281250238418579,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 10.588281154632568,
"epoch": 0.21585160202360876,
"kl": 1.462668538093567,
"learning_rate": 9.978539325842695e-07,
"loss": 0.02637871727347374,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9296875,
"reward_std": 0.8622894585132599,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9296875298023224,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 10.403125286102295,
"epoch": 0.21697582911748173,
"kl": 1.453848421573639,
"learning_rate": 9.978426966292134e-07,
"loss": 0.04470495507121086,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.8359375,
"reward": 1.9585937857627869,
"reward_std": 0.9128505885601044,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9609375,
"rewards/avg_3": 1.9609375,
"rewards/avg_4": 1.9609375,
"rewards/avg_5": 1.9609375,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9585937559604645,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 10.55859375,
"epoch": 0.2181000562113547,
"kl": 1.548885464668274,
"learning_rate": 9.978314606741573e-07,
"loss": 0.03591137379407883,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.734375,
"reward": 1.9117187857627869,
"reward_std": 0.8763015270233154,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9124999940395355,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 10.505468845367432,
"epoch": 0.21922428330522767,
"kl": 1.6363706588745117,
"learning_rate": 9.97820224719101e-07,
"loss": 0.03188881278038025,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.9171875715255737,
"reward_std": 0.8619976937770844,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9187500178813934,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 10.568749904632568,
"epoch": 0.2203485103991006,
"kl": 1.5860892534255981,
"learning_rate": 9.978089887640449e-07,
"loss": 0.028511447831988335,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6796875,
"reward": 1.9078125357627869,
"reward_std": 0.8347568511962891,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9078125059604645,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437881469727,
"epoch": 0.22147273749297358,
"kl": 1.5308297872543335,
"learning_rate": 9.977977528089888e-07,
"loss": 0.026905380189418793,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.71875,
"reward": 1.907031238079071,
"reward_std": 0.8604941964149475,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9070312678813934,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 10.496875286102295,
"epoch": 0.22259696458684655,
"kl": 1.5878414511680603,
"learning_rate": 9.977865168539325e-07,
"loss": 0.03801161050796509,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7578125,
"reward": 1.925000011920929,
"reward_std": 0.8770124614238739,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.92578125,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 10.521093845367432,
"epoch": 0.2237211916807195,
"kl": 1.5883166193962097,
"learning_rate": 9.977752808988763e-07,
"loss": 0.023211535066366196,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6953125,
"reward": 1.908593773841858,
"reward_std": 0.8363709449768066,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.909375011920929,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 10.541406154632568,
"epoch": 0.22484541877459246,
"kl": 1.5315197706222534,
"learning_rate": 9.977640449438202e-07,
"loss": 0.023549862205982208,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.920312523841858,
"reward_std": 0.8471402525901794,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.921875,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 10.565625190734863,
"epoch": 0.22596964586846544,
"kl": 1.5974794626235962,
"learning_rate": 9.97752808988764e-07,
"loss": 0.024372313171625137,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6640625,
"reward": 1.9000000357627869,
"reward_std": 0.8229183256626129,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000059604645,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 10.517969131469727,
"epoch": 0.2270938729623384,
"kl": 1.539478600025177,
"learning_rate": 9.977415730337078e-07,
"loss": 0.012446017935872078,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6953125,
"reward": 1.904687523841858,
"reward_std": 0.8295447826385498,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9046874940395355,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 10.535156726837158,
"epoch": 0.22821810005621135,
"kl": 1.6015858054161072,
"learning_rate": 9.977303370786517e-07,
"loss": 0.03816835209727287,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7578125,
"reward": 1.930468738079071,
"reward_std": 0.8757118582725525,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9312500059604645,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000476837158,
"epoch": 0.22934232715008432,
"kl": 1.6012290716171265,
"learning_rate": 9.977191011235954e-07,
"loss": 0.035076532512903214,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6953125,
"reward": 1.9117187857627869,
"reward_std": 0.8442235589027405,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9117187559604645,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 10.500000476837158,
"epoch": 0.2304665542439573,
"kl": 1.6269680857658386,
"learning_rate": 9.977078651685393e-07,
"loss": 0.02707146294414997,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.909375011920929,
"reward_std": 0.8505551815032959,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.909375011920929,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 10.557031631469727,
"epoch": 0.23159078133783023,
"kl": 1.6032747626304626,
"learning_rate": 9.976966292134832e-07,
"loss": 0.029255583882331848,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6953125,
"reward": 1.9195312857627869,
"reward_std": 0.8345302641391754,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9203125238418579,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 10.522656440734863,
"epoch": 0.2327150084317032,
"kl": 1.5486992001533508,
"learning_rate": 9.976853932584268e-07,
"loss": 0.014459997415542603,
"ratio/all_0": 0.0,
"ratio/all_2": 0.65625,
"reward": 1.908593773841858,
"reward_std": 0.8039405941963196,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9085937738418579,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 10.563281536102295,
"epoch": 0.23383923552557617,
"kl": 1.5740108489990234,
"learning_rate": 9.976741573033707e-07,
"loss": 0.0257696695625782,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.918749988079071,
"reward_std": 0.8626956641674042,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9195312559604645,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 10.550000190734863,
"epoch": 0.23496346261944911,
"kl": 1.5769821405410767,
"learning_rate": 9.976629213483146e-07,
"loss": 0.03247952461242676,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.913281261920929,
"reward_std": 0.8521692454814911,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9140625,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.23608768971332209,
"kl": 1.536140501499176,
"learning_rate": 9.976516853932583e-07,
"loss": 0.03490378335118294,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6953125,
"reward": 1.92578125,
"reward_std": 0.8375056982040405,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9281249940395355,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 10.572656631469727,
"epoch": 0.23721191680719506,
"kl": 1.613707423210144,
"learning_rate": 9.976404494382022e-07,
"loss": 0.0240375567227602,
"ratio/all_0": 0.0625,
"ratio/all_2": 0.6875,
"reward": 1.8757812976837158,
"reward_std": 0.8590532541275024,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8765625059604645,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 10.489062786102295,
"epoch": 0.23833614390106803,
"kl": 1.7040197253227234,
"learning_rate": 9.97629213483146e-07,
"loss": 0.02793644554913044,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6640625,
"reward": 1.8882812857627869,
"reward_std": 0.8288446962833405,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8890625238418579,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 10.516406536102295,
"epoch": 0.23946037099494097,
"kl": 1.6352243423461914,
"learning_rate": 9.976179775280898e-07,
"loss": 0.018058663234114647,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.890625,
"reward_std": 0.8580062985420227,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8906250298023224,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 10.466406345367432,
"epoch": 0.24058459808881394,
"kl": 1.6236803531646729,
"learning_rate": 9.976067415730337e-07,
"loss": 0.03151131793856621,
"ratio/all_0": 0.0,
"ratio/all_2": 0.703125,
"reward": 1.9312500357627869,
"reward_std": 0.832555741071701,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9320312440395355,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 10.541406154632568,
"epoch": 0.2417088251826869,
"kl": 1.6875005960464478,
"learning_rate": 9.975955056179775e-07,
"loss": 0.031967081129550934,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.919531226158142,
"reward_std": 0.8395129144191742,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9195312857627869,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 10.545312881469727,
"epoch": 0.24283305227655985,
"kl": 1.674071729183197,
"learning_rate": 9.975842696629212e-07,
"loss": 0.03784000501036644,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7578125,
"reward": 1.9257813096046448,
"reward_std": 0.8713735342025757,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.92578125,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 10.56406307220459,
"epoch": 0.24395727937043282,
"kl": 1.6124215126037598,
"learning_rate": 9.975730337078651e-07,
"loss": 0.03100433573126793,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.920312523841858,
"reward_std": 0.8564988374710083,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.922656238079071,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 10.494531631469727,
"epoch": 0.2450815064643058,
"kl": 1.7857028245925903,
"learning_rate": 9.97561797752809e-07,
"loss": 0.03976079821586609,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.928906261920929,
"reward_std": 0.8637703955173492,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9296875298023224,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 10.584375381469727,
"epoch": 0.24620573355817876,
"kl": 1.6516713500022888,
"learning_rate": 9.975505617977527e-07,
"loss": 0.02732301503419876,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6953125,
"reward": 1.914843738079071,
"reward_std": 0.8281229138374329,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9156250059604645,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 10.557812690734863,
"epoch": 0.2473299606520517,
"kl": 1.6257204413414001,
"learning_rate": 9.975393258426966e-07,
"loss": 0.03726682811975479,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9296875,
"reward_std": 0.8575507402420044,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.932812511920929,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 10.563281059265137,
"epoch": 0.24845418774592468,
"kl": 1.6728870272636414,
"learning_rate": 9.975280898876405e-07,
"loss": 0.03137045353651047,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.913281261920929,
"reward_std": 0.8622699677944183,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9148437678813934,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 10.493750095367432,
"epoch": 0.24957841483979765,
"kl": 1.6135631203651428,
"learning_rate": 9.975168539325841e-07,
"loss": 0.02609895169734955,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.920312523841858,
"reward_std": 0.8621271550655365,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203125238418579,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 10.526562690734863,
"epoch": 0.2507026419336706,
"kl": 1.6434112787246704,
"learning_rate": 9.97505617977528e-07,
"loss": 0.029833676293492317,
"ratio/all_0": 0.0,
"ratio/all_2": 0.671875,
"reward": 1.928906261920929,
"reward_std": 0.8060888051986694,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.928906261920929,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 10.494531631469727,
"epoch": 0.2518268690275436,
"kl": 1.5851457118988037,
"learning_rate": 9.97494382022472e-07,
"loss": 0.031208906322717667,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.924218773841858,
"reward_std": 0.8377038240432739,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.925000011920929,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 10.5546875,
"epoch": 0.25295109612141653,
"kl": 1.5501510500907898,
"learning_rate": 9.974831460674156e-07,
"loss": 0.02604903280735016,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.9140625,
"reward_std": 0.846492350101471,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9140625298023224,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 10.51718807220459,
"epoch": 0.2540753232152895,
"kl": 1.5336772799491882,
"learning_rate": 9.974719101123595e-07,
"loss": 0.03907974064350128,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.78125,
"reward": 1.9304687976837158,
"reward_std": 0.8920327723026276,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9304687678813934,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 10.474218845367432,
"epoch": 0.25519955030916247,
"kl": 1.6211896538734436,
"learning_rate": 9.974606741573034e-07,
"loss": 0.03476298600435257,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.912500023841858,
"reward_std": 0.8594194948673248,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9125000238418579,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 10.568749904632568,
"epoch": 0.2563237774030354,
"kl": 1.5672905445098877,
"learning_rate": 9.97449438202247e-07,
"loss": 0.027938904240727425,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.9093750715255737,
"reward_std": 0.8576548993587494,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9109375178813934,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 10.542187690734863,
"epoch": 0.25744800449690836,
"kl": 1.562992513179779,
"learning_rate": 9.97438202247191e-07,
"loss": 0.03218929469585419,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.9226562976837158,
"reward_std": 0.856164962053299,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9242187440395355,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 10.540625095367432,
"epoch": 0.25857223159078135,
"kl": 1.708520531654358,
"learning_rate": 9.974269662921348e-07,
"loss": 0.025115033611655235,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.671875,
"reward": 1.899218738079071,
"reward_std": 0.8306452631950378,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.90234375,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 10.558594226837158,
"epoch": 0.2596964586846543,
"kl": 1.8696183562278748,
"learning_rate": 9.974157303370785e-07,
"loss": 0.041371747851371765,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6640625,
"reward": 1.912500023841858,
"reward_std": 0.8245647549629211,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9148437678813934,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 10.474218845367432,
"epoch": 0.26082068577852724,
"kl": 1.932099997997284,
"learning_rate": 9.974044943820224e-07,
"loss": 0.04683533310890198,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.765625,
"reward": 1.9195312857627869,
"reward_std": 0.8856737911701202,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875,
"rewards/point_reward": 0.9273437559604645,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 10.432812690734863,
"epoch": 0.26194491287240024,
"kl": 1.7457298040390015,
"learning_rate": 9.973932584269663e-07,
"loss": 0.03385300561785698,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6953125,
"reward": 1.918749988079071,
"reward_std": 0.8396010994911194,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9234375059604645,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 10.432031631469727,
"epoch": 0.2630691399662732,
"kl": 1.8611086010932922,
"learning_rate": 9.9738202247191e-07,
"loss": 0.04638049378991127,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.90234375,
"reward_std": 0.8645775318145752,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8046875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.996874988079071,
"rewards/point_reward": 0.905468761920929,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 10.385156154632568,
"epoch": 0.2641933670601461,
"kl": 1.723706603050232,
"learning_rate": 9.973707865168539e-07,
"loss": 0.034559160470962524,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.9234375357627869,
"reward_std": 0.8410924971103668,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.996874988079071,
"rewards/point_reward": 0.9265625178813934,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 10.43125057220459,
"epoch": 0.2653175941540191,
"kl": 1.6914669275283813,
"learning_rate": 9.973595505617976e-07,
"loss": 0.0417524054646492,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.78125,
"reward": 1.942187488079071,
"reward_std": 0.8815446197986603,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.984375,
"rewards/avg_4": 1.9453125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.944531261920929,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 10.531250476837158,
"epoch": 0.26644182124789206,
"kl": 1.6563090085983276,
"learning_rate": 9.973483146067414e-07,
"loss": 0.031678296625614166,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.734375,
"reward": 1.92578125,
"reward_std": 0.8512683510780334,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9273437559604645,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 10.491406440734863,
"epoch": 0.26756604834176506,
"kl": 1.7366485595703125,
"learning_rate": 9.973370786516853e-07,
"loss": 0.02747419849038124,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.671875,
"reward": 1.9000000357627869,
"reward_std": 0.8285984098911285,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9023437798023224,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 10.403906345367432,
"epoch": 0.268690275435638,
"kl": 1.775952935218811,
"learning_rate": 9.97325842696629e-07,
"loss": 0.03478237986564636,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6796875,
"reward": 1.9062500596046448,
"reward_std": 0.8357247114181519,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.909375011920929,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 10.389062404632568,
"epoch": 0.26981450252951095,
"kl": 1.7484761476516724,
"learning_rate": 9.97314606741573e-07,
"loss": 0.04855723679065704,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.78125,
"reward": 1.9351562857627869,
"reward_std": 0.8961874544620514,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9375,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 10.538281440734863,
"epoch": 0.27093872962338394,
"kl": 1.6602862477302551,
"learning_rate": 9.973033707865168e-07,
"loss": 0.04181479662656784,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.934374988079071,
"reward_std": 0.8723446130752563,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9390625059604645,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 10.399219036102295,
"epoch": 0.2720629567172569,
"kl": 1.8109837770462036,
"learning_rate": 9.972921348314605e-07,
"loss": 0.032406628131866455,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.703125,
"reward": 1.916406273841858,
"reward_std": 0.8395773470401764,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9187500178813934,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 10.46484375,
"epoch": 0.27318718381112983,
"kl": 1.7080157399177551,
"learning_rate": 9.972808988764044e-07,
"loss": 0.02838246151804924,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7265625,
"reward": 1.911718726158142,
"reward_std": 0.8578216135501862,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9960937798023224,
"rewards/point_reward": 0.9156250059604645,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 10.516406536102295,
"epoch": 0.27431141090500283,
"kl": 1.8099465370178223,
"learning_rate": 9.972696629213483e-07,
"loss": 0.04237574338912964,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.916406273841858,
"reward_std": 0.8517789244651794,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9187500178813934,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 10.339062690734863,
"epoch": 0.27543563799887577,
"kl": 1.8641058206558228,
"learning_rate": 9.972584269662921e-07,
"loss": 0.04015839472413063,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6953125,
"reward": 1.92578125,
"reward_std": 0.827897310256958,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9265625178813934,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 10.48046875,
"epoch": 0.2765598650927487,
"kl": 1.6710260510444641,
"learning_rate": 9.972471910112358e-07,
"loss": 0.0362420380115509,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.75,
"reward": 1.9210938215255737,
"reward_std": 0.872996062040329,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9234375059604645,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 10.430469036102295,
"epoch": 0.2776840921866217,
"kl": 1.6808490753173828,
"learning_rate": 9.972359550561797e-07,
"loss": 0.031743235886096954,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.90234375,
"reward_std": 0.8524691760540009,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.90625,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 10.414844036102295,
"epoch": 0.27880831928049465,
"kl": 1.6693540811538696,
"learning_rate": 9.972247191011236e-07,
"loss": 0.035806022584438324,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.905468761920929,
"reward_std": 0.8627266883850098,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.992968738079071,
"rewards/point_reward": 0.9125000238418579,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 10.3984375,
"epoch": 0.2799325463743676,
"kl": 1.5856854319572449,
"learning_rate": 9.972134831460673e-07,
"loss": 0.04258953034877777,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.828125,
"reward": 1.952343761920929,
"reward_std": 0.9072423577308655,
"rewards/avg_0": 1.9453125,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.96875,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.953906238079071,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 10.359375476837158,
"epoch": 0.2810567734682406,
"kl": 1.5875438451766968,
"learning_rate": 9.972022471910112e-07,
"loss": 0.04035266861319542,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7890625,
"reward": 1.9343750476837158,
"reward_std": 0.8995476365089417,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9359374940395355,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 10.521093845367432,
"epoch": 0.28218100056211354,
"kl": 1.642164707183838,
"learning_rate": 9.97191011235955e-07,
"loss": 0.03470568358898163,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.9078125357627869,
"reward_std": 0.8443655967712402,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9124999940395355,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 10.52734375,
"epoch": 0.28330522765598654,
"kl": 1.558402955532074,
"learning_rate": 9.971797752808987e-07,
"loss": 0.0262003056704998,
"ratio/all_0": 0.0,
"ratio/all_2": 0.6875,
"reward": 1.9226562976837158,
"reward_std": 0.8312846720218658,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9929687678813934,
"rewards/point_reward": 0.9296875,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 10.449219226837158,
"epoch": 0.2844294547498595,
"kl": 1.532906413078308,
"learning_rate": 9.971685393258426e-07,
"loss": 0.042840905487537384,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.7890625,
"reward": 1.9296875,
"reward_std": 0.9091970920562744,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9320312738418579,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 10.514843940734863,
"epoch": 0.2855536818437324,
"kl": 1.6184453964233398,
"learning_rate": 9.971573033707865e-07,
"loss": 0.029775146394968033,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.91796875,
"reward_std": 0.8635019361972809,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875298023224,
"rewards/point_reward": 0.9257812798023224,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 10.499218940734863,
"epoch": 0.2866779089376054,
"kl": 1.5478734374046326,
"learning_rate": 9.971460674157302e-07,
"loss": 0.036088164895772934,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7890625,
"reward": 1.9312500357627869,
"reward_std": 0.8939096629619598,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.9453125,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.9343750178813934,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 10.510156631469727,
"epoch": 0.28780213603147836,
"kl": 1.6254317164421082,
"learning_rate": 9.97134831460674e-07,
"loss": 0.022765733301639557,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7265625,
"reward": 1.904687523841858,
"reward_std": 0.8665963411331177,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875298023224,
"rewards/point_reward": 0.9125000238418579,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.2889263631253513,
"kl": 1.5246264338493347,
"learning_rate": 9.97123595505618e-07,
"loss": 0.017598390579223633,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.625,
"reward": 1.8984375596046448,
"reward_std": 0.8018450438976288,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500059604645,
"rewards/point_reward": 0.9046875238418579,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 10.46875,
"epoch": 0.2900505902192243,
"kl": 1.503324806690216,
"learning_rate": 9.971123595505617e-07,
"loss": 0.02756604552268982,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7421875,
"reward": 1.916406273841858,
"reward_std": 0.869181752204895,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.917187511920929,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 10.486719131469727,
"epoch": 0.29117481731309725,
"kl": 1.6142412424087524,
"learning_rate": 9.971011235955056e-07,
"loss": 0.03882833570241928,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.703125,
"reward": 1.908593773841858,
"reward_std": 0.8646510541439056,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.9125000238418579,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 10.471094131469727,
"epoch": 0.2922990444069702,
"kl": 1.557334840297699,
"learning_rate": 9.970898876404495e-07,
"loss": 0.03715989738702774,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7734375,
"reward": 1.9335938096046448,
"reward_std": 0.8902114033699036,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9359374940395355,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 10.576562881469727,
"epoch": 0.2934232715008432,
"kl": 1.6058542132377625,
"learning_rate": 9.970786516853931e-07,
"loss": 0.02706632763147354,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.913281261920929,
"reward_std": 0.8577883243560791,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500357627869,
"rewards/point_reward": 0.9195312559604645,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 10.518750190734863,
"epoch": 0.29454749859471613,
"kl": 1.631466269493103,
"learning_rate": 9.97067415730337e-07,
"loss": 0.029847048223018646,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7265625,
"reward": 1.920312523841858,
"reward_std": 0.8593951761722565,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.925000011920929,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687404632568,
"epoch": 0.29567172568858907,
"kl": 1.6081148386001587,
"learning_rate": 9.97056179775281e-07,
"loss": 0.03809971362352371,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.671875,
"reward": 1.921875,
"reward_std": 0.8334166705608368,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9929687678813934,
"rewards/point_reward": 0.928906261920929,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 10.452343940734863,
"epoch": 0.29679595278246207,
"kl": 1.687977135181427,
"learning_rate": 9.970449438202246e-07,
"loss": 0.03532067686319351,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.734375,
"reward": 1.928125023841858,
"reward_std": 0.8631133139133453,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.932812511920929,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 10.543750286102295,
"epoch": 0.297920179876335,
"kl": 1.62973290681839,
"learning_rate": 9.970337078651685e-07,
"loss": 0.04487081244587898,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7734375,
"reward": 1.932031273841858,
"reward_std": 0.9043321311473846,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9859375059604645,
"rewards/point_reward": 0.9460937678813934,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 10.403125286102295,
"epoch": 0.29904440697020795,
"kl": 1.6182299852371216,
"learning_rate": 9.970224719101124e-07,
"loss": 0.03904420882463455,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.9187500476837158,
"reward_std": 0.8686837255954742,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.989062488079071,
"rewards/point_reward": 0.9296875,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 10.600781440734863,
"epoch": 0.30016863406408095,
"kl": 1.6868852376937866,
"learning_rate": 9.97011235955056e-07,
"loss": 0.028194734826683998,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6484375,
"reward": 1.8882812857627869,
"reward_std": 0.8412781953811646,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9867187738418579,
"rewards/point_reward": 0.901562511920929,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 10.592968940734863,
"epoch": 0.3012928611579539,
"kl": 1.6456496119499207,
"learning_rate": 9.97e-07,
"loss": 0.033215299248695374,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.924218773841858,
"reward_std": 0.853204607963562,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875,
"rewards/point_reward": 0.9320312738418579,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 10.525781154632568,
"epoch": 0.3024170882518269,
"kl": 1.8199474811553955,
"learning_rate": 9.969887640449438e-07,
"loss": 0.02363828755915165,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9078125357627869,
"reward_std": 0.8277927041053772,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.98828125,
"rewards/point_reward": 0.9195312559604645,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 10.553906440734863,
"epoch": 0.30354131534569984,
"kl": 1.738243043422699,
"learning_rate": 9.969775280898875e-07,
"loss": 0.03283867612481117,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6796875,
"reward": 1.8992187976837158,
"reward_std": 0.8493183255195618,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.828125,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9843750298023224,
"rewards/point_reward": 0.914843738079071,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 10.518750190734863,
"epoch": 0.3046655424395728,
"kl": 1.7232638597488403,
"learning_rate": 9.969662921348314e-07,
"loss": 0.039549570530653,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.927343726158142,
"reward_std": 0.8627557754516602,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9828125238418579,
"rewards/point_reward": 0.944531261920929,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 10.567187786102295,
"epoch": 0.3057897695334458,
"kl": 1.806475043296814,
"learning_rate": 9.969550561797753e-07,
"loss": 0.03781045228242874,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.916406273841858,
"reward_std": 0.8432705104351044,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.987500011920929,
"rewards/point_reward": 0.928906261920929,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 10.459374904632568,
"epoch": 0.3069139966273187,
"kl": 1.7418119311332703,
"learning_rate": 9.96943820224719e-07,
"loss": 0.02658051624894142,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.640625,
"reward": 1.896875023841858,
"reward_std": 0.8112240135669708,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.796875,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9828124940395355,
"rewards/point_reward": 0.9140625298023224,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 10.614062786102295,
"epoch": 0.30803822372119166,
"kl": 1.7741380333900452,
"learning_rate": 9.969325842696629e-07,
"loss": 0.038595620542764664,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6171875,
"reward": 1.9195312857627869,
"reward_std": 0.7923027276992798,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9742187559604645,
"rewards/point_reward": 0.9453125,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 10.54843807220459,
"epoch": 0.30916245081506466,
"kl": 1.8489094972610474,
"learning_rate": 9.969213483146068e-07,
"loss": 0.03723674640059471,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.65625,
"reward": 1.908593773841858,
"reward_std": 0.8213367164134979,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9750000238418579,
"rewards/point_reward": 0.93359375,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 10.652344226837158,
"epoch": 0.3102866779089376,
"kl": 1.8202200531959534,
"learning_rate": 9.969101123595504e-07,
"loss": 0.025497809052467346,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.5546875,
"reward": 1.88671875,
"reward_std": 0.7714699506759644,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8359375,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.8125,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.96484375,
"rewards/point_reward": 0.9218750298023224,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 10.56406307220459,
"epoch": 0.31141090500281055,
"kl": 1.928186297416687,
"learning_rate": 9.968988764044943e-07,
"loss": 0.02249237895011902,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.5859375,
"reward": 1.875,
"reward_std": 0.787726491689682,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.796875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.971875011920929,
"rewards/point_reward": 0.9031250178813934,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 10.594531059265137,
"epoch": 0.31253513209668354,
"kl": 1.8774849772453308,
"learning_rate": 9.968876404494382e-07,
"loss": 0.040980830788612366,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.65625,
"reward": 1.9195312857627869,
"reward_std": 0.8190862238407135,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.973437488079071,
"rewards/point_reward": 0.9460937678813934,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 10.603125095367432,
"epoch": 0.3136593591905565,
"kl": 1.9386613965034485,
"learning_rate": 9.968764044943819e-07,
"loss": 0.023881088942289352,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6015625,
"reward": 1.8671875,
"reward_std": 0.8201425671577454,
"rewards/avg_0": 1.78125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8203125,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.975781261920929,
"rewards/point_reward": 0.8914062678813934,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 10.528906345367432,
"epoch": 0.31478358628442943,
"kl": 1.9172906279563904,
"learning_rate": 9.968651685393258e-07,
"loss": 0.0323064848780632,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.625,
"reward": 1.901562511920929,
"reward_std": 0.8001732230186462,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8359375,
"rewards/avg_2": 1.828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9671875238418579,
"rewards/point_reward": 0.9343750178813934,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 10.539062976837158,
"epoch": 0.3159078133783024,
"kl": 1.888903796672821,
"learning_rate": 9.968539325842697e-07,
"loss": 0.038979463279247284,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.59375,
"reward": 1.908593773841858,
"reward_std": 0.7810782790184021,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9750000238418579,
"rewards/point_reward": 0.9335937798023224,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 10.551562786102295,
"epoch": 0.31703204047217537,
"kl": 1.9569891095161438,
"learning_rate": 9.968426966292134e-07,
"loss": 0.03946223855018616,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.578125,
"reward": 1.8921875357627869,
"reward_std": 0.7819642722606659,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8359375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9695312678813934,
"rewards/point_reward": 0.9226562678813934,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 10.463281631469727,
"epoch": 0.31815626756604837,
"kl": 1.9771220684051514,
"learning_rate": 9.968314606741572e-07,
"loss": 0.029817869886755943,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.59375,
"reward": 1.8835937976837158,
"reward_std": 0.791181355714798,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.84375,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9781250059604645,
"rewards/point_reward": 0.905468761920929,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 10.390625,
"epoch": 0.3192804946599213,
"kl": 1.9137450456619263,
"learning_rate": 9.968202247191011e-07,
"loss": 0.05185917764902115,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.734375,
"reward": 1.9382812976837158,
"reward_std": 0.8540681302547455,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9890625178813934,
"rewards/point_reward": 0.9492187798023224,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 10.33750057220459,
"epoch": 0.32040472175379425,
"kl": 1.8866459131240845,
"learning_rate": 9.968089887640448e-07,
"loss": 0.04224765673279762,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.909375011920929,
"reward_std": 0.8564162254333496,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.913281261920929,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 10.255468845367432,
"epoch": 0.32152894884766725,
"kl": 1.8018646836280823,
"learning_rate": 9.967977528089887e-07,
"loss": 0.043330300599336624,
"ratio/all_0": 0.0,
"ratio/all_2": 0.7890625,
"reward": 1.9390625357627869,
"reward_std": 0.8851653337478638,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9398437738418579,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 10.403125286102295,
"epoch": 0.3226531759415402,
"kl": 1.8123514652252197,
"learning_rate": 9.967865168539326e-07,
"loss": 0.049471862614154816,
"ratio/all_0": 0.0,
"ratio/all_2": 0.7734375,
"reward": 1.947656273841858,
"reward_std": 0.8822700083255768,
"rewards/avg_0": 1.9609375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9765625,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.953125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.952343761920929,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 10.360937595367432,
"epoch": 0.32377740303541314,
"kl": 1.7900111675262451,
"learning_rate": 9.967752808988765e-07,
"loss": 0.035676054656505585,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6953125,
"reward": 1.90625,
"reward_std": 0.842722624540329,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9070312678813934,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 10.370312690734863,
"epoch": 0.32490163012928613,
"kl": 1.8105761408805847,
"learning_rate": 9.967640449438202e-07,
"loss": 0.040825922042131424,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.75,
"reward": 1.9312500357627869,
"reward_std": 0.8617748022079468,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.9453125,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.953125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9312500059604645,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 10.416406631469727,
"epoch": 0.3260258572231591,
"kl": 1.8610867261886597,
"learning_rate": 9.96752808988764e-07,
"loss": 0.042752377688884735,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.913281261920929,
"reward_std": 0.8681075572967529,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.913281261920929,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 10.275000095367432,
"epoch": 0.327150084317032,
"kl": 1.8451239466667175,
"learning_rate": 9.96741573033708e-07,
"loss": 0.04314912483096123,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.92578125,
"reward_std": 0.8235087096691132,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9257812798023224,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 10.250000476837158,
"epoch": 0.328274311410905,
"kl": 1.79464590549469,
"learning_rate": 9.967303370786516e-07,
"loss": 0.03976229578256607,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.75,
"reward": 1.928906261920929,
"reward_std": 0.8630857169628143,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.928906261920929,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 10.306250095367432,
"epoch": 0.32939853850477796,
"kl": 1.8493717908859253,
"learning_rate": 9.967191011235955e-07,
"loss": 0.048004940152168274,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.928125023841858,
"reward_std": 0.8397326469421387,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9281250238418579,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 10.316406726837158,
"epoch": 0.3305227655986509,
"kl": 1.8398056626319885,
"learning_rate": 9.967078651685394e-07,
"loss": 0.041067518293857574,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.91796875,
"reward_std": 0.8646991848945618,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9195312559604645,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 10.385156631469727,
"epoch": 0.3316469926925239,
"kl": 1.7265257239341736,
"learning_rate": 9.96696629213483e-07,
"loss": 0.036400072276592255,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.9234375357627869,
"reward_std": 0.8537377119064331,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.925000011920929,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 10.515625476837158,
"epoch": 0.33277121978639684,
"kl": 1.7181497812271118,
"learning_rate": 9.96685393258427e-07,
"loss": 0.035892877727746964,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.905468761920929,
"reward_std": 0.8431965708732605,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9070312678813934,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 10.43359375,
"epoch": 0.33389544688026984,
"kl": 1.7789562344551086,
"learning_rate": 9.966741573033709e-07,
"loss": 0.038690704852342606,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.917187511920929,
"reward_std": 0.8528233170509338,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.917187511920929,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 10.446875095367432,
"epoch": 0.3350196739741428,
"kl": 1.7555699944496155,
"learning_rate": 9.966629213483145e-07,
"loss": 0.043629299849271774,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.920312523841858,
"reward_std": 0.8546639978885651,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9203125238418579,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 10.271874904632568,
"epoch": 0.3361439010680157,
"kl": 1.8395432829856873,
"learning_rate": 9.966516853932584e-07,
"loss": 0.05077920854091644,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9382812976837158,
"reward_std": 0.8653816878795624,
"rewards/avg_0": 1.9453125,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9382812678813934,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 10.353906154632568,
"epoch": 0.3372681281618887,
"kl": 1.8199289441108704,
"learning_rate": 9.966404494382023e-07,
"loss": 0.04438919201493263,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.924218773841858,
"reward_std": 0.8613188862800598,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9242187440395355,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 10.187500476837158,
"epoch": 0.33839235525576167,
"kl": 2.1463682055473328,
"learning_rate": 9.96629213483146e-07,
"loss": 0.055228251963853836,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.9148437976837158,
"reward_std": 0.8456367552280426,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9148437678813934,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 10.175000190734863,
"epoch": 0.3395165823496346,
"kl": 2.1115158200263977,
"learning_rate": 9.9661797752809e-07,
"loss": 0.05504034459590912,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.765625,
"reward": 1.9226562976837158,
"reward_std": 0.884088397026062,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9242187738418579,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 10.18359375,
"epoch": 0.3406408094435076,
"kl": 1.990248441696167,
"learning_rate": 9.966067415730338e-07,
"loss": 0.052734263241291046,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.828125,
"reward": 1.940625011920929,
"reward_std": 0.9085220098495483,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.940625011920929,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 10.145312309265137,
"epoch": 0.34176503653738055,
"kl": 1.8839709758758545,
"learning_rate": 9.965955056179775e-07,
"loss": 0.04460742324590683,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.734375,
"reward": 1.932031273841858,
"reward_std": 0.857248067855835,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.932812511920929,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 10.23046875,
"epoch": 0.3428892636312535,
"kl": 1.853227436542511,
"learning_rate": 9.965842696629214e-07,
"loss": 0.04168041795492172,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.916406273841858,
"reward_std": 0.8461984395980835,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.91796875,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 10.2265625,
"epoch": 0.3440134907251265,
"kl": 1.869143784046173,
"learning_rate": 9.965730337078652e-07,
"loss": 0.0380750373005867,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.703125,
"reward": 1.9195312857627869,
"reward_std": 0.8381736278533936,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9195312559604645,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 10.282031059265137,
"epoch": 0.34513771781899943,
"kl": 1.8878865242004395,
"learning_rate": 9.96561797752809e-07,
"loss": 0.046272002160549164,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.78125,
"reward": 1.916406273841858,
"reward_std": 0.9037257432937622,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.91796875,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 10.348437786102295,
"epoch": 0.3462619449128724,
"kl": 1.7901397943496704,
"learning_rate": 9.965505617977528e-07,
"loss": 0.03747926652431488,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.918749988079071,
"reward_std": 0.8264937102794647,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9203125238418579,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 10.301562786102295,
"epoch": 0.3473861720067454,
"kl": 1.82473886013031,
"learning_rate": 9.965393258426967e-07,
"loss": 0.041796181350946426,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.928125023841858,
"reward_std": 0.8394474387168884,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.928906261920929,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 10.310937881469727,
"epoch": 0.3485103991006183,
"kl": 1.8468487858772278,
"learning_rate": 9.965280898876404e-07,
"loss": 0.04869385436177254,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.932812511920929,
"reward_std": 0.8740152418613434,
"rewards/avg_0": 1.9453125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9351562559604645,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 10.325000286102295,
"epoch": 0.34963462619449126,
"kl": 1.824562668800354,
"learning_rate": 9.965168539325843e-07,
"loss": 0.04901933670043945,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.8046875,
"reward": 1.93359375,
"reward_std": 0.9041942656040192,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9351562559604645,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 10.381250381469727,
"epoch": 0.35075885328836426,
"kl": 1.77147775888443,
"learning_rate": 9.965056179775282e-07,
"loss": 0.04348636791110039,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7734375,
"reward": 1.935937523841858,
"reward_std": 0.8781629204750061,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9359375238418579,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 10.47421932220459,
"epoch": 0.3518830803822372,
"kl": 1.7815687656402588,
"learning_rate": 9.964943820224718e-07,
"loss": 0.04413023591041565,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7265625,
"reward": 1.924218773841858,
"reward_std": 0.8591809868812561,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.953125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9265625178813934,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 10.405468940734863,
"epoch": 0.3530073074761102,
"kl": 1.8134968280792236,
"learning_rate": 9.964831460674157e-07,
"loss": 0.046355605125427246,
"ratio/all_0": 0.0,
"ratio/all_2": 0.796875,
"reward": 1.94921875,
"reward_std": 0.8853201568126678,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.9453125,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9609375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9507812559604645,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 10.400781154632568,
"epoch": 0.35413153456998314,
"kl": 1.8659851551055908,
"learning_rate": 9.964719101123596e-07,
"loss": 0.04664193093776703,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.734375,
"reward": 1.9195312857627869,
"reward_std": 0.8666932284832001,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9195312559604645,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 10.532031536102295,
"epoch": 0.3552557616638561,
"kl": 1.852257788181305,
"learning_rate": 9.964606741573033e-07,
"loss": 0.0502714179456234,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7421875,
"reward": 1.932812511920929,
"reward_std": 0.8658345639705658,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9343750178813934,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 10.500781536102295,
"epoch": 0.3563799887577291,
"kl": 1.9854409098625183,
"learning_rate": 9.964494382022472e-07,
"loss": 0.051377031952142715,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.71875,
"reward": 1.921875,
"reward_std": 0.858985036611557,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.926562488079071,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 10.478906154632568,
"epoch": 0.357504215851602,
"kl": 1.9958835244178772,
"learning_rate": 9.96438202247191e-07,
"loss": 0.05269791930913925,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.75,
"reward": 1.9250000715255737,
"reward_std": 0.8795750141143799,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9296875298023224,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 10.481250286102295,
"epoch": 0.35862844294547497,
"kl": 2.0202487111091614,
"learning_rate": 9.964269662921348e-07,
"loss": 0.05175580829381943,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6953125,
"reward": 1.9187500476837158,
"reward_std": 0.8507504165172577,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9945312738418579,
"rewards/point_reward": 0.9242187738418579,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 10.442187786102295,
"epoch": 0.35975267003934797,
"kl": 2.0611737966537476,
"learning_rate": 9.964157303370787e-07,
"loss": 0.0645291656255722,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.796875,
"reward": 1.9429687857627869,
"reward_std": 0.902433842420578,
"rewards/avg_0": 1.9375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.9468750059604645,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 10.372656345367432,
"epoch": 0.3608768971332209,
"kl": 2.1989023685455322,
"learning_rate": 9.964044943820226e-07,
"loss": 0.06326432526111603,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.765625,
"reward": 1.9335938096046448,
"reward_std": 0.8833630681037903,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9382812678813934,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 10.446875095367432,
"epoch": 0.36200112422709385,
"kl": 2.1978585720062256,
"learning_rate": 9.963932584269662e-07,
"loss": 0.046397365629673004,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7265625,
"reward": 1.91015625,
"reward_std": 0.8547484576702118,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9937500357627869,
"rewards/point_reward": 0.9164062440395355,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 10.348437309265137,
"epoch": 0.36312535132096685,
"kl": 2.1940536499023438,
"learning_rate": 9.963820224719101e-07,
"loss": 0.05930664390325546,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.734375,
"reward": 1.9296875,
"reward_std": 0.8640153706073761,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9890625178813934,
"rewards/point_reward": 0.940625011920929,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 10.446094036102295,
"epoch": 0.3642495784148398,
"kl": 2.160746693611145,
"learning_rate": 9.96370786516854e-07,
"loss": 0.0425700843334198,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.90234375,
"reward_std": 0.8400704264640808,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.987500011920929,
"rewards/point_reward": 0.9148437678813934,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 10.309375286102295,
"epoch": 0.36537380550871273,
"kl": 2.020435869693756,
"learning_rate": 9.963595505617977e-07,
"loss": 0.05405348166823387,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.75,
"reward": 1.9265625476837158,
"reward_std": 0.8764021694660187,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.9304687678813934,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 10.322656631469727,
"epoch": 0.36649803260258573,
"kl": 2.034945547580719,
"learning_rate": 9.963483146067416e-07,
"loss": 0.056264981627464294,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.765625,
"reward": 1.943750023841858,
"reward_std": 0.8759823739528656,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.948437511920929,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 10.303906440734863,
"epoch": 0.3676222596964587,
"kl": 2.0568439960479736,
"learning_rate": 9.963370786516855e-07,
"loss": 0.04515969753265381,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.899218738079071,
"reward_std": 0.8356598913669586,
"rewards/avg_0": 1.8125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.84375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9882812798023224,
"rewards/point_reward": 0.9109375178813934,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 10.306250095367432,
"epoch": 0.3687464867903317,
"kl": 1.9643288254737854,
"learning_rate": 9.963258426966292e-07,
"loss": 0.053189441561698914,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.78125,
"reward": 1.939843773841858,
"reward_std": 0.8839648067951202,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.953125,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.9437499940395355,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 10.207812786102295,
"epoch": 0.3698707138842046,
"kl": 2.013589084148407,
"learning_rate": 9.96314606741573e-07,
"loss": 0.04469640925526619,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.919531226158142,
"reward_std": 0.8481138348579407,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9945312440395355,
"rewards/point_reward": 0.925000011920929,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 10.314844131469727,
"epoch": 0.37099494097807756,
"kl": 1.8831562399864197,
"learning_rate": 9.96303370786517e-07,
"loss": 0.043579135090112686,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9296875596046448,
"reward_std": 0.8622469007968903,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9343750178813934,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 10.259375095367432,
"epoch": 0.37211916807195056,
"kl": 1.9401150941848755,
"learning_rate": 9.962921348314606e-07,
"loss": 0.034515127539634705,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.65625,
"reward": 1.90234375,
"reward_std": 0.8185981810092926,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9929687678813934,
"rewards/point_reward": 0.909375011920929,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 10.256249904632568,
"epoch": 0.3732433951658235,
"kl": 1.9391788840293884,
"learning_rate": 9.962808988764045e-07,
"loss": 0.04044695198535919,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6875,
"reward": 1.9109375476837158,
"reward_std": 0.8457313776016235,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9156250059604645,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 10.24375057220459,
"epoch": 0.37436762225969644,
"kl": 1.934513509273529,
"learning_rate": 9.962696629213482e-07,
"loss": 0.040316130965948105,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7421875,
"reward": 1.917187511920929,
"reward_std": 0.8681433200836182,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9218750298023224,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 10.280468940734863,
"epoch": 0.37549184935356944,
"kl": 1.921375572681427,
"learning_rate": 9.96258426966292e-07,
"loss": 0.04540445655584335,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.924218773841858,
"reward_std": 0.855448454618454,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9929687678813934,
"rewards/point_reward": 0.9312500357627869,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 10.259375095367432,
"epoch": 0.3766160764474424,
"kl": 1.8859546780586243,
"learning_rate": 9.96247191011236e-07,
"loss": 0.04824208840727806,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.71875,
"reward": 1.930468738079071,
"reward_std": 0.8533749580383301,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9351562559604645,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 10.177343845367432,
"epoch": 0.3777403035413153,
"kl": 1.868826985359192,
"learning_rate": 9.962359550561796e-07,
"loss": 0.0528462678194046,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7890625,
"reward": 1.94921875,
"reward_std": 0.8829784393310547,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9296875,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9453125,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9500000178813934,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 10.29453182220459,
"epoch": 0.3788645306351883,
"kl": 1.7946856617927551,
"learning_rate": 9.962247191011235e-07,
"loss": 0.04495403170585632,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6953125,
"reward": 1.9117187857627869,
"reward_std": 0.8583216667175293,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9140625,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 10.392187595367432,
"epoch": 0.37998875772906127,
"kl": 1.770771324634552,
"learning_rate": 9.962134831460674e-07,
"loss": 0.03466241434216499,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6328125,
"reward": 1.9062500596046448,
"reward_std": 0.804929107427597,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9078125059604645,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 10.415625095367432,
"epoch": 0.3811129848229342,
"kl": 1.7638018727302551,
"learning_rate": 9.96202247191011e-07,
"loss": 0.038400210440158844,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.75,
"reward": 1.917187511920929,
"reward_std": 0.8736513555049896,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.91796875,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 10.450000286102295,
"epoch": 0.3822372119168072,
"kl": 1.7452391982078552,
"learning_rate": 9.96191011235955e-07,
"loss": 0.031908392906188965,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.734375,
"reward": 1.90625,
"reward_std": 0.8690169453620911,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9085937738418579,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 10.489843845367432,
"epoch": 0.38336143901068015,
"kl": 1.7394488453865051,
"learning_rate": 9.961797752808989e-07,
"loss": 0.03782523423433304,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.9148437976837158,
"reward_std": 0.8547420799732208,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.91796875,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 10.541406631469727,
"epoch": 0.3844856661045531,
"kl": 1.6925815343856812,
"learning_rate": 9.961685393258426e-07,
"loss": 0.031927358359098434,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.75,
"reward": 1.9226562976837158,
"reward_std": 0.8643298447132111,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.9296875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9234375059604645,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 10.5390625,
"epoch": 0.3856098931984261,
"kl": 1.673849105834961,
"learning_rate": 9.961573033707865e-07,
"loss": 0.02241407334804535,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6640625,
"reward": 1.8765625357627869,
"reward_std": 0.8381451368331909,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8203125,
"rewards/avg_3": 1.828125,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.8203125,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.878125011920929,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 10.5078125,
"epoch": 0.38673412029229903,
"kl": 1.6821864247322083,
"learning_rate": 9.961460674157303e-07,
"loss": 0.04435833916068077,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7734375,
"reward": 1.940625011920929,
"reward_std": 0.8782646358013153,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.9453125,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.940625011920929,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 10.510937690734863,
"epoch": 0.38785834738617203,
"kl": 1.7037270069122314,
"learning_rate": 9.96134831460674e-07,
"loss": 0.02727324329316616,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.703125,
"reward": 1.9140625596046448,
"reward_std": 0.8372543752193451,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9148437678813934,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 10.547656536102295,
"epoch": 0.388982574480045,
"kl": 1.6825117468833923,
"learning_rate": 9.96123595505618e-07,
"loss": 0.037956684827804565,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.765625,
"reward": 1.928906261920929,
"reward_std": 0.8790275156497955,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9304687678813934,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.3901068015739179,
"kl": 1.700487732887268,
"learning_rate": 9.961123595505618e-07,
"loss": 0.03412896394729614,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.921875,
"reward_std": 0.8527751564979553,
"rewards/avg_0": 1.9453125,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9242187738418579,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 10.485937595367432,
"epoch": 0.3912310286677909,
"kl": 1.7351809740066528,
"learning_rate": 9.961011235955055e-07,
"loss": 0.024284057319164276,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.90234375,
"reward_std": 0.8538567125797272,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9039062857627869,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 10.400781631469727,
"epoch": 0.39235525576166386,
"kl": 1.7726843357086182,
"learning_rate": 9.960898876404494e-07,
"loss": 0.04192570224404335,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6875,
"reward": 1.9101563096046448,
"reward_std": 0.8402937352657318,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.9117187559604645,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 10.570312976837158,
"epoch": 0.3934794828555368,
"kl": 1.7546494603157043,
"learning_rate": 9.960786516853933e-07,
"loss": 0.02213025651872158,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.609375,
"reward": 1.872656226158142,
"reward_std": 0.8090003728866577,
"rewards/avg_0": 1.84375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.8046875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.875,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 10.430468559265137,
"epoch": 0.3946037099494098,
"kl": 1.813601315021515,
"learning_rate": 9.96067415730337e-07,
"loss": 0.037648413330316544,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6796875,
"reward": 1.9265625476837158,
"reward_std": 0.8125456273555756,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9273437559604645,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 10.321094036102295,
"epoch": 0.39572793704328274,
"kl": 1.8753434419631958,
"learning_rate": 9.960561797752808e-07,
"loss": 0.039298996329307556,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.671875,
"reward": 1.91015625,
"reward_std": 0.8232802748680115,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9109375178813934,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 10.334375381469727,
"epoch": 0.3968521641371557,
"kl": 1.867890477180481,
"learning_rate": 9.960449438202247e-07,
"loss": 0.04137108474969864,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.909375011920929,
"reward_std": 0.8602083325386047,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.91015625,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 10.3515625,
"epoch": 0.3979763912310287,
"kl": 1.7816566228866577,
"learning_rate": 9.960337078651684e-07,
"loss": 0.031370680779218674,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.703125,
"reward": 1.900781273841858,
"reward_std": 0.8489590287208557,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.901562511920929,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 10.286718845367432,
"epoch": 0.3991006183249016,
"kl": 1.8434048295021057,
"learning_rate": 9.960224719101123e-07,
"loss": 0.03749451786279678,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6796875,
"reward": 1.8921875357627869,
"reward_std": 0.8432820439338684,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.84375,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.8921875059604645,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 10.339062690734863,
"epoch": 0.40022484541877457,
"kl": 1.7783136367797852,
"learning_rate": 9.960112359550562e-07,
"loss": 0.03067711368203163,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.6328125,
"reward": 1.8835937976837158,
"reward_std": 0.8181053400039673,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.8359375,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.84375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.883593738079071,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 10.340625286102295,
"epoch": 0.40134907251264756,
"kl": 1.7568607330322266,
"learning_rate": 9.959999999999999e-07,
"loss": 0.03994826227426529,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.703125,
"reward": 1.8953125476837158,
"reward_std": 0.8587200343608856,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.875,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.8960937559604645,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 10.377344131469727,
"epoch": 0.4024732996065205,
"kl": 1.7589952945709229,
"learning_rate": 9.959887640449438e-07,
"loss": 0.035448405891656876,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.640625,
"reward": 1.9195312857627869,
"reward_std": 0.8011971116065979,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9203125238418579,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 10.403125286102295,
"epoch": 0.4035975267003935,
"kl": 2.400500178337097,
"learning_rate": 9.959775280898876e-07,
"loss": 0.0631265640258789,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6953125,
"reward": 1.928125023841858,
"reward_std": 0.8244198858737946,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9281249940395355,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 10.425781726837158,
"epoch": 0.40472175379426645,
"kl": 1.7974015474319458,
"learning_rate": 9.959662921348313e-07,
"loss": 0.040458083152770996,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7421875,
"reward": 1.9289063215255737,
"reward_std": 0.8593026697635651,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9296875,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 10.502344131469727,
"epoch": 0.4058459808881394,
"kl": 1.714927613735199,
"learning_rate": 9.959550561797752e-07,
"loss": 0.016150854527950287,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6484375,
"reward": 1.8867188096046448,
"reward_std": 0.8087844252586365,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.88671875,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437404632568,
"epoch": 0.4069702079820124,
"kl": 1.7653551697731018,
"learning_rate": 9.959438202247191e-07,
"loss": 0.038302332162857056,
"ratio/all_0": 0.046875,
"ratio/all_2": 0.671875,
"reward": 1.897656261920929,
"reward_std": 0.8373755216598511,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8671875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.8992187678813934,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.40809443507588533,
"kl": 1.7473861575126648,
"learning_rate": 9.959325842696628e-07,
"loss": 0.031032182276248932,
"ratio/all_0": 0.0546875,
"ratio/all_2": 0.6484375,
"reward": 1.880468726158142,
"reward_std": 0.8342385292053223,
"rewards/avg_0": 1.8359375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8203125,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.882031261920929,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 10.524219036102295,
"epoch": 0.4092186621697583,
"kl": 1.738921344280243,
"learning_rate": 9.959213483146067e-07,
"loss": 0.033890679478645325,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.71875,
"reward": 1.909375011920929,
"reward_std": 0.8538974523544312,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.91015625,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 10.576562404632568,
"epoch": 0.41034288926363127,
"kl": 1.743700623512268,
"learning_rate": 9.959101123595506e-07,
"loss": 0.03802679851651192,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.6875,
"reward": 1.90625,
"reward_std": 0.8454999029636383,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.8828125,
"rewards/avg_6": 1.890625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9085937738418579,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 10.540625095367432,
"epoch": 0.4114671163575042,
"kl": 1.8171001076698303,
"learning_rate": 9.958988764044942e-07,
"loss": 0.029331211000680923,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.899218738079071,
"reward_std": 0.8547923564910889,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8515625,
"rewards/avg_7": 1.8359375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.901562511920929,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 10.59765625,
"epoch": 0.41259134345137716,
"kl": 1.8153007626533508,
"learning_rate": 9.958876404494381e-07,
"loss": 0.031290844082832336,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.9117187857627869,
"reward_std": 0.8251713514328003,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.913281261920929,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 10.586718559265137,
"epoch": 0.41371557054525016,
"kl": 1.7499891519546509,
"learning_rate": 9.95876404494382e-07,
"loss": 0.045041367411613464,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7578125,
"reward": 1.928125023841858,
"reward_std": 0.8811922073364258,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9375,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.921875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9296875298023224,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 10.546093940734863,
"epoch": 0.4148397976391231,
"kl": 1.7742969393730164,
"learning_rate": 9.958651685393257e-07,
"loss": 0.03312449902296066,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.65625,
"reward": 1.9078125357627869,
"reward_std": 0.8149179220199585,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.859375,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9085937738418579,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 10.528125286102295,
"epoch": 0.41596402473299604,
"kl": 1.8458644151687622,
"learning_rate": 9.958539325842696e-07,
"loss": 0.04537215456366539,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.703125,
"reward": 1.932812511920929,
"reward_std": 0.8349111080169678,
"rewards/avg_0": 1.953125,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.93359375,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 10.56796932220459,
"epoch": 0.41708825182686904,
"kl": 1.7916463613510132,
"learning_rate": 9.958426966292135e-07,
"loss": 0.027775993570685387,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6484375,
"reward": 1.899999976158142,
"reward_std": 0.8068560361862183,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.8515625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 1.0,
"rewards/point_reward": 0.9000000357627869,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 10.571094036102295,
"epoch": 0.418212478920742,
"kl": 1.8079155087471008,
"learning_rate": 9.958314606741572e-07,
"loss": 0.0378127321600914,
"ratio/all_0": 0.0,
"ratio/all_2": 0.6640625,
"reward": 1.928906261920929,
"reward_std": 0.8060687780380249,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.953125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.90625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9296875298023224,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 10.447656631469727,
"epoch": 0.419336706014615,
"kl": 2.0055981278419495,
"learning_rate": 9.95820224719101e-07,
"loss": 0.05942863970994949,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.8203125,
"reward": 1.939843773841858,
"reward_std": 0.9128240644931793,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9296875,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.940625011920929,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 10.457812786102295,
"epoch": 0.4204609331084879,
"kl": 2.0407444834709167,
"learning_rate": 9.958089887640447e-07,
"loss": 0.047153279185295105,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.9148437976837158,
"reward_std": 0.8315493166446686,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.921875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.917187511920929,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 10.54453182220459,
"epoch": 0.42158516020236086,
"kl": 2.314019560813904,
"learning_rate": 9.957977528089886e-07,
"loss": 0.058852873742580414,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6484375,
"reward": 1.9054688215255737,
"reward_std": 0.8244183659553528,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875,
"rewards/point_reward": 0.913281261920929,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 10.529687881469727,
"epoch": 0.42270938729623386,
"kl": 2.133104085922241,
"learning_rate": 9.957865168539325e-07,
"loss": 0.04806741327047348,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.907031238079071,
"reward_std": 0.8460875153541565,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.890625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.91015625,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 10.531250476837158,
"epoch": 0.4238336143901068,
"kl": 2.215003252029419,
"learning_rate": 9.957752808988764e-07,
"loss": 0.05868378281593323,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.703125,
"reward": 1.905468761920929,
"reward_std": 0.8680282831192017,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8671875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9945312440395355,
"rewards/point_reward": 0.9109375178813934,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 10.579687595367432,
"epoch": 0.42495784148397975,
"kl": 1.8949918150901794,
"learning_rate": 9.9576404494382e-07,
"loss": 0.05246419459581375,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.765625,
"reward": 1.9468750357627869,
"reward_std": 0.8695424795150757,
"rewards/avg_0": 1.921875,
"rewards/avg_1": 1.9375,
"rewards/avg_2": 1.9375,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9609375,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9296875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9476562738418579,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 10.537499904632568,
"epoch": 0.42608206857785275,
"kl": 2.0464513897895813,
"learning_rate": 9.95752808988764e-07,
"loss": 0.046930864453315735,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.6796875,
"reward": 1.9250000715255737,
"reward_std": 0.8235551118850708,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.9273437559604645,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 10.576562404632568,
"epoch": 0.4272062956717257,
"kl": 2.3635306358337402,
"learning_rate": 9.957415730337079e-07,
"loss": 0.06351582705974579,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6875,
"reward": 1.91015625,
"reward_std": 0.8465672135353088,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.875,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8671875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9945312738418579,
"rewards/point_reward": 0.9156250059604645,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 10.589062690734863,
"epoch": 0.42833052276559863,
"kl": 2.1300774812698364,
"learning_rate": 9.957303370786516e-07,
"loss": 0.04272787272930145,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.671875,
"reward": 1.896875023841858,
"reward_std": 0.8363301753997803,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.84375,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.8515625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.996874988079071,
"rewards/point_reward": 0.9000000059604645,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 10.495312690734863,
"epoch": 0.42945474985947163,
"kl": 1.9909522533416748,
"learning_rate": 9.957191011235954e-07,
"loss": 0.050648171454668045,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6875,
"reward": 1.9156250357627869,
"reward_std": 0.8425585627555847,
"rewards/avg_0": 1.875,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8359375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.91796875,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 10.517187595367432,
"epoch": 0.43057897695334457,
"kl": 1.9220994114875793,
"learning_rate": 9.957078651685393e-07,
"loss": 0.046620436012744904,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6796875,
"reward": 1.9304687976837158,
"reward_std": 0.8176408112049103,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.9312500059604645,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 10.509375095367432,
"epoch": 0.4317032040472175,
"kl": 2.019084930419922,
"learning_rate": 9.95696629213483e-07,
"loss": 0.052892833948135376,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.6953125,
"reward": 1.9148437976837158,
"reward_std": 0.8502573072910309,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.859375,
"rewards/avg_5": 1.8671875,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.9375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.9187500178813934,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 10.425781726837158,
"epoch": 0.4328274311410905,
"kl": 1.8879412412643433,
"learning_rate": 9.95685393258427e-07,
"loss": 0.031065676361322403,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.6640625,
"reward": 1.909375011920929,
"reward_std": 0.8127116858959198,
"rewards/avg_0": 1.890625,
"rewards/avg_1": 1.8984375,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.99609375,
"rewards/point_reward": 0.913281261920929,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 10.498437881469727,
"epoch": 0.43395165823496346,
"kl": 1.8726175427436829,
"learning_rate": 9.956741573033708e-07,
"loss": 0.049964789301157,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7421875,
"reward": 1.944531261920929,
"reward_std": 0.8532433807849884,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9140625,
"rewards/avg_2": 1.8984375,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9375,
"rewards/avg_6": 1.9453125,
"rewards/avg_7": 1.96875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984375238418579,
"rewards/point_reward": 0.9460937678813934,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 10.471875190734863,
"epoch": 0.4350758853288364,
"kl": 1.9985505938529968,
"learning_rate": 9.956629213483145e-07,
"loss": 0.04716179892420769,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7109375,
"reward": 1.913281261920929,
"reward_std": 0.8565007448196411,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.9140625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.9140625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.859375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9179687798023224,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 10.591406345367432,
"epoch": 0.4362001124227094,
"kl": 1.9754056334495544,
"learning_rate": 9.956516853932584e-07,
"loss": 0.04457426816225052,
"ratio/all_0": 0.03125,
"ratio/all_2": 0.7109375,
"reward": 1.9148437976837158,
"reward_std": 0.8505724966526031,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.917187511920929,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 10.512500286102295,
"epoch": 0.43732433951658234,
"kl": 1.8616729974746704,
"learning_rate": 9.956404494382023e-07,
"loss": 0.04564625769853592,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.7265625,
"reward": 1.928125023841858,
"reward_std": 0.8594686686992645,
"rewards/avg_0": 1.90625,
"rewards/avg_1": 1.8828125,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.8828125,
"rewards/avg_5": 1.9453125,
"rewards/avg_6": 1.9140625,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.996874988079071,
"rewards/point_reward": 0.9312500059604645,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 10.460937976837158,
"epoch": 0.43844856661045534,
"kl": 1.9120779633522034,
"learning_rate": 9.95629213483146e-07,
"loss": 0.05435582995414734,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.796875,
"reward": 1.947656273841858,
"reward_std": 0.8881878852844238,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.9453125,
"rewards/avg_2": 1.9453125,
"rewards/avg_3": 1.9453125,
"rewards/avg_4": 1.9375,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.921875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.999218761920929,
"rewards/point_reward": 0.948437511920929,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 10.542187690734863,
"epoch": 0.4395727937043283,
"kl": 2.190659284591675,
"learning_rate": 9.956179775280898e-07,
"loss": 0.0439864918589592,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6484375,
"reward": 1.8921875357627869,
"reward_std": 0.8169780969619751,
"rewards/avg_0": 1.859375,
"rewards/avg_1": 1.875,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8671875,
"rewards/avg_4": 1.875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.8359375,
"rewards/avg_7": 1.8828125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9984374940395355,
"rewards/point_reward": 0.893750011920929,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 10.589844226837158,
"epoch": 0.4406970207982012,
"kl": 2.186584234237671,
"learning_rate": 9.956067415730337e-07,
"loss": 0.04994940757751465,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6484375,
"reward": 1.901562511920929,
"reward_std": 0.823219507932663,
"rewards/avg_0": 1.828125,
"rewards/avg_1": 1.8671875,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.890625,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9968750178813934,
"rewards/point_reward": 0.9046875238418579,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 10.546875,
"epoch": 0.4418212478920742,
"kl": 2.2678062915802,
"learning_rate": 9.955955056179774e-07,
"loss": 0.06236354634165764,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.6875,
"reward": 1.9226562976837158,
"reward_std": 0.837425947189331,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.921875,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.9140625,
"rewards/avg_4": 1.9140625,
"rewards/avg_5": 1.9296875,
"rewards/avg_6": 1.8828125,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9945312738418579,
"rewards/point_reward": 0.9281249940395355,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 10.487500190734863,
"epoch": 0.44294547498594716,
"kl": 2.0774426460266113,
"learning_rate": 9.955842696629213e-07,
"loss": 0.05997675657272339,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.935937523841858,
"reward_std": 0.8445488810539246,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.9453125,
"rewards/avg_2": 1.859375,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.9375,
"rewards/avg_7": 1.9453125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9976562559604645,
"rewards/point_reward": 0.938281238079071,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 10.522656440734863,
"epoch": 0.4440697020798201,
"kl": 2.3769845962524414,
"learning_rate": 9.955730337078652e-07,
"loss": 0.05876045674085617,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.7421875,
"reward": 1.9148437976837158,
"reward_std": 0.8725523352622986,
"rewards/avg_0": 1.8828125,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.890625,
"rewards/avg_3": 1.90625,
"rewards/avg_4": 1.90625,
"rewards/avg_5": 1.890625,
"rewards/avg_6": 1.921875,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.995312511920929,
"rewards/point_reward": 0.9195312559604645,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 10.563281536102295,
"epoch": 0.4451939291736931,
"kl": 2.1915425062179565,
"learning_rate": 9.955617977528089e-07,
"loss": 0.05748950317502022,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.703125,
"reward": 1.916406273841858,
"reward_std": 0.8571533858776093,
"rewards/avg_0": 1.9140625,
"rewards/avg_1": 1.90625,
"rewards/avg_2": 1.8828125,
"rewards/avg_3": 1.8984375,
"rewards/avg_4": 1.921875,
"rewards/avg_5": 1.859375,
"rewards/avg_6": 1.90625,
"rewards/avg_7": 1.875,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.992968738079071,
"rewards/point_reward": 0.9234375059604645,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 10.535937786102295,
"epoch": 0.44631815626756605,
"kl": 2.4020025730133057,
"learning_rate": 9.955505617977527e-07,
"loss": 0.041312672197818756,
"ratio/all_0": 0.0234375,
"ratio/all_2": 0.5625,
"reward": 1.8679687976837158,
"reward_std": 0.7768263518810272,
"rewards/avg_0": 1.8515625,
"rewards/avg_1": 1.8125,
"rewards/avg_2": 1.84375,
"rewards/avg_3": 1.859375,
"rewards/avg_4": 1.8515625,
"rewards/avg_5": 1.828125,
"rewards/avg_6": 1.8125,
"rewards/avg_7": 1.8203125,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9921875,
"rewards/point_reward": 0.8757812678813934,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 10.42578125,
"epoch": 0.447442383361439,
"kl": 2.278535842895508,
"learning_rate": 9.955393258426966e-07,
"loss": 0.05333054065704346,
"ratio/all_0": 0.0078125,
"ratio/all_2": 0.7109375,
"reward": 1.9179688096046448,
"reward_std": 0.8447178602218628,
"rewards/avg_0": 1.8984375,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.90625,
"rewards/avg_3": 1.8828125,
"rewards/avg_4": 1.8984375,
"rewards/avg_5": 1.90625,
"rewards/avg_6": 1.8984375,
"rewards/avg_7": 1.8984375,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.991406261920929,
"rewards/point_reward": 0.9265625178813934,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 10.431250095367432,
"epoch": 0.448566610455312,
"kl": 2.0164719820022583,
"learning_rate": 9.955280898876403e-07,
"loss": 0.05713193863630295,
"ratio/all_0": 0.015625,
"ratio/all_2": 0.71875,
"reward": 1.930468738079071,
"reward_std": 0.8625682890415192,
"rewards/avg_0": 1.9296875,
"rewards/avg_1": 1.890625,
"rewards/avg_2": 1.921875,
"rewards/avg_3": 1.9296875,
"rewards/avg_4": 1.890625,
"rewards/avg_5": 1.8984375,
"rewards/avg_6": 1.9296875,
"rewards/avg_7": 1.9140625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.9929687678813934,
"rewards/point_reward": 0.9375000298023224,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 10.560156345367432,
"epoch": 0.44969083754918493,
"kl": 2.090118169784546,
"learning_rate": 9.955168539325842e-07,
"loss": 0.04012680798768997,
"ratio/all_0": 0.0390625,
"ratio/all_2": 0.609375,
"reward": 1.88671875,
"reward_std": 0.7999500036239624,
"rewards/avg_0": 1.8671875,
"rewards/avg_1": 1.8515625,
"rewards/avg_2": 1.875,
"rewards/avg_3": 1.8515625,
"rewards/avg_4": 1.8671875,
"rewards/avg_5": 1.84375,
"rewards/avg_6": 1.859375,
"rewards/avg_7": 1.8515625,
"rewards/avg_8": 2.0,
"rewards/avg_9": 2.0,
"rewards/format_reward": 0.991406261920929,
"rewards/point_reward": 0.8953125178813934,
"step": 400
}
],
"logging_steps": 1.0,
"max_steps": 89000,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}