MMR-Sigmoid-DAPO-8B / trainer_state.json
kangdawei's picture
Model save
f2ec6e6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_fraction": 0.0,
"completion_length": 2523.270866394043,
"epoch": 0.001142857142857143,
"grad_norm": 0.06654668599367142,
"kl": 0.0,
"lambda_div_used": 0.6164634302258492,
"learning_rate": 0.0,
"loss": -0.0258,
"reward": -0.14551701629534364,
"reward_after_mean": -0.14551701629534364,
"reward_after_std": 0.6225011153146625,
"reward_before_mean": 0.17862090840935707,
"reward_before_std": 0.5394803490489721,
"reward_change_max": 0.0,
"reward_change_mean": -0.3241379093378782,
"reward_change_min": -0.5330121107399464,
"reward_change_std": 0.19750467501580715,
"reward_std": 0.622501116245985,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.0713790925219655,
"step": 1
},
{
"clip_fraction": 0.0,
"completion_length": 2684.583366394043,
"epoch": 0.002285714285714286,
"grad_norm": 0.07638121396303177,
"kl": 0.0,
"lambda_div_used": 0.5910675376653671,
"learning_rate": 5e-08,
"loss": 0.001,
"reward": -0.07136534340679646,
"reward_after_mean": -0.07136534340679646,
"reward_after_std": 0.5026816055178642,
"reward_before_mean": 0.33918463438749313,
"reward_before_std": 0.41114553064107895,
"reward_change_max": 0.0,
"reward_change_mean": -0.4105500001460314,
"reward_change_min": -0.619081187993288,
"reward_change_std": 0.23483010660856962,
"reward_std": 0.5026816166937351,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.047517990693449974,
"step": 2
},
{
"clip_fraction": 0.0,
"completion_length": 2966.9166870117188,
"epoch": 0.0034285714285714284,
"grad_norm": 0.07826078683137894,
"kl": 5.805492401123047e-05,
"lambda_div_used": 0.5747007578611374,
"learning_rate": 1e-07,
"loss": 0.0118,
"reward": -0.3506452329456806,
"reward_after_mean": -0.3506452329456806,
"reward_after_std": 0.40795029513537884,
"reward_before_mean": -0.06986081041395664,
"reward_before_std": 0.3391748256981373,
"reward_change_max": 0.0,
"reward_change_mean": -0.280784422531724,
"reward_change_min": -0.45085202157497406,
"reward_change_std": 0.16326917707920074,
"reward_std": 0.4079503044486046,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.1323608043603599,
"step": 3
},
{
"clip_fraction": 0.0,
"completion_length": 1377.000015258789,
"epoch": 0.004571428571428572,
"grad_norm": 0.10993114858865738,
"kl": 3.951042890548706e-05,
"lambda_div_used": 0.5788158848881721,
"learning_rate": 1.5e-07,
"loss": 0.0214,
"reward": -0.24558139964938164,
"reward_after_mean": -0.24558139964938164,
"reward_after_std": 0.3998244144022465,
"reward_before_mean": 0.06915237568318844,
"reward_before_std": 0.36195528600364923,
"reward_change_max": 0.0,
"reward_change_mean": -0.3147337753325701,
"reward_change_min": -0.4869973622262478,
"reward_change_std": 0.1927295122295618,
"reward_std": 0.3998244162648916,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.09751429967582226,
"step": 4
},
{
"clip_fraction": 0.0,
"completion_length": 3310.4583587646484,
"epoch": 0.005714285714285714,
"grad_norm": 0.06377790123224258,
"kl": 5.4255127906799316e-05,
"lambda_div_used": 0.61885916441679,
"learning_rate": 2e-07,
"loss": 0.0428,
"reward": -0.25358792673796415,
"reward_after_mean": -0.25358792673796415,
"reward_after_std": 0.5858059301972389,
"reward_before_mean": 0.0003254720941185951,
"reward_before_std": 0.5446656532585621,
"reward_change_max": 0.0,
"reward_change_mean": -0.25391339510679245,
"reward_change_min": -0.45586229115724564,
"reward_change_std": 0.16313681472092867,
"reward_std": 0.5858059376478195,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.12467453628778458,
"step": 5
},
{
"clip_fraction": 0.0,
"completion_length": 2887.8541717529297,
"epoch": 0.006857142857142857,
"grad_norm": 0.06263996660709381,
"kl": 5.511939525604248e-05,
"lambda_div_used": 0.5606320649385452,
"learning_rate": 2.5e-07,
"loss": 0.0618,
"reward": -0.4456054698675871,
"reward_after_mean": -0.4456054698675871,
"reward_after_std": 0.3509902711957693,
"reward_before_mean": -0.1859958479180932,
"reward_before_std": 0.2702419590204954,
"reward_change_max": 0.0,
"reward_change_mean": -0.2596096061170101,
"reward_change_min": -0.38109203428030014,
"reward_change_std": 0.14125030301511288,
"reward_std": 0.3509902749210596,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2276625158265233,
"step": 6
},
{
"clip_fraction": 0.0,
"completion_length": 2572.3542098999023,
"epoch": 0.008,
"grad_norm": 0.06695970892906189,
"kl": 4.0218234062194824e-05,
"lambda_div_used": 0.6162339821457863,
"learning_rate": 3e-07,
"loss": 0.0453,
"reward": -0.10290078446269035,
"reward_after_mean": -0.10290078446269035,
"reward_after_std": 0.566169198602438,
"reward_before_mean": 0.22045390354469419,
"reward_before_std": 0.5333282891660929,
"reward_change_max": 0.0,
"reward_change_mean": -0.32335469499230385,
"reward_change_min": -0.5540991388261318,
"reward_change_std": 0.20453903079032898,
"reward_std": 0.566169211640954,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": 0.012120556086301804,
"step": 7
},
{
"clip_fraction": 0.0,
"completion_length": 2173.7500228881836,
"epoch": 0.009142857142857144,
"grad_norm": 0.07779484242200851,
"kl": 3.185681998729706e-05,
"lambda_div_used": 0.63105358928442,
"learning_rate": 3.5e-07,
"loss": 0.0071,
"reward": 0.09723488846793771,
"reward_after_mean": 0.09723488846793771,
"reward_after_std": 0.6893859151750803,
"reward_before_mean": 0.5079077246482484,
"reward_before_std": 0.6077435212209821,
"reward_change_max": 0.0,
"reward_change_mean": -0.410672839730978,
"reward_change_min": -0.638349361717701,
"reward_change_std": 0.2494220733642578,
"reward_std": 0.6893859468400478,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/cosine_scaled_reward": 0.1329077403061092,
"step": 8
},
{
"clip_fraction": 0.0,
"completion_length": 2828.229179382324,
"epoch": 0.010285714285714285,
"grad_norm": 0.09342571347951889,
"kl": 4.9620866775512695e-05,
"lambda_div_used": 0.6150719821453094,
"learning_rate": 4e-07,
"loss": -0.0132,
"reward": -0.18431122601032257,
"reward_after_mean": -0.18431122601032257,
"reward_after_std": 0.5804070886224508,
"reward_before_mean": 0.09753156686201692,
"reward_before_std": 0.5302032623440027,
"reward_change_max": 0.0,
"reward_change_mean": -0.2818427961319685,
"reward_change_min": -0.46669338643550873,
"reward_change_std": 0.18001185078173876,
"reward_std": 0.5804071053862572,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.0691350968554616,
"step": 9
},
{
"clip_fraction": 0.0,
"completion_length": 2469.7083587646484,
"epoch": 0.011428571428571429,
"grad_norm": 0.09024360775947571,
"kl": 3.8331374526023865e-05,
"lambda_div_used": 0.63355503231287,
"learning_rate": 4.5e-07,
"loss": -0.001,
"reward": -0.04430920258164406,
"reward_after_mean": -0.04430920258164406,
"reward_after_std": 0.6170141901820898,
"reward_before_mean": 0.2625937759876251,
"reward_before_std": 0.6259458847343922,
"reward_change_max": 0.0,
"reward_change_mean": -0.3069029748439789,
"reward_change_min": -0.5734602250158787,
"reward_change_std": 0.22167105227708817,
"reward_std": 0.6170141994953156,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/cosine_scaled_reward": 0.012593759223818779,
"step": 10
},
{
"clip_fraction": 0.0,
"completion_length": 3367.500015258789,
"epoch": 0.012571428571428572,
"grad_norm": 0.05533275753259659,
"kl": 5.245208740234375e-05,
"lambda_div_used": 0.5749180987477303,
"learning_rate": 5e-07,
"loss": -0.0467,
"reward": -0.35878527723252773,
"reward_after_mean": -0.35878527723252773,
"reward_after_std": 0.4030665699392557,
"reward_before_mean": -0.09556051343679428,
"reward_before_std": 0.3429161449894309,
"reward_change_max": 0.0,
"reward_change_mean": -0.2632247470319271,
"reward_change_min": -0.4096112735569477,
"reward_change_std": 0.1540710162371397,
"reward_std": 0.40306657925248146,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.19972719065845013,
"step": 11
},
{
"clip_fraction": 0.0,
"completion_length": 2016.958351135254,
"epoch": 0.013714285714285714,
"grad_norm": 0.08992662280797958,
"kl": 3.9637088775634766e-05,
"lambda_div_used": 0.6247389540076256,
"learning_rate": 5.5e-07,
"loss": 0.0429,
"reward": -0.10162727534770966,
"reward_after_mean": -0.10162727534770966,
"reward_after_std": 0.594907833263278,
"reward_before_mean": 0.19102710485458374,
"reward_before_std": 0.5805315412580967,
"reward_change_max": 0.0,
"reward_change_mean": -0.292654387652874,
"reward_change_min": -0.5117304362356663,
"reward_change_std": 0.20085694547742605,
"reward_std": 0.5949078388512135,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.03813957702368498,
"step": 12
},
{
"clip_fraction": 0.0,
"completion_length": 2905.0833587646484,
"epoch": 0.014857142857142857,
"grad_norm": 0.0641385093331337,
"kl": 4.194676876068115e-05,
"lambda_div_used": 0.5766140297055244,
"learning_rate": 6e-07,
"loss": 0.0437,
"reward": -0.3046809285879135,
"reward_after_mean": -0.3046809285879135,
"reward_after_std": 0.399059085175395,
"reward_before_mean": -0.018187658861279488,
"reward_before_std": 0.35123275220394135,
"reward_change_max": 0.0,
"reward_change_mean": -0.28649328649044037,
"reward_change_min": -0.4807186797261238,
"reward_change_std": 0.17643271200358868,
"reward_std": 0.399059085175395,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.1431876514106989,
"step": 13
},
{
"clip_fraction": 0.0,
"completion_length": 2520.312530517578,
"epoch": 0.016,
"grad_norm": 0.07475250959396362,
"kl": 3.432855010032654e-05,
"lambda_div_used": 0.5991310104727745,
"learning_rate": 6.5e-07,
"loss": -0.0332,
"reward": -0.22936711832880974,
"reward_after_mean": -0.22936711832880974,
"reward_after_std": 0.5019008349627256,
"reward_before_mean": 0.04907496925443411,
"reward_before_std": 0.4585955021902919,
"reward_change_max": 0.0,
"reward_change_mean": -0.27844210527837276,
"reward_change_min": -0.4685846194624901,
"reward_change_std": 0.17394172679632902,
"reward_std": 0.5019008629024029,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.09675835724920034,
"step": 14
},
{
"clip_fraction": 0.0,
"completion_length": 2717.0416984558105,
"epoch": 0.017142857142857144,
"grad_norm": 0.08986662328243256,
"kl": 4.3585896492004395e-05,
"lambda_div_used": 0.5765259638428688,
"learning_rate": 7e-07,
"loss": 0.0355,
"reward": -0.16787780448794365,
"reward_after_mean": -0.16787780448794365,
"reward_after_std": 0.4601633083075285,
"reward_before_mean": 0.22437161579728127,
"reward_before_std": 0.3496173685416579,
"reward_change_max": 0.0,
"reward_change_mean": -0.3922494389116764,
"reward_change_min": -0.5602193549275398,
"reward_change_std": 0.2197399353608489,
"reward_std": 0.46016331762075424,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": 0.016038289293646812,
"step": 15
},
{
"clip_fraction": 0.0,
"completion_length": 3395.1875,
"epoch": 0.018285714285714287,
"grad_norm": 0.05402011796832085,
"kl": 4.659593105316162e-05,
"lambda_div_used": 0.5638558194041252,
"learning_rate": 7.5e-07,
"loss": 0.032,
"reward": -0.432980858720839,
"reward_after_mean": -0.432980858720839,
"reward_after_std": 0.3415633924305439,
"reward_before_mean": -0.16444075386971235,
"reward_before_std": 0.2898183651268482,
"reward_change_max": 0.0,
"reward_change_mean": -0.26854010485112667,
"reward_change_min": -0.45487307757139206,
"reward_change_std": 0.16323526203632355,
"reward_std": 0.34156340546905994,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.20610742270946503,
"step": 16
},
{
"clip_fraction": 0.0,
"completion_length": 2164.020854949951,
"epoch": 0.019428571428571427,
"grad_norm": 0.1185457855463028,
"kl": 4.2147934436798096e-05,
"lambda_div_used": 0.5763917565345764,
"learning_rate": 8e-07,
"loss": 0.0302,
"reward": -0.2335935328155756,
"reward_after_mean": -0.2335935328155756,
"reward_after_std": 0.4716028142720461,
"reward_before_mean": 0.13111361488699913,
"reward_before_std": 0.3423871146515012,
"reward_change_max": 0.0,
"reward_change_mean": -0.36470715142786503,
"reward_change_min": -0.5119686089456081,
"reward_change_std": 0.19370271731168032,
"reward_std": 0.471602825447917,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.05638639978133142,
"step": 17
},
{
"clip_fraction": 0.0,
"completion_length": 3079.4583740234375,
"epoch": 0.02057142857142857,
"grad_norm": 0.05137661099433899,
"kl": 3.637373447418213e-05,
"lambda_div_used": 0.6073044687509537,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0478,
"reward": -0.057250723242759705,
"reward_after_mean": -0.057250723242759705,
"reward_after_std": 0.5715843215584755,
"reward_before_mean": 0.3262214660644531,
"reward_before_std": 0.5018989769741893,
"reward_change_max": 0.0,
"reward_change_mean": -0.38347217813134193,
"reward_change_min": -0.6260374560952187,
"reward_change_std": 0.24400013033300638,
"reward_std": 0.571584340184927,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.034554785932414234,
"step": 18
},
{
"clip_fraction": 0.0,
"completion_length": 2889.8542289733887,
"epoch": 0.021714285714285714,
"grad_norm": 0.07495337724685669,
"kl": 4.096329212188721e-05,
"lambda_div_used": 0.6219106838107109,
"learning_rate": 9e-07,
"loss": 0.036,
"reward": 0.1660416293889284,
"reward_after_mean": 0.1660416293889284,
"reward_after_std": 0.6783627849072218,
"reward_before_mean": 0.6544073540717363,
"reward_before_std": 0.5640754774212837,
"reward_change_max": 0.0,
"reward_change_mean": -0.488365713506937,
"reward_change_min": -0.758969146758318,
"reward_change_std": 0.2900321548804641,
"reward_std": 0.6783628333359957,
"rewards/accuracy_reward": 0.45833334140479565,
"rewards/cosine_scaled_reward": 0.19607400865061209,
"step": 19
},
{
"clip_fraction": 0.0,
"completion_length": 2114.8333587646484,
"epoch": 0.022857142857142857,
"grad_norm": 0.0912039652466774,
"kl": 2.7514994144439697e-05,
"lambda_div_used": 0.6171177923679352,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0338,
"reward": -0.12753646727651358,
"reward_after_mean": -0.12753646727651358,
"reward_after_std": 0.6479649767279625,
"reward_before_mean": 0.2073977841064334,
"reward_before_std": 0.5446468549780548,
"reward_change_max": 0.0,
"reward_change_mean": -0.33493425138294697,
"reward_change_min": -0.5443090125918388,
"reward_change_std": 0.20240973494946957,
"reward_std": 0.6479649972170591,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.0426022089086473,
"step": 20
},
{
"clip_fraction": 0.0,
"completion_length": 2578.3125,
"epoch": 0.024,
"grad_norm": 0.1021808609366417,
"kl": 4.0903687477111816e-05,
"lambda_div_used": 0.5696183741092682,
"learning_rate": 1e-06,
"loss": 0.0243,
"reward": -0.16086186096072197,
"reward_after_mean": -0.16086186096072197,
"reward_after_std": 0.4245347697287798,
"reward_before_mean": 0.23232585442019626,
"reward_before_std": 0.3138282438740134,
"reward_change_max": 0.0,
"reward_change_mean": -0.3931876849383116,
"reward_change_min": -0.5524330623447895,
"reward_change_std": 0.2125744568184018,
"reward_std": 0.424534784629941,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": -0.01767415925860405,
"step": 21
},
{
"clip_fraction": 0.0,
"completion_length": 1644.437572479248,
"epoch": 0.025142857142857144,
"grad_norm": 0.11418648809194565,
"kl": 2.1685846149921417e-05,
"lambda_div_used": 0.5976943448185921,
"learning_rate": 9.99931462820376e-07,
"loss": -0.0304,
"reward": -0.10828415304422379,
"reward_after_mean": -0.10828415304422379,
"reward_after_std": 0.5476059578359127,
"reward_before_mean": 0.2575272931717336,
"reward_before_std": 0.4531834872905165,
"reward_change_max": 0.0,
"reward_change_mean": -0.36581144109368324,
"reward_change_min": -0.5498910807073116,
"reward_change_std": 0.21405170671641827,
"reward_std": 0.5476059839129448,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.03413938358426094,
"step": 22
},
{
"clip_fraction": 0.0,
"completion_length": 2188.687530517578,
"epoch": 0.026285714285714287,
"grad_norm": 0.0898994579911232,
"kl": 3.7088990211486816e-05,
"lambda_div_used": 0.6402537003159523,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0037,
"reward": -0.13877022732049227,
"reward_after_mean": -0.13877022732049227,
"reward_after_std": 0.6969181355088949,
"reward_before_mean": 0.11387635703431442,
"reward_before_std": 0.6502415342256427,
"reward_change_max": 0.0,
"reward_change_mean": -0.2526465691626072,
"reward_change_min": -0.45175985619425774,
"reward_change_std": 0.1627071350812912,
"reward_std": 0.6969181522727013,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.05279031861573458,
"step": 23
},
{
"clip_fraction": 0.0,
"completion_length": 2394.3958740234375,
"epoch": 0.027428571428571427,
"grad_norm": 0.09361977875232697,
"kl": 2.0720064640045166e-05,
"lambda_div_used": 0.6309552267193794,
"learning_rate": 9.993832906395582e-07,
"loss": 0.0666,
"reward": 0.0798279655573424,
"reward_after_mean": 0.0798279655573424,
"reward_after_std": 0.6638341955840588,
"reward_before_mean": 0.4744788520038128,
"reward_before_std": 0.6099728401750326,
"reward_change_max": 0.0,
"reward_change_mean": -0.3946509025990963,
"reward_change_min": -0.6427228525280952,
"reward_change_std": 0.25018193013966084,
"reward_std": 0.6638342067599297,
"rewards/accuracy_reward": 0.35416667349636555,
"rewards/cosine_scaled_reward": 0.12031219294294715,
"step": 24
},
{
"clip_fraction": 0.0,
"completion_length": 2358.583366394043,
"epoch": 0.02857142857142857,
"grad_norm": 0.07725252956151962,
"kl": 4.092603921890259e-05,
"lambda_div_used": 0.6378427669405937,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0557,
"reward": -0.15363326482474804,
"reward_after_mean": -0.15363326482474804,
"reward_after_std": 0.6477887704968452,
"reward_before_mean": 0.10259643197059631,
"reward_before_std": 0.6456613540649414,
"reward_change_max": 0.0,
"reward_change_mean": -0.25622970052063465,
"reward_change_min": -0.5558900497853756,
"reward_change_std": 0.19845533184707165,
"reward_std": 0.6477888077497482,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.08490357082337141,
"step": 25
},
{
"clip_fraction": 0.0,
"completion_length": 2850.937530517578,
"epoch": 0.029714285714285714,
"grad_norm": 0.06806578487157822,
"kl": 3.5643577575683594e-05,
"lambda_div_used": 0.5781113430857658,
"learning_rate": 9.982876141412855e-07,
"loss": 0.005,
"reward": -0.25517464708536863,
"reward_after_mean": -0.25517464708536863,
"reward_after_std": 0.47686139307916164,
"reward_before_mean": 0.09097394905984402,
"reward_before_std": 0.3564971052110195,
"reward_change_max": 0.0,
"reward_change_mean": -0.3461485952138901,
"reward_change_min": -0.4805891402065754,
"reward_change_std": 0.18107541371136904,
"reward_std": 0.476861409842968,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.07569271977990866,
"step": 26
},
{
"clip_fraction": 0.0,
"completion_length": 2795.3750534057617,
"epoch": 0.030857142857142857,
"grad_norm": 0.0670829713344574,
"kl": 5.0067901611328125e-05,
"lambda_div_used": 0.5909813195466995,
"learning_rate": 9.975348529157229e-07,
"loss": 0.038,
"reward": -0.30352935567498207,
"reward_after_mean": -0.30352935567498207,
"reward_after_std": 0.4782011900097132,
"reward_before_mean": -0.04106062464416027,
"reward_before_std": 0.42168071679770947,
"reward_change_max": 0.0,
"reward_change_mean": -0.2624687273055315,
"reward_change_min": -0.441840048879385,
"reward_change_std": 0.16586182732135057,
"reward_std": 0.47820119373500347,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.16606062231585383,
"step": 27
},
{
"clip_fraction": 0.0,
"completion_length": 2662.562530517578,
"epoch": 0.032,
"grad_norm": 0.07541470974683762,
"kl": 4.6879053115844727e-05,
"lambda_div_used": 0.6455363035202026,
"learning_rate": 9.96645768238595e-07,
"loss": -0.0548,
"reward": -0.01765006221830845,
"reward_after_mean": -0.01765006221830845,
"reward_after_std": 0.7017391249537468,
"reward_before_mean": 0.26933106034994125,
"reward_before_std": 0.6768321208655834,
"reward_change_max": 0.0,
"reward_change_mean": -0.2869811188429594,
"reward_change_min": -0.5125514604151249,
"reward_change_std": 0.19719962775707245,
"reward_std": 0.7017391547560692,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.022335614077746868,
"step": 28
},
{
"clip_fraction": 0.0,
"completion_length": 2988.1041717529297,
"epoch": 0.03314285714285714,
"grad_norm": 0.07968341559171677,
"kl": 4.7832727432250977e-05,
"lambda_div_used": 0.5851128473877907,
"learning_rate": 9.956206309337066e-07,
"loss": -0.1054,
"reward": -0.38509707152843475,
"reward_after_mean": -0.38509707152843475,
"reward_after_std": 0.4501145612448454,
"reward_before_mean": -0.14269733056426048,
"reward_before_std": 0.3910446595400572,
"reward_change_max": 0.0,
"reward_change_mean": -0.24239975214004517,
"reward_change_min": -0.4103321433067322,
"reward_change_std": 0.1474100910127163,
"reward_std": 0.4501145798712969,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.20519732707180083,
"step": 29
},
{
"clip_fraction": 0.0,
"completion_length": 2777.1042404174805,
"epoch": 0.03428571428571429,
"grad_norm": 0.0785035490989685,
"kl": 4.0784478187561035e-05,
"lambda_div_used": 0.616574227809906,
"learning_rate": 9.944597532678119e-07,
"loss": 0.0306,
"reward": 0.014547215774655342,
"reward_after_mean": 0.014547215774655342,
"reward_after_std": 0.6317574586719275,
"reward_before_mean": 0.40835407795384526,
"reward_before_std": 0.5376694360747933,
"reward_change_max": 0.0,
"reward_change_mean": -0.3938068598508835,
"reward_change_min": -0.5825920477509499,
"reward_change_std": 0.22909015510231256,
"reward_std": 0.631757466122508,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/cosine_scaled_reward": 0.09585406119003892,
"step": 30
},
{
"clip_fraction": 0.0,
"completion_length": 3100.9166870117188,
"epoch": 0.03542857142857143,
"grad_norm": 0.05876823514699936,
"kl": 5.622208118438721e-05,
"lambda_div_used": 0.5746422484517097,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0301,
"reward": -0.373639321886003,
"reward_after_mean": -0.373639321886003,
"reward_after_std": 0.40832165256142616,
"reward_before_mean": -0.11681870371103287,
"reward_before_std": 0.3402246618643403,
"reward_change_max": 0.0,
"reward_change_mean": -0.2568206209689379,
"reward_change_min": -0.4000260457396507,
"reward_change_std": 0.149553001858294,
"reward_std": 0.4083216693252325,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.20015203207731247,
"step": 31
},
{
"clip_fraction": 0.0,
"completion_length": 2879.625030517578,
"epoch": 0.036571428571428574,
"grad_norm": 0.060988157987594604,
"kl": 4.863739013671875e-05,
"lambda_div_used": 0.583471342921257,
"learning_rate": 9.917322325514487e-07,
"loss": 0.0143,
"reward": -0.09911171346902847,
"reward_after_mean": -0.09911171346902847,
"reward_after_std": 0.46612042374908924,
"reward_before_mean": 0.2912287414073944,
"reward_before_std": 0.3850706424564123,
"reward_change_max": 0.0,
"reward_change_mean": -0.39034045673906803,
"reward_change_min": -0.5949138030409813,
"reward_change_std": 0.23070038296282291,
"reward_std": 0.46612043119966984,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": -0.0004379265010356903,
"step": 32
},
{
"clip_fraction": 0.0,
"completion_length": 3136.750030517578,
"epoch": 0.037714285714285714,
"grad_norm": 0.07551854848861694,
"kl": 4.787743091583252e-05,
"lambda_div_used": 0.6242911070585251,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0446,
"reward": -0.06627794913947582,
"reward_after_mean": -0.06627794913947582,
"reward_after_std": 0.5915219262242317,
"reward_before_mean": 0.24627447500824928,
"reward_before_std": 0.5734489392489195,
"reward_change_max": 0.0,
"reward_change_mean": -0.3125524502247572,
"reward_change_min": -0.5326853170990944,
"reward_change_std": 0.2051102453842759,
"reward_std": 0.5915219560265541,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": -0.0037255147472023964,
"step": 33
},
{
"clip_fraction": 0.0,
"completion_length": 2379.541702270508,
"epoch": 0.038857142857142854,
"grad_norm": 0.07498405873775482,
"kl": 4.60892915725708e-05,
"lambda_div_used": 0.6475758850574493,
"learning_rate": 9.88466529153356e-07,
"loss": 0.0814,
"reward": 0.054736172780394554,
"reward_after_mean": 0.054736172780394554,
"reward_after_std": 0.7529329154640436,
"reward_before_mean": 0.4133173357695341,
"reward_before_std": 0.6894109938293695,
"reward_change_max": 0.0,
"reward_change_mean": -0.3585811499506235,
"reward_change_min": -0.6234904117882252,
"reward_change_std": 0.23517589084804058,
"reward_std": 0.7529329396784306,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.07998399529606104,
"step": 34
},
{
"clip_fraction": 0.0,
"completion_length": 3113.1875610351562,
"epoch": 0.04,
"grad_norm": 0.1044503003358841,
"kl": 5.8710575103759766e-05,
"lambda_div_used": 0.5754920393228531,
"learning_rate": 9.866330768241983e-07,
"loss": -0.0058,
"reward": -0.36803475581109524,
"reward_after_mean": -0.36803475581109524,
"reward_after_std": 0.4120363052934408,
"reward_before_mean": -0.1010703444480896,
"reward_before_std": 0.3434353759512305,
"reward_change_max": 0.0,
"reward_change_mean": -0.2669644057750702,
"reward_change_min": -0.42853060737252235,
"reward_change_std": 0.1540099997073412,
"reward_std": 0.41203631460666656,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.16357035003602505,
"step": 35
},
{
"clip_fraction": 0.0,
"completion_length": 3350.1666870117188,
"epoch": 0.04114285714285714,
"grad_norm": 0.07969332486391068,
"kl": 6.110966205596924e-05,
"lambda_div_used": 0.5744208693504333,
"learning_rate": 9.846666218300807e-07,
"loss": -0.0514,
"reward": -0.4380789175629616,
"reward_after_mean": -0.4380789175629616,
"reward_after_std": 0.41310197673738003,
"reward_before_mean": -0.20339234871789813,
"reward_before_std": 0.337322598323226,
"reward_change_max": 0.0,
"reward_change_mean": -0.23468656465411186,
"reward_change_min": -0.3448420464992523,
"reward_change_std": 0.12617942783981562,
"reward_std": 0.4131019860506058,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.24505900964140892,
"step": 36
},
{
"clip_fraction": 0.0,
"completion_length": 3394.3958435058594,
"epoch": 0.04228571428571429,
"grad_norm": 0.05401439592242241,
"kl": 4.9054622650146484e-05,
"lambda_div_used": 0.5599935948848724,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0086,
"reward": -0.3875775597989559,
"reward_after_mean": -0.3875775597989559,
"reward_after_std": 0.32731432281434536,
"reward_before_mean": -0.10551332868635654,
"reward_before_std": 0.2694389373064041,
"reward_change_max": 0.0,
"reward_change_mean": -0.28206423483788967,
"reward_change_min": -0.4370834305882454,
"reward_change_std": 0.16284553799778223,
"reward_std": 0.3273143321275711,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.1888466626405716,
"step": 37
},
{
"clip_fraction": 0.0,
"completion_length": 3311.0833435058594,
"epoch": 0.04342857142857143,
"grad_norm": 0.05246276035904884,
"kl": 4.945695400238037e-05,
"lambda_div_used": 0.5770035535097122,
"learning_rate": 9.80337140183366e-07,
"loss": -0.0117,
"reward": -0.3118477761745453,
"reward_after_mean": -0.3118477761745453,
"reward_after_std": 0.38849758356809616,
"reward_before_mean": -0.021045896457508206,
"reward_before_std": 0.3480216721072793,
"reward_change_max": 0.0,
"reward_change_mean": -0.2908018734306097,
"reward_change_min": -0.44894198328256607,
"reward_change_std": 0.17141664400696754,
"reward_std": 0.3884976040571928,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.14604590460658073,
"step": 38
},
{
"clip_fraction": 0.0,
"completion_length": 2919.604179382324,
"epoch": 0.044571428571428574,
"grad_norm": 0.08766351640224457,
"kl": 3.2648444175720215e-05,
"lambda_div_used": 0.5717808604240417,
"learning_rate": 9.779754323328192e-07,
"loss": 0.032,
"reward": -0.14589639008045197,
"reward_after_mean": -0.14589639008045197,
"reward_after_std": 0.4250806160271168,
"reward_before_mean": 0.26390516571700573,
"reward_before_std": 0.3228347860276699,
"reward_change_max": 0.0,
"reward_change_mean": -0.40980157628655434,
"reward_change_min": -0.5692479386925697,
"reward_change_std": 0.22337355464696884,
"reward_std": 0.42508063092827797,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.013905153144150972,
"step": 39
},
{
"clip_fraction": 0.0,
"completion_length": 2468.7916870117188,
"epoch": 0.045714285714285714,
"grad_norm": 0.07357986271381378,
"kl": 4.18052077293396e-05,
"lambda_div_used": 0.5706272348761559,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0148,
"reward": -0.2578982161357999,
"reward_after_mean": -0.2578982161357999,
"reward_after_std": 0.4464099854230881,
"reward_before_mean": 0.1052470114082098,
"reward_before_std": 0.31700289947912097,
"reward_change_max": 0.0,
"reward_change_mean": -0.3631452303379774,
"reward_change_min": -0.5201860442757607,
"reward_change_std": 0.1936064399778843,
"reward_std": 0.4464099947363138,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.06141966814175248,
"step": 40
},
{
"clip_fraction": 0.0,
"completion_length": 3203.312515258789,
"epoch": 0.046857142857142854,
"grad_norm": 0.05290238931775093,
"kl": 5.0440430641174316e-05,
"lambda_div_used": 0.6480180770158768,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0199,
"reward": -0.04493038635700941,
"reward_after_mean": -0.04493038635700941,
"reward_after_std": 0.7092454191297293,
"reward_before_mean": 0.23497815802693367,
"reward_before_std": 0.693053056485951,
"reward_change_max": 0.0,
"reward_change_mean": -0.2799085471779108,
"reward_change_min": -0.5064845345914364,
"reward_change_std": 0.19558908697217703,
"reward_std": 0.7092454340308905,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": 0.005811510724015534,
"step": 41
},
{
"clip_fraction": 0.0,
"completion_length": 2890.9791927337646,
"epoch": 0.048,
"grad_norm": 0.09494666010141373,
"kl": 5.85019588470459e-05,
"lambda_div_used": 0.5329261645674706,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0335,
"reward": -0.5650591850280762,
"reward_after_mean": -0.5650591850280762,
"reward_after_std": 0.225538931787014,
"reward_before_mean": -0.32472414150834084,
"reward_before_std": 0.14451794046908617,
"reward_change_max": 0.0,
"reward_change_mean": -0.24033503979444504,
"reward_change_min": -0.358134388923645,
"reward_change_std": 0.12634030357003212,
"reward_std": 0.2255389392375946,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.32472414895892143,
"step": 42
},
{
"clip_fraction": 0.0,
"completion_length": 3027.9583587646484,
"epoch": 0.04914285714285714,
"grad_norm": 0.05899634212255478,
"kl": 4.252791404724121e-05,
"lambda_div_used": 0.595756284892559,
"learning_rate": 9.672327345550543e-07,
"loss": -0.017,
"reward": -0.31989429891109467,
"reward_after_mean": -0.31989429891109467,
"reward_after_std": 0.5033334400504827,
"reward_before_mean": -0.07388713955879211,
"reward_before_std": 0.43908379040658474,
"reward_change_max": 0.0,
"reward_change_mean": -0.24600715190172195,
"reward_change_min": -0.4015818126499653,
"reward_change_std": 0.143897395581007,
"reward_std": 0.5033334512263536,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1780538223683834,
"step": 43
},
{
"clip_fraction": 0.0,
"completion_length": 2372.5833587646484,
"epoch": 0.05028571428571429,
"grad_norm": 0.08952542394399643,
"kl": 4.1332095861434937e-05,
"lambda_div_used": 0.5768283307552338,
"learning_rate": 9.64227184053598e-07,
"loss": 0.0238,
"reward": -0.09475147165358067,
"reward_after_mean": -0.09475147165358067,
"reward_after_std": 0.4600023180246353,
"reward_before_mean": 0.33489263616502285,
"reward_before_std": 0.35124383692163974,
"reward_change_max": 0.0,
"reward_change_mean": -0.4296441040933132,
"reward_change_min": -0.6358921378850937,
"reward_change_std": 0.24556122440844774,
"reward_std": 0.4600023254752159,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.06405930174514651,
"step": 44
},
{
"clip_fraction": 0.0,
"completion_length": 3178.125,
"epoch": 0.05142857142857143,
"grad_norm": 0.05793697386980057,
"kl": 4.953145980834961e-05,
"lambda_div_used": 0.6092750132083893,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0375,
"reward": -0.24964951165020466,
"reward_after_mean": -0.24964951165020466,
"reward_after_std": 0.575963044539094,
"reward_before_mean": 0.010031561367213726,
"reward_before_std": 0.5040898034349084,
"reward_change_max": 0.0,
"reward_change_mean": -0.25968106649816036,
"reward_change_min": -0.41585399955511093,
"reward_change_std": 0.15605208091437817,
"reward_std": 0.5759630724787712,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.114968441426754,
"step": 45
},
{
"clip_fraction": 0.0,
"completion_length": 3160.895866394043,
"epoch": 0.052571428571428575,
"grad_norm": 0.06750793755054474,
"kl": 4.884600639343262e-05,
"lambda_div_used": 0.5543409436941147,
"learning_rate": 9.578385041664925e-07,
"loss": 0.0442,
"reward": -0.47163831628859043,
"reward_after_mean": -0.47163831628859043,
"reward_after_std": 0.32501144520938396,
"reward_before_mean": -0.21356616588309407,
"reward_before_std": 0.24316862598061562,
"reward_change_max": 0.0,
"reward_change_mean": -0.25807216577231884,
"reward_change_min": -0.3883417770266533,
"reward_change_std": 0.13798328768461943,
"reward_std": 0.3250114619731903,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.23439949192106724,
"step": 46
},
{
"clip_fraction": 0.0,
"completion_length": 2624.270881652832,
"epoch": 0.053714285714285714,
"grad_norm": 0.10712098330259323,
"kl": 5.3569674491882324e-05,
"lambda_div_used": 0.6219898834824562,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0102,
"reward": -0.10781855694949627,
"reward_after_mean": -0.10781855694949627,
"reward_after_std": 0.5886904541403055,
"reward_before_mean": 0.19626505579799414,
"reward_before_std": 0.5641138087958097,
"reward_change_max": 0.0,
"reward_change_mean": -0.3040836229920387,
"reward_change_min": -0.5314439944922924,
"reward_change_std": 0.2001748401671648,
"reward_std": 0.5886904690414667,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.03290162514895201,
"step": 47
},
{
"clip_fraction": 0.0,
"completion_length": 2589.458396911621,
"epoch": 0.054857142857142854,
"grad_norm": 0.07933427393436432,
"kl": 4.4988468289375305e-05,
"lambda_div_used": 0.6262567341327667,
"learning_rate": 9.509529358847654e-07,
"loss": -0.0358,
"reward": -0.13860276085324585,
"reward_after_mean": -0.13860276085324585,
"reward_after_std": 0.5960155855864286,
"reward_before_mean": 0.13786590658128262,
"reward_before_std": 0.5872014760971069,
"reward_change_max": 0.0,
"reward_change_mean": -0.2764686793088913,
"reward_change_min": -0.5099791921675205,
"reward_change_std": 0.19576375279575586,
"reward_std": 0.5960155855864286,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.07046743389219046,
"step": 48
},
{
"clip_fraction": 0.0,
"completion_length": 1985.2291946411133,
"epoch": 0.056,
"grad_norm": 0.12161526083946228,
"kl": 4.811584949493408e-05,
"lambda_div_used": 0.5859075263142586,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0065,
"reward": -0.21219712868332863,
"reward_after_mean": -0.21219712868332863,
"reward_after_std": 0.439599821344018,
"reward_before_mean": 0.10536342021077871,
"reward_before_std": 0.39430073963012546,
"reward_change_max": 0.0,
"reward_change_mean": -0.3175605833530426,
"reward_change_min": -0.4657192714512348,
"reward_change_std": 0.19229275174438953,
"reward_std": 0.4395998399704695,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.10296990955248475,
"step": 49
},
{
"clip_fraction": 0.0,
"completion_length": 2880.250011444092,
"epoch": 0.05714285714285714,
"grad_norm": 0.07723147422075272,
"kl": 3.2588839530944824e-05,
"lambda_div_used": 0.5741998106241226,
"learning_rate": 9.43578868212728e-07,
"loss": 0.0182,
"reward": -0.0913238637149334,
"reward_after_mean": -0.0913238637149334,
"reward_after_std": 0.42474013939499855,
"reward_before_mean": 0.3248649761080742,
"reward_before_std": 0.3397149038501084,
"reward_change_max": 0.0,
"reward_change_mean": -0.41618884168565273,
"reward_change_min": -0.6034708395600319,
"reward_change_std": 0.23950859624892473,
"reward_std": 0.4247401561588049,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.012364983558654785,
"step": 50
},
{
"clip_fraction": 0.0,
"completion_length": 2433.020854949951,
"epoch": 0.05828571428571429,
"grad_norm": 0.09581972658634186,
"kl": 5.4717063903808594e-05,
"lambda_div_used": 0.5894461125135422,
"learning_rate": 9.397114317029974e-07,
"loss": -0.0308,
"reward": -0.28266510320827365,
"reward_after_mean": -0.28266510320827365,
"reward_after_std": 0.44258156418800354,
"reward_before_mean": 0.012647990137338638,
"reward_before_std": 0.40818152111023664,
"reward_change_max": 0.0,
"reward_change_mean": -0.29531308077275753,
"reward_change_min": -0.4761137217283249,
"reward_change_std": 0.1856073010712862,
"reward_std": 0.4425815735012293,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.15401867777109146,
"step": 51
},
{
"clip_fraction": 0.0,
"completion_length": 2707.250030517578,
"epoch": 0.05942857142857143,
"grad_norm": 0.14702580869197845,
"kl": 4.030205309391022e-05,
"lambda_div_used": 0.6197419166564941,
"learning_rate": 9.357252853159505e-07,
"loss": 0.0298,
"reward": 0.17821598052978516,
"reward_after_mean": 0.17821598052978516,
"reward_after_std": 0.6063419748097658,
"reward_before_mean": 0.629100788384676,
"reward_before_std": 0.5578461596742272,
"reward_change_max": 0.0,
"reward_change_mean": -0.4508847985416651,
"reward_change_min": -0.6935725994408131,
"reward_change_std": 0.2830996550619602,
"reward_std": 0.6063419822603464,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/cosine_scaled_reward": 0.17076744884252548,
"step": 52
},
{
"clip_fraction": 0.0,
"completion_length": 2799.8125610351562,
"epoch": 0.060571428571428575,
"grad_norm": 0.07566332817077637,
"kl": 5.0321221351623535e-05,
"lambda_div_used": 0.6389462202787399,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0088,
"reward": -0.0043678306974470615,
"reward_after_mean": -0.0043678306974470615,
"reward_after_std": 0.6600480955094099,
"reward_before_mean": 0.31353550031781197,
"reward_before_std": 0.6461421558633447,
"reward_change_max": 0.0,
"reward_change_mean": -0.3179033286869526,
"reward_change_min": -0.5801984183490276,
"reward_change_std": 0.22101250104606152,
"reward_std": 0.6600481104105711,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.0218688128516078,
"step": 53
},
{
"clip_fraction": 0.0,
"completion_length": 2297.9583778381348,
"epoch": 0.061714285714285715,
"grad_norm": 0.09625467658042908,
"kl": 4.2378902435302734e-05,
"lambda_div_used": 0.6194662973284721,
"learning_rate": 9.274017555754407e-07,
"loss": 0.0224,
"reward": 0.0802488662302494,
"reward_after_mean": 0.0802488662302494,
"reward_after_std": 0.6323203574866056,
"reward_before_mean": 0.4949483387172222,
"reward_before_std": 0.5504263024777174,
"reward_change_max": 0.0,
"reward_change_mean": -0.41469944082200527,
"reward_change_min": -0.6165112145245075,
"reward_change_std": 0.2416615542024374,
"reward_std": 0.632320374250412,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.1407816605642438,
"step": 54
},
{
"clip_fraction": 0.0,
"completion_length": 2896.416702270508,
"epoch": 0.06285714285714286,
"grad_norm": 0.05907125398516655,
"kl": 3.5256147384643555e-05,
"lambda_div_used": 0.602072462439537,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0467,
"reward": -0.13429339788854122,
"reward_after_mean": -0.13429339788854122,
"reward_after_std": 0.57946902140975,
"reward_before_mean": 0.22700263070873916,
"reward_before_std": 0.4767341245897114,
"reward_change_max": 0.0,
"reward_change_mean": -0.36129603534936905,
"reward_change_min": -0.5782174952328205,
"reward_change_std": 0.21661545429378748,
"reward_std": 0.5794690307229757,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.02299738209694624,
"step": 55
},
{
"clip_fraction": 0.0,
"completion_length": 3075.187545776367,
"epoch": 0.064,
"grad_norm": 0.06310205906629562,
"kl": 4.1544437408447266e-05,
"lambda_div_used": 0.5549901723861694,
"learning_rate": 9.186184199300463e-07,
"loss": -0.0108,
"reward": -0.4740895018912852,
"reward_after_mean": -0.4740895018912852,
"reward_after_std": 0.32212498411536217,
"reward_before_mean": -0.2245362438261509,
"reward_before_std": 0.24502124171704054,
"reward_change_max": 0.0,
"reward_change_mean": -0.24955323711037636,
"reward_change_min": -0.3650141842663288,
"reward_change_std": 0.13405024446547031,
"reward_std": 0.3221249897032976,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.24536957405507565,
"step": 56
},
{
"clip_fraction": 0.0,
"completion_length": 3013.8541870117188,
"epoch": 0.06514285714285714,
"grad_norm": 0.052018001675605774,
"kl": 3.096461296081543e-05,
"lambda_div_used": 0.5674270242452621,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0192,
"reward": -0.45193217881023884,
"reward_after_mean": -0.45193217881023884,
"reward_after_std": 0.37017360515892506,
"reward_before_mean": -0.21255622059106827,
"reward_before_std": 0.3090948835015297,
"reward_change_max": 0.0,
"reward_change_mean": -0.23937593773007393,
"reward_change_min": -0.42212119325995445,
"reward_change_std": 0.14615233521908522,
"reward_std": 0.3701736144721508,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2542228833772242,
"step": 57
},
{
"clip_fraction": 0.0,
"completion_length": 2257.8750534057617,
"epoch": 0.06628571428571428,
"grad_norm": 0.09038142114877701,
"kl": 3.3486634492874146e-05,
"lambda_div_used": 0.6153116375207901,
"learning_rate": 9.093859795212817e-07,
"loss": 0.049,
"reward": -0.07369695231318474,
"reward_after_mean": -0.07369695231318474,
"reward_after_std": 0.5549236796796322,
"reward_before_mean": 0.2532845102250576,
"reward_before_std": 0.5286985114216805,
"reward_change_max": 0.0,
"reward_change_mean": -0.3269814867526293,
"reward_change_min": -0.537376407533884,
"reward_change_std": 0.20782853197306395,
"reward_std": 0.5549236983060837,
"rewards/accuracy_reward": 0.27083334513008595,
"rewards/cosine_scaled_reward": -0.017548808827996254,
"step": 58
},
{
"clip_fraction": 0.0,
"completion_length": 2807.6250381469727,
"epoch": 0.06742857142857143,
"grad_norm": 0.06863339245319366,
"kl": 3.577768802642822e-05,
"lambda_div_used": 0.6041718497872353,
"learning_rate": 9.046048391230247e-07,
"loss": -0.005,
"reward": -0.25026911310851574,
"reward_after_mean": -0.25026911310851574,
"reward_after_std": 0.5205795764923096,
"reward_before_mean": 0.026796480640769005,
"reward_before_std": 0.4837551396340132,
"reward_change_max": 0.0,
"reward_change_mean": -0.27706561237573624,
"reward_change_min": -0.5239802338182926,
"reward_change_std": 0.18853904772549868,
"reward_std": 0.5205795876681805,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.11903684702701867,
"step": 59
},
{
"clip_fraction": 0.0,
"completion_length": 2797.5833740234375,
"epoch": 0.06857142857142857,
"grad_norm": 0.07598927617073059,
"kl": 3.738701343536377e-05,
"lambda_div_used": 0.6269034072756767,
"learning_rate": 8.997156826556369e-07,
"loss": 0.0313,
"reward": -0.2369612492620945,
"reward_after_mean": -0.2369612492620945,
"reward_after_std": 0.6222166065126657,
"reward_before_mean": 0.005125788040459156,
"reward_before_std": 0.5912308068946004,
"reward_change_max": 0.0,
"reward_change_mean": -0.24208704754710197,
"reward_change_min": -0.48471353203058243,
"reward_change_std": 0.17278119549155235,
"reward_std": 0.6222166288644075,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.14070754405111074,
"step": 60
},
{
"clip_fraction": 0.0,
"completion_length": 2917.812515258789,
"epoch": 0.06971428571428571,
"grad_norm": 0.06223488971590996,
"kl": 3.0603259801864624e-05,
"lambda_div_used": 0.6177623867988586,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0259,
"reward": -0.1357121616601944,
"reward_after_mean": -0.1357121616601944,
"reward_after_std": 0.5694319568574429,
"reward_before_mean": 0.16569078899919987,
"reward_before_std": 0.5441586868837476,
"reward_change_max": 0.0,
"reward_change_mean": -0.3014029450714588,
"reward_change_min": -0.5209132842719555,
"reward_change_std": 0.20053375512361526,
"reward_std": 0.5694319736212492,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.042642548214644194,
"step": 61
},
{
"clip_fraction": 0.0,
"completion_length": 2507.8958854675293,
"epoch": 0.07085714285714285,
"grad_norm": 0.0747612789273262,
"kl": 2.6777386665344238e-05,
"lambda_div_used": 0.61732517182827,
"learning_rate": 8.896193111002475e-07,
"loss": 0.0366,
"reward": 0.010478481650352478,
"reward_after_mean": 0.010478481650352478,
"reward_after_std": 0.6334415413439274,
"reward_before_mean": 0.4018659461289644,
"reward_before_std": 0.5490978918969631,
"reward_change_max": 0.0,
"reward_change_mean": -0.39138743840157986,
"reward_change_min": -0.6315983533859253,
"reward_change_std": 0.24404066987335682,
"reward_std": 0.6334415581077337,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/cosine_scaled_reward": 0.06853258889168501,
"step": 62
},
{
"clip_fraction": 0.0,
"completion_length": 1902.5000228881836,
"epoch": 0.072,
"grad_norm": 0.07997458428144455,
"kl": 2.5197863578796387e-05,
"lambda_div_used": 0.6046535074710846,
"learning_rate": 8.844151714648274e-07,
"loss": 0.059,
"reward": 0.0217137411236763,
"reward_after_mean": 0.0217137411236763,
"reward_after_std": 0.5547807831317186,
"reward_before_mean": 0.42593196779489517,
"reward_before_std": 0.49076249497011304,
"reward_change_max": 0.0,
"reward_change_mean": -0.4042182229459286,
"reward_change_min": -0.6314646527171135,
"reward_change_std": 0.25159123074263334,
"reward_std": 0.5547807849943638,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.07176528126001358,
"step": 63
},
{
"clip_fraction": 0.0,
"completion_length": 2790.979217529297,
"epoch": 0.07314285714285715,
"grad_norm": 0.07697053253650665,
"kl": 4.419684410095215e-05,
"lambda_div_used": 0.6203151121735573,
"learning_rate": 8.791091657286267e-07,
"loss": 0.061,
"reward": -0.1100537832826376,
"reward_after_mean": -0.1100537832826376,
"reward_after_std": 0.5783387050032616,
"reward_before_mean": 0.19534806534647942,
"reward_before_std": 0.5548522733151913,
"reward_change_max": 0.0,
"reward_change_mean": -0.3054018337279558,
"reward_change_min": -0.5362299680709839,
"reward_change_std": 0.2037976048886776,
"reward_std": 0.5783387236297131,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.012985273322556168,
"step": 64
},
{
"clip_fraction": 0.0,
"completion_length": 2605.2708587646484,
"epoch": 0.07428571428571429,
"grad_norm": 0.0772913321852684,
"kl": 3.089010715484619e-05,
"lambda_div_used": 0.5563548430800438,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0861,
"reward": -0.33125742711126804,
"reward_after_mean": -0.33125742711126804,
"reward_after_std": 0.3626741226762533,
"reward_before_mean": 0.005530592054128647,
"reward_before_std": 0.2566492212936282,
"reward_change_max": 0.0,
"reward_change_mean": -0.33678802102804184,
"reward_change_min": -0.5241431556642056,
"reward_change_std": 0.18918895348906517,
"reward_std": 0.36267412453889847,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.161136067006737,
"step": 65
},
{
"clip_fraction": 0.0,
"completion_length": 2088.562511444092,
"epoch": 0.07542857142857143,
"grad_norm": 0.10951534658670425,
"kl": 2.690870314836502e-05,
"lambda_div_used": 0.5969364494085312,
"learning_rate": 8.681980515339463e-07,
"loss": 0.0287,
"reward": 0.012385480105876923,
"reward_after_mean": 0.012385480105876923,
"reward_after_std": 0.5128760654479265,
"reward_before_mean": 0.44090043008327484,
"reward_before_std": 0.4437296399846673,
"reward_change_max": 0.0,
"reward_change_mean": -0.4285149369388819,
"reward_change_min": -0.660201895982027,
"reward_change_std": 0.25701938942074776,
"reward_std": 0.512876084074378,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/cosine_scaled_reward": 0.0659004095941782,
"step": 66
},
{
"clip_fraction": 0.0,
"completion_length": 3402.8958740234375,
"epoch": 0.07657142857142857,
"grad_norm": 0.047973256558179855,
"kl": 3.771483898162842e-05,
"lambda_div_used": 0.5712955147027969,
"learning_rate": 8.625962667065487e-07,
"loss": -0.0027,
"reward": -0.4502652711234987,
"reward_after_mean": -0.4502652711234987,
"reward_after_std": 0.39950996078550816,
"reward_before_mean": -0.22067994717508554,
"reward_before_std": 0.3211147477850318,
"reward_change_max": 0.0,
"reward_change_mean": -0.229585325345397,
"reward_change_min": -0.3470626436173916,
"reward_change_std": 0.12443333957344294,
"reward_std": 0.39950997941195965,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.26234661415219307,
"step": 67
},
{
"clip_fraction": 0.0,
"completion_length": 1642.9166984558105,
"epoch": 0.07771428571428571,
"grad_norm": 0.11412619799375534,
"kl": 2.551823854446411e-05,
"lambda_div_used": 0.6189040914177895,
"learning_rate": 8.568992620281243e-07,
"loss": -0.0912,
"reward": -0.22925877509987913,
"reward_after_mean": -0.22925877509987913,
"reward_after_std": 0.6055262424051762,
"reward_before_mean": 0.03262739907950163,
"reward_before_std": 0.5506744375452399,
"reward_change_max": 0.0,
"reward_change_mean": -0.2618861813098192,
"reward_change_min": -0.4676270857453346,
"reward_change_std": 0.16872322466224432,
"reward_std": 0.6055262610316277,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.09237260185182095,
"step": 68
},
{
"clip_fraction": 0.0,
"completion_length": 2018.6666946411133,
"epoch": 0.07885714285714286,
"grad_norm": 0.11094633489847183,
"kl": 4.1466206312179565e-05,
"lambda_div_used": 0.5734386518597603,
"learning_rate": 8.511087728614862e-07,
"loss": -0.0618,
"reward": -0.4316765144467354,
"reward_after_mean": -0.4316765144467354,
"reward_after_std": 0.4006939698010683,
"reward_before_mean": -0.1882182292174548,
"reward_before_std": 0.331131674349308,
"reward_change_max": 0.0,
"reward_change_mean": -0.24345828033983707,
"reward_change_min": -0.3763142116367817,
"reward_change_std": 0.13501812983304262,
"reward_std": 0.40069398283958435,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.22988490015268326,
"step": 69
},
{
"clip_fraction": 0.0,
"completion_length": 2860.8958587646484,
"epoch": 0.08,
"grad_norm": 0.06797178089618683,
"kl": 4.0102750062942505e-05,
"lambda_div_used": 0.6089852601289749,
"learning_rate": 8.452265630457282e-07,
"loss": 0.0416,
"reward": -0.16451344639062881,
"reward_after_mean": -0.16451344639062881,
"reward_after_std": 0.5175492316484451,
"reward_before_mean": 0.1350960824638605,
"reward_before_std": 0.5058744940906763,
"reward_change_max": 0.0,
"reward_change_mean": -0.2996095381677151,
"reward_change_min": -0.5449608005583286,
"reward_change_std": 0.2059446070343256,
"reward_std": 0.5175492409616709,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.05240389332175255,
"step": 70
},
{
"clip_fraction": 0.0,
"completion_length": 2700.2708587646484,
"epoch": 0.08114285714285714,
"grad_norm": 0.07971946895122528,
"kl": 4.7519803047180176e-05,
"lambda_div_used": 0.6114047914743423,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0248,
"reward": -0.17357509583234787,
"reward_after_mean": -0.17357509583234787,
"reward_after_std": 0.5260436423122883,
"reward_before_mean": 0.1154884397983551,
"reward_before_std": 0.5224206217098981,
"reward_change_max": 0.0,
"reward_change_mean": -0.2890635374933481,
"reward_change_min": -0.5293963067233562,
"reward_change_std": 0.20269155222922564,
"reward_std": 0.5260436479002237,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.051178233698010445,
"step": 71
},
{
"clip_fraction": 0.0,
"completion_length": 2651.1250610351562,
"epoch": 0.08228571428571428,
"grad_norm": 0.1173771321773529,
"kl": 5.626678466796875e-05,
"lambda_div_used": 0.5625430718064308,
"learning_rate": 8.331941759724268e-07,
"loss": -0.0294,
"reward": -0.4364648088812828,
"reward_after_mean": -0.4364648088812828,
"reward_after_std": 0.3423657324165106,
"reward_before_mean": -0.17360325902700424,
"reward_before_std": 0.28492841869592667,
"reward_change_max": 0.0,
"reward_change_mean": -0.2628615368157625,
"reward_change_min": -0.44537778943777084,
"reward_change_std": 0.15708798076957464,
"reward_std": 0.3423657491803169,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2152699390426278,
"step": 72
},
{
"clip_fraction": 0.0,
"completion_length": 3339.9583740234375,
"epoch": 0.08342857142857144,
"grad_norm": 0.04790791496634483,
"kl": 4.526972770690918e-05,
"lambda_div_used": 0.6265770718455315,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0626,
"reward": -0.12956268154084682,
"reward_after_mean": -0.12956268154084682,
"reward_after_std": 0.6143265012651682,
"reward_before_mean": 0.15906396182253957,
"reward_before_std": 0.5886987801641226,
"reward_change_max": 0.0,
"reward_change_mean": -0.288626654073596,
"reward_change_min": -0.5101466961205006,
"reward_change_std": 0.1920691430568695,
"reward_std": 0.6143265273422003,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.049269367940723896,
"step": 73
},
{
"clip_fraction": 0.0,
"completion_length": 2385.7500381469727,
"epoch": 0.08457142857142858,
"grad_norm": 0.06600002944469452,
"kl": 3.542378544807434e-05,
"lambda_div_used": 0.6220900639891624,
"learning_rate": 8.208167604184217e-07,
"loss": -0.0312,
"reward": -0.0998319232603535,
"reward_after_mean": -0.0998319232603535,
"reward_after_std": 0.5906735248863697,
"reward_before_mean": 0.20307225361466408,
"reward_before_std": 0.566557977348566,
"reward_change_max": 0.0,
"reward_change_mean": -0.30290416814386845,
"reward_change_min": -0.5291841961443424,
"reward_change_std": 0.20113296527415514,
"reward_std": 0.5906735453754663,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.005261080645141192,
"step": 74
},
{
"clip_fraction": 0.0,
"completion_length": 2732.458335876465,
"epoch": 0.08571428571428572,
"grad_norm": 0.059586018323898315,
"kl": 3.4804921597242355e-05,
"lambda_div_used": 0.5497717335820198,
"learning_rate": 8.145033635316128e-07,
"loss": -0.026,
"reward": -0.16143636964261532,
"reward_after_mean": -0.16143636964261532,
"reward_after_std": 0.41724423691630363,
"reward_before_mean": 0.31992355175316334,
"reward_before_std": 0.22394374571740627,
"reward_change_max": 0.0,
"reward_change_mean": -0.4813598971813917,
"reward_change_min": -0.6314556114375591,
"reward_change_std": 0.24685372970998287,
"reward_std": 0.4172442499548197,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.0490901879966259,
"step": 75
},
{
"clip_fraction": 0.0,
"completion_length": 2649.9791870117188,
"epoch": 0.08685714285714285,
"grad_norm": 0.07793173938989639,
"kl": 3.8176774978637695e-05,
"lambda_div_used": 0.5533522665500641,
"learning_rate": 8.081093963579707e-07,
"loss": 0.0329,
"reward": -0.4524771338328719,
"reward_after_mean": -0.4524771338328719,
"reward_after_std": 0.32186589390039444,
"reward_before_mean": -0.18054483737796545,
"reward_before_std": 0.23863591719418764,
"reward_change_max": 0.0,
"reward_change_mean": -0.2719323057681322,
"reward_change_min": -0.402019876986742,
"reward_change_std": 0.14489794615656137,
"reward_std": 0.32186589762568474,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.20137817226350307,
"step": 76
},
{
"clip_fraction": 0.0,
"completion_length": 2847.8333587646484,
"epoch": 0.088,
"grad_norm": 0.06544006615877151,
"kl": 4.89354133605957e-05,
"lambda_div_used": 0.5558431893587112,
"learning_rate": 8.01636806561836e-07,
"loss": -0.0184,
"reward": -0.46523131616413593,
"reward_after_mean": -0.46523131616413593,
"reward_after_std": 0.3304946720600128,
"reward_before_mean": -0.20099198445677757,
"reward_before_std": 0.2519041560590267,
"reward_change_max": 0.0,
"reward_change_mean": -0.26423933170735836,
"reward_change_min": -0.37857379391789436,
"reward_change_std": 0.140173084102571,
"reward_std": 0.3304946757853031,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.22182531282305717,
"step": 77
},
{
"clip_fraction": 0.0,
"completion_length": 3269.8333740234375,
"epoch": 0.08914285714285715,
"grad_norm": 0.05014213174581528,
"kl": 3.695487976074219e-05,
"lambda_div_used": 0.6150171384215355,
"learning_rate": 7.950875657567621e-07,
"loss": -0.0153,
"reward": -0.23963370453566313,
"reward_after_mean": -0.23963370453566313,
"reward_after_std": 0.5818531475961208,
"reward_before_mean": 0.01525909942574799,
"reward_before_std": 0.5286716222763062,
"reward_change_max": 0.0,
"reward_change_mean": -0.2548927925527096,
"reward_change_min": -0.411530327051878,
"reward_change_std": 0.1550313262268901,
"reward_std": 0.5818531513214111,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.10974090336821973,
"step": 78
},
{
"clip_fraction": 0.0,
"completion_length": 2262.062511444092,
"epoch": 0.09028571428571429,
"grad_norm": 0.06913956999778748,
"kl": 3.7103891372680664e-05,
"lambda_div_used": 0.5935603529214859,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0312,
"reward": -0.23656679573468864,
"reward_after_mean": -0.23656679573468864,
"reward_after_std": 0.5612265523523092,
"reward_before_mean": 0.08522074297070503,
"reward_before_std": 0.4243400124832988,
"reward_change_max": 0.0,
"reward_change_mean": -0.3217875510454178,
"reward_change_min": -0.4353032112121582,
"reward_change_std": 0.16354462038725615,
"reward_std": 0.5612265765666962,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.10227925609797239,
"step": 79
},
{
"clip_fraction": 0.0,
"completion_length": 3292.3333587646484,
"epoch": 0.09142857142857143,
"grad_norm": 0.0538838729262352,
"kl": 4.382804036140442e-05,
"lambda_div_used": 0.5688970535993576,
"learning_rate": 7.817671337095244e-07,
"loss": 0.0288,
"reward": -0.3089722655713558,
"reward_after_mean": -0.3089722655713558,
"reward_after_std": 0.3733787778764963,
"reward_before_mean": 0.0013715587556362152,
"reward_before_std": 0.30914535373449326,
"reward_change_max": 0.0,
"reward_change_mean": -0.31034383550286293,
"reward_change_min": -0.46327832341194153,
"reward_change_std": 0.17456937301903963,
"reward_std": 0.3733787890523672,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.12362844927702099,
"step": 80
},
{
"clip_fraction": 0.0,
"completion_length": 3043.270866394043,
"epoch": 0.09257142857142857,
"grad_norm": 0.08238786458969116,
"kl": 5.1021575927734375e-05,
"lambda_div_used": 0.5923707559704781,
"learning_rate": 7.75e-07,
"loss": 0.0152,
"reward": -0.26676796935498714,
"reward_after_mean": -0.26676796935498714,
"reward_after_std": 0.4838532619178295,
"reward_before_mean": 0.01490369625389576,
"reward_before_std": 0.4254928780719638,
"reward_change_max": 0.0,
"reward_change_mean": -0.28167167864739895,
"reward_change_min": -0.4777062237262726,
"reward_change_std": 0.17371418420225382,
"reward_std": 0.4838532619178295,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.11009630188345909,
"step": 81
},
{
"clip_fraction": 0.0,
"completion_length": 2630.625030517578,
"epoch": 0.09371428571428571,
"grad_norm": 0.06656210124492645,
"kl": 4.544854164123535e-05,
"lambda_div_used": 0.6220841184258461,
"learning_rate": 7.681643291108517e-07,
"loss": 0.0658,
"reward": -0.2009956305846572,
"reward_after_mean": -0.2009956305846572,
"reward_after_std": 0.6119217481464148,
"reward_before_mean": 0.05983926076442003,
"reward_before_std": 0.5658847019076347,
"reward_change_max": 0.0,
"reward_change_mean": -0.2608348857611418,
"reward_change_min": -0.45365019887685776,
"reward_change_std": 0.16699018515646458,
"reward_std": 0.6119217481464148,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.08599409856833518,
"step": 82
},
{
"clip_fraction": 0.0,
"completion_length": 2535.208381652832,
"epoch": 0.09485714285714286,
"grad_norm": 0.09338229149580002,
"kl": 5.167722702026367e-05,
"lambda_div_used": 0.5914888307452202,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0644,
"reward": -0.14509151689708233,
"reward_after_mean": -0.14509151689708233,
"reward_after_std": 0.47357647120952606,
"reward_before_mean": 0.1998078590258956,
"reward_before_std": 0.4206119291484356,
"reward_change_max": 0.0,
"reward_change_mean": -0.3448993805795908,
"reward_change_min": -0.5211166813969612,
"reward_change_std": 0.20530468598008156,
"reward_std": 0.4735764730721712,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": 0.012307855300605297,
"step": 83
},
{
"clip_fraction": 0.0,
"completion_length": 2915.8750076293945,
"epoch": 0.096,
"grad_norm": 0.07025005668401718,
"kl": 4.2825937271118164e-05,
"lambda_div_used": 0.6353590413928032,
"learning_rate": 7.54295724882796e-07,
"loss": -0.0466,
"reward": -0.1503741154447198,
"reward_after_mean": -0.1503741154447198,
"reward_after_std": 0.6686306204646826,
"reward_before_mean": 0.10233869170770049,
"reward_before_std": 0.6240420090034604,
"reward_change_max": 0.0,
"reward_change_mean": -0.25271278992295265,
"reward_change_min": -0.43359045311808586,
"reward_change_std": 0.1594225950539112,
"reward_std": 0.6686306446790695,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.10599466459825635,
"step": 84
},
{
"clip_fraction": 0.0,
"completion_length": 2968.9375610351562,
"epoch": 0.09714285714285714,
"grad_norm": 0.05030859634280205,
"kl": 2.9653310775756836e-05,
"lambda_div_used": 0.6120947226881981,
"learning_rate": 7.472670160550848e-07,
"loss": 0.02,
"reward": -0.31006562570109963,
"reward_after_mean": -0.31006562570109963,
"reward_after_std": 0.5445140562951565,
"reward_before_mean": -0.0690727960318327,
"reward_before_std": 0.5210688021034002,
"reward_change_max": 0.0,
"reward_change_mean": -0.2409928422421217,
"reward_change_min": -0.46906592324376106,
"reward_change_std": 0.1683354014530778,
"reward_std": 0.5445140581578016,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.17323945881798863,
"step": 85
},
{
"clip_fraction": 0.0,
"completion_length": 2726.9167098999023,
"epoch": 0.09828571428571428,
"grad_norm": 0.08248579502105713,
"kl": 5.6862831115722656e-05,
"lambda_div_used": 0.5853299722075462,
"learning_rate": 7.401782177833147e-07,
"loss": -0.015,
"reward": -0.32714659720659256,
"reward_after_mean": -0.32714659720659256,
"reward_after_std": 0.4505470525473356,
"reward_before_mean": -0.05685961013659835,
"reward_before_std": 0.39618763769976795,
"reward_change_max": 0.0,
"reward_change_mean": -0.2702869772911072,
"reward_change_min": -0.49061961844563484,
"reward_change_std": 0.17353465128690004,
"reward_std": 0.4505470544099808,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.14019294315949082,
"step": 86
},
{
"clip_fraction": 0.0,
"completion_length": 2504.604232788086,
"epoch": 0.09942857142857142,
"grad_norm": 0.07946456968784332,
"kl": 5.137920379638672e-05,
"lambda_div_used": 0.5860441103577614,
"learning_rate": 7.330314893841101e-07,
"loss": -0.0658,
"reward": -0.25398197025060654,
"reward_after_mean": -0.25398197025060654,
"reward_after_std": 0.4263457953929901,
"reward_before_mean": 0.039198137819767,
"reward_before_std": 0.4029952948912978,
"reward_change_max": 0.0,
"reward_change_mean": -0.2931801360100508,
"reward_change_min": -0.46910280734300613,
"reward_change_std": 0.18813357036560774,
"reward_std": 0.4263457991182804,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.1066351905465126,
"step": 87
},
{
"clip_fraction": 0.0,
"completion_length": 2021.3958702087402,
"epoch": 0.10057142857142858,
"grad_norm": 0.08914105594158173,
"kl": 3.445148468017578e-05,
"lambda_div_used": 0.6275195479393005,
"learning_rate": 7.258290078201731e-07,
"loss": 0.0661,
"reward": -0.03901347843930125,
"reward_after_mean": -0.03901347843930125,
"reward_after_std": 0.6304188724607229,
"reward_before_mean": 0.28059265296906233,
"reward_before_std": 0.5838784109801054,
"reward_change_max": 0.0,
"reward_change_mean": -0.3196061383932829,
"reward_change_min": -0.4850176088511944,
"reward_change_std": 0.19270309899002314,
"reward_std": 0.6304188761860132,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.030592639930546284,
"step": 88
},
{
"clip_fraction": 0.0,
"completion_length": 2855.666702270508,
"epoch": 0.10171428571428572,
"grad_norm": 0.052225928753614426,
"kl": 3.810226917266846e-05,
"lambda_div_used": 0.603381521999836,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0748,
"reward": -0.2931769546121359,
"reward_after_mean": -0.2931769546121359,
"reward_after_std": 0.5186727736145258,
"reward_before_mean": -0.04963091528043151,
"reward_before_std": 0.4795092437416315,
"reward_change_max": 0.0,
"reward_change_mean": -0.2435460388660431,
"reward_change_min": -0.4475689232349396,
"reward_change_std": 0.1609211042523384,
"reward_std": 0.5186727829277515,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.17463092133402824,
"step": 89
},
{
"clip_fraction": 0.0,
"completion_length": 2377.9375228881836,
"epoch": 0.10285714285714286,
"grad_norm": 0.1115579754114151,
"kl": 6.511807441711426e-05,
"lambda_div_used": 0.5806571692228317,
"learning_rate": 7.11265577295385e-07,
"loss": 0.1049,
"reward": -0.39063363266177475,
"reward_after_mean": -0.39063363266177475,
"reward_after_std": 0.42313366010785103,
"reward_before_mean": -0.14117828011512756,
"reward_before_std": 0.3631622865796089,
"reward_change_max": 0.0,
"reward_change_mean": -0.24945535697042942,
"reward_change_min": -0.38458868488669395,
"reward_change_std": 0.14130910206586123,
"reward_std": 0.4231336638331413,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1828449461609125,
"step": 90
},
{
"clip_fraction": 0.0,
"completion_length": 2758.250015258789,
"epoch": 0.104,
"grad_norm": 0.0690622553229332,
"kl": 4.595518112182617e-05,
"lambda_div_used": 0.5834402665495872,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0142,
"reward": -0.1535566644743085,
"reward_after_mean": -0.1535566644743085,
"reward_after_std": 0.4708296097815037,
"reward_before_mean": 0.22540673054754734,
"reward_before_std": 0.3833985608071089,
"reward_change_max": 0.0,
"reward_change_mean": -0.3789634220302105,
"reward_change_min": -0.6174517869949341,
"reward_change_std": 0.2261828240007162,
"reward_std": 0.4708296228200197,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": -0.003759911749511957,
"step": 91
},
{
"clip_fraction": 0.0,
"completion_length": 2467.2083892822266,
"epoch": 0.10514285714285715,
"grad_norm": 0.07518215477466583,
"kl": 3.6522746086120605e-05,
"lambda_div_used": 0.5778974890708923,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0134,
"reward": -0.324931837618351,
"reward_after_mean": -0.324931837618351,
"reward_after_std": 0.42975695990025997,
"reward_before_mean": -0.03748153988271952,
"reward_before_std": 0.35468481201678514,
"reward_change_max": 0.0,
"reward_change_mean": -0.2874503042548895,
"reward_change_min": -0.4382200054824352,
"reward_change_std": 0.1673103515058756,
"reward_std": 0.42975698225200176,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.14164821058511734,
"step": 92
},
{
"clip_fraction": 0.0,
"completion_length": 3584.0,
"epoch": 0.10628571428571429,
"grad_norm": 0.05293423309922218,
"kl": 5.128979682922363e-05,
"lambda_div_used": 0.5581478402018547,
"learning_rate": 6.890576474687263e-07,
"loss": -0.0,
"reward": -0.4109080731868744,
"reward_after_mean": -0.4109080731868744,
"reward_after_std": 0.33292335644364357,
"reward_before_mean": -0.12180843483656645,
"reward_before_std": 0.26414576172828674,
"reward_change_max": 0.0,
"reward_change_mean": -0.28909964114427567,
"reward_change_min": -0.45860791578888893,
"reward_change_std": 0.16918409056961536,
"reward_std": 0.332923362031579,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.16347510018385947,
"step": 93
},
{
"clip_fraction": 0.0,
"completion_length": 2337.0625076293945,
"epoch": 0.10742857142857143,
"grad_norm": 0.08120116591453552,
"kl": 4.356354475021362e-05,
"lambda_div_used": 0.5623572915792465,
"learning_rate": 6.815672671252315e-07,
"loss": 0.0672,
"reward": -0.2148991823196411,
"reward_after_mean": -0.2148991823196411,
"reward_after_std": 0.37764647975564003,
"reward_before_mean": 0.17423074319958687,
"reward_before_std": 0.2815079055726528,
"reward_change_max": 0.0,
"reward_change_mean": -0.3891299031674862,
"reward_change_min": -0.5794133953750134,
"reward_change_std": 0.21886237617582083,
"reward_std": 0.3776464983820915,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.034102603793144226,
"step": 94
},
{
"clip_fraction": 0.0,
"completion_length": 3268.7708587646484,
"epoch": 0.10857142857142857,
"grad_norm": 0.048412173986434937,
"kl": 3.3482909202575684e-05,
"lambda_div_used": 0.599761851131916,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0138,
"reward": -0.28226344753056765,
"reward_after_mean": -0.28226344753056765,
"reward_after_std": 0.5015552807599306,
"reward_before_mean": -0.01817359635606408,
"reward_before_std": 0.4594444427639246,
"reward_change_max": 0.0,
"reward_change_mean": -0.2640898581594229,
"reward_change_min": -0.45775213465094566,
"reward_change_std": 0.170160255394876,
"reward_std": 0.5015552863478661,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.14317359682172537,
"step": 95
},
{
"clip_fraction": 0.0,
"completion_length": 2311.229202270508,
"epoch": 0.10971428571428571,
"grad_norm": 0.08286502212285995,
"kl": 3.9830803871154785e-05,
"lambda_div_used": 0.6152824014425278,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0394,
"reward": -0.1798182651400566,
"reward_after_mean": -0.1798182651400566,
"reward_after_std": 0.5684184953570366,
"reward_before_mean": 0.09630595671478659,
"reward_before_std": 0.5319525888189673,
"reward_change_max": 0.0,
"reward_change_mean": -0.2761242166161537,
"reward_change_min": -0.4500977620482445,
"reward_change_std": 0.17272682767361403,
"reward_std": 0.5684185232967138,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.11202737595885992,
"step": 96
},
{
"clip_fraction": 0.0,
"completion_length": 3088.104202270508,
"epoch": 0.11085714285714286,
"grad_norm": 0.05773423984646797,
"kl": 4.525482654571533e-05,
"lambda_div_used": 0.5830484703183174,
"learning_rate": 6.588648530198504e-07,
"loss": 0.012,
"reward": -0.24631217122077942,
"reward_after_mean": -0.24631217122077942,
"reward_after_std": 0.4177880808711052,
"reward_before_mean": 0.052551381289958954,
"reward_before_std": 0.3777043428272009,
"reward_change_max": 0.0,
"reward_change_mean": -0.2988635450601578,
"reward_change_min": -0.4822593107819557,
"reward_change_std": 0.18344944156706333,
"reward_std": 0.4177880957722664,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.11411529779434204,
"step": 97
},
{
"clip_fraction": 0.0,
"completion_length": 3099.6666870117188,
"epoch": 0.112,
"grad_norm": 0.05776653811335564,
"kl": 4.242360591888428e-05,
"lambda_div_used": 0.5548205152153969,
"learning_rate": 6.512279744547392e-07,
"loss": 0.0853,
"reward": -0.30739316577091813,
"reward_after_mean": -0.30739316577091813,
"reward_after_std": 0.3838734310120344,
"reward_before_mean": 0.044590696692466736,
"reward_before_std": 0.2453754236921668,
"reward_change_max": 0.0,
"reward_change_mean": -0.3519838694483042,
"reward_change_min": -0.49566248431801796,
"reward_change_std": 0.18073826655745506,
"reward_std": 0.38387343287467957,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.10124263912439346,
"step": 98
},
{
"clip_fraction": 0.0,
"completion_length": 2814.0416679382324,
"epoch": 0.11314285714285714,
"grad_norm": 0.10099756717681885,
"kl": 3.9711594581604004e-05,
"lambda_div_used": 0.589689776301384,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0116,
"reward": -0.24916072934865952,
"reward_after_mean": -0.24916072934865952,
"reward_after_std": 0.5363867282867432,
"reward_before_mean": 0.07555947656510398,
"reward_before_std": 0.4080575914122164,
"reward_change_max": 0.0,
"reward_change_mean": -0.32472023367881775,
"reward_change_min": -0.46621018648147583,
"reward_change_std": 0.16967851482331753,
"reward_std": 0.5363867320120335,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.11194050963968039,
"step": 99
},
{
"clip_fraction": 0.0,
"completion_length": 2585.208366394043,
"epoch": 0.11428571428571428,
"grad_norm": 0.08013861626386642,
"kl": 3.557652235031128e-05,
"lambda_div_used": 0.6440299674868584,
"learning_rate": 6.358640479194451e-07,
"loss": 0.0233,
"reward": -0.017059004865586758,
"reward_after_mean": -0.017059004865586758,
"reward_after_std": 0.6788044832646847,
"reward_before_mean": 0.27609538938850164,
"reward_before_std": 0.672226045280695,
"reward_change_max": 0.0,
"reward_change_mean": -0.29315441474318504,
"reward_change_min": -0.560445386916399,
"reward_change_std": 0.2121435971930623,
"reward_std": 0.6788045018911362,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.02609538659453392,
"step": 100
},
{
"clip_fraction": 0.0,
"completion_length": 2698.687515258789,
"epoch": 0.11542857142857142,
"grad_norm": 0.08196399360895157,
"kl": 4.9740076065063477e-05,
"lambda_div_used": 0.5603137612342834,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0552,
"reward": -0.2113272361457348,
"reward_after_mean": -0.2113272361457348,
"reward_after_std": 0.3802106771618128,
"reward_before_mean": 0.176470085978508,
"reward_before_std": 0.27200379874557257,
"reward_change_max": 0.0,
"reward_change_mean": -0.3877973351627588,
"reward_change_min": -0.5599614717066288,
"reward_change_std": 0.21361587569117546,
"reward_std": 0.38021068647503853,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.03186323493719101,
"step": 101
},
{
"clip_fraction": 0.0,
"completion_length": 2008.1875457763672,
"epoch": 0.11657142857142858,
"grad_norm": 0.09427309036254883,
"kl": 3.542006015777588e-05,
"lambda_div_used": 0.5826256647706032,
"learning_rate": 6.203955092681039e-07,
"loss": -0.0045,
"reward": -0.2004177300259471,
"reward_after_mean": -0.2004177300259471,
"reward_after_std": 0.498240664601326,
"reward_before_mean": 0.16466995794326067,
"reward_before_std": 0.37351171765476465,
"reward_change_max": 0.0,
"reward_change_mean": -0.36508770659565926,
"reward_change_min": -0.5195483043789864,
"reward_change_std": 0.19531819131225348,
"reward_std": 0.4982406720519066,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.001996707171201706,
"step": 102
},
{
"clip_fraction": 0.0,
"completion_length": 2895.250020980835,
"epoch": 0.11771428571428572,
"grad_norm": 0.06802447140216827,
"kl": 4.501640796661377e-05,
"lambda_div_used": 0.5506640374660492,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0391,
"reward": -0.3214366286993027,
"reward_after_mean": -0.3214366286993027,
"reward_after_std": 0.3532369527965784,
"reward_before_mean": 0.050119780004024506,
"reward_before_std": 0.22584082814864814,
"reward_change_max": 0.0,
"reward_change_mean": -0.3715564049780369,
"reward_change_min": -0.520975548774004,
"reward_change_std": 0.19540261384099722,
"reward_std": 0.35323695838451385,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.09571357164531946,
"step": 103
},
{
"clip_fraction": 0.0,
"completion_length": 2521.3333435058594,
"epoch": 0.11885714285714286,
"grad_norm": 0.096355140209198,
"kl": 4.947185516357422e-05,
"lambda_div_used": 0.5742901861667633,
"learning_rate": 6.048412045323164e-07,
"loss": 0.0001,
"reward": -0.32155255414545536,
"reward_after_mean": -0.32155255414545536,
"reward_after_std": 0.39902236871421337,
"reward_before_mean": -0.03546402044594288,
"reward_before_std": 0.3418318200856447,
"reward_change_max": 0.0,
"reward_change_mean": -0.2860885336995125,
"reward_change_min": -0.4318212755024433,
"reward_change_std": 0.16667354479432106,
"reward_std": 0.39902237243950367,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.13963068462908268,
"step": 104
},
{
"clip_fraction": 0.0,
"completion_length": 2526.000030517578,
"epoch": 0.12,
"grad_norm": 0.07842111587524414,
"kl": 4.8801302909851074e-05,
"lambda_div_used": 0.6022165417671204,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0569,
"reward": -0.12751667387783527,
"reward_after_mean": -0.12751667387783527,
"reward_after_std": 0.582199590280652,
"reward_before_mean": 0.24656505044549704,
"reward_before_std": 0.4756303254980594,
"reward_change_max": 0.0,
"reward_change_mean": -0.37408171594142914,
"reward_change_min": -0.5777854695916176,
"reward_change_std": 0.22329542227089405,
"reward_std": 0.5821996051818132,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.0034349607303738594,
"step": 105
},
{
"clip_fraction": 0.0,
"completion_length": 2396.291702270508,
"epoch": 0.12114285714285715,
"grad_norm": 0.07387516647577286,
"kl": 2.6203226298093796e-05,
"lambda_div_used": 0.6128493994474411,
"learning_rate": 5.892200842364462e-07,
"loss": -0.0293,
"reward": 0.159407502040267,
"reward_after_mean": 0.159407502040267,
"reward_after_std": 0.5759131647646427,
"reward_before_mean": 0.6274341251701117,
"reward_before_std": 0.5208892030641437,
"reward_change_max": 0.0,
"reward_change_mean": -0.46802657656371593,
"reward_change_min": -0.7112628631293774,
"reward_change_std": 0.2871347274631262,
"reward_std": 0.5759131908416748,
"rewards/accuracy_reward": 0.4583333469927311,
"rewards/cosine_scaled_reward": 0.16910075349733233,
"step": 106
},
{
"clip_fraction": 0.0,
"completion_length": 2806.875045776367,
"epoch": 0.12228571428571429,
"grad_norm": 0.08797896653413773,
"kl": 4.5686960220336914e-05,
"lambda_div_used": 0.5551519840955734,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0206,
"reward": -0.20906101167201996,
"reward_after_mean": -0.20906101167201996,
"reward_after_std": 0.35940215550363064,
"reward_before_mean": 0.19015613943338394,
"reward_before_std": 0.24773720651865005,
"reward_change_max": 0.0,
"reward_change_mean": -0.39921717159450054,
"reward_change_min": -0.5645803362131119,
"reward_change_std": 0.21728475205600262,
"reward_std": 0.35940215922892094,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.03901051543653011,
"step": 107
},
{
"clip_fraction": 0.0,
"completion_length": 2676.3125381469727,
"epoch": 0.12342857142857143,
"grad_norm": 0.08117339015007019,
"kl": 4.683062434196472e-05,
"lambda_div_used": 0.5782058760523796,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0222,
"reward": -0.33698799335979857,
"reward_after_mean": -0.33698799335979857,
"reward_after_std": 0.4150846730917692,
"reward_before_mean": -0.06898907572031021,
"reward_before_std": 0.3580573983490467,
"reward_change_max": 0.0,
"reward_change_mean": -0.2679989282041788,
"reward_change_min": -0.39731478318572044,
"reward_change_std": 0.15669311955571175,
"reward_std": 0.41508468240499496,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.17315573990345,
"step": 108
},
{
"clip_fraction": 0.0,
"completion_length": 3091.4166870117188,
"epoch": 0.12457142857142857,
"grad_norm": 0.06771776080131531,
"kl": 4.081428050994873e-05,
"lambda_div_used": 0.5580320879817009,
"learning_rate": 5.657047735161255e-07,
"loss": -0.0162,
"reward": -0.30369802191853523,
"reward_after_mean": -0.30369802191853523,
"reward_after_std": 0.3783009462058544,
"reward_before_mean": 0.060508192516863346,
"reward_before_std": 0.2597539462149143,
"reward_change_max": 0.0,
"reward_change_mean": -0.36420623771846294,
"reward_change_min": -0.5151935815811157,
"reward_change_std": 0.19158014561980963,
"reward_std": 0.37830095551908016,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.08532514423131943,
"step": 109
},
{
"clip_fraction": 0.0,
"completion_length": 2727.875045776367,
"epoch": 0.12571428571428572,
"grad_norm": 0.07848876714706421,
"kl": 3.8996338844299316e-05,
"lambda_div_used": 0.5898331105709076,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0059,
"reward": -0.27789773186668754,
"reward_after_mean": -0.27789773186668754,
"reward_after_std": 0.48964778520166874,
"reward_before_mean": 0.006700331810861826,
"reward_before_std": 0.40999170672148466,
"reward_change_max": 0.0,
"reward_change_mean": -0.28459805622696877,
"reward_change_min": -0.4541938379406929,
"reward_change_std": 0.16897330060601234,
"reward_std": 0.48964780382812023,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.15996634401381016,
"step": 110
},
{
"clip_fraction": 0.0,
"completion_length": 3152.3958587646484,
"epoch": 0.12685714285714286,
"grad_norm": 0.06026454642415047,
"kl": 5.206465721130371e-05,
"lambda_div_used": 0.6035187616944313,
"learning_rate": 5.5e-07,
"loss": 0.0004,
"reward": -0.22968922927975655,
"reward_after_mean": -0.22968922927975655,
"reward_after_std": 0.5089491438120604,
"reward_before_mean": 0.050578076392412186,
"reward_before_std": 0.47721442952752113,
"reward_change_max": 0.0,
"reward_change_mean": -0.28026728704571724,
"reward_change_min": -0.4855511710047722,
"reward_change_std": 0.1824399819597602,
"reward_std": 0.5089491568505764,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.09525526221841574,
"step": 111
},
{
"clip_fraction": 0.0,
"completion_length": 3253.937530517578,
"epoch": 0.128,
"grad_norm": 0.054045893251895905,
"kl": 4.832446575164795e-05,
"lambda_div_used": 0.6144762486219406,
"learning_rate": 5.421464171032224e-07,
"loss": 0.0014,
"reward": -0.07707784557715058,
"reward_after_mean": -0.07707784557715058,
"reward_after_std": 0.5871799997985363,
"reward_before_mean": 0.2822006791830063,
"reward_before_std": 0.5333117246627808,
"reward_change_max": 0.0,
"reward_change_mean": -0.35927851870656013,
"reward_change_min": -0.6481843888759613,
"reward_change_std": 0.23641589283943176,
"reward_std": 0.5871800072491169,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": 0.01136732567101717,
"step": 112
},
{
"clip_fraction": 0.0,
"completion_length": 2541.812545776367,
"epoch": 0.12914285714285714,
"grad_norm": 0.09499017894268036,
"kl": 4.720315337181091e-05,
"lambda_div_used": 0.5564066097140312,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0471,
"reward": -0.3626333475112915,
"reward_after_mean": -0.3626333475112915,
"reward_after_std": 0.3075137473642826,
"reward_before_mean": -0.05370184499770403,
"reward_before_std": 0.25310691073536873,
"reward_change_max": 0.0,
"reward_change_mean": -0.30893150344491005,
"reward_change_min": -0.4691920429468155,
"reward_change_std": 0.1770205283537507,
"reward_std": 0.3075137585401535,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.13703518453985453,
"step": 113
},
{
"clip_fraction": 0.0,
"completion_length": 2547.5416984558105,
"epoch": 0.13028571428571428,
"grad_norm": 0.06982935965061188,
"kl": 2.650544047355652e-05,
"lambda_div_used": 0.5948992818593979,
"learning_rate": 5.264488196906752e-07,
"loss": 0.0506,
"reward": -0.33033538423478603,
"reward_after_mean": -0.33033538423478603,
"reward_after_std": 0.49076898768544197,
"reward_before_mean": -0.08067071554251015,
"reward_before_std": 0.4387869122438133,
"reward_change_max": 0.0,
"reward_change_mean": -0.2496646661311388,
"reward_change_min": -0.38466524705290794,
"reward_change_std": 0.15147447120398283,
"reward_std": 0.4907689895480871,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.18483738787472248,
"step": 114
},
{
"clip_fraction": 0.0,
"completion_length": 2926.3125228881836,
"epoch": 0.13142857142857142,
"grad_norm": 0.10055308789014816,
"kl": 4.2185187339782715e-05,
"lambda_div_used": 0.5866656303405762,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0219,
"reward": -0.16912975907325745,
"reward_after_mean": -0.16912975907325745,
"reward_after_std": 0.4218728318810463,
"reward_before_mean": 0.17230269685387611,
"reward_before_std": 0.39482294395565987,
"reward_change_max": 0.0,
"reward_change_mean": -0.34143244847655296,
"reward_change_min": -0.5322228632867336,
"reward_change_std": 0.21214590221643448,
"reward_std": 0.4218728430569172,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": -0.09853065386414528,
"step": 115
},
{
"clip_fraction": 0.0,
"completion_length": 3185.895835876465,
"epoch": 0.13257142857142856,
"grad_norm": 0.06827546656131744,
"kl": 4.4345855712890625e-05,
"lambda_div_used": 0.579861506819725,
"learning_rate": 5.107799157635538e-07,
"loss": -0.017,
"reward": -0.36347829084843397,
"reward_after_mean": -0.36347829084843397,
"reward_after_std": 0.425207169726491,
"reward_before_mean": -0.10740169882774353,
"reward_before_std": 0.36591998394578695,
"reward_change_max": 0.0,
"reward_change_mean": -0.25607660599052906,
"reward_change_min": -0.4011174105107784,
"reward_change_std": 0.15214112866669893,
"reward_std": 0.4252071734517813,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.19073502300307155,
"step": 116
},
{
"clip_fraction": 0.0,
"completion_length": 3265.666702270508,
"epoch": 0.1337142857142857,
"grad_norm": 0.058839015662670135,
"kl": 5.224347114562988e-05,
"lambda_div_used": 0.5589891448616982,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0144,
"reward": -0.47565145045518875,
"reward_after_mean": -0.47565145045518875,
"reward_after_std": 0.34672038443386555,
"reward_before_mean": -0.2341720014810562,
"reward_before_std": 0.2644944768399,
"reward_change_max": 0.0,
"reward_change_mean": -0.2414794433861971,
"reward_change_min": -0.3730311393737793,
"reward_change_std": 0.13014927878975868,
"reward_std": 0.3467204011976719,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2550053410232067,
"step": 117
},
{
"clip_fraction": 0.0,
"completion_length": 3087.9375610351562,
"epoch": 0.13485714285714287,
"grad_norm": 0.05542779713869095,
"kl": 3.71783971786499e-05,
"lambda_div_used": 0.6410617902874947,
"learning_rate": 4.951587954676837e-07,
"loss": 0.0602,
"reward": 0.02640039217658341,
"reward_after_mean": 0.02640039217658341,
"reward_after_std": 0.6670121420174837,
"reward_before_mean": 0.34903212962672114,
"reward_before_std": 0.6545964349061251,
"reward_change_max": 0.0,
"reward_change_mean": -0.322631748393178,
"reward_change_min": -0.5322613082826138,
"reward_change_std": 0.21465477347373962,
"reward_std": 0.66701215878129,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.07819879939779639,
"step": 118
},
{
"clip_fraction": 0.0,
"completion_length": 2036.8542251586914,
"epoch": 0.136,
"grad_norm": 0.09512177854776382,
"kl": 4.920363426208496e-05,
"lambda_div_used": 0.5765073597431183,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0968,
"reward": -0.1327025555074215,
"reward_after_mean": -0.1327025555074215,
"reward_after_std": 0.4383635278791189,
"reward_before_mean": 0.2736157886683941,
"reward_before_std": 0.34712607227265835,
"reward_change_max": 0.0,
"reward_change_mean": -0.4063183292746544,
"reward_change_min": -0.5770336836576462,
"reward_change_std": 0.22668993193656206,
"reward_std": 0.43836353346705437,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": 0.0444490984082222,
"step": 119
},
{
"clip_fraction": 0.0,
"completion_length": 2693.6458854675293,
"epoch": 0.13714285714285715,
"grad_norm": 0.0801864042878151,
"kl": 5.1856040954589844e-05,
"lambda_div_used": 0.5954036563634872,
"learning_rate": 4.79604490731896e-07,
"loss": -0.0416,
"reward": -0.10674675926566124,
"reward_after_mean": -0.10674675926566124,
"reward_after_std": 0.5345237273722887,
"reward_before_mean": 0.26713848020881414,
"reward_before_std": 0.44027570402249694,
"reward_change_max": 0.0,
"reward_change_mean": -0.37388524040579796,
"reward_change_min": -0.5998699106276035,
"reward_change_std": 0.22230207175016403,
"reward_std": 0.5345237515866756,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.017138468101620674,
"step": 120
},
{
"clip_fraction": 0.0,
"completion_length": 1960.8958587646484,
"epoch": 0.1382857142857143,
"grad_norm": 0.09297072887420654,
"kl": 3.6925775930285454e-05,
"lambda_div_used": 0.5758452340960503,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.0275,
"reward": -0.15540813654661179,
"reward_after_mean": -0.15540813654661179,
"reward_after_std": 0.3896722886711359,
"reward_before_mean": 0.20818629674613476,
"reward_before_std": 0.34600438084453344,
"reward_change_max": 0.0,
"reward_change_mean": -0.3635944165289402,
"reward_change_min": -0.5350156500935555,
"reward_change_std": 0.21344942972064018,
"reward_std": 0.38967230543494225,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": 0.020686281844973564,
"step": 121
},
{
"clip_fraction": 0.0,
"completion_length": 2996.7917289733887,
"epoch": 0.13942857142857143,
"grad_norm": 0.07053450495004654,
"kl": 5.537271499633789e-05,
"lambda_div_used": 0.604009747505188,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0244,
"reward": -0.18336665583774447,
"reward_after_mean": -0.18336665583774447,
"reward_after_std": 0.5550395585596561,
"reward_before_mean": 0.11408489104360342,
"reward_before_std": 0.4755072835832834,
"reward_change_max": 0.0,
"reward_change_mean": -0.29745154455304146,
"reward_change_min": -0.4127044528722763,
"reward_change_std": 0.16168679296970367,
"reward_std": 0.5550395771861076,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.05258178571239114,
"step": 122
},
{
"clip_fraction": 0.0,
"completion_length": 2965.687530517578,
"epoch": 0.14057142857142857,
"grad_norm": 0.061138082295656204,
"kl": 4.2922794818878174e-05,
"lambda_div_used": 0.5864768177270889,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0629,
"reward": -0.24433407932519913,
"reward_after_mean": -0.24433407932519913,
"reward_after_std": 0.43522679433226585,
"reward_before_mean": 0.05285754054784775,
"reward_before_std": 0.4003843404352665,
"reward_change_max": 0.0,
"reward_change_mean": -0.2971916198730469,
"reward_change_min": -0.4781832844018936,
"reward_change_std": 0.18817945942282677,
"reward_std": 0.435226796194911,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.09297579154372215,
"step": 123
},
{
"clip_fraction": 0.0,
"completion_length": 2273.895881652832,
"epoch": 0.1417142857142857,
"grad_norm": 0.06792795658111572,
"kl": 2.843700349330902e-05,
"lambda_div_used": 0.5992666333913803,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0724,
"reward": -0.030117375776171684,
"reward_after_mean": -0.030117375776171684,
"reward_after_std": 0.6018249355256557,
"reward_before_mean": 0.40752510842867196,
"reward_before_std": 0.4566116305068135,
"reward_change_max": 0.0,
"reward_change_mean": -0.4376424718648195,
"reward_change_min": -0.6283221691846848,
"reward_change_std": 0.2420077919960022,
"reward_std": 0.601824939250946,
"rewards/accuracy_reward": 0.3541666679084301,
"rewards/cosine_scaled_reward": 0.05335840582847595,
"step": 124
},
{
"clip_fraction": 0.0,
"completion_length": 2841.687515258789,
"epoch": 0.14285714285714285,
"grad_norm": 0.057179540395736694,
"kl": 3.587547689676285e-05,
"lambda_div_used": 0.5674227699637413,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0092,
"reward": -0.09743379801511765,
"reward_after_mean": -0.09743379801511765,
"reward_after_std": 0.4501750059425831,
"reward_before_mean": 0.3561172317713499,
"reward_before_std": 0.3082335013896227,
"reward_change_max": 0.0,
"reward_change_mean": -0.453551035374403,
"reward_change_min": -0.6685109585523605,
"reward_change_std": 0.25046134926378727,
"reward_std": 0.4501750282943249,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.043617233633995056,
"step": 125
},
{
"clip_fraction": 0.0,
"completion_length": 2814.3333892822266,
"epoch": 0.144,
"grad_norm": 0.06912728399038315,
"kl": 3.851950168609619e-05,
"lambda_div_used": 0.6011649072170258,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0079,
"reward": -0.2701383363455534,
"reward_after_mean": -0.2701383363455534,
"reward_after_std": 0.517305538058281,
"reward_before_mean": -0.00174633227288723,
"reward_before_std": 0.46759936958551407,
"reward_change_max": 0.0,
"reward_change_mean": -0.2683920059353113,
"reward_change_min": -0.42111679539084435,
"reward_change_std": 0.16303524654358625,
"reward_std": 0.5173055771738291,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.10591300774831325,
"step": 126
},
{
"clip_fraction": 0.0,
"completion_length": 3520.375,
"epoch": 0.14514285714285713,
"grad_norm": 0.05083320662379265,
"kl": 3.781914710998535e-05,
"lambda_div_used": 0.5308314934372902,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0094,
"reward": -0.5137354172766209,
"reward_after_mean": -0.5137354172766209,
"reward_after_std": 0.23104721494019032,
"reward_before_mean": -0.23012915067374706,
"reward_before_std": 0.13533379370346665,
"reward_change_max": 0.0,
"reward_change_mean": -0.28360624983906746,
"reward_change_min": -0.4074147306382656,
"reward_change_std": 0.14828919060528278,
"reward_std": 0.2310472223907709,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2301291599869728,
"step": 127
},
{
"clip_fraction": 0.0,
"completion_length": 2622.6041946411133,
"epoch": 0.1462857142857143,
"grad_norm": 0.07047850638628006,
"kl": 3.9443373680114746e-05,
"lambda_div_used": 0.5977307558059692,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0245,
"reward": 0.14617427624762058,
"reward_after_mean": 0.14617427624762058,
"reward_after_std": 0.5826170947402716,
"reward_before_mean": 0.6620007424353389,
"reward_before_std": 0.45144926803186536,
"reward_change_max": 0.0,
"reward_change_mean": -0.5158264562487602,
"reward_change_min": -0.7390152402222157,
"reward_change_std": 0.29064416885375977,
"reward_std": 0.5826170966029167,
"rewards/accuracy_reward": 0.41666666977107525,
"rewards/cosine_scaled_reward": 0.2453340534120798,
"step": 128
},
{
"clip_fraction": 0.0,
"completion_length": 3211.166702270508,
"epoch": 0.14742857142857144,
"grad_norm": 0.06908967345952988,
"kl": 4.245340824127197e-05,
"lambda_div_used": 0.579024501144886,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0068,
"reward": -0.41220802813768387,
"reward_after_mean": -0.41220802813768387,
"reward_after_std": 0.4235076569020748,
"reward_before_mean": -0.1792316001956351,
"reward_before_std": 0.3573149349540472,
"reward_change_max": 0.0,
"reward_change_mean": -0.2329764310270548,
"reward_change_min": -0.38812074810266495,
"reward_change_std": 0.13700235076248646,
"reward_std": 0.423507671803236,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.24173159897327423,
"step": 129
},
{
"clip_fraction": 0.0,
"completion_length": 3416.375030517578,
"epoch": 0.14857142857142858,
"grad_norm": 0.05305078998208046,
"kl": 3.8370490074157715e-05,
"lambda_div_used": 0.5585792362689972,
"learning_rate": 4.034943304942796e-07,
"loss": -0.0353,
"reward": -0.324653722345829,
"reward_after_mean": -0.324653722345829,
"reward_after_std": 0.30889566242694855,
"reward_before_mean": 0.0034640291705727577,
"reward_before_std": 0.2620142959058285,
"reward_change_max": 0.0,
"reward_change_mean": -0.32811774499714375,
"reward_change_min": -0.4815611355006695,
"reward_change_std": 0.18737321346998215,
"reward_std": 0.30889566615223885,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.10070264618843794,
"step": 130
},
{
"clip_fraction": 0.0,
"completion_length": 2672.8541984558105,
"epoch": 0.14971428571428572,
"grad_norm": 0.08398960530757904,
"kl": 2.8876587748527527e-05,
"lambda_div_used": 0.6004800871014595,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0142,
"reward": 0.04825104773044586,
"reward_after_mean": 0.04825104773044586,
"reward_after_std": 0.537496130913496,
"reward_before_mean": 0.48470486514270306,
"reward_before_std": 0.4661959493532777,
"reward_change_max": 0.0,
"reward_change_mean": -0.43645381927490234,
"reward_change_min": -0.650093249976635,
"reward_change_std": 0.2640222804620862,
"reward_std": 0.5374961327761412,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/cosine_scaled_reward": 0.10970486886799335,
"step": 131
},
{
"clip_fraction": 0.0,
"completion_length": 2745.8541870117188,
"epoch": 0.15085714285714286,
"grad_norm": 0.10217073559761047,
"kl": 3.91155481338501e-05,
"lambda_div_used": 0.5872602090239525,
"learning_rate": 3.8873442270461485e-07,
"loss": -0.0282,
"reward": -0.18162552546709776,
"reward_after_mean": -0.18162552546709776,
"reward_after_std": 0.4464551955461502,
"reward_before_mean": 0.14102918095886707,
"reward_before_std": 0.3960893382318318,
"reward_change_max": 0.0,
"reward_change_mean": -0.32265469804406166,
"reward_change_min": -0.477376826107502,
"reward_change_std": 0.18945662677288055,
"reward_std": 0.4464551992714405,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/cosine_scaled_reward": -0.08813749923137948,
"step": 132
},
{
"clip_fraction": 0.0,
"completion_length": 3396.625,
"epoch": 0.152,
"grad_norm": 0.05338770151138306,
"kl": 4.1738152503967285e-05,
"lambda_div_used": 0.5715877488255501,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0083,
"reward": -0.3019852191209793,
"reward_after_mean": -0.3019852191209793,
"reward_after_std": 0.394625848159194,
"reward_before_mean": 0.015070955269038677,
"reward_before_std": 0.3251611590385437,
"reward_change_max": 0.0,
"reward_change_mean": -0.3170561697334051,
"reward_change_min": -0.5240310952067375,
"reward_change_std": 0.18560713436454535,
"reward_std": 0.39462585374712944,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.15159572381526232,
"step": 133
},
{
"clip_fraction": 0.0,
"completion_length": 2655.7708435058594,
"epoch": 0.15314285714285714,
"grad_norm": 0.08679015934467316,
"kl": 8.308887481689453e-05,
"lambda_div_used": 0.5542216002941132,
"learning_rate": 3.7417099217982686e-07,
"loss": 0.0208,
"reward": -0.32178156822919846,
"reward_after_mean": -0.32178156822919846,
"reward_after_std": 0.2957470379769802,
"reward_before_mean": 0.012201536446809769,
"reward_before_std": 0.24192149471491575,
"reward_change_max": 0.0,
"reward_change_mean": -0.3339830953627825,
"reward_change_min": -0.4839101508259773,
"reward_change_std": 0.18742438219487667,
"reward_std": 0.2957470417022705,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.09196514077484608,
"step": 134
},
{
"clip_fraction": 0.0,
"completion_length": 2014.9583587646484,
"epoch": 0.15428571428571428,
"grad_norm": 0.07728467881679535,
"kl": 2.7412548661231995e-05,
"lambda_div_used": 0.6218282878398895,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0739,
"reward": 0.15841423906385899,
"reward_after_mean": 0.15841423906385899,
"reward_after_std": 0.6740392297506332,
"reward_before_mean": 0.6407645158469677,
"reward_before_std": 0.5697397403419018,
"reward_change_max": 0.0,
"reward_change_mean": -0.48235028609633446,
"reward_change_min": -0.7837426699697971,
"reward_change_std": 0.3034245353192091,
"reward_std": 0.6740392409265041,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/cosine_scaled_reward": 0.161597837228328,
"step": 135
},
{
"clip_fraction": 0.0,
"completion_length": 2602.416717529297,
"epoch": 0.15542857142857142,
"grad_norm": 0.06872163712978363,
"kl": 2.8759241104125977e-05,
"lambda_div_used": 0.6318690255284309,
"learning_rate": 3.5982178221668533e-07,
"loss": 0.0344,
"reward": 0.06710329907946289,
"reward_after_mean": 0.06710329907946289,
"reward_after_std": 0.6661366745829582,
"reward_before_mean": 0.454304663464427,
"reward_before_std": 0.6177845690399408,
"reward_change_max": 0.0,
"reward_change_mean": -0.38720136508345604,
"reward_change_min": -0.6763367056846619,
"reward_change_std": 0.258380358107388,
"reward_std": 0.6661366857588291,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.10013799648731947,
"step": 136
},
{
"clip_fraction": 0.0,
"completion_length": 3239.666679382324,
"epoch": 0.15657142857142858,
"grad_norm": 0.0593891404569149,
"kl": 3.842264413833618e-05,
"lambda_div_used": 0.5779730677604675,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0124,
"reward": -0.3537064976990223,
"reward_after_mean": -0.3537064976990223,
"reward_after_std": 0.41078851372003555,
"reward_before_mean": -0.08445308171212673,
"reward_before_std": 0.35645375214517117,
"reward_change_max": 0.0,
"reward_change_mean": -0.2692534364759922,
"reward_change_min": -0.4356107972562313,
"reward_change_std": 0.15999021660536528,
"reward_std": 0.4107885267585516,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.16778641194105148,
"step": 137
},
{
"clip_fraction": 0.0,
"completion_length": 2760.687526702881,
"epoch": 0.15771428571428572,
"grad_norm": 0.0790812149643898,
"kl": 2.8595328330993652e-05,
"lambda_div_used": 0.5799111127853394,
"learning_rate": 3.45704275117204e-07,
"loss": -0.0362,
"reward": -0.27829512720927596,
"reward_after_mean": -0.27829512720927596,
"reward_after_std": 0.41231589019298553,
"reward_before_mean": 0.02187724970281124,
"reward_before_std": 0.362152349203825,
"reward_change_max": 0.0,
"reward_change_mean": -0.3001723885536194,
"reward_change_min": -0.45827802270650864,
"reward_change_std": 0.17282870691269636,
"reward_std": 0.412315895780921,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.10312275495380163,
"step": 138
},
{
"clip_fraction": 0.0,
"completion_length": 2984.4583587646484,
"epoch": 0.15885714285714286,
"grad_norm": 0.06229131296277046,
"kl": 3.9517879486083984e-05,
"lambda_div_used": 0.5838895812630653,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0589,
"reward": -0.3416835393290967,
"reward_after_mean": -0.3416835393290967,
"reward_after_std": 0.4420221708714962,
"reward_before_mean": -0.08248884417116642,
"reward_before_std": 0.384817186743021,
"reward_change_max": 0.0,
"reward_change_mean": -0.259194690734148,
"reward_change_min": -0.42579447478055954,
"reward_change_std": 0.15513746719807386,
"reward_std": 0.44202217273414135,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.16582217533141375,
"step": 139
},
{
"clip_fraction": 0.0,
"completion_length": 2881.1250228881836,
"epoch": 0.16,
"grad_norm": 0.071082703769207,
"kl": 4.310905933380127e-05,
"lambda_div_used": 0.5740059092640877,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0019,
"reward": -0.25879082828760147,
"reward_after_mean": -0.25879082828760147,
"reward_after_std": 0.4811950568109751,
"reward_before_mean": 0.09872006997466087,
"reward_before_std": 0.3372529884800315,
"reward_change_max": 0.0,
"reward_change_mean": -0.35751091688871384,
"reward_change_min": -0.4904663935303688,
"reward_change_std": 0.18400432635098696,
"reward_std": 0.481195081025362,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.06794659808656434,
"step": 140
},
{
"clip_fraction": 0.0,
"completion_length": 3180.187515258789,
"epoch": 0.16114285714285714,
"grad_norm": 0.05953420698642731,
"kl": 3.790855407714844e-05,
"lambda_div_used": 0.5660471692681313,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0557,
"reward": -0.3696493972092867,
"reward_after_mean": -0.3696493972092867,
"reward_after_std": 0.37020369805395603,
"reward_before_mean": -0.1031316639855504,
"reward_before_std": 0.300534725189209,
"reward_change_max": 0.0,
"reward_change_mean": -0.2665177509188652,
"reward_change_min": -0.3948013670742512,
"reward_change_std": 0.151769301854074,
"reward_std": 0.3702037110924721,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.20729831932112575,
"step": 141
},
{
"clip_fraction": 0.0,
"completion_length": 2633.291732788086,
"epoch": 0.16228571428571428,
"grad_norm": 0.06993869692087173,
"kl": 4.523620009422302e-05,
"lambda_div_used": 0.643665611743927,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0607,
"reward": -0.11501466785557568,
"reward_after_mean": -0.11501466785557568,
"reward_after_std": 0.7061873953789473,
"reward_before_mean": 0.1335596082135453,
"reward_before_std": 0.6679177191108465,
"reward_change_max": 0.0,
"reward_change_mean": -0.2485742662101984,
"reward_change_min": -0.438314501196146,
"reward_change_std": 0.16557594947516918,
"reward_std": 0.7061873972415924,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.05394040001556277,
"step": 142
},
{
"clip_fraction": 0.0,
"completion_length": 2543.854202270508,
"epoch": 0.16342857142857142,
"grad_norm": 0.09050207585096359,
"kl": 4.5418739318847656e-05,
"lambda_div_used": 0.5568482205271721,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0559,
"reward": -0.44561293721199036,
"reward_after_mean": -0.44561293721199036,
"reward_after_std": 0.34061737172305584,
"reward_before_mean": -0.17733613029122353,
"reward_before_std": 0.2554386807605624,
"reward_change_max": 0.0,
"reward_change_mean": -0.26827680319547653,
"reward_change_min": -0.39999108389019966,
"reward_change_std": 0.14368562400341034,
"reward_std": 0.3406173773109913,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.19816946546779945,
"step": 143
},
{
"clip_fraction": 0.0,
"completion_length": 3062.2291984558105,
"epoch": 0.16457142857142856,
"grad_norm": 0.08773668855428696,
"kl": 4.3526291847229004e-05,
"lambda_div_used": 0.5946916490793228,
"learning_rate": 3.0491243424323783e-07,
"loss": 0.0214,
"reward": -0.1852840557694435,
"reward_after_mean": -0.1852840557694435,
"reward_after_std": 0.4624195992946625,
"reward_before_mean": 0.13713636994361877,
"reward_before_std": 0.4314339701086283,
"reward_change_max": 0.0,
"reward_change_mean": -0.3224204182624817,
"reward_change_min": -0.502110667526722,
"reward_change_std": 0.19803205784410238,
"reward_std": 0.4624196030199528,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.05036364169791341,
"step": 144
},
{
"clip_fraction": 0.0,
"completion_length": 2343.708381652832,
"epoch": 0.1657142857142857,
"grad_norm": 0.09618457406759262,
"kl": 3.914535045623779e-05,
"lambda_div_used": 0.6065893918275833,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0239,
"reward": -0.08643046766519547,
"reward_after_mean": -0.08643046766519547,
"reward_after_std": 0.565411014482379,
"reward_before_mean": 0.2700228439643979,
"reward_before_std": 0.4942969996482134,
"reward_change_max": 0.0,
"reward_change_mean": -0.35645330883562565,
"reward_change_min": -0.5621693357825279,
"reward_change_std": 0.21702316030859947,
"reward_std": 0.565411014482379,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": -0.021643826737999916,
"step": 145
},
{
"clip_fraction": 0.0,
"completion_length": 2643.750030517578,
"epoch": 0.16685714285714287,
"grad_norm": 0.06749647855758667,
"kl": 3.0444934964179993e-05,
"lambda_div_used": 0.5759064182639122,
"learning_rate": 2.918906036420294e-07,
"loss": -0.0525,
"reward": -0.3986330684274435,
"reward_after_mean": -0.3986330684274435,
"reward_after_std": 0.4206914007663727,
"reward_before_mean": -0.1481835450977087,
"reward_before_std": 0.3465117560699582,
"reward_change_max": 0.0,
"reward_change_mean": -0.2504495307803154,
"reward_change_min": -0.3944549411535263,
"reward_change_std": 0.14348686579614878,
"reward_std": 0.42069140635430813,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.21068353950977325,
"step": 146
},
{
"clip_fraction": 0.0,
"completion_length": 3530.8958435058594,
"epoch": 0.168,
"grad_norm": 0.05082716792821884,
"kl": 4.493445158004761e-05,
"lambda_div_used": 0.6231074929237366,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0064,
"reward": -0.17444764077663422,
"reward_after_mean": -0.17444764077663422,
"reward_after_std": 0.5966980569064617,
"reward_before_mean": 0.08711316343396902,
"reward_before_std": 0.5713155549019575,
"reward_change_max": 0.0,
"reward_change_mean": -0.26156080327928066,
"reward_change_min": -0.42396802455186844,
"reward_change_std": 0.16937424894422293,
"reward_std": 0.5966980736702681,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.12122016958892345,
"step": 147
},
{
"clip_fraction": 0.0,
"completion_length": 2726.0625228881836,
"epoch": 0.16914285714285715,
"grad_norm": 0.05811236426234245,
"kl": 2.9481947422027588e-05,
"lambda_div_used": 0.5576904863119125,
"learning_rate": 2.791832395815782e-07,
"loss": -0.0067,
"reward": -0.3092116080224514,
"reward_after_mean": -0.3092116080224514,
"reward_after_std": 0.3737166114151478,
"reward_before_mean": 0.04292176803573966,
"reward_before_std": 0.2566564744338393,
"reward_change_max": 0.0,
"reward_change_mean": -0.35213335789740086,
"reward_change_min": -0.5105132721364498,
"reward_change_std": 0.18572800233960152,
"reward_std": 0.3737166281789541,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.10291159152984619,
"step": 148
},
{
"clip_fraction": 0.0,
"completion_length": 3018.4375228881836,
"epoch": 0.1702857142857143,
"grad_norm": 0.07346916198730469,
"kl": 3.395974636077881e-05,
"lambda_div_used": 0.5993617027997971,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0316,
"reward": -0.23622393235564232,
"reward_after_mean": -0.23622393235564232,
"reward_after_std": 0.513044873252511,
"reward_before_mean": 0.05197374615818262,
"reward_before_std": 0.4586914679966867,
"reward_change_max": 0.0,
"reward_change_mean": -0.2881976682692766,
"reward_change_min": -0.4784680940210819,
"reward_change_std": 0.1794019928202033,
"reward_std": 0.5130448862910271,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.0938595961779356,
"step": 149
},
{
"clip_fraction": 0.0,
"completion_length": 2817.000030517578,
"epoch": 0.17142857142857143,
"grad_norm": 0.07110631465911865,
"kl": 4.3623149394989014e-05,
"lambda_div_used": 0.5749505385756493,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.0143,
"reward": -0.4273622464388609,
"reward_after_mean": -0.4273622464388609,
"reward_after_std": 0.41951365023851395,
"reward_before_mean": -0.1882035918533802,
"reward_before_std": 0.34118228033185005,
"reward_change_max": 0.0,
"reward_change_mean": -0.23915864899754524,
"reward_change_min": -0.3481520377099514,
"reward_change_std": 0.1273820260539651,
"reward_std": 0.4195136558264494,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.22987026162445545,
"step": 150
},
{
"clip_fraction": 0.0,
"completion_length": 2473.5208892822266,
"epoch": 0.17257142857142857,
"grad_norm": 0.06608390063047409,
"kl": 3.757700324058533e-05,
"lambda_div_used": 0.6444682851433754,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0674,
"reward": 0.013197226449847221,
"reward_after_mean": 0.013197226449847221,
"reward_after_std": 0.6638381816446781,
"reward_before_mean": 0.3273471943102777,
"reward_before_std": 0.6709178425371647,
"reward_change_max": 0.0,
"reward_change_mean": -0.31414996832609177,
"reward_change_min": -0.5566080771386623,
"reward_change_std": 0.22394540812820196,
"reward_std": 0.6638382077217102,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.014847185462713242,
"step": 151
},
{
"clip_fraction": 0.0,
"completion_length": 2980.312530517578,
"epoch": 0.1737142857142857,
"grad_norm": 0.09729959070682526,
"kl": 3.904849290847778e-05,
"lambda_div_used": 0.5731147676706314,
"learning_rate": 2.547734369542718e-07,
"loss": 0.036,
"reward": -0.42251406982541084,
"reward_after_mean": -0.42251406982541084,
"reward_after_std": 0.4107065536081791,
"reward_before_mean": -0.17358114942908287,
"reward_before_std": 0.3313267915509641,
"reward_change_max": 0.0,
"reward_change_mean": -0.24893292412161827,
"reward_change_min": -0.37377795577049255,
"reward_change_std": 0.13549237046390772,
"reward_std": 0.4107065834105015,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21524782478809357,
"step": 152
},
{
"clip_fraction": 0.0,
"completion_length": 2980.4167098999023,
"epoch": 0.17485714285714285,
"grad_norm": 0.07872413098812103,
"kl": 5.060434341430664e-05,
"lambda_div_used": 0.5996033921837807,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0468,
"reward": -0.25326492823660374,
"reward_after_mean": -0.25326492823660374,
"reward_after_std": 0.5015600807964802,
"reward_before_mean": 0.03005093801766634,
"reward_before_std": 0.45544715132564306,
"reward_change_max": 0.0,
"reward_change_mean": -0.2833158541470766,
"reward_change_min": -0.490575835108757,
"reward_change_std": 0.17807927820831537,
"reward_std": 0.5015600919723511,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.09494908014312387,
"step": 153
},
{
"clip_fraction": 0.0,
"completion_length": 3400.6250610351562,
"epoch": 0.176,
"grad_norm": 0.044398024678230286,
"kl": 3.533065319061279e-05,
"lambda_div_used": 0.6407897770404816,
"learning_rate": 2.4310073797187573e-07,
"loss": 0.001,
"reward": -0.08846104983240366,
"reward_after_mean": -0.08846104983240366,
"reward_after_std": 0.6875216029584408,
"reward_before_mean": 0.1883750823326409,
"reward_before_std": 0.6521897967904806,
"reward_change_max": 0.0,
"reward_change_mean": -0.27683611772954464,
"reward_change_min": -0.45937369018793106,
"reward_change_std": 0.17437504325062037,
"reward_std": 0.6875216346234083,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.019958254415541887,
"step": 154
},
{
"clip_fraction": 0.0,
"completion_length": 2453.5416717529297,
"epoch": 0.17714285714285713,
"grad_norm": 0.08257688581943512,
"kl": 4.6115368604660034e-05,
"lambda_div_used": 0.5891367048025131,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0267,
"reward": -0.14017992746084929,
"reward_after_mean": -0.14017992746084929,
"reward_after_std": 0.5328723080456257,
"reward_before_mean": 0.24201755598187447,
"reward_before_std": 0.40873255487531424,
"reward_change_max": 0.0,
"reward_change_mean": -0.3821975253522396,
"reward_change_min": -0.5966118611395359,
"reward_change_std": 0.22055233176797628,
"reward_std": 0.532872324809432,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": 0.012850900180637836,
"step": 155
},
{
"clip_fraction": 0.0,
"completion_length": 3058.145835876465,
"epoch": 0.1782857142857143,
"grad_norm": 0.07657067477703094,
"kl": 3.8407742977142334e-05,
"lambda_div_used": 0.5792115926742554,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.0192,
"reward": -0.35964071936905384,
"reward_after_mean": -0.35964071936905384,
"reward_after_std": 0.41963592916727066,
"reward_before_mean": -0.10375695489346981,
"reward_before_std": 0.36636115331202745,
"reward_change_max": 0.0,
"reward_change_mean": -0.2558837812393904,
"reward_change_min": -0.40037716925144196,
"reward_change_std": 0.15252912789583206,
"reward_std": 0.41963593289256096,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.18709028977900743,
"step": 156
},
{
"clip_fraction": 0.0,
"completion_length": 3067.0833435058594,
"epoch": 0.17942857142857144,
"grad_norm": 0.0602630190551281,
"kl": 4.166364669799805e-05,
"lambda_div_used": 0.578017845749855,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.0024,
"reward": -0.20843622065149248,
"reward_after_mean": -0.20843622065149248,
"reward_after_std": 0.46523306891322136,
"reward_before_mean": 0.15985593758523464,
"reward_before_std": 0.35562971234321594,
"reward_change_max": 0.0,
"reward_change_mean": -0.3682921752333641,
"reward_change_min": -0.5581395737826824,
"reward_change_std": 0.20453235507011414,
"reward_std": 0.46523308381438255,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.027644065208733082,
"step": 157
},
{
"clip_fraction": 0.0,
"completion_length": 2439.8125381469727,
"epoch": 0.18057142857142858,
"grad_norm": 0.07962260395288467,
"kl": 3.724917769432068e-05,
"lambda_div_used": 0.5700756460428238,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0427,
"reward": -0.09846613928675652,
"reward_after_mean": -0.09846613928675652,
"reward_after_std": 0.50458899512887,
"reward_before_mean": 0.36336963158100843,
"reward_before_std": 0.31559942476451397,
"reward_change_max": 0.0,
"reward_change_mean": -0.46183576062321663,
"reward_change_min": -0.6303520426154137,
"reward_change_std": 0.23663399182260036,
"reward_std": 0.5045890025794506,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.07170296460390091,
"step": 158
},
{
"clip_fraction": 0.0,
"completion_length": 3283.7916717529297,
"epoch": 0.18171428571428572,
"grad_norm": 0.05873558670282364,
"kl": 4.16487455368042e-05,
"lambda_div_used": 0.565159484744072,
"learning_rate": 2.1558482853517253e-07,
"loss": -0.0183,
"reward": -0.43611275404691696,
"reward_after_mean": -0.43611275404691696,
"reward_after_std": 0.34730882942676544,
"reward_before_mean": -0.18830033391714096,
"reward_before_std": 0.2990688029676676,
"reward_change_max": 0.0,
"reward_change_mean": -0.247812420129776,
"reward_change_min": -0.4116964153945446,
"reward_change_std": 0.15130725782364607,
"reward_std": 0.34730884805321693,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.2508003171533346,
"step": 159
},
{
"clip_fraction": 0.0,
"completion_length": 3132.4167098999023,
"epoch": 0.18285714285714286,
"grad_norm": 0.06507622450590134,
"kl": 5.078315734863281e-05,
"lambda_div_used": 0.5871393531560898,
"learning_rate": 2.1038068889975259e-07,
"loss": 0.0371,
"reward": -0.25536114536225796,
"reward_after_mean": -0.25536114536225796,
"reward_after_std": 0.4553165938705206,
"reward_before_mean": 0.0424603084102273,
"reward_before_std": 0.3940334524959326,
"reward_change_max": 0.0,
"reward_change_mean": -0.2978214379400015,
"reward_change_min": -0.43723243474960327,
"reward_change_std": 0.16810003202408552,
"reward_std": 0.45531659945845604,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.10337302926927805,
"step": 160
},
{
"clip_fraction": 0.0,
"completion_length": 1814.145866394043,
"epoch": 0.184,
"grad_norm": 0.09199656546115875,
"kl": 3.68654727935791e-05,
"lambda_div_used": 0.6221684664487839,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.1313,
"reward": -0.14752347487956285,
"reward_after_mean": -0.14752347487956285,
"reward_after_std": 0.6091006584465504,
"reward_before_mean": 0.1306287944316864,
"reward_before_std": 0.5693164113909006,
"reward_change_max": 0.0,
"reward_change_mean": -0.2781522646546364,
"reward_change_min": -0.4651326909661293,
"reward_change_std": 0.17978444695472717,
"reward_std": 0.6091006807982922,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.03603787627071142,
"step": 161
},
{
"clip_fraction": 0.0,
"completion_length": 3445.2291870117188,
"epoch": 0.18514285714285714,
"grad_norm": 0.054613932967185974,
"kl": 4.4696033000946045e-05,
"lambda_div_used": 0.6093570217490196,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0111,
"reward": -0.21708323806524277,
"reward_after_mean": -0.21708323806524277,
"reward_after_std": 0.5340255293995142,
"reward_before_mean": 0.05781930312514305,
"reward_before_std": 0.5134020620025694,
"reward_change_max": 0.0,
"reward_change_mean": -0.27490255795419216,
"reward_change_min": -0.5003471188247204,
"reward_change_std": 0.1919182576239109,
"reward_std": 0.5340255443006754,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.10884736478328705,
"step": 162
},
{
"clip_fraction": 0.0,
"completion_length": 2758.5833435058594,
"epoch": 0.18628571428571428,
"grad_norm": 0.06869502365589142,
"kl": 3.579258918762207e-05,
"lambda_div_used": 0.5997234806418419,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.025,
"reward": -0.0028839372098445892,
"reward_after_mean": -0.0028839372098445892,
"reward_after_std": 0.5558424014598131,
"reward_before_mean": 0.42010904336348176,
"reward_before_std": 0.46184782730415463,
"reward_change_max": 0.0,
"reward_change_mean": -0.42299298755824566,
"reward_change_min": -0.614623662084341,
"reward_change_std": 0.2444069180637598,
"reward_std": 0.5558424014598131,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.08677570521831512,
"step": 163
},
{
"clip_fraction": 0.0,
"completion_length": 2280.4583435058594,
"epoch": 0.18742857142857142,
"grad_norm": 0.08430740982294083,
"kl": 3.807246685028076e-05,
"lambda_div_used": 0.583802655339241,
"learning_rate": 1.9061402047871833e-07,
"loss": -0.0005,
"reward": -0.14323966577649117,
"reward_after_mean": -0.14323966577649117,
"reward_after_std": 0.48269340209662914,
"reward_before_mean": 0.24511565826833248,
"reward_before_std": 0.38433590345084667,
"reward_change_max": 0.0,
"reward_change_mean": -0.38835531286895275,
"reward_change_min": -0.6137315705418587,
"reward_change_std": 0.2298044739291072,
"reward_std": 0.48269341699779034,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.0048843612894415855,
"step": 164
},
{
"clip_fraction": 0.0,
"completion_length": 3350.2083435058594,
"epoch": 0.18857142857142858,
"grad_norm": 0.06055794283747673,
"kl": 4.6700239181518555e-05,
"lambda_div_used": 0.5597026646137238,
"learning_rate": 1.8594235253127372e-07,
"loss": -0.0239,
"reward": -0.383526224642992,
"reward_after_mean": -0.383526224642992,
"reward_after_std": 0.3388876337558031,
"reward_before_mean": -0.09019916784018278,
"reward_before_std": 0.27148230001330376,
"reward_change_max": 0.0,
"reward_change_mean": -0.2933270577341318,
"reward_change_min": -0.44717802107334137,
"reward_change_std": 0.16905023716390133,
"reward_std": 0.3388876374810934,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.15269917901605368,
"step": 165
},
{
"clip_fraction": 0.0,
"completion_length": 2993.7709045410156,
"epoch": 0.18971428571428572,
"grad_norm": 0.07070616632699966,
"kl": 3.575533628463745e-05,
"lambda_div_used": 0.6159055009484291,
"learning_rate": 1.8138158006995363e-07,
"loss": -0.1171,
"reward": -0.20591574627906084,
"reward_after_mean": -0.20591574627906084,
"reward_after_std": 0.568760309368372,
"reward_before_mean": 0.07225125166587532,
"reward_before_std": 0.5325462874025106,
"reward_change_max": 0.0,
"reward_change_mean": -0.2781669981777668,
"reward_change_min": -0.4841184914112091,
"reward_change_std": 0.17965693771839142,
"reward_std": 0.5687603335827589,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.0735820853151381,
"step": 166
},
{
"clip_fraction": 0.0,
"completion_length": 2243.4583740234375,
"epoch": 0.19085714285714286,
"grad_norm": 0.08954203128814697,
"kl": 3.8199592381715775e-05,
"lambda_div_used": 0.5807118713855743,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0227,
"reward": -0.12234261445701122,
"reward_after_mean": -0.12234261445701122,
"reward_after_std": 0.46423870138823986,
"reward_before_mean": 0.28887630719691515,
"reward_before_std": 0.3718760753981769,
"reward_change_max": 0.0,
"reward_change_mean": -0.41121890768408775,
"reward_change_min": -0.6246605962514877,
"reward_change_std": 0.24211041443049908,
"reward_std": 0.4642387069761753,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": 0.0597096448764205,
"step": 167
},
{
"clip_fraction": 0.0,
"completion_length": 3159.604217529297,
"epoch": 0.192,
"grad_norm": 0.05736541375517845,
"kl": 4.6193599700927734e-05,
"lambda_div_used": 0.6025644987821579,
"learning_rate": 1.7259824442455923e-07,
"loss": 0.0616,
"reward": -0.1081604603677988,
"reward_after_mean": -0.1081604603677988,
"reward_after_std": 0.5043755322694778,
"reward_before_mean": 0.23938040900975466,
"reward_before_std": 0.4762058244086802,
"reward_change_max": 0.0,
"reward_change_mean": -0.347540894523263,
"reward_change_min": -0.5797867476940155,
"reward_change_std": 0.22277529910206795,
"reward_std": 0.504375534132123,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": 0.031047106254845858,
"step": 168
},
{
"clip_fraction": 0.0,
"completion_length": 2691.166717529297,
"epoch": 0.19314285714285714,
"grad_norm": 0.05840221792459488,
"kl": 3.322819247841835e-05,
"lambda_div_used": 0.6123679727315903,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0221,
"reward": 0.15223310515284538,
"reward_after_mean": 0.15223310515284538,
"reward_after_std": 0.589836286380887,
"reward_before_mean": 0.622445510700345,
"reward_before_std": 0.5133083704859018,
"reward_change_max": 0.0,
"reward_change_mean": -0.4702123887836933,
"reward_change_min": -0.686709500849247,
"reward_change_std": 0.27231256756931543,
"reward_std": 0.5898362882435322,
"rewards/accuracy_reward": 0.45833334140479565,
"rewards/cosine_scaled_reward": 0.1641121432185173,
"step": 169
},
{
"clip_fraction": 0.0,
"completion_length": 2737.8542137145996,
"epoch": 0.19428571428571428,
"grad_norm": 0.08408840745687485,
"kl": 2.9318034648895264e-05,
"lambda_div_used": 0.5516072362661362,
"learning_rate": 1.6427471468404952e-07,
"loss": -0.0223,
"reward": -0.22534361481666565,
"reward_after_mean": -0.22534361481666565,
"reward_after_std": 0.34615582413971424,
"reward_before_mean": 0.17829649709165096,
"reward_before_std": 0.231270051561296,
"reward_change_max": 0.0,
"reward_change_mean": -0.4036401268094778,
"reward_change_min": -0.5518503561615944,
"reward_change_std": 0.21789801493287086,
"reward_std": 0.34615583159029484,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.05087016709148884,
"step": 170
},
{
"clip_fraction": 0.0,
"completion_length": 2810.208366394043,
"epoch": 0.19542857142857142,
"grad_norm": 0.07381287217140198,
"kl": 3.468245267868042e-05,
"lambda_div_used": 0.6072571501135826,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0763,
"reward": -0.0880913995206356,
"reward_after_mean": -0.0880913995206356,
"reward_after_std": 0.4982736185193062,
"reward_before_mean": 0.2392272837460041,
"reward_before_std": 0.4966685324907303,
"reward_change_max": 0.0,
"reward_change_mean": -0.3273186720907688,
"reward_change_min": -0.5419761650264263,
"reward_change_std": 0.21660141553729773,
"reward_std": 0.4982736259698868,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": 0.010060586035251617,
"step": 171
},
{
"clip_fraction": 0.0,
"completion_length": 2782.833381652832,
"epoch": 0.19657142857142856,
"grad_norm": 0.08758535981178284,
"kl": 4.4949352741241455e-05,
"lambda_div_used": 0.5883520841598511,
"learning_rate": 1.5642113178727193e-07,
"loss": 0.0173,
"reward": -0.0421207002364099,
"reward_after_mean": -0.0421207002364099,
"reward_after_std": 0.5838102325797081,
"reward_before_mean": 0.41242535319179296,
"reward_before_std": 0.4052962730638683,
"reward_change_max": 0.0,
"reward_change_mean": -0.45454608276486397,
"reward_change_min": -0.6413811855018139,
"reward_change_std": 0.2437124690040946,
"reward_std": 0.5838102586567402,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.07909201784059405,
"step": 172
},
{
"clip_fraction": 0.0,
"completion_length": 2038.3125114440918,
"epoch": 0.1977142857142857,
"grad_norm": 0.10115820169448853,
"kl": 2.4806708097457886e-05,
"lambda_div_used": 0.5800783261656761,
"learning_rate": 1.5267358321348285e-07,
"loss": -0.0379,
"reward": -0.3465901352465153,
"reward_after_mean": -0.3465901352465153,
"reward_after_std": 0.41013904474675655,
"reward_before_mean": -0.07510924944654107,
"reward_before_std": 0.3657920900732279,
"reward_change_max": 0.0,
"reward_change_mean": -0.27148088812828064,
"reward_change_min": -0.41459682397544384,
"reward_change_std": 0.16112061124294996,
"reward_std": 0.4101390540599823,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.17927592061460018,
"step": 173
},
{
"clip_fraction": 0.0,
"completion_length": 2429.7083740234375,
"epoch": 0.19885714285714284,
"grad_norm": 0.08579805493354797,
"kl": 4.820525646209717e-05,
"lambda_div_used": 0.5971189364790916,
"learning_rate": 1.4904706411523448e-07,
"loss": -0.0681,
"reward": -0.16348301246762276,
"reward_after_mean": -0.16348301246762276,
"reward_after_std": 0.5588766317814589,
"reward_before_mean": 0.18685074103996158,
"reward_before_std": 0.44327013567090034,
"reward_change_max": 0.0,
"reward_change_mean": -0.3503337763249874,
"reward_change_min": -0.5374101847410202,
"reward_change_std": 0.19548403285443783,
"reward_std": 0.5588766410946846,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.02148258499801159,
"step": 174
},
{
"clip_fraction": 0.0,
"completion_length": 2815.1458587646484,
"epoch": 0.2,
"grad_norm": 0.06386198848485947,
"kl": 3.6529265344142914e-05,
"lambda_div_used": 0.5798816308379173,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0362,
"reward": -0.08284463733434677,
"reward_after_mean": -0.08284463733434677,
"reward_after_std": 0.46138195879757404,
"reward_before_mean": 0.33735317550599575,
"reward_before_std": 0.3691369164735079,
"reward_change_max": 0.0,
"reward_change_mean": -0.4201977960765362,
"reward_change_min": -0.615710511803627,
"reward_change_std": 0.2454231232404709,
"reward_std": 0.46138197369873524,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": 0.06651980988681316,
"step": 175
},
{
"clip_fraction": 0.0,
"completion_length": 2699.916748046875,
"epoch": 0.20114285714285715,
"grad_norm": 0.1062018871307373,
"kl": 3.183633089065552e-05,
"lambda_div_used": 0.6652076914906502,
"learning_rate": 1.4216149583350755e-07,
"loss": -0.0096,
"reward": 0.01935443957336247,
"reward_after_mean": 0.01935443957336247,
"reward_after_std": 0.7740565538406372,
"reward_before_mean": 0.30571601539850235,
"reward_before_std": 0.7710783276706934,
"reward_change_max": 0.0,
"reward_change_mean": -0.2863615807145834,
"reward_change_min": -0.5516976863145828,
"reward_change_std": 0.20931501779705286,
"reward_std": 0.7740565687417984,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.03488267329521477,
"step": 176
},
{
"clip_fraction": 0.0,
"completion_length": 2837.062530517578,
"epoch": 0.2022857142857143,
"grad_norm": 0.10419050604104996,
"kl": 5.3942203521728516e-05,
"lambda_div_used": 0.5891516581177711,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.094,
"reward": -0.275299109518528,
"reward_after_mean": -0.275299109518528,
"reward_after_std": 0.4370743464678526,
"reward_before_mean": 0.019260672852396965,
"reward_before_std": 0.4135833829641342,
"reward_change_max": 0.0,
"reward_change_mean": -0.294559795409441,
"reward_change_min": -0.49321193993091583,
"reward_change_std": 0.19294326566159725,
"reward_std": 0.43707435205578804,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.10573932714760303,
"step": 177
},
{
"clip_fraction": 0.0,
"completion_length": 2239.645866394043,
"epoch": 0.20342857142857143,
"grad_norm": 0.08482369780540466,
"kl": 3.471970558166504e-05,
"lambda_div_used": 0.5743999034166336,
"learning_rate": 1.3577281594640182e-07,
"loss": -0.012,
"reward": -0.2907655192539096,
"reward_after_mean": -0.2907655192539096,
"reward_after_std": 0.3906351812183857,
"reward_before_mean": 0.020154590718448162,
"reward_before_std": 0.33835357427597046,
"reward_change_max": 0.0,
"reward_change_mean": -0.31092010997235775,
"reward_change_min": -0.4689374789595604,
"reward_change_std": 0.18138453178107738,
"reward_std": 0.3906352035701275,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.08401207532733679,
"step": 178
},
{
"clip_fraction": 0.0,
"completion_length": 2847.187530517578,
"epoch": 0.20457142857142857,
"grad_norm": 0.06261342018842697,
"kl": 3.7359073758125305e-05,
"lambda_div_used": 0.5993921086192131,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0003,
"reward": -0.3251352347433567,
"reward_after_mean": -0.3251352347433567,
"reward_after_std": 0.507962841540575,
"reward_before_mean": -0.08432525303214788,
"reward_before_std": 0.45724861416965723,
"reward_change_max": 0.0,
"reward_change_mean": -0.2408099938184023,
"reward_change_min": -0.39610618352890015,
"reward_change_std": 0.14655023906379938,
"reward_std": 0.5079628489911556,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.18849190324544907,
"step": 179
},
{
"clip_fraction": 0.0,
"completion_length": 2329.562515258789,
"epoch": 0.2057142857142857,
"grad_norm": 0.10947566479444504,
"kl": 3.916025161743164e-05,
"lambda_div_used": 0.6209289953112602,
"learning_rate": 1.2988880807625927e-07,
"loss": -0.0732,
"reward": 0.0392537759616971,
"reward_after_mean": 0.0392537759616971,
"reward_after_std": 0.6374544408172369,
"reward_before_mean": 0.4333432329003699,
"reward_before_std": 0.5579905398190022,
"reward_change_max": 0.0,
"reward_change_mean": -0.3940894529223442,
"reward_change_min": -0.5780177563428879,
"reward_change_std": 0.229821746237576,
"reward_std": 0.6374544575810432,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.07917657308280468,
"step": 180
},
{
"clip_fraction": 0.0,
"completion_length": 3174.9166870117188,
"epoch": 0.20685714285714285,
"grad_norm": 0.0638444796204567,
"kl": 4.488229751586914e-05,
"lambda_div_used": 0.5625592544674873,
"learning_rate": 1.2713832064634125e-07,
"loss": -0.0055,
"reward": -0.3669877387583256,
"reward_after_mean": -0.3669877387583256,
"reward_after_std": 0.3254594895988703,
"reward_before_mean": -0.07308395206928253,
"reward_before_std": 0.2811311110854149,
"reward_change_max": 0.0,
"reward_change_mean": -0.29390377551317215,
"reward_change_min": -0.47421957924962044,
"reward_change_std": 0.1733461432158947,
"reward_std": 0.3254595026373863,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.15641729161143303,
"step": 181
},
{
"clip_fraction": 0.0,
"completion_length": 2200.5000228881836,
"epoch": 0.208,
"grad_norm": 0.07948590070009232,
"kl": 3.180652856826782e-05,
"lambda_div_used": 0.5802313759922981,
"learning_rate": 1.2451664098030743e-07,
"loss": -0.0177,
"reward": -0.33155214530415833,
"reward_after_mean": -0.33155214530415833,
"reward_after_std": 0.4093964695930481,
"reward_before_mean": -0.04757622070610523,
"reward_before_std": 0.362399042584002,
"reward_change_max": 0.0,
"reward_change_mean": -0.2839759271591902,
"reward_change_min": -0.4600135274231434,
"reward_change_std": 0.1762974951416254,
"reward_std": 0.40939648635685444,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.19340955652296543,
"step": 182
},
{
"clip_fraction": 0.0,
"completion_length": 1919.0000381469727,
"epoch": 0.20914285714285713,
"grad_norm": 0.09855654090642929,
"kl": 3.127008676528931e-05,
"lambda_div_used": 0.5876503959298134,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0354,
"reward": -0.29830155335366726,
"reward_after_mean": -0.29830155335366726,
"reward_after_std": 0.461369052529335,
"reward_before_mean": -0.01361087104305625,
"reward_before_std": 0.4003021940588951,
"reward_change_max": 0.0,
"reward_change_mean": -0.28469069860875607,
"reward_change_min": -0.4400797598063946,
"reward_change_std": 0.1702390005812049,
"reward_std": 0.4613690562546253,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.11777753569185734,
"step": 183
},
{
"clip_fraction": 0.0,
"completion_length": 2771.0416870117188,
"epoch": 0.2102857142857143,
"grad_norm": 0.09264298528432846,
"kl": 4.6037137508392334e-05,
"lambda_div_used": 0.5391388088464737,
"learning_rate": 1.1966285981663407e-07,
"loss": -0.001,
"reward": -0.3703397810459137,
"reward_after_mean": -0.3703397810459137,
"reward_after_std": 0.2937673106789589,
"reward_before_mean": -0.011352727189660072,
"reward_before_std": 0.17188004031777382,
"reward_change_max": 0.0,
"reward_change_mean": -0.35898703522980213,
"reward_change_min": -0.5206632278859615,
"reward_change_std": 0.18913730140775442,
"reward_std": 0.29376731254160404,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.13635273277759552,
"step": 184
},
{
"clip_fraction": 0.0,
"completion_length": 2770.9792098999023,
"epoch": 0.21142857142857144,
"grad_norm": 0.09669750928878784,
"kl": 4.3720006942749023e-05,
"lambda_div_used": 0.5636111497879028,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0651,
"reward": -0.3112166179344058,
"reward_after_mean": -0.3112166179344058,
"reward_after_std": 0.3695005215704441,
"reward_before_mean": -0.0006957156583666801,
"reward_before_std": 0.2874382403679192,
"reward_change_max": 0.0,
"reward_change_mean": -0.3105209097266197,
"reward_change_min": -0.43808290362358093,
"reward_change_std": 0.17091987561434507,
"reward_std": 0.369500532746315,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.10486237239092588,
"step": 185
},
{
"clip_fraction": 0.0,
"completion_length": 2965.375015258789,
"epoch": 0.21257142857142858,
"grad_norm": 0.05594424530863762,
"kl": 3.8955360651016235e-05,
"lambda_div_used": 0.608224630355835,
"learning_rate": 1.1533337816991931e-07,
"loss": -0.0681,
"reward": -0.20297073479741812,
"reward_after_mean": -0.20297073479741812,
"reward_after_std": 0.5205403883010149,
"reward_before_mean": 0.07951527182012796,
"reward_before_std": 0.502502404153347,
"reward_change_max": 0.0,
"reward_change_mean": -0.282486030831933,
"reward_change_min": -0.47177543863654137,
"reward_change_std": 0.18455488048493862,
"reward_std": 0.5205404032021761,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.08715140074491501,
"step": 186
},
{
"clip_fraction": 0.0,
"completion_length": 2639.291702270508,
"epoch": 0.21371428571428572,
"grad_norm": 0.08843009173870087,
"kl": 4.921853542327881e-05,
"lambda_div_used": 0.5908937901258469,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0048,
"reward": -0.25490788742899895,
"reward_after_mean": -0.25490788742899895,
"reward_after_std": 0.4807432759553194,
"reward_before_mean": 0.04253344633616507,
"reward_before_std": 0.4119319263845682,
"reward_change_max": 0.0,
"reward_change_mean": -0.29744134843349457,
"reward_change_min": -0.4517475329339504,
"reward_change_std": 0.16703799460083246,
"reward_std": 0.48074328526854515,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.10329987667500973,
"step": 187
},
{
"clip_fraction": 0.0,
"completion_length": 3541.9791870117188,
"epoch": 0.21485714285714286,
"grad_norm": 0.05120408907532692,
"kl": 4.652142524719238e-05,
"lambda_div_used": 0.5589732080698013,
"learning_rate": 1.1153347084664419e-07,
"loss": -0.0029,
"reward": -0.4253443591296673,
"reward_after_mean": -0.4253443591296673,
"reward_after_std": 0.3237063102424145,
"reward_before_mean": -0.15165027976036072,
"reward_before_std": 0.26862882915884256,
"reward_change_max": 0.0,
"reward_change_mean": -0.27369407564401627,
"reward_change_min": -0.437398936599493,
"reward_change_std": 0.16332428343594074,
"reward_std": 0.3237063158303499,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.2141502844169736,
"step": 188
},
{
"clip_fraction": 0.0,
"completion_length": 2377.2708892822266,
"epoch": 0.216,
"grad_norm": 0.07240846008062363,
"kl": 5.1606446504592896e-05,
"lambda_div_used": 0.5700332000851631,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0342,
"reward": -0.3627132559195161,
"reward_after_mean": -0.3627132559195161,
"reward_after_std": 0.4038649797439575,
"reward_before_mean": -0.08412502333521843,
"reward_before_std": 0.3190039964392781,
"reward_change_max": 0.0,
"reward_change_mean": -0.2785882242023945,
"reward_change_min": -0.4183661602437496,
"reward_change_std": 0.15456592850387096,
"reward_std": 0.4038649834692478,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.1466250205412507,
"step": 189
},
{
"clip_fraction": 0.0,
"completion_length": 3024.041717529297,
"epoch": 0.21714285714285714,
"grad_norm": 0.0596420094370842,
"kl": 2.85319983959198e-05,
"lambda_div_used": 0.5832700356841087,
"learning_rate": 1.0826776744855121e-07,
"loss": -0.0126,
"reward": -0.17394864186644554,
"reward_after_mean": -0.17394864186644554,
"reward_after_std": 0.471806388348341,
"reward_before_mean": 0.198049274738878,
"reward_before_std": 0.38090503215789795,
"reward_change_max": 0.0,
"reward_change_mean": -0.37199792452156544,
"reward_change_min": -0.5356397405266762,
"reward_change_std": 0.2117150044068694,
"reward_std": 0.4718063995242119,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.031117402017116547,
"step": 190
},
{
"clip_fraction": 0.0,
"completion_length": 2440.3541870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.0756516084074974,
"kl": 3.930646926164627e-05,
"lambda_div_used": 0.5715140625834465,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0003,
"reward": -0.284542384557426,
"reward_after_mean": -0.284542384557426,
"reward_after_std": 0.447479585185647,
"reward_before_mean": 0.05547440081136301,
"reward_before_std": 0.3240885529667139,
"reward_change_max": 0.0,
"reward_change_mean": -0.34001679718494415,
"reward_change_min": -0.4750482700765133,
"reward_change_std": 0.1778492433950305,
"reward_std": 0.4474795889109373,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.11119226738810539,
"step": 191
},
{
"clip_fraction": 0.0,
"completion_length": 3487.6041870117188,
"epoch": 0.21942857142857142,
"grad_norm": 0.048071179538965225,
"kl": 3.250688314437866e-05,
"lambda_div_used": 0.5586921945214272,
"learning_rate": 1.0554024673218806e-07,
"loss": 0.0206,
"reward": -0.3871428966522217,
"reward_after_mean": -0.3871428966522217,
"reward_after_std": 0.33264124393463135,
"reward_before_mean": -0.11114806681871414,
"reward_before_std": 0.2715215114876628,
"reward_change_max": 0.0,
"reward_change_mean": -0.27599484845995903,
"reward_change_min": -0.40478406473994255,
"reward_change_std": 0.15969175938516855,
"reward_std": 0.33264124765992165,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.19448141008615494,
"step": 192
},
{
"clip_fraction": 0.0,
"completion_length": 2929.937545776367,
"epoch": 0.22057142857142858,
"grad_norm": 0.06499005109071732,
"kl": 3.759562969207764e-05,
"lambda_div_used": 0.6119573265314102,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0198,
"reward": -0.24959510192275047,
"reward_after_mean": -0.24959510192275047,
"reward_after_std": 0.5340033005923033,
"reward_before_mean": 0.009953722357749939,
"reward_before_std": 0.520604582503438,
"reward_change_max": 0.0,
"reward_change_mean": -0.25954881869256496,
"reward_change_min": -0.5182105302810669,
"reward_change_std": 0.186925214715302,
"reward_std": 0.5340033229440451,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.1358796115964651,
"step": 193
},
{
"clip_fraction": 0.0,
"completion_length": 3132.875030517578,
"epoch": 0.22171428571428572,
"grad_norm": 0.06673065572977066,
"kl": 4.443526268005371e-05,
"lambda_div_used": 0.6207702159881592,
"learning_rate": 1.0335423176140511e-07,
"loss": 0.0719,
"reward": 0.2215312235057354,
"reward_after_mean": 0.2215312235057354,
"reward_after_std": 0.6115883849561214,
"reward_before_mean": 0.7030764240771532,
"reward_before_std": 0.5582148376852274,
"reward_change_max": 0.0,
"reward_change_mean": -0.48154521360993385,
"reward_change_min": -0.7418999075889587,
"reward_change_std": 0.29459451511502266,
"reward_std": 0.611588392406702,
"rewards/accuracy_reward": 0.4583333469927311,
"rewards/cosine_scaled_reward": 0.2447430812753737,
"step": 194
},
{
"clip_fraction": 0.0,
"completion_length": 2998.687530517578,
"epoch": 0.22285714285714286,
"grad_norm": 0.05692190304398537,
"kl": 4.340708255767822e-05,
"lambda_div_used": 0.5588866546750069,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0321,
"reward": -0.4250563494861126,
"reward_after_mean": -0.4250563494861126,
"reward_after_std": 0.3346366826444864,
"reward_before_mean": -0.16300050355494022,
"reward_before_std": 0.26834714552387595,
"reward_change_max": 0.0,
"reward_change_mean": -0.2620558477938175,
"reward_change_min": -0.4044179916381836,
"reward_change_std": 0.15184260439127684,
"reward_std": 0.3346366863697767,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.22550049051642418,
"step": 195
},
{
"clip_fraction": 0.0,
"completion_length": 3551.6041870117188,
"epoch": 0.224,
"grad_norm": 0.051314327865839005,
"kl": 4.8547983169555664e-05,
"lambda_div_used": 0.5673639252781868,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0144,
"reward": -0.43661339208483696,
"reward_after_mean": -0.43661339208483696,
"reward_after_std": 0.3381017968058586,
"reward_before_mean": -0.18068170547485352,
"reward_before_std": 0.3060085875913501,
"reward_change_max": 0.0,
"reward_change_mean": -0.25593167915940285,
"reward_change_min": -0.436182364821434,
"reward_change_std": 0.16187659837305546,
"reward_std": 0.3381018117070198,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.24318172316998243,
"step": 196
},
{
"clip_fraction": 0.0,
"completion_length": 2789.125068664551,
"epoch": 0.22514285714285714,
"grad_norm": 0.08223231136798859,
"kl": 3.402773290872574e-05,
"lambda_div_used": 0.6149426028132439,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0526,
"reward": 0.08269345387816429,
"reward_after_mean": 0.08269345387816429,
"reward_after_std": 0.5922380294650793,
"reward_before_mean": 0.4997589197009802,
"reward_before_std": 0.5276397680863738,
"reward_change_max": 0.0,
"reward_change_mean": -0.4170654658228159,
"reward_change_min": -0.6376415118575096,
"reward_change_std": 0.24999654106795788,
"reward_std": 0.592238049954176,
"rewards/accuracy_reward": 0.39583334140479565,
"rewards/cosine_scaled_reward": 0.10392560251057148,
"step": 197
},
{
"clip_fraction": 0.0,
"completion_length": 2770.2708435058594,
"epoch": 0.22628571428571428,
"grad_norm": 0.07601747661828995,
"kl": 3.542006015777588e-05,
"lambda_div_used": 0.5800714045763016,
"learning_rate": 1.0061670936044178e-07,
"loss": -0.0178,
"reward": -0.24220025539398193,
"reward_after_mean": -0.24220025539398193,
"reward_after_std": 0.4058863651007414,
"reward_before_mean": 0.06675046496093273,
"reward_before_std": 0.36683204025030136,
"reward_change_max": 0.0,
"reward_change_mean": -0.30895073898136616,
"reward_change_min": -0.4967211000621319,
"reward_change_std": 0.18911676667630672,
"reward_std": 0.40588637441396713,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.09991620294749737,
"step": 198
},
{
"clip_fraction": 0.0,
"completion_length": 3554.0416870117188,
"epoch": 0.22742857142857142,
"grad_norm": 0.04930044710636139,
"kl": 4.279613494873047e-05,
"lambda_div_used": 0.5560082867741585,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0027,
"reward": -0.4229874052107334,
"reward_after_mean": -0.4229874052107334,
"reward_after_std": 0.31992789916694164,
"reward_before_mean": -0.14317837683483958,
"reward_before_std": 0.25433824164792895,
"reward_change_max": 0.0,
"reward_change_mean": -0.27980900928378105,
"reward_change_min": -0.4250169135630131,
"reward_change_std": 0.16277197189629078,
"reward_std": 0.31992790661752224,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.20567839220166206,
"step": 199
},
{
"clip_fraction": 0.0,
"completion_length": 2299.5209045410156,
"epoch": 0.22857142857142856,
"grad_norm": 0.0840829536318779,
"kl": 4.097074270248413e-05,
"lambda_div_used": 0.6226711273193359,
"learning_rate": 1.0006853717962393e-07,
"loss": 0.0203,
"reward": 0.062016794458031654,
"reward_after_mean": 0.062016794458031654,
"reward_after_std": 0.631916331127286,
"reward_before_mean": 0.4668920338153839,
"reward_before_std": 0.5674512181431055,
"reward_change_max": 0.0,
"reward_change_mean": -0.40487524308264256,
"reward_change_min": -0.6527138948440552,
"reward_change_std": 0.250981617718935,
"reward_std": 0.6319163534790277,
"rewards/accuracy_reward": 0.35416667349636555,
"rewards/cosine_scaled_reward": 0.11272535985335708,
"step": 200
},
{
"clip_fraction": 0.0,
"completion_length": 2760.7084197998047,
"epoch": 0.2297142857142857,
"grad_norm": 0.06477575749158859,
"kl": 1.7711427062749863e-05,
"lambda_div_used": 0.6429438292980194,
"learning_rate": 1e-07,
"loss": -0.0022,
"reward": 0.15019571036100388,
"reward_after_mean": 0.15019571036100388,
"reward_after_std": 0.7328120246529579,
"reward_before_mean": 0.5535202682949603,
"reward_before_std": 0.6601617820560932,
"reward_change_max": 0.0,
"reward_change_mean": -0.4033245462924242,
"reward_change_min": -0.6303750872612,
"reward_change_std": 0.24585676938295364,
"reward_std": 0.7328120358288288,
"rewards/accuracy_reward": 0.41666667349636555,
"rewards/cosine_scaled_reward": 0.13685358315706253,
"step": 201
},
{
"clip_fraction": 0.0,
"completion_length": 2511.937511444092,
"epoch": 0.23085714285714284,
"grad_norm": 0.08345558494329453,
"kl": 4.0203332901000977e-05,
"lambda_div_used": 0.5811712816357613,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0587,
"reward": 0.03902309015393257,
"reward_after_mean": 0.03902309015393257,
"reward_after_std": 0.5094954669475555,
"reward_before_mean": 0.5408022310584784,
"reward_before_std": 0.37269798293709755,
"reward_change_max": 0.0,
"reward_change_mean": -0.5017791502177715,
"reward_change_min": -0.7236880213022232,
"reward_change_std": 0.28125342447310686,
"reward_std": 0.5094954781234264,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.14496888127177954,
"step": 202
},
{
"clip_fraction": 0.0,
"completion_length": 3065.3333435058594,
"epoch": 0.232,
"grad_norm": 0.05931547284126282,
"kl": 5.182623863220215e-05,
"lambda_div_used": 0.5537669658660889,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0447,
"reward": -0.3224771413952112,
"reward_after_mean": -0.3224771413952112,
"reward_after_std": 0.36307925172150135,
"reward_before_mean": 0.029698201455175877,
"reward_before_std": 0.23980092909187078,
"reward_change_max": 0.0,
"reward_change_mean": -0.3521753493696451,
"reward_change_min": -0.49568963050842285,
"reward_change_std": 0.18385440576821566,
"reward_std": 0.36307926289737225,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.1161351166665554,
"step": 203
},
{
"clip_fraction": 0.0,
"completion_length": 2511.833366394043,
"epoch": 0.23314285714285715,
"grad_norm": 0.07267706841230392,
"kl": 5.4508447647094727e-05,
"lambda_div_used": 0.5887176766991615,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0023,
"reward": -0.23160922899842262,
"reward_after_mean": -0.23160922899842262,
"reward_after_std": 0.430733734741807,
"reward_before_mean": 0.06992286071181297,
"reward_before_std": 0.4116028640419245,
"reward_change_max": 0.0,
"reward_change_mean": -0.301532082259655,
"reward_change_min": -0.49793890863657,
"reward_change_std": 0.19511268101632595,
"reward_std": 0.43073374405503273,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.07591048628091812,
"step": 204
},
{
"clip_fraction": 0.0,
"completion_length": 2967.1875915527344,
"epoch": 0.2342857142857143,
"grad_norm": 0.06776981800794601,
"kl": 4.439055919647217e-05,
"lambda_div_used": 0.653583250939846,
"learning_rate": 7.640308940816239e-07,
"loss": 0.1014,
"reward": -0.08090712130069733,
"reward_after_mean": -0.08090712130069733,
"reward_after_std": 0.7262772191315889,
"reward_before_mean": 0.17835846357047558,
"reward_before_std": 0.7181071005761623,
"reward_change_max": 0.0,
"reward_change_mean": -0.2592655848711729,
"reward_change_min": -0.4743462074548006,
"reward_change_std": 0.18475584778934717,
"reward_std": 0.7262772284448147,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.05080821271985769,
"step": 205
},
{
"clip_fraction": 0.0,
"completion_length": 3085.5208892822266,
"epoch": 0.23542857142857143,
"grad_norm": 0.06689873337745667,
"kl": 3.569573163986206e-05,
"lambda_div_used": 0.5862618833780289,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0353,
"reward": -0.28365214727818966,
"reward_after_mean": -0.28365214727818966,
"reward_after_std": 0.44012872874736786,
"reward_before_mean": 0.004427256062626839,
"reward_before_std": 0.3988625044003129,
"reward_change_max": 0.0,
"reward_change_mean": -0.2880793921649456,
"reward_change_min": -0.49235254526138306,
"reward_change_std": 0.18395678792148829,
"reward_std": 0.44012873619794846,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.12057275045663118,
"step": 206
},
{
"clip_fraction": 0.0,
"completion_length": 3146.6458740234375,
"epoch": 0.23657142857142857,
"grad_norm": 0.06122450903058052,
"kl": 3.725104033946991e-05,
"lambda_div_used": 0.579835832118988,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0182,
"reward": -0.28688727831467986,
"reward_after_mean": -0.28688727831467986,
"reward_after_std": 0.4150819983333349,
"reward_before_mean": 0.009919969365000725,
"reward_before_std": 0.3629662115126848,
"reward_change_max": 0.0,
"reward_change_mean": -0.29680725932121277,
"reward_change_min": -0.45919613167643547,
"reward_change_std": 0.17279880121350288,
"reward_std": 0.41508200392127037,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.11508002015762031,
"step": 207
},
{
"clip_fraction": 0.0,
"completion_length": 2780.291717529297,
"epoch": 0.2377142857142857,
"grad_norm": 0.06370716542005539,
"kl": 3.692507743835449e-05,
"lambda_div_used": 0.5807016789913177,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0403,
"reward": -0.17094913870096207,
"reward_after_mean": -0.17094913870096207,
"reward_after_std": 0.46442117914557457,
"reward_before_mean": 0.20349296741187572,
"reward_before_std": 0.3671952560544014,
"reward_change_max": 0.0,
"reward_change_mean": -0.3744421415030956,
"reward_change_min": -0.5505415536463261,
"reward_change_std": 0.21210224367678165,
"reward_std": 0.46442119032144547,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.004840357229113579,
"step": 208
},
{
"clip_fraction": 0.0,
"completion_length": 2574.6042079925537,
"epoch": 0.23885714285714285,
"grad_norm": 0.08756011724472046,
"kl": 1.8547754734754562e-05,
"lambda_div_used": 0.6246868968009949,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0837,
"reward": 0.028150439262390137,
"reward_after_mean": 0.028150439262390137,
"reward_after_std": 0.5636487938463688,
"reward_before_mean": 0.3838757500052452,
"reward_before_std": 0.576031070202589,
"reward_change_max": 0.0,
"reward_change_mean": -0.3557253200560808,
"reward_change_min": -0.5899170190095901,
"reward_change_std": 0.23946599010378122,
"reward_std": 0.5636488180607557,
"rewards/accuracy_reward": 0.3333333469927311,
"rewards/cosine_scaled_reward": 0.05054241791367531,
"step": 209
},
{
"clip_fraction": 0.0,
"completion_length": 2779.3333435058594,
"epoch": 0.24,
"grad_norm": 0.06111512333154678,
"kl": 3.0368566513061523e-05,
"lambda_div_used": 0.5897415727376938,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0277,
"reward": -0.25500940857455134,
"reward_after_mean": -0.25500940857455134,
"reward_after_std": 0.47190882451832294,
"reward_before_mean": 0.03478116978658363,
"reward_before_std": 0.4098346810787916,
"reward_change_max": 0.0,
"reward_change_mean": -0.2897905595600605,
"reward_change_min": -0.42270801588892937,
"reward_change_std": 0.1620404813438654,
"reward_std": 0.4719088301062584,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.11105216934811324,
"step": 210
},
{
"clip_fraction": 0.0,
"completion_length": 2558.8541946411133,
"epoch": 0.24114285714285713,
"grad_norm": 0.06923027336597443,
"kl": 3.6522746086120605e-05,
"lambda_div_used": 0.5664671063423157,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0729,
"reward": -0.06898702308535576,
"reward_after_mean": -0.06898702308535576,
"reward_after_std": 0.4354093614965677,
"reward_before_mean": 0.4075740482658148,
"reward_before_std": 0.30287738889455795,
"reward_change_max": 0.0,
"reward_change_mean": -0.4765610620379448,
"reward_change_min": -0.6883851811289787,
"reward_change_std": 0.262312775477767,
"reward_std": 0.4354093801230192,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.09507404454052448,
"step": 211
},
{
"clip_fraction": 0.0,
"completion_length": 2327.750015258789,
"epoch": 0.2422857142857143,
"grad_norm": 0.08028864860534668,
"kl": 2.8003007173538208e-05,
"lambda_div_used": 0.5712975114583969,
"learning_rate": 7.444385869608921e-07,
"loss": -0.0113,
"reward": -0.09517045877873898,
"reward_after_mean": -0.09517045877873898,
"reward_after_std": 0.49211281538009644,
"reward_before_mean": 0.3566615767776966,
"reward_before_std": 0.32028379291296005,
"reward_change_max": 0.0,
"reward_change_mean": -0.4518320318311453,
"reward_change_min": -0.6186705827713013,
"reward_change_std": 0.23167487233877182,
"reward_std": 0.4921128321439028,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.06499490264104679,
"step": 212
},
{
"clip_fraction": 0.0,
"completion_length": 2145.6041946411133,
"epoch": 0.24342857142857144,
"grad_norm": 0.09073800593614578,
"kl": 4.177819937467575e-05,
"lambda_div_used": 0.6168783828616142,
"learning_rate": 7.416006812042827e-07,
"loss": 0.027,
"reward": -0.08367926510982215,
"reward_after_mean": -0.08367926510982215,
"reward_after_std": 0.5790203902870417,
"reward_before_mean": 0.24448610469698906,
"reward_before_std": 0.5364782512187958,
"reward_change_max": 0.0,
"reward_change_mean": -0.32816537097096443,
"reward_change_min": -0.5068525895476341,
"reward_change_std": 0.19713077135384083,
"reward_std": 0.579020407050848,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": 0.036152773071080446,
"step": 213
},
{
"clip_fraction": 0.0,
"completion_length": 2982.6458587646484,
"epoch": 0.24457142857142858,
"grad_norm": 0.06497277319431305,
"kl": 3.772228956222534e-05,
"lambda_div_used": 0.6168569102883339,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0247,
"reward": -0.10639690980315208,
"reward_after_mean": -0.10639690980315208,
"reward_after_std": 0.563305439427495,
"reward_before_mean": 0.21648684330284595,
"reward_before_std": 0.5375220291316509,
"reward_change_max": 0.0,
"reward_change_mean": -0.3228837437927723,
"reward_change_min": -0.5641361065208912,
"reward_change_std": 0.20987980626523495,
"reward_std": 0.5633054543286562,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": 0.00815348862670362,
"step": 214
},
{
"clip_fraction": 0.0,
"completion_length": 2382.2500610351562,
"epoch": 0.24571428571428572,
"grad_norm": 0.07877103984355927,
"kl": 4.485622048377991e-05,
"lambda_div_used": 0.5695274397730827,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0351,
"reward": -0.39196348818950355,
"reward_after_mean": -0.39196348818950355,
"reward_after_std": 0.39862883277237415,
"reward_before_mean": -0.128106027841568,
"reward_before_std": 0.31178954988718033,
"reward_change_max": 0.0,
"reward_change_mean": -0.263857439160347,
"reward_change_min": -0.38021961972117424,
"reward_change_std": 0.13994430005550385,
"reward_std": 0.39862884022295475,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1697727027349174,
"step": 215
},
{
"clip_fraction": 0.0,
"completion_length": 2156.3958740234375,
"epoch": 0.24685714285714286,
"grad_norm": 0.11318857222795486,
"kl": 4.297494888305664e-05,
"lambda_div_used": 0.5965342745184898,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0186,
"reward": -0.22045887634158134,
"reward_after_mean": -0.22045887634158134,
"reward_after_std": 0.4805217906832695,
"reward_before_mean": 0.08258800266776234,
"reward_before_std": 0.4415160808712244,
"reward_change_max": 0.0,
"reward_change_mean": -0.3030468635261059,
"reward_change_min": -0.4999500457197428,
"reward_change_std": 0.185057258233428,
"reward_std": 0.4805217981338501,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.08407866954803467,
"step": 216
},
{
"clip_fraction": 0.0,
"completion_length": 2763.520866394043,
"epoch": 0.248,
"grad_norm": 0.067295141518116,
"kl": 3.0644237995147705e-05,
"lambda_div_used": 0.6201670467853546,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0139,
"reward": -0.19884846359491348,
"reward_after_mean": -0.19884846359491348,
"reward_after_std": 0.5856006741523743,
"reward_before_mean": 0.06158541014883667,
"reward_before_std": 0.5545480605214834,
"reward_change_max": 0.0,
"reward_change_mean": -0.2604338899254799,
"reward_change_min": -0.503634799271822,
"reward_change_std": 0.1784855630248785,
"reward_std": 0.5856006946414709,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.10508125275373459,
"step": 217
},
{
"clip_fraction": 0.0,
"completion_length": 2880.7083587646484,
"epoch": 0.24914285714285714,
"grad_norm": 0.07706139236688614,
"kl": 3.7044286727905273e-05,
"lambda_div_used": 0.6074161231517792,
"learning_rate": 7.27273859315928e-07,
"loss": -0.0223,
"reward": -0.19500153325498104,
"reward_after_mean": -0.19500153325498104,
"reward_after_std": 0.5332945492118597,
"reward_before_mean": 0.08328226953744888,
"reward_before_std": 0.5006468072533607,
"reward_change_max": 0.0,
"reward_change_mean": -0.27828381210565567,
"reward_change_min": -0.5113113783299923,
"reward_change_std": 0.18696410488337278,
"reward_std": 0.5332945715636015,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.08338439278304577,
"step": 218
},
{
"clip_fraction": 0.0,
"completion_length": 2352.645854949951,
"epoch": 0.2502857142857143,
"grad_norm": 0.10948827862739563,
"kl": 3.1441450119018555e-05,
"lambda_div_used": 0.5561031624674797,
"learning_rate": 7.243820139034464e-07,
"loss": -0.0096,
"reward": -0.029067307710647583,
"reward_after_mean": -0.029067307710647583,
"reward_after_std": 0.41141366213560104,
"reward_before_mean": 0.4911606255918741,
"reward_before_std": 0.2519808644428849,
"reward_change_max": 0.0,
"reward_change_mean": -0.5202279426157475,
"reward_change_min": -0.7093416638672352,
"reward_change_std": 0.27748389169573784,
"reward_std": 0.41141367703676224,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.13699393905699253,
"step": 219
},
{
"clip_fraction": 0.0,
"completion_length": 2635.5833435058594,
"epoch": 0.25142857142857145,
"grad_norm": 0.08914919197559357,
"kl": 3.5672448575496674e-05,
"lambda_div_used": 0.5308618620038033,
"learning_rate": 7.214816693576234e-07,
"loss": -0.0362,
"reward": -0.5146691724658012,
"reward_after_mean": -0.5146691724658012,
"reward_after_std": 0.23347610421478748,
"reward_before_mean": -0.2369950506836176,
"reward_before_std": 0.13545648753643036,
"reward_change_max": 0.0,
"reward_change_mean": -0.27767411433160305,
"reward_change_min": -0.3908136747777462,
"reward_change_std": 0.14405533485114574,
"reward_std": 0.23347610607743263,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2369950469583273,
"step": 220
},
{
"clip_fraction": 0.0,
"completion_length": 2279.562530517578,
"epoch": 0.25257142857142856,
"grad_norm": 0.0954662561416626,
"kl": 2.4788081645965576e-05,
"lambda_div_used": 0.5764833092689514,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0168,
"reward": 0.023251693695783615,
"reward_after_mean": 0.023251693695783615,
"reward_after_std": 0.4863298423588276,
"reward_before_mean": 0.5212067291140556,
"reward_before_std": 0.34889572812244296,
"reward_change_max": 0.0,
"reward_change_mean": -0.49795500561594963,
"reward_change_min": -0.7016883753240108,
"reward_change_std": 0.27911104913800955,
"reward_std": 0.48632985167205334,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/cosine_scaled_reward": 0.10454002395272255,
"step": 221
},
{
"clip_fraction": 0.0,
"completion_length": 2104.9583740234375,
"epoch": 0.2537142857142857,
"grad_norm": 0.0776790902018547,
"kl": 2.4726614356040955e-05,
"lambda_div_used": 0.5997953563928604,
"learning_rate": 7.156560487081051e-07,
"loss": -0.0148,
"reward": -0.04329463094472885,
"reward_after_mean": -0.04329463094472885,
"reward_after_std": 0.48106030002236366,
"reward_before_mean": 0.33003126084804535,
"reward_before_std": 0.46051184553653,
"reward_change_max": 0.0,
"reward_change_mean": -0.3733258917927742,
"reward_change_min": -0.5728014186024666,
"reward_change_std": 0.23287776950746775,
"reward_std": 0.48106031119823456,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.0591979268938303,
"step": 222
},
{
"clip_fraction": 0.0,
"completion_length": 2521.3333587646484,
"epoch": 0.25485714285714284,
"grad_norm": 0.07036174088716507,
"kl": 2.9304996132850647e-05,
"lambda_div_used": 0.5750112235546112,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0216,
"reward": -0.1473498847335577,
"reward_after_mean": -0.1473498847335577,
"reward_after_std": 0.4359878208488226,
"reward_before_mean": 0.24288302287459373,
"reward_before_std": 0.3438769578933716,
"reward_change_max": 0.0,
"reward_change_mean": -0.39023288898169994,
"reward_change_min": -0.588653527200222,
"reward_change_std": 0.22452317085117102,
"reward_std": 0.43598783388733864,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.027950339019298553,
"step": 223
},
{
"clip_fraction": 0.0,
"completion_length": 3317.729217529297,
"epoch": 0.256,
"grad_norm": 0.05309538170695305,
"kl": 2.8050970286130905e-05,
"lambda_div_used": 0.5725216493010521,
"learning_rate": 7.097981330836616e-07,
"loss": 0.069,
"reward": -0.30242439545691013,
"reward_after_mean": -0.30242439545691013,
"reward_after_std": 0.38869454339146614,
"reward_before_mean": -0.000308917835354805,
"reward_before_std": 0.3251040354371071,
"reward_change_max": 0.0,
"reward_change_mean": -0.30211549811065197,
"reward_change_min": -0.4553787522017956,
"reward_change_std": 0.171081081032753,
"reward_std": 0.3886945564299822,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.12530891119968146,
"step": 224
},
{
"clip_fraction": 0.0,
"completion_length": 3037.208335876465,
"epoch": 0.2571428571428571,
"grad_norm": 0.0768936425447464,
"kl": 4.080682992935181e-05,
"lambda_div_used": 0.5907088667154312,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0169,
"reward": -0.28160014655441046,
"reward_after_mean": -0.28160014655441046,
"reward_after_std": 0.48472702503204346,
"reward_before_mean": -0.008234186680056155,
"reward_before_std": 0.41540220472961664,
"reward_change_max": 0.0,
"reward_change_mean": -0.27336596697568893,
"reward_change_min": -0.4061685614287853,
"reward_change_std": 0.15673903841525316,
"reward_std": 0.48472702503204346,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.13323417864739895,
"step": 225
},
{
"clip_fraction": 0.0,
"completion_length": 2861.416702270508,
"epoch": 0.2582857142857143,
"grad_norm": 0.07499676197767258,
"kl": 3.580749034881592e-05,
"lambda_div_used": 0.6208535805344582,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0702,
"reward": -0.08291551470756531,
"reward_after_mean": -0.08291551470756531,
"reward_after_std": 0.5764818880707026,
"reward_before_mean": 0.23111886344850063,
"reward_before_std": 0.5593543313443661,
"reward_change_max": 0.0,
"reward_change_mean": -0.3140343725681305,
"reward_change_min": -0.5283008627593517,
"reward_change_std": 0.20683539099991322,
"reward_std": 0.5764818955212831,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": -0.03971448230731767,
"step": 226
},
{
"clip_fraction": 0.0,
"completion_length": 1926.6875305175781,
"epoch": 0.25942857142857145,
"grad_norm": 0.10923109948635101,
"kl": 4.176795482635498e-05,
"lambda_div_used": 0.5834182798862457,
"learning_rate": 7.009532063876148e-07,
"loss": -0.0447,
"reward": -0.2778073139488697,
"reward_after_mean": -0.2778073139488697,
"reward_after_std": 0.4395454227924347,
"reward_before_mean": 0.009000460617244244,
"reward_before_std": 0.3901812704280019,
"reward_change_max": 0.0,
"reward_change_mean": -0.28680778108537197,
"reward_change_min": -0.45677450299263,
"reward_change_std": 0.18111994117498398,
"reward_std": 0.43954543210566044,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.1159995449706912,
"step": 227
},
{
"clip_fraction": 0.0,
"completion_length": 2502.562545776367,
"epoch": 0.26057142857142856,
"grad_norm": 0.08182154595851898,
"kl": 1.8984079360961914e-05,
"lambda_div_used": 0.6025907471776009,
"learning_rate": 6.979899910323624e-07,
"loss": 0.074,
"reward": 0.03976001590490341,
"reward_after_mean": 0.03976001590490341,
"reward_after_std": 0.5421929359436035,
"reward_before_mean": 0.4711545445024967,
"reward_before_std": 0.4750876808539033,
"reward_change_max": 0.0,
"reward_change_mean": -0.4313945174217224,
"reward_change_min": -0.6774051003158092,
"reward_change_std": 0.2637105621397495,
"reward_std": 0.5421929433941841,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.11698784306645393,
"step": 228
},
{
"clip_fraction": 0.0,
"completion_length": 3196.3125228881836,
"epoch": 0.26171428571428573,
"grad_norm": 0.06398586928844452,
"kl": 4.0434300899505615e-05,
"lambda_div_used": 0.5669709742069244,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0135,
"reward": -0.15348458290100098,
"reward_after_mean": -0.15348458290100098,
"reward_after_std": 0.42672324273735285,
"reward_before_mean": 0.26130594592541456,
"reward_before_std": 0.3047938751988113,
"reward_change_max": 0.0,
"reward_change_mean": -0.41479056514799595,
"reward_change_min": -0.5674791261553764,
"reward_change_std": 0.22689451277256012,
"reward_std": 0.4267232706770301,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": 0.03213928057812154,
"step": 229
},
{
"clip_fraction": 0.0,
"completion_length": 3100.5000228881836,
"epoch": 0.26285714285714284,
"grad_norm": 0.05996527522802353,
"kl": 2.22623348236084e-05,
"lambda_div_used": 0.5811162814497948,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0391,
"reward": -0.36068666726350784,
"reward_after_mean": -0.36068666726350784,
"reward_after_std": 0.43734684213995934,
"reward_before_mean": -0.08804995659738779,
"reward_before_std": 0.37305002473294735,
"reward_change_max": 0.0,
"reward_change_mean": -0.2726367134600878,
"reward_change_min": -0.4353605732321739,
"reward_change_std": 0.16089125256985426,
"reward_std": 0.43734684586524963,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.15054996183607727,
"step": 230
},
{
"clip_fraction": 0.0,
"completion_length": 2678.7083587646484,
"epoch": 0.264,
"grad_norm": 0.06770680844783783,
"kl": 2.5890767574310303e-05,
"lambda_div_used": 0.5817164853215218,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0722,
"reward": -0.13625335786491632,
"reward_after_mean": -0.13625335786491632,
"reward_after_std": 0.46202925965189934,
"reward_before_mean": 0.25235490314662457,
"reward_before_std": 0.37311020120978355,
"reward_change_max": 0.0,
"reward_change_mean": -0.3886082824319601,
"reward_change_min": -0.6004670634865761,
"reward_change_std": 0.2275555245578289,
"reward_std": 0.4620292726904154,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.018478410318493843,
"step": 231
},
{
"clip_fraction": 0.0,
"completion_length": 3258.625030517578,
"epoch": 0.2651428571428571,
"grad_norm": 0.05468503385782242,
"kl": 3.515183925628662e-05,
"lambda_div_used": 0.5800251811742783,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0159,
"reward": -0.3428461756557226,
"reward_after_mean": -0.3428461756557226,
"reward_after_std": 0.4266065489500761,
"reward_before_mean": -0.07336848601698875,
"reward_before_std": 0.36755594704300165,
"reward_change_max": 0.0,
"reward_change_mean": -0.26947769708931446,
"reward_change_min": -0.43084757775068283,
"reward_change_std": 0.1597052849829197,
"reward_std": 0.4266065787523985,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.1567018236964941,
"step": 232
},
{
"clip_fraction": 0.0,
"completion_length": 2661.2292251586914,
"epoch": 0.2662857142857143,
"grad_norm": 0.08775883167982101,
"kl": 3.3989548683166504e-05,
"lambda_div_used": 0.6155472174286842,
"learning_rate": 6.83068622519821e-07,
"loss": -0.016,
"reward": -0.22876367531716824,
"reward_after_mean": -0.22876367531716824,
"reward_after_std": 0.5766322333365679,
"reward_before_mean": 0.027781556826084852,
"reward_before_std": 0.5290827043354511,
"reward_change_max": 0.0,
"reward_change_mean": -0.2565452288836241,
"reward_change_min": -0.4581022933125496,
"reward_change_std": 0.16627201065421104,
"reward_std": 0.5766322445124388,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.11805178504437208,
"step": 233
},
{
"clip_fraction": 0.0,
"completion_length": 2694.3125228881836,
"epoch": 0.2674285714285714,
"grad_norm": 0.11633959412574768,
"kl": 3.643333911895752e-05,
"lambda_div_used": 0.5539941042661667,
"learning_rate": 6.800643086250121e-07,
"loss": -0.0056,
"reward": -0.22800572216510773,
"reward_after_mean": -0.22800572216510773,
"reward_after_std": 0.3484720904380083,
"reward_before_mean": 0.16009100899100304,
"reward_before_std": 0.24206165876239538,
"reward_change_max": 0.0,
"reward_change_mean": -0.3880967255681753,
"reward_change_min": -0.5548957660794258,
"reward_change_std": 0.21049270872026682,
"reward_std": 0.34847209975123405,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.06907567009329796,
"step": 234
},
{
"clip_fraction": 0.0,
"completion_length": 2490.4375,
"epoch": 0.26857142857142857,
"grad_norm": 0.10378725826740265,
"kl": 2.8094742447137833e-05,
"lambda_div_used": 0.5782083421945572,
"learning_rate": 6.770536555792944e-07,
"loss": -0.0425,
"reward": 0.01590564101934433,
"reward_after_mean": 0.01590564101934433,
"reward_after_std": 0.5053408965468407,
"reward_before_mean": 0.49857149738818407,
"reward_before_std": 0.3586234971880913,
"reward_change_max": 0.0,
"reward_change_mean": -0.48266585171222687,
"reward_change_min": -0.7199156694114208,
"reward_change_std": 0.2697788691148162,
"reward_std": 0.505340900272131,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/cosine_scaled_reward": 0.12357146013528109,
"step": 235
},
{
"clip_fraction": 0.0,
"completion_length": 2824.7917098999023,
"epoch": 0.26971428571428574,
"grad_norm": 0.06474865972995758,
"kl": 2.1502375602722168e-05,
"lambda_div_used": 0.5763456672430038,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0011,
"reward": -0.3977004513144493,
"reward_after_mean": -0.3977004513144493,
"reward_after_std": 0.41489073634147644,
"reward_before_mean": -0.1460555698722601,
"reward_before_std": 0.347917802631855,
"reward_change_max": 0.0,
"reward_change_mean": -0.2516448702663183,
"reward_change_min": -0.42327918112277985,
"reward_change_std": 0.14836463797837496,
"reward_std": 0.4148907568305731,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.20855557406321168,
"step": 236
},
{
"clip_fraction": 0.0,
"completion_length": 2764.9583740234375,
"epoch": 0.27085714285714285,
"grad_norm": 0.06224860996007919,
"kl": 2.5756657123565674e-05,
"lambda_div_used": 0.5886275842785835,
"learning_rate": 6.710139192768694e-07,
"loss": -0.0228,
"reward": -0.1175742200575769,
"reward_after_mean": -0.1175742200575769,
"reward_after_std": 0.4582329224795103,
"reward_before_mean": 0.24255024455487728,
"reward_before_std": 0.40142686292529106,
"reward_change_max": 0.0,
"reward_change_mean": -0.36012447625398636,
"reward_change_min": -0.5257687419652939,
"reward_change_std": 0.20567627251148224,
"reward_std": 0.4582329299300909,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/cosine_scaled_reward": 0.013383567042183131,
"step": 237
},
{
"clip_fraction": 0.0,
"completion_length": 3222.479217529297,
"epoch": 0.272,
"grad_norm": 0.05841728672385216,
"kl": 3.546103835105896e-05,
"lambda_div_used": 0.6281886473298073,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0655,
"reward": 0.044486068189144135,
"reward_after_mean": 0.044486068189144135,
"reward_after_std": 0.6025157757103443,
"reward_before_mean": 0.4036002438515425,
"reward_before_std": 0.6015807576477528,
"reward_change_max": 0.0,
"reward_change_mean": -0.359114158898592,
"reward_change_min": -0.609935961663723,
"reward_change_std": 0.24452881701290607,
"reward_std": 0.6025157924741507,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.09110023453831673,
"step": 238
},
{
"clip_fraction": 0.0,
"completion_length": 1740.1666793823242,
"epoch": 0.27314285714285713,
"grad_norm": 0.09357193857431412,
"kl": 1.8790364265441895e-05,
"lambda_div_used": 0.5962883979082108,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0055,
"reward": 0.07376761082559824,
"reward_after_mean": 0.07376761082559824,
"reward_after_std": 0.5891099888831377,
"reward_before_mean": 0.5742647312581539,
"reward_before_std": 0.44304153323173523,
"reward_change_max": 0.0,
"reward_change_mean": -0.5004971195012331,
"reward_change_min": -0.7436067499220371,
"reward_change_std": 0.2840570341795683,
"reward_std": 0.5891100075095892,
"rewards/accuracy_reward": 0.37500000186264515,
"rewards/cosine_scaled_reward": 0.19926471984945238,
"step": 239
},
{
"clip_fraction": 0.0,
"completion_length": 3087.416702270508,
"epoch": 0.2742857142857143,
"grad_norm": 0.06324295699596405,
"kl": 3.217160701751709e-05,
"lambda_div_used": 0.5339473709464073,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0166,
"reward": -0.5358126908540726,
"reward_after_mean": -0.5358126908540726,
"reward_after_std": 0.23053276538848877,
"reward_before_mean": -0.2753155492246151,
"reward_before_std": 0.1490377252921462,
"reward_change_max": 0.0,
"reward_change_mean": -0.26049717888236046,
"reward_change_min": -0.3783254958689213,
"reward_change_std": 0.1383643699809909,
"reward_std": 0.23053277097642422,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2753155454993248,
"step": 240
},
{
"clip_fraction": 0.0,
"completion_length": 3419.625,
"epoch": 0.2754285714285714,
"grad_norm": 0.047396283596754074,
"kl": 3.555417060852051e-05,
"lambda_div_used": 0.5585425272583961,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0027,
"reward": -0.4805446630343795,
"reward_after_mean": -0.4805446630343795,
"reward_after_std": 0.3355217073112726,
"reward_before_mean": -0.23604051489382982,
"reward_before_std": 0.2620125887915492,
"reward_change_max": 0.0,
"reward_change_mean": -0.2445041425526142,
"reward_change_min": -0.37724288925528526,
"reward_change_std": 0.13484715577214956,
"reward_std": 0.3355217222124338,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2568738413974643,
"step": 241
},
{
"clip_fraction": 0.0,
"completion_length": 2400.8750228881836,
"epoch": 0.2765714285714286,
"grad_norm": 0.08974741399288177,
"kl": 4.5746564865112305e-05,
"lambda_div_used": 0.5887154564261436,
"learning_rate": 6.558139508961654e-07,
"loss": -0.0452,
"reward": -0.2555653927847743,
"reward_after_mean": -0.2555653927847743,
"reward_after_std": 0.47001883387565613,
"reward_before_mean": 0.03157716616988182,
"reward_before_std": 0.4061046461574733,
"reward_change_max": 0.0,
"reward_change_mean": -0.28714255429804325,
"reward_change_min": -0.41172971203923225,
"reward_change_std": 0.16218022629618645,
"reward_std": 0.47001886926591396,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.1142561559099704,
"step": 242
},
{
"clip_fraction": 0.0,
"completion_length": 2828.541679382324,
"epoch": 0.2777142857142857,
"grad_norm": 0.07105688005685806,
"kl": 1.8571503460407257e-05,
"lambda_div_used": 0.5722818151116371,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0039,
"reward": -0.1431608721613884,
"reward_after_mean": -0.1431608721613884,
"reward_after_std": 0.42974008433520794,
"reward_before_mean": 0.2529455106705427,
"reward_before_std": 0.3276430475525558,
"reward_change_max": 0.0,
"reward_change_mean": -0.39610639959573746,
"reward_change_min": -0.5491434335708618,
"reward_change_std": 0.21527612209320068,
"reward_std": 0.42974009923636913,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.0029455198673531413,
"step": 243
},
{
"clip_fraction": 0.0,
"completion_length": 2882.291679382324,
"epoch": 0.27885714285714286,
"grad_norm": 0.0700126439332962,
"kl": 3.098323941230774e-05,
"lambda_div_used": 0.6161807402968407,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0,
"reward": -0.06458889320492744,
"reward_after_mean": -0.06458889320492744,
"reward_after_std": 0.5576508566737175,
"reward_before_mean": 0.2605547234416008,
"reward_before_std": 0.5402053641155362,
"reward_change_max": 0.0,
"reward_change_mean": -0.32514358684420586,
"reward_change_min": -0.5207333639264107,
"reward_change_std": 0.21059911139309406,
"reward_std": 0.5576508603990078,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": -0.01027863984927535,
"step": 244
},
{
"clip_fraction": 0.0,
"completion_length": 2907.4583587646484,
"epoch": 0.28,
"grad_norm": 0.05816182866692543,
"kl": 3.5434961318969727e-05,
"lambda_div_used": 0.5615689232945442,
"learning_rate": 6.466308972251785e-07,
"loss": -0.0133,
"reward": -0.42641448229551315,
"reward_after_mean": -0.42641448229551315,
"reward_after_std": 0.3361923936754465,
"reward_before_mean": -0.15409247018396854,
"reward_before_std": 0.2813150165602565,
"reward_change_max": 0.0,
"reward_change_mean": -0.2723220158368349,
"reward_change_min": -0.445806298404932,
"reward_change_std": 0.1635848032310605,
"reward_std": 0.3361923974007368,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.19575914042070508,
"step": 245
},
{
"clip_fraction": 0.0,
"completion_length": 2680.7083587646484,
"epoch": 0.28114285714285714,
"grad_norm": 0.07198330760002136,
"kl": 1.764507032930851e-05,
"lambda_div_used": 0.5865297466516495,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0347,
"reward": -0.19568787328898907,
"reward_after_mean": -0.19568787328898907,
"reward_after_std": 0.4485197216272354,
"reward_before_mean": 0.129365224391222,
"reward_before_std": 0.393932550214231,
"reward_change_max": 0.0,
"reward_change_mean": -0.32505310885608196,
"reward_change_min": -0.4816659241914749,
"reward_change_std": 0.1897038472816348,
"reward_std": 0.44851974956691265,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.0789681114256382,
"step": 246
},
{
"clip_fraction": 0.0,
"completion_length": 3076.0208435058594,
"epoch": 0.2822857142857143,
"grad_norm": 0.08006418496370316,
"kl": 4.15705144405365e-05,
"lambda_div_used": 0.5735789015889168,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0199,
"reward": -0.3997113136574626,
"reward_after_mean": -0.3997113136574626,
"reward_after_std": 0.41638931445777416,
"reward_before_mean": -0.1426885835826397,
"reward_before_std": 0.3330965582281351,
"reward_change_max": 0.0,
"reward_change_mean": -0.2570227347314358,
"reward_change_min": -0.38052143156528473,
"reward_change_std": 0.13911327440291643,
"reward_std": 0.4163893237709999,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.18435525218956172,
"step": 247
},
{
"clip_fraction": 0.0,
"completion_length": 2145.0000228881836,
"epoch": 0.2834285714285714,
"grad_norm": 0.10052043944597244,
"kl": 3.3289194107055664e-05,
"lambda_div_used": 0.6171368733048439,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0514,
"reward": 0.28496517799794674,
"reward_after_mean": 0.28496517799794674,
"reward_after_std": 0.6654367055743933,
"reward_before_mean": 0.83577667362988,
"reward_before_std": 0.5420532608404756,
"reward_change_max": 0.0,
"reward_change_mean": -0.5508115068078041,
"reward_change_min": -0.7994196638464928,
"reward_change_std": 0.3166588256135583,
"reward_std": 0.6654367428272963,
"rewards/accuracy_reward": 0.5000000093132257,
"rewards/cosine_scaled_reward": 0.33577666338533163,
"step": 248
},
{
"clip_fraction": 0.0,
"completion_length": 2076.937515258789,
"epoch": 0.2845714285714286,
"grad_norm": 0.0908605083823204,
"kl": 2.4942681193351746e-05,
"lambda_div_used": 0.5948201194405556,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0219,
"reward": -0.003894178196787834,
"reward_after_mean": -0.003894178196787834,
"reward_after_std": 0.5242901761084795,
"reward_before_mean": 0.4153097262606025,
"reward_before_std": 0.43534869560971856,
"reward_change_max": 0.0,
"reward_change_mean": -0.41920389235019684,
"reward_change_min": -0.5903440341353416,
"reward_change_std": 0.2394579891115427,
"reward_std": 0.524290194734931,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.08197638019919395,
"step": 249
},
{
"clip_fraction": 0.0,
"completion_length": 2605.0208587646484,
"epoch": 0.2857142857142857,
"grad_norm": 0.09027790278196335,
"kl": 3.804638981819153e-05,
"lambda_div_used": 0.5779460370540619,
"learning_rate": 6.31233615362752e-07,
"loss": -0.0589,
"reward": -0.2859988175332546,
"reward_after_mean": -0.2859988175332546,
"reward_after_std": 0.4080943390727043,
"reward_before_mean": 0.008792944252490997,
"reward_before_std": 0.35337654035538435,
"reward_change_max": 0.0,
"reward_change_mean": -0.2947917561978102,
"reward_change_min": -0.4365835040807724,
"reward_change_std": 0.16840537451207638,
"reward_std": 0.40809434466063976,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.11620705761015415,
"step": 250
},
{
"clip_fraction": 0.0,
"completion_length": 2368.9167137145996,
"epoch": 0.28685714285714287,
"grad_norm": 0.09640161693096161,
"kl": 2.7060508728027344e-05,
"lambda_div_used": 0.5987462773919106,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0435,
"reward": -0.028017258271574974,
"reward_after_mean": -0.028017258271574974,
"reward_after_std": 0.5376799181103706,
"reward_before_mean": 0.38438196340575814,
"reward_before_std": 0.4528377316892147,
"reward_change_max": 0.0,
"reward_change_mean": -0.41239920631051064,
"reward_change_min": -0.6119380593299866,
"reward_change_std": 0.23853347077965736,
"reward_std": 0.5376799292862415,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.05104860384017229,
"step": 251
},
{
"clip_fraction": 0.0,
"completion_length": 2798.5625228881836,
"epoch": 0.288,
"grad_norm": 0.06784425675868988,
"kl": 3.535952419042587e-05,
"lambda_div_used": 0.5580189973115921,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0335,
"reward": -0.2931139934808016,
"reward_after_mean": -0.2931139934808016,
"reward_after_std": 0.38731373474001884,
"reward_before_mean": 0.06481979880481958,
"reward_before_std": 0.2658566879108548,
"reward_change_max": 0.0,
"reward_change_mean": -0.3579337988048792,
"reward_change_min": -0.5358714908361435,
"reward_change_std": 0.19932966493070126,
"reward_std": 0.3873137477785349,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.10184688493609428,
"step": 252
},
{
"clip_fraction": 0.0,
"completion_length": 2946.416679382324,
"epoch": 0.28914285714285715,
"grad_norm": 0.09991417825222015,
"kl": 3.4242868423461914e-05,
"lambda_div_used": 0.6016674339771271,
"learning_rate": 6.219465344613258e-07,
"loss": -0.0465,
"reward": -0.0356330550275743,
"reward_after_mean": -0.0356330550275743,
"reward_after_std": 0.5561915785074234,
"reward_before_mean": 0.353804474696517,
"reward_before_std": 0.4667428769171238,
"reward_change_max": 0.0,
"reward_change_mean": -0.389437522739172,
"reward_change_min": -0.592089481651783,
"reward_change_std": 0.22914788126945496,
"reward_std": 0.5561915840953588,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.041304459096863866,
"step": 253
},
{
"clip_fraction": 0.0,
"completion_length": 2754.3541946411133,
"epoch": 0.29028571428571426,
"grad_norm": 0.08061102032661438,
"kl": 3.886967897415161e-05,
"lambda_div_used": 0.6251527816057205,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0857,
"reward": -0.1664750911295414,
"reward_after_mean": -0.1664750911295414,
"reward_after_std": 0.599596256390214,
"reward_before_mean": 0.1026190984994173,
"reward_before_std": 0.5836509419605136,
"reward_change_max": 0.0,
"reward_change_mean": -0.2690941859036684,
"reward_change_min": -0.49960801005363464,
"reward_change_std": 0.19118957500904799,
"reward_std": 0.5995962955057621,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.08488089527236298,
"step": 254
},
{
"clip_fraction": 0.0,
"completion_length": 3190.083335876465,
"epoch": 0.2914285714285714,
"grad_norm": 0.08434556424617767,
"kl": 2.3322179913520813e-05,
"lambda_div_used": 0.5955284982919693,
"learning_rate": 6.157373628530852e-07,
"loss": -0.0125,
"reward": -0.3585695568472147,
"reward_after_mean": -0.3585695568472147,
"reward_after_std": 0.4996061436831951,
"reward_before_mean": -0.12086338270455599,
"reward_before_std": 0.4362535886466503,
"reward_change_max": 0.0,
"reward_change_mean": -0.2377061638981104,
"reward_change_min": -0.39885040931403637,
"reward_change_std": 0.1406589960679412,
"reward_std": 0.4996061585843563,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.20419671526178718,
"step": 255
},
{
"clip_fraction": 0.0,
"completion_length": 3160.8125610351562,
"epoch": 0.2925714285714286,
"grad_norm": 0.0576457642018795,
"kl": 3.143772482872009e-05,
"lambda_div_used": 0.639025017619133,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0243,
"reward": -0.03833652473986149,
"reward_after_mean": -0.03833652473986149,
"reward_after_std": 0.6670235451310873,
"reward_before_mean": 0.2675688254312263,
"reward_before_std": 0.6473761759698391,
"reward_change_max": 0.0,
"reward_change_mean": -0.30590534023940563,
"reward_change_min": -0.511355496942997,
"reward_change_std": 0.2077637044712901,
"reward_std": 0.6670235693454742,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.01756881456822157,
"step": 256
},
{
"clip_fraction": 0.0,
"completion_length": 3138.5000228881836,
"epoch": 0.2937142857142857,
"grad_norm": 0.05595054104924202,
"kl": 3.0346214771270752e-05,
"lambda_div_used": 0.6474356725811958,
"learning_rate": 6.095153756157051e-07,
"loss": 0.025,
"reward": -0.0769930558744818,
"reward_after_mean": -0.0769930558744818,
"reward_after_std": 0.7138958293944597,
"reward_before_mean": 0.18525638710707426,
"reward_before_std": 0.685269920155406,
"reward_change_max": 0.0,
"reward_change_mean": -0.26224944926798344,
"reward_change_min": -0.4810149297118187,
"reward_change_std": 0.18088674824684858,
"reward_std": 0.7138958312571049,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.04391028080135584,
"step": 257
},
{
"clip_fraction": 0.0,
"completion_length": 3473.187530517578,
"epoch": 0.2948571428571429,
"grad_norm": 0.05202889442443848,
"kl": 3.561750054359436e-05,
"lambda_div_used": 0.6007983982563019,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0058,
"reward": -0.23316247668117285,
"reward_after_mean": -0.23316247668117285,
"reward_after_std": 0.5119184870272875,
"reward_before_mean": 0.04610642418265343,
"reward_before_std": 0.46757086645811796,
"reward_change_max": 0.0,
"reward_change_mean": -0.2792689222842455,
"reward_change_min": -0.4602624364197254,
"reward_change_std": 0.17828952055424452,
"reward_std": 0.5119185000658035,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.09972689533606172,
"step": 258
},
{
"clip_fraction": 0.0,
"completion_length": 2996.7500534057617,
"epoch": 0.296,
"grad_norm": 0.11353152990341187,
"kl": 4.0959566831588745e-05,
"lambda_div_used": 0.605031318962574,
"learning_rate": 6.032817857379256e-07,
"loss": -0.0572,
"reward": -0.24131755530834198,
"reward_after_mean": -0.24131755530834198,
"reward_after_std": 0.5208989772945642,
"reward_before_mean": 0.03226012596860528,
"reward_before_std": 0.4831458665430546,
"reward_change_max": 0.0,
"reward_change_mean": -0.27357766777276993,
"reward_change_min": -0.4753991588950157,
"reward_change_std": 0.1725512887351215,
"reward_std": 0.5208989884704351,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.1135732214897871,
"step": 259
},
{
"clip_fraction": 0.0,
"completion_length": 2184.8542098999023,
"epoch": 0.29714285714285715,
"grad_norm": 0.1352783739566803,
"kl": 2.3871660232543945e-05,
"lambda_div_used": 0.6604775562882423,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0496,
"reward": 0.1305767484009266,
"reward_after_mean": 0.1305767484009266,
"reward_after_std": 0.7162795849144459,
"reward_before_mean": 0.48802967881783843,
"reward_before_std": 0.7462537074461579,
"reward_change_max": 0.0,
"reward_change_mean": -0.3574528992176056,
"reward_change_min": -0.6257632970809937,
"reward_change_std": 0.25755990110337734,
"reward_std": 0.7162796072661877,
"rewards/accuracy_reward": 0.39583334513008595,
"rewards/cosine_scaled_reward": 0.0921963145956397,
"step": 260
},
{
"clip_fraction": 0.0,
"completion_length": 3070.3125228881836,
"epoch": 0.29828571428571427,
"grad_norm": 0.06917808949947357,
"kl": 3.3020973205566406e-05,
"lambda_div_used": 0.5845921337604523,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0106,
"reward": -0.3141371374949813,
"reward_after_mean": -0.3141371374949813,
"reward_after_std": 0.43580205366015434,
"reward_before_mean": -0.03358305338770151,
"reward_before_std": 0.38949463702738285,
"reward_change_max": 0.0,
"reward_change_mean": -0.28055410645902157,
"reward_change_min": -0.44674795493483543,
"reward_change_std": 0.16879158467054367,
"reward_std": 0.4358020592480898,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.11691637896001339,
"step": 261
},
{
"clip_fraction": 0.0,
"completion_length": 3258.3125,
"epoch": 0.29942857142857143,
"grad_norm": 0.06239396706223488,
"kl": 4.535168409347534e-05,
"lambda_div_used": 0.5396690741181374,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0153,
"reward": -0.49852147325873375,
"reward_after_mean": -0.49852147325873375,
"reward_after_std": 0.23993146419525146,
"reward_before_mean": -0.21930878423154354,
"reward_before_std": 0.17422470543533564,
"reward_change_max": 0.0,
"reward_change_mean": -0.2792126890271902,
"reward_change_min": -0.4082174263894558,
"reward_change_std": 0.15261581167578697,
"reward_std": 0.2399314697831869,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.21930878423154354,
"step": 262
},
{
"clip_fraction": 0.0,
"completion_length": 2837.6666717529297,
"epoch": 0.30057142857142854,
"grad_norm": 0.07131204754114151,
"kl": 2.771243453025818e-05,
"lambda_div_used": 0.5753477811813354,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0159,
"reward": -0.32574891671538353,
"reward_after_mean": -0.32574891671538353,
"reward_after_std": 0.40547293052077293,
"reward_before_mean": -0.04349888768047094,
"reward_before_std": 0.3437284992542118,
"reward_change_max": 0.0,
"reward_change_mean": -0.28225001133978367,
"reward_change_min": -0.423517182469368,
"reward_change_std": 0.16225541010499,
"reward_std": 0.4054729398339987,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.14766556210815907,
"step": 263
},
{
"clip_fraction": 0.0,
"completion_length": 2913.812515258789,
"epoch": 0.3017142857142857,
"grad_norm": 0.07276218384504318,
"kl": 3.5703182220458984e-05,
"lambda_div_used": 0.5857644975185394,
"learning_rate": 5.87655029499542e-07,
"loss": 0.039,
"reward": -0.19073306024074554,
"reward_after_mean": -0.19073306024074554,
"reward_after_std": 0.4183583725243807,
"reward_before_mean": 0.13579276762902737,
"reward_before_std": 0.39237749949097633,
"reward_change_max": 0.0,
"reward_change_mean": -0.32652581483125687,
"reward_change_min": -0.512863963842392,
"reward_change_std": 0.20149299688637257,
"reward_std": 0.4183583725243807,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.051707250997424126,
"step": 264
},
{
"clip_fraction": 0.0,
"completion_length": 2092.333351135254,
"epoch": 0.3028571428571429,
"grad_norm": 0.09631343185901642,
"kl": 2.018176019191742e-05,
"lambda_div_used": 0.5927653685212135,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0409,
"reward": -0.05978839658200741,
"reward_after_mean": -0.05978839658200741,
"reward_after_std": 0.5236263573169708,
"reward_before_mean": 0.3295632619410753,
"reward_before_std": 0.4253385625779629,
"reward_change_max": 0.0,
"reward_change_mean": -0.38935166224837303,
"reward_change_min": -0.5855174511671066,
"reward_change_std": 0.22452097665518522,
"reward_std": 0.5236263833940029,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.037896597757935524,
"step": 265
},
{
"clip_fraction": 0.0,
"completion_length": 3210.8541717529297,
"epoch": 0.304,
"grad_norm": 0.05451129376888275,
"kl": 3.138929605484009e-05,
"lambda_div_used": 0.557657316327095,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0024,
"reward": -0.3950183019042015,
"reward_after_mean": -0.3950183019042015,
"reward_after_std": 0.32154187746345997,
"reward_before_mean": -0.12019951082766056,
"reward_before_std": 0.2612606221809983,
"reward_change_max": 0.0,
"reward_change_mean": -0.2748187892138958,
"reward_change_min": -0.42080841958522797,
"reward_change_std": 0.158494733273983,
"reward_std": 0.321541890501976,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.20353284664452076,
"step": 266
},
{
"clip_fraction": 0.0,
"completion_length": 3039.916679382324,
"epoch": 0.30514285714285716,
"grad_norm": 0.06252676248550415,
"kl": 3.5075165214948356e-05,
"lambda_div_used": 0.5528770685195923,
"learning_rate": 5.78255733788191e-07,
"loss": -0.037,
"reward": -0.4460434205830097,
"reward_after_mean": -0.4460434205830097,
"reward_after_std": 0.3107005339115858,
"reward_before_mean": -0.1744655454531312,
"reward_before_std": 0.2377403611317277,
"reward_change_max": 0.0,
"reward_change_mean": -0.27157786674797535,
"reward_change_min": -0.42647456377744675,
"reward_change_std": 0.1541104121133685,
"reward_std": 0.3107005413621664,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21613221243023872,
"step": 267
},
{
"clip_fraction": 0.0,
"completion_length": 2704.8333587646484,
"epoch": 0.3062857142857143,
"grad_norm": 0.08417963981628418,
"kl": 4.7713518142700195e-05,
"lambda_div_used": 0.6233388632535934,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0822,
"reward": -0.21941478177905083,
"reward_after_mean": -0.21941478177905083,
"reward_after_std": 0.6098091676831245,
"reward_before_mean": 0.03345827816519886,
"reward_before_std": 0.5743174999952316,
"reward_change_max": 0.0,
"reward_change_mean": -0.25287305377423763,
"reward_change_min": -0.45724478363990784,
"reward_change_std": 0.1690685572102666,
"reward_std": 0.6098091900348663,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.11237506754696369,
"step": 268
},
{
"clip_fraction": 0.0,
"completion_length": 3165.9791717529297,
"epoch": 0.30742857142857144,
"grad_norm": 0.06371315568685532,
"kl": 2.181902527809143e-05,
"lambda_div_used": 0.5798910111188889,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0477,
"reward": -0.38025568798184395,
"reward_after_mean": -0.38025568798184395,
"reward_after_std": 0.4126162938773632,
"reward_before_mean": -0.13023450784385204,
"reward_before_std": 0.3629760518670082,
"reward_change_max": 0.0,
"reward_change_mean": -0.2500211838632822,
"reward_change_min": -0.40024495497345924,
"reward_change_std": 0.15232266392558813,
"reward_std": 0.4126163087785244,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.2135678417980671,
"step": 269
},
{
"clip_fraction": 0.0,
"completion_length": 3056.875045776367,
"epoch": 0.30857142857142855,
"grad_norm": 0.05920446664094925,
"kl": 2.387911081314087e-05,
"lambda_div_used": 0.6503699943423271,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0,
"reward": -0.004786740057170391,
"reward_after_mean": -0.004786740057170391,
"reward_after_std": 0.6943319551646709,
"reward_before_mean": 0.292231020051986,
"reward_before_std": 0.7036092299968004,
"reward_change_max": 0.0,
"reward_change_mean": -0.2970177587121725,
"reward_change_min": -0.5634518079459667,
"reward_change_std": 0.21872046310454607,
"reward_std": 0.6943319924175739,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.021397670439910144,
"step": 270
},
{
"clip_fraction": 0.0,
"completion_length": 2221.3958702087402,
"epoch": 0.3097142857142857,
"grad_norm": 0.09025074541568756,
"kl": 6.723217666149139e-06,
"lambda_div_used": 0.640851192176342,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0064,
"reward": 0.11755780689418316,
"reward_after_mean": 0.11755780689418316,
"reward_after_std": 0.6356876939535141,
"reward_before_mean": 0.4748641401529312,
"reward_before_std": 0.654548792168498,
"reward_change_max": 0.0,
"reward_change_mean": -0.35730634443461895,
"reward_change_min": -0.5644437614828348,
"reward_change_std": 0.23989163804799318,
"reward_std": 0.6356877088546753,
"rewards/accuracy_reward": 0.39583334885537624,
"rewards/cosine_scaled_reward": 0.07903079688549042,
"step": 271
},
{
"clip_fraction": 0.0,
"completion_length": 2814.5208740234375,
"epoch": 0.31085714285714283,
"grad_norm": 0.07214022427797318,
"kl": 2.553686499595642e-05,
"lambda_div_used": 0.5532237812876701,
"learning_rate": 5.625647374256061e-07,
"loss": -0.023,
"reward": -0.33643968403339386,
"reward_after_mean": -0.33643968403339386,
"reward_after_std": 0.35168860107660294,
"reward_before_mean": 0.01186647079885006,
"reward_before_std": 0.2353415172547102,
"reward_change_max": 0.0,
"reward_change_mean": -0.34830615669488907,
"reward_change_min": -0.5079392194747925,
"reward_change_std": 0.1846030419692397,
"reward_std": 0.351688614115119,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.13396686874330044,
"step": 272
},
{
"clip_fraction": 0.0,
"completion_length": 2673.937515258789,
"epoch": 0.312,
"grad_norm": 0.067450150847435,
"kl": 2.347538247704506e-05,
"lambda_div_used": 0.5976931154727936,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0439,
"reward": 0.060379184782505035,
"reward_after_mean": 0.060379184782505035,
"reward_after_std": 0.5129444599151611,
"reward_before_mean": 0.4946505483239889,
"reward_before_std": 0.44454328902065754,
"reward_change_max": 0.0,
"reward_change_mean": -0.4342713989317417,
"reward_change_min": -0.6668536812067032,
"reward_change_std": 0.25412870943546295,
"reward_std": 0.5129444822669029,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/cosine_scaled_reward": 0.1196505706757307,
"step": 273
},
{
"clip_fraction": 0.0,
"completion_length": 1866.4375076293945,
"epoch": 0.31314285714285717,
"grad_norm": 0.11823767423629761,
"kl": 2.6108697056770325e-05,
"lambda_div_used": 0.6173145174980164,
"learning_rate": 5.562829811526154e-07,
"loss": -0.0666,
"reward": 0.20245935022830963,
"reward_after_mean": 0.20245935022830963,
"reward_after_std": 0.6078098546713591,
"reward_before_mean": 0.6919816123554483,
"reward_before_std": 0.5468513960950077,
"reward_change_max": 0.0,
"reward_change_mean": -0.4895222559571266,
"reward_change_min": -0.7379350513219833,
"reward_change_std": 0.3024911228567362,
"reward_std": 0.6078098695725203,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/cosine_scaled_reward": 0.2336482839891687,
"step": 274
},
{
"clip_fraction": 0.0,
"completion_length": 2334.0416870117188,
"epoch": 0.3142857142857143,
"grad_norm": 0.07390157133340836,
"kl": 2.975761890411377e-05,
"lambda_div_used": 0.5953021869063377,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0067,
"reward": -0.02193348854780197,
"reward_after_mean": -0.02193348854780197,
"reward_after_std": 0.5273136273026466,
"reward_before_mean": 0.3926870714276447,
"reward_before_std": 0.4364564120769501,
"reward_change_max": 0.0,
"reward_change_mean": -0.4146205447614193,
"reward_change_min": -0.6028474234044552,
"reward_change_std": 0.23770872876048088,
"reward_std": 0.5273136328905821,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.08018706925213337,
"step": 275
},
{
"clip_fraction": 0.0,
"completion_length": 2735.6458740234375,
"epoch": 0.31542857142857145,
"grad_norm": 0.07891591638326645,
"kl": 3.522634506225586e-05,
"lambda_div_used": 0.6019129753112793,
"learning_rate": 5.5e-07,
"loss": -0.0433,
"reward": 0.016597013920545578,
"reward_after_mean": 0.016597013920545578,
"reward_after_std": 0.5496951006352901,
"reward_before_mean": 0.43198024667799473,
"reward_before_std": 0.4716393407434225,
"reward_change_max": 0.0,
"reward_change_mean": -0.4153832569718361,
"reward_change_min": -0.6486029289662838,
"reward_change_std": 0.25036295782774687,
"reward_std": 0.5496951248496771,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.09864690899848938,
"step": 276
},
{
"clip_fraction": 0.0,
"completion_length": 2435.437530517578,
"epoch": 0.31657142857142856,
"grad_norm": 0.08397295325994492,
"kl": 2.7611851692199707e-05,
"lambda_div_used": 0.6150463595986366,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0536,
"reward": -0.049143560230731964,
"reward_after_mean": -0.049143560230731964,
"reward_after_std": 0.5724204778671265,
"reward_before_mean": 0.29686027206480503,
"reward_before_std": 0.5315626971423626,
"reward_change_max": 0.0,
"reward_change_mean": -0.34600381925702095,
"reward_change_min": -0.5762605480849743,
"reward_change_std": 0.21639508474618196,
"reward_std": 0.5724204815924168,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.02602693811058998,
"step": 277
},
{
"clip_fraction": 0.0,
"completion_length": 2029.708381652832,
"epoch": 0.3177142857142857,
"grad_norm": 0.11247767508029938,
"kl": 3.094971179962158e-05,
"lambda_div_used": 0.6170216798782349,
"learning_rate": 5.437170188473847e-07,
"loss": 0.002,
"reward": 0.056462954729795456,
"reward_after_mean": 0.056462954729795456,
"reward_after_std": 0.6216260213404894,
"reward_before_mean": 0.48691817931830883,
"reward_before_std": 0.5461275167763233,
"reward_change_max": 0.0,
"reward_change_mean": -0.4304552264511585,
"reward_change_min": -0.7186719551682472,
"reward_change_std": 0.27375681325793266,
"reward_std": 0.6216260306537151,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.13275149278342724,
"step": 278
},
{
"clip_fraction": 0.0,
"completion_length": 3158.6875228881836,
"epoch": 0.31885714285714284,
"grad_norm": 0.06945524364709854,
"kl": 3.522634506225586e-05,
"lambda_div_used": 0.5564287528395653,
"learning_rate": 5.405759110524894e-07,
"loss": -0.0272,
"reward": -0.43442236818373203,
"reward_after_mean": -0.43442236818373203,
"reward_after_std": 0.32187592424452305,
"reward_before_mean": -0.1544840056449175,
"reward_before_std": 0.25075172632932663,
"reward_change_max": 0.0,
"reward_change_mean": -0.27993838116526604,
"reward_change_min": -0.41151439025998116,
"reward_change_std": 0.15283891931176186,
"reward_std": 0.32187593914568424,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.17531732900533825,
"step": 279
},
{
"clip_fraction": 0.0,
"completion_length": 2157.833335876465,
"epoch": 0.32,
"grad_norm": 0.14424921572208405,
"kl": 4.202499985694885e-05,
"lambda_div_used": 0.630903884768486,
"learning_rate": 5.37435262574394e-07,
"loss": -0.0339,
"reward": -0.14438428170979023,
"reward_after_mean": -0.14438428170979023,
"reward_after_std": 0.6414528246968985,
"reward_before_mean": 0.1313652544049546,
"reward_before_std": 0.6011563409119844,
"reward_change_max": 0.0,
"reward_change_mean": -0.275749534368515,
"reward_change_min": -0.4570390097796917,
"reward_change_std": 0.1709576854482293,
"reward_std": 0.6414528302848339,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.076968085631961,
"step": 280
},
{
"clip_fraction": 0.0,
"completion_length": 3559.000030517578,
"epoch": 0.3211428571428571,
"grad_norm": 0.0494488850235939,
"kl": 3.0197203159332275e-05,
"lambda_div_used": 0.5301951244473457,
"learning_rate": 5.342952264838747e-07,
"loss": -0.0035,
"reward": -0.5142169296741486,
"reward_after_mean": -0.5142169296741486,
"reward_after_std": 0.22926145792007446,
"reward_before_mean": -0.2300790660083294,
"reward_before_std": 0.1324998252093792,
"reward_change_max": 0.0,
"reward_change_mean": -0.2841378580778837,
"reward_change_min": -0.4100039228796959,
"reward_change_std": 0.14742697216570377,
"reward_std": 0.22926146537065506,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2300790660083294,
"step": 281
},
{
"clip_fraction": 0.0,
"completion_length": 2757.7708587646484,
"epoch": 0.3222857142857143,
"grad_norm": 0.07136296480894089,
"kl": 1.5236437320709229e-05,
"lambda_div_used": 0.6041858941316605,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0304,
"reward": 0.016035709530115128,
"reward_after_mean": 0.016035709530115128,
"reward_after_std": 0.5573885068297386,
"reward_before_mean": 0.4244079850614071,
"reward_before_std": 0.48719789227470756,
"reward_change_max": 0.0,
"reward_change_mean": -0.40837226063013077,
"reward_change_min": -0.630510251969099,
"reward_change_std": 0.253003865480423,
"reward_std": 0.5573885291814804,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.07024132460355759,
"step": 282
},
{
"clip_fraction": 0.0,
"completion_length": 2882.5416946411133,
"epoch": 0.32342857142857145,
"grad_norm": 0.05982055515050888,
"kl": 1.1865049600601196e-05,
"lambda_div_used": 0.6191478446125984,
"learning_rate": 5.28017603591974e-07,
"loss": -0.015,
"reward": 0.10000502690672874,
"reward_after_mean": 0.10000502690672874,
"reward_after_std": 0.6287181153893471,
"reward_before_mean": 0.5279214177280664,
"reward_before_std": 0.5531216450035572,
"reward_change_max": 0.0,
"reward_change_mean": -0.4279164057224989,
"reward_change_min": -0.6583010666072369,
"reward_change_std": 0.2602456407621503,
"reward_std": 0.6287181228399277,
"rewards/accuracy_reward": 0.37500000931322575,
"rewards/cosine_scaled_reward": 0.1529214084148407,
"step": 283
},
{
"clip_fraction": 0.0,
"completion_length": 2187.083381652832,
"epoch": 0.32457142857142857,
"grad_norm": 0.20831985771656036,
"kl": 5.378853529691696e-05,
"lambda_div_used": 0.617803268134594,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0301,
"reward": -0.09672576747834682,
"reward_after_mean": -0.09672576747834682,
"reward_after_std": 0.5820088647305965,
"reward_before_mean": 0.21344758570194244,
"reward_before_std": 0.544846129603684,
"reward_change_max": 0.0,
"reward_change_mean": -0.3101733736693859,
"reward_change_min": -0.5270103476941586,
"reward_change_std": 0.20029443874955177,
"reward_std": 0.5820088759064674,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.0157190952450037,
"step": 284
},
{
"clip_fraction": 0.0,
"completion_length": 2571.4583435058594,
"epoch": 0.32571428571428573,
"grad_norm": 0.05750874802470207,
"kl": 2.5823712348937988e-05,
"lambda_div_used": 0.5412982106208801,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0429,
"reward": -0.3610886335372925,
"reward_after_mean": -0.3610886335372925,
"reward_after_std": 0.2963402010500431,
"reward_before_mean": 0.004524916410446167,
"reward_before_std": 0.18141429405659437,
"reward_change_max": 0.0,
"reward_change_mean": -0.36561354249715805,
"reward_change_min": -0.5212092585861683,
"reward_change_std": 0.19485185854136944,
"reward_std": 0.29634021408855915,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.12047509290277958,
"step": 285
},
{
"clip_fraction": 0.0,
"completion_length": 2607.625030517578,
"epoch": 0.32685714285714285,
"grad_norm": 0.06422320753335953,
"kl": 2.036895602941513e-05,
"lambda_div_used": 0.593349277973175,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0069,
"reward": -0.1434864792972803,
"reward_after_mean": -0.1434864792972803,
"reward_after_std": 0.4716298431158066,
"reward_before_mean": 0.18680993653833866,
"reward_before_std": 0.4280705275014043,
"reward_change_max": 0.0,
"reward_change_mean": -0.3302964009344578,
"reward_change_min": -0.5444469898939133,
"reward_change_std": 0.20489494875073433,
"reward_std": 0.4716298636049032,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.04235674999654293,
"step": 286
},
{
"clip_fraction": 0.0,
"completion_length": 2068.4375343322754,
"epoch": 0.328,
"grad_norm": 0.11217836290597916,
"kl": 3.282725811004639e-05,
"lambda_div_used": 0.576636016368866,
"learning_rate": 5.154764373429315e-07,
"loss": -0.0127,
"reward": -0.015125550329685211,
"reward_after_mean": -0.015125550329685211,
"reward_after_std": 0.44022079929709435,
"reward_before_mean": 0.4338516741991043,
"reward_before_std": 0.349846001714468,
"reward_change_max": 0.0,
"reward_change_mean": -0.4489772208034992,
"reward_change_min": -0.6400703266263008,
"reward_change_std": 0.25500839948654175,
"reward_std": 0.44022080302238464,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.12135165929794312,
"step": 287
},
{
"clip_fraction": 0.0,
"completion_length": 3021.1666870117188,
"epoch": 0.3291428571428571,
"grad_norm": 0.05326732248067856,
"kl": 2.1582163753919303e-05,
"lambda_div_used": 0.5814583152532578,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0215,
"reward": -0.23482287488877773,
"reward_after_mean": -0.23482287488877773,
"reward_after_std": 0.4797380156815052,
"reward_before_mean": 0.10618079453706741,
"reward_before_std": 0.37177785113453865,
"reward_change_max": 0.0,
"reward_change_mean": -0.34100368432700634,
"reward_change_min": -0.5246531553566456,
"reward_change_std": 0.190623770467937,
"reward_std": 0.47973802499473095,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.08131920825690031,
"step": 288
},
{
"clip_fraction": 0.0,
"completion_length": 2210.0208435058594,
"epoch": 0.3302857142857143,
"grad_norm": 0.0918356254696846,
"kl": 3.3445656299591064e-05,
"lambda_div_used": 0.5579611882567406,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0308,
"reward": -0.1890019178390503,
"reward_after_mean": -0.1890019178390503,
"reward_after_std": 0.38851393200457096,
"reward_before_mean": 0.2269407268613577,
"reward_before_std": 0.2580757178366184,
"reward_change_max": 0.0,
"reward_change_mean": -0.4159426633268595,
"reward_change_min": -0.5701811872422695,
"reward_change_std": 0.21641669981181622,
"reward_std": 0.38851393945515156,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": -0.04389259871095419,
"step": 289
},
{
"clip_fraction": 0.0,
"completion_length": 1820.6666870117188,
"epoch": 0.3314285714285714,
"grad_norm": 0.10334110260009766,
"kl": 1.7639249563217163e-05,
"lambda_div_used": 0.649879202246666,
"learning_rate": 5.060876951083828e-07,
"loss": -0.0389,
"reward": 0.04819735325872898,
"reward_after_mean": 0.04819735325872898,
"reward_after_std": 0.752808591350913,
"reward_before_mean": 0.39133079699240625,
"reward_before_std": 0.7000208692625165,
"reward_change_max": 0.0,
"reward_change_mean": -0.3431334514170885,
"reward_change_min": -0.5988858677446842,
"reward_change_std": 0.2255254928022623,
"reward_std": 0.7528086155653,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.05799746699631214,
"step": 290
},
{
"clip_fraction": 0.0,
"completion_length": 2717.166732788086,
"epoch": 0.3325714285714286,
"grad_norm": 0.07149659842252731,
"kl": 9.842216968536377e-06,
"lambda_div_used": 0.6230586618185043,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0634,
"reward": 0.04544975981116295,
"reward_after_mean": 0.04544975981116295,
"reward_after_std": 0.6337935384362936,
"reward_before_mean": 0.4299341347068548,
"reward_before_std": 0.5707205794751644,
"reward_change_max": 0.0,
"reward_change_mean": -0.38448438234627247,
"reward_change_min": -0.6367709413170815,
"reward_change_std": 0.2398422248661518,
"reward_std": 0.6337935607880354,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/cosine_scaled_reward": 0.09660080214962363,
"step": 291
},
{
"clip_fraction": 0.0,
"completion_length": 3176.979202270508,
"epoch": 0.33371428571428574,
"grad_norm": 0.05150657147169113,
"kl": 2.0240433514118195e-05,
"lambda_div_used": 0.5560602247714996,
"learning_rate": 4.998389805071536e-07,
"loss": 0.062,
"reward": -0.4294139966368675,
"reward_after_mean": -0.4294139966368675,
"reward_after_std": 0.3452510107308626,
"reward_before_mean": -0.15922172274440527,
"reward_before_std": 0.2578093442134559,
"reward_change_max": 0.0,
"reward_change_mean": -0.27019229158759117,
"reward_change_min": -0.42562897875905037,
"reward_change_std": 0.15438843425363302,
"reward_std": 0.3452510181814432,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.20088838692754507,
"step": 292
},
{
"clip_fraction": 0.0,
"completion_length": 2630.9791946411133,
"epoch": 0.33485714285714285,
"grad_norm": 0.05954331159591675,
"kl": 1.2833625078201294e-05,
"lambda_div_used": 0.5555919855833054,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0009,
"reward": -0.19852645695209503,
"reward_after_mean": -0.19852645695209503,
"reward_after_std": 0.3515181578695774,
"reward_before_mean": 0.20652466267347336,
"reward_before_std": 0.24862384609878063,
"reward_change_max": 0.0,
"reward_change_mean": -0.40505112148821354,
"reward_change_min": -0.5747470110654831,
"reward_change_std": 0.22049889154732227,
"reward_std": 0.35151816345751286,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.022642023861408234,
"step": 293
},
{
"clip_fraction": 0.0,
"completion_length": 3005.5,
"epoch": 0.336,
"grad_norm": 0.06993379443883896,
"kl": 2.8274953365325928e-05,
"lambda_div_used": 0.5608418732881546,
"learning_rate": 4.93600044896063e-07,
"loss": -0.0248,
"reward": -0.3667301833629608,
"reward_after_mean": -0.3667301833629608,
"reward_after_std": 0.32350156269967556,
"reward_before_mean": -0.07280110754072666,
"reward_before_std": 0.2737845163792372,
"reward_change_max": 0.0,
"reward_change_mean": -0.29392906464636326,
"reward_change_min": -0.4593726359307766,
"reward_change_std": 0.17168805841356516,
"reward_std": 0.3235015720129013,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.15613443590700626,
"step": 294
},
{
"clip_fraction": 0.0,
"completion_length": 3221.2291717529297,
"epoch": 0.33714285714285713,
"grad_norm": 0.06364311277866364,
"kl": 1.817569136619568e-05,
"lambda_div_used": 0.5979067236185074,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0107,
"reward": -0.259306401014328,
"reward_after_mean": -0.259306401014328,
"reward_after_std": 0.5024331342428923,
"reward_before_mean": 0.018260781886056066,
"reward_before_std": 0.45087322127074003,
"reward_change_max": 0.0,
"reward_change_mean": -0.277567183598876,
"reward_change_min": -0.41658853366971016,
"reward_change_std": 0.1643626783043146,
"reward_std": 0.5024331398308277,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.10673921927809715,
"step": 295
},
{
"clip_fraction": 0.0,
"completion_length": 3282.8750610351562,
"epoch": 0.3382857142857143,
"grad_norm": 0.0698896273970604,
"kl": 2.3877248167991638e-05,
"lambda_div_used": 0.5625706240534782,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0079,
"reward": -0.39721857756376266,
"reward_after_mean": -0.39721857756376266,
"reward_after_std": 0.34284412302076817,
"reward_before_mean": -0.1091517936438322,
"reward_before_std": 0.28544116113334894,
"reward_change_max": 0.0,
"reward_change_mean": -0.28806679882109165,
"reward_change_min": -0.4639837518334389,
"reward_change_std": 0.1706175785511732,
"reward_std": 0.3428441286087036,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.15081845596432686,
"step": 296
},
{
"clip_fraction": 0.0,
"completion_length": 3569.8541870117188,
"epoch": 0.3394285714285714,
"grad_norm": 0.04541197791695595,
"kl": 2.245139330625534e-05,
"lambda_div_used": 0.5649159774184227,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0071,
"reward": -0.43140796944499016,
"reward_after_mean": -0.43140796944499016,
"reward_after_std": 0.3433863054960966,
"reward_before_mean": -0.17089181207120419,
"reward_before_std": 0.29400468710809946,
"reward_change_max": 0.0,
"reward_change_mean": -0.26051616482436657,
"reward_change_min": -0.4413977116346359,
"reward_change_std": 0.15714262332767248,
"reward_std": 0.3433863129466772,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2125584715977311,
"step": 297
},
{
"clip_fraction": 0.0,
"completion_length": 2834.479202270508,
"epoch": 0.3405714285714286,
"grad_norm": 0.07559877634048462,
"kl": 1.9371509552001953e-05,
"lambda_div_used": 0.5950812250375748,
"learning_rate": 4.811563736721829e-07,
"loss": -0.0217,
"reward": -0.13096300419420004,
"reward_after_mean": -0.13096300419420004,
"reward_after_std": 0.505608232691884,
"reward_before_mean": 0.22305661533027887,
"reward_before_std": 0.43375879526138306,
"reward_change_max": 0.0,
"reward_change_mean": -0.354019645601511,
"reward_change_min": -0.5659407489001751,
"reward_change_std": 0.20670694950968027,
"reward_std": 0.5056082457304001,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.0686100497841835,
"step": 298
},
{
"clip_fraction": 0.0,
"completion_length": 3288.5833587646484,
"epoch": 0.3417142857142857,
"grad_norm": 0.05182049795985222,
"kl": 3.013480454683304e-05,
"lambda_div_used": 0.5971293970942497,
"learning_rate": 4.780534655386743e-07,
"loss": -0.0035,
"reward": -0.1649590004235506,
"reward_after_mean": -0.1649590004235506,
"reward_after_std": 0.476910138502717,
"reward_before_mean": 0.15552489459514618,
"reward_before_std": 0.44229450821876526,
"reward_change_max": 0.0,
"reward_change_mean": -0.3204838838428259,
"reward_change_min": -0.5031619034707546,
"reward_change_std": 0.19439208041876554,
"reward_std": 0.47691015154123306,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.052808452397584915,
"step": 299
},
{
"clip_fraction": 0.0,
"completion_length": 3455.3958435058594,
"epoch": 0.34285714285714286,
"grad_norm": 0.059302717447280884,
"kl": 3.039836883544922e-05,
"lambda_div_used": 0.5659952610731125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0631,
"reward": -0.4138263203203678,
"reward_after_mean": -0.4138263203203678,
"reward_after_std": 0.35950295627117157,
"reward_before_mean": -0.14833886176347733,
"reward_before_std": 0.30170741491019726,
"reward_change_max": 0.0,
"reward_change_mean": -0.26548744924366474,
"reward_change_min": -0.45029690116643906,
"reward_change_std": 0.16043098457157612,
"reward_std": 0.35950295627117157,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.19000552780926228,
"step": 300
},
{
"clip_fraction": 0.0,
"completion_length": 2581.2917404174805,
"epoch": 0.344,
"grad_norm": 0.0829557478427887,
"kl": 1.7702579498291016e-05,
"lambda_div_used": 0.6020706444978714,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0033,
"reward": -0.2905968725681305,
"reward_after_mean": -0.2905968725681305,
"reward_after_std": 0.522302333265543,
"reward_before_mean": -0.03232934419065714,
"reward_before_std": 0.47291796933859587,
"reward_change_max": 0.0,
"reward_change_mean": -0.2582675274461508,
"reward_change_min": -0.4681916609406471,
"reward_change_std": 0.16694430727511644,
"reward_std": 0.5223023407161236,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.13649601582437754,
"step": 301
},
{
"clip_fraction": 0.0,
"completion_length": 2299.250026702881,
"epoch": 0.34514285714285714,
"grad_norm": 0.09259702265262604,
"kl": 2.473965287208557e-05,
"lambda_div_used": 0.6064160838723183,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0049,
"reward": 0.1023632986471057,
"reward_after_mean": 0.1023632986471057,
"reward_after_std": 0.5881610047072172,
"reward_before_mean": 0.5643375236541033,
"reward_before_std": 0.48729276517406106,
"reward_change_max": 0.0,
"reward_change_mean": -0.46197423338890076,
"reward_change_min": -0.6793596595525742,
"reward_change_std": 0.2643775464966893,
"reward_std": 0.588161014020443,
"rewards/accuracy_reward": 0.39583334513008595,
"rewards/cosine_scaled_reward": 0.16850417526438832,
"step": 302
},
{
"clip_fraction": 0.0,
"completion_length": 2496.604202270508,
"epoch": 0.3462857142857143,
"grad_norm": 0.10342813283205032,
"kl": 2.822279930114746e-05,
"lambda_div_used": 0.6173663139343262,
"learning_rate": 4.656784084364238e-07,
"loss": -0.0508,
"reward": -0.21569269057363272,
"reward_after_mean": -0.21569269057363272,
"reward_after_std": 0.5887450613081455,
"reward_before_mean": 0.048040480352938175,
"reward_before_std": 0.5433634500950575,
"reward_change_max": 0.0,
"reward_change_mean": -0.2637331634759903,
"reward_change_min": -0.4554433934390545,
"reward_change_std": 0.17205023765563965,
"reward_std": 0.5887450724840164,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.09779285965487361,
"step": 303
},
{
"clip_fraction": 0.0,
"completion_length": 2828.375030517578,
"epoch": 0.3474285714285714,
"grad_norm": 0.06273287534713745,
"kl": 3.515370190143585e-05,
"lambda_div_used": 0.5850896239280701,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0395,
"reward": -0.351226020604372,
"reward_after_mean": -0.351226020604372,
"reward_after_std": 0.4407376032322645,
"reward_before_mean": -0.091004628688097,
"reward_before_std": 0.39247662480920553,
"reward_change_max": 0.0,
"reward_change_mean": -0.2602214030921459,
"reward_change_min": -0.4692201763391495,
"reward_change_std": 0.16711712814867496,
"reward_std": 0.4407376106828451,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.1743379645049572,
"step": 304
},
{
"clip_fraction": 0.0,
"completion_length": 2963.270835876465,
"epoch": 0.3485714285714286,
"grad_norm": 0.07068406045436859,
"kl": 3.521144390106201e-05,
"lambda_div_used": 0.5555498078465462,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0077,
"reward": -0.44899775832891464,
"reward_after_mean": -0.44899775832891464,
"reward_after_std": 0.32930242642760277,
"reward_before_mean": -0.17746215965598822,
"reward_before_std": 0.2488851365633309,
"reward_change_max": 0.0,
"reward_change_mean": -0.27153559774160385,
"reward_change_min": -0.41173194721341133,
"reward_change_std": 0.14773181919008493,
"reward_std": 0.32930243387818336,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.19829548941925168,
"step": 305
},
{
"clip_fraction": 0.0,
"completion_length": 2542.8750228881836,
"epoch": 0.3497142857142857,
"grad_norm": 0.09160702675580978,
"kl": 2.777576446533203e-05,
"lambda_div_used": 0.5710241869091988,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.001,
"reward": -0.28438286297023296,
"reward_after_mean": -0.28438286297023296,
"reward_after_std": 0.45469350554049015,
"reward_before_mean": 0.05048683221684769,
"reward_before_std": 0.3197702756151557,
"reward_change_max": 0.0,
"reward_change_mean": -0.33486970886588097,
"reward_change_min": -0.46306542679667473,
"reward_change_std": 0.1733616916462779,
"reward_std": 0.45469350926578045,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.11617984622716904,
"step": 306
},
{
"clip_fraction": 0.0,
"completion_length": 2358.9167098999023,
"epoch": 0.35085714285714287,
"grad_norm": 0.08308543264865875,
"kl": 2.5270506739616394e-05,
"lambda_div_used": 0.6283661872148514,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0233,
"reward": -0.023870151489973068,
"reward_after_mean": -0.023870151489973068,
"reward_after_std": 0.5968872811645269,
"reward_before_mean": 0.2986760139465332,
"reward_before_std": 0.602552474476397,
"reward_change_max": 0.0,
"reward_change_mean": -0.3225461672991514,
"reward_change_min": -0.5788566246628761,
"reward_change_std": 0.22817020770162344,
"reward_std": 0.5968873165547848,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.007009351626038551,
"step": 307
},
{
"clip_fraction": 0.0,
"completion_length": 3364.187530517578,
"epoch": 0.352,
"grad_norm": 0.04729427769780159,
"kl": 1.2226402759552002e-05,
"lambda_div_used": 0.5575956255197525,
"learning_rate": 4.503031760712397e-07,
"loss": -0.0147,
"reward": -0.364587739109993,
"reward_after_mean": -0.364587739109993,
"reward_after_std": 0.3172164801508188,
"reward_before_mean": -0.06630371138453484,
"reward_before_std": 0.25998193118721247,
"reward_change_max": 0.0,
"reward_change_mean": -0.29828402772545815,
"reward_change_min": -0.4606154337525368,
"reward_change_std": 0.17100436985492706,
"reward_std": 0.31721648946404457,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.14963705092668533,
"step": 308
},
{
"clip_fraction": 0.0,
"completion_length": 3246.750030517578,
"epoch": 0.35314285714285715,
"grad_norm": 0.054204076528549194,
"kl": 1.4858320355415344e-05,
"lambda_div_used": 0.6364770829677582,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0936,
"reward": -0.09930397570133209,
"reward_after_mean": -0.09930397570133209,
"reward_after_std": 0.6563027147203684,
"reward_before_mean": 0.18065341375768185,
"reward_before_std": 0.6305609010159969,
"reward_change_max": 0.0,
"reward_change_mean": -0.27995740808546543,
"reward_change_min": -0.5127242244780064,
"reward_change_std": 0.19019456766545773,
"reward_std": 0.6563027296215296,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.027679930441081524,
"step": 309
},
{
"clip_fraction": 0.0,
"completion_length": 2454.750015258789,
"epoch": 0.35428571428571426,
"grad_norm": 0.10188445448875427,
"kl": 3.9263395592570305e-05,
"lambda_div_used": 0.5863905549049377,
"learning_rate": 4.441860491038345e-07,
"loss": -0.0091,
"reward": -0.358197920024395,
"reward_after_mean": -0.358197920024395,
"reward_after_std": 0.4752412661910057,
"reward_before_mean": -0.10783447185531259,
"reward_before_std": 0.39106855262070894,
"reward_change_max": 0.0,
"reward_change_mean": -0.25036344304680824,
"reward_change_min": -0.36833222955465317,
"reward_change_std": 0.13517011515796185,
"reward_std": 0.47524126805365086,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.17033447965513915,
"step": 310
},
{
"clip_fraction": 0.0,
"completion_length": 2549.5000228881836,
"epoch": 0.3554285714285714,
"grad_norm": 0.07070475816726685,
"kl": 1.1476688086986542e-05,
"lambda_div_used": 0.6028245091438293,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0234,
"reward": -0.07257996033877134,
"reward_after_mean": -0.07257996033877134,
"reward_after_std": 0.5427698846906424,
"reward_before_mean": 0.32260693423449993,
"reward_before_std": 0.4735551681369543,
"reward_change_max": 0.0,
"reward_change_mean": -0.3951868824660778,
"reward_change_min": -0.6274620294570923,
"reward_change_std": 0.2447280865162611,
"reward_std": 0.5427699014544487,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/cosine_scaled_reward": 0.010106915608048439,
"step": 311
},
{
"clip_fraction": 0.0,
"completion_length": 2062.020851135254,
"epoch": 0.3565714285714286,
"grad_norm": 0.0864594504237175,
"kl": 2.2163614630699158e-05,
"lambda_div_used": 0.5457132831215858,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0122,
"reward": -0.03406492620706558,
"reward_after_mean": -0.03406492620706558,
"reward_after_std": 0.3851391561329365,
"reward_before_mean": 0.5170513242483139,
"reward_before_std": 0.20092127658426762,
"reward_change_max": 0.0,
"reward_change_mean": -0.5511162541806698,
"reward_change_min": -0.7572819888591766,
"reward_change_std": 0.2855915669351816,
"reward_std": 0.38513917103409767,
"rewards/accuracy_reward": 0.375,
"rewards/cosine_scaled_reward": 0.1420513167977333,
"step": 312
},
{
"clip_fraction": 0.0,
"completion_length": 2939.958351135254,
"epoch": 0.3577142857142857,
"grad_norm": 0.06731264293193817,
"kl": 2.4283304810523987e-05,
"lambda_div_used": 0.5877447426319122,
"learning_rate": 4.350494089288943e-07,
"loss": -0.0439,
"reward": -0.15467195864766836,
"reward_after_mean": -0.15467195864766836,
"reward_after_std": 0.522945849224925,
"reward_before_mean": 0.20540180057287216,
"reward_before_std": 0.40051606576889753,
"reward_change_max": 0.0,
"reward_change_mean": -0.3600737862288952,
"reward_change_min": -0.5124829597771168,
"reward_change_std": 0.19857864920049906,
"reward_std": 0.5229458846151829,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": -0.02376485476270318,
"step": 313
},
{
"clip_fraction": 0.0,
"completion_length": 2541.18754196167,
"epoch": 0.3588571428571429,
"grad_norm": 0.08012691885232925,
"kl": 3.474205732345581e-05,
"lambda_div_used": 0.607881672680378,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0168,
"reward": -0.20861644856631756,
"reward_after_mean": -0.20861644856631756,
"reward_after_std": 0.5089061111211777,
"reward_before_mean": 0.08002243563532829,
"reward_before_std": 0.49900877848267555,
"reward_change_max": 0.0,
"reward_change_mean": -0.28863888792693615,
"reward_change_min": -0.548176895827055,
"reward_change_std": 0.2009973768144846,
"reward_std": 0.5089061167091131,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.08664422971196473,
"step": 314
},
{
"clip_fraction": 0.0,
"completion_length": 3124.3333740234375,
"epoch": 0.36,
"grad_norm": 0.0558871328830719,
"kl": 1.9735191017389297e-05,
"lambda_div_used": 0.5734824016690254,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0494,
"reward": -0.1729298224672675,
"reward_after_mean": -0.1729298224672675,
"reward_after_std": 0.45045475475490093,
"reward_before_mean": 0.20758753083646297,
"reward_before_std": 0.33522335812449455,
"reward_change_max": 0.0,
"reward_change_mean": -0.3805173486471176,
"reward_change_min": -0.5440769977867603,
"reward_change_std": 0.20876472163945436,
"reward_std": 0.4504547640681267,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.02157914894632995,
"step": 315
},
{
"clip_fraction": 0.0,
"completion_length": 3509.5416870117188,
"epoch": 0.36114285714285715,
"grad_norm": 0.05331319198012352,
"kl": 2.981536090373993e-05,
"lambda_div_used": 0.5893311724066734,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0135,
"reward": -0.22396450489759445,
"reward_after_mean": -0.22396450489759445,
"reward_after_std": 0.43868801929056644,
"reward_before_mean": 0.07456053979694843,
"reward_before_std": 0.414919788017869,
"reward_change_max": 0.0,
"reward_change_mean": -0.29852502048015594,
"reward_change_min": -0.4823254942893982,
"reward_change_std": 0.1912739286199212,
"reward_std": 0.43868803791701794,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.09210613742470741,
"step": 316
},
{
"clip_fraction": 0.0,
"completion_length": 3099.5833435058594,
"epoch": 0.36228571428571427,
"grad_norm": 0.06200157478451729,
"kl": 3.091990947723389e-05,
"lambda_div_used": 0.5825675800442696,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0437,
"reward": -0.35854507237672806,
"reward_after_mean": -0.35854507237672806,
"reward_after_std": 0.43180895783007145,
"reward_before_mean": -0.102005859836936,
"reward_before_std": 0.3769839182496071,
"reward_change_max": 0.0,
"reward_change_mean": -0.25653921999037266,
"reward_change_min": -0.4170425795018673,
"reward_change_std": 0.15367608424276114,
"reward_std": 0.4318089634180069,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.18533920124173164,
"step": 317
},
{
"clip_fraction": 0.0,
"completion_length": 2281.000068664551,
"epoch": 0.36342857142857143,
"grad_norm": 0.0932127833366394,
"kl": 3.602728247642517e-05,
"lambda_div_used": 0.6223095878958702,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0631,
"reward": -0.07363937655463815,
"reward_after_mean": -0.07363937655463815,
"reward_after_std": 0.5691333152353764,
"reward_before_mean": 0.24711408838629723,
"reward_before_std": 0.5656038168817759,
"reward_change_max": 0.0,
"reward_change_mean": -0.3207534924149513,
"reward_change_min": -0.5814453661441803,
"reward_change_std": 0.2219030074775219,
"reward_std": 0.569133322685957,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/cosine_scaled_reward": -0.0028859074227511883,
"step": 318
},
{
"clip_fraction": 0.0,
"completion_length": 2854.5625076293945,
"epoch": 0.36457142857142855,
"grad_norm": 0.08494079858064651,
"kl": 2.7507543563842773e-05,
"lambda_div_used": 0.5380512997508049,
"learning_rate": 4.1693137748017915e-07,
"loss": -0.0389,
"reward": -0.4965253435075283,
"reward_after_mean": -0.4965253435075283,
"reward_after_std": 0.2333353590220213,
"reward_before_mean": -0.2120634987950325,
"reward_before_std": 0.16711975168436766,
"reward_change_max": 0.0,
"reward_change_mean": -0.28446184657514095,
"reward_change_min": -0.4124374948441982,
"reward_change_std": 0.15511877462267876,
"reward_std": 0.2333353627473116,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2120634950697422,
"step": 319
},
{
"clip_fraction": 0.0,
"completion_length": 1964.8958740234375,
"epoch": 0.3657142857142857,
"grad_norm": 0.09068436175584793,
"kl": 3.33394855260849e-05,
"lambda_div_used": 0.6160280704498291,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0117,
"reward": -0.19740011962130666,
"reward_after_mean": -0.19740011962130666,
"reward_after_std": 0.5963947810232639,
"reward_before_mean": 0.0693174353800714,
"reward_before_std": 0.5336382519453764,
"reward_change_max": 0.0,
"reward_change_mean": -0.26671755872666836,
"reward_change_min": -0.4143567681312561,
"reward_change_std": 0.15817437414079905,
"reward_std": 0.5963947977870703,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.05568256159313023,
"step": 320
},
{
"clip_fraction": 0.0,
"completion_length": 1974.8958778381348,
"epoch": 0.3668571428571429,
"grad_norm": 0.0693729817867279,
"kl": 1.574307680130005e-05,
"lambda_div_used": 0.6290735602378845,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0387,
"reward": 0.11023577488958836,
"reward_after_mean": 0.11023577488958836,
"reward_after_std": 0.6442763805389404,
"reward_before_mean": 0.5218545235693455,
"reward_before_std": 0.6013733670115471,
"reward_change_max": 0.0,
"reward_change_mean": -0.4116187300533056,
"reward_change_min": -0.6836260408163071,
"reward_change_std": 0.2705372450873256,
"reward_std": 0.6442763898521662,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.12602117005735636,
"step": 321
},
{
"clip_fraction": 0.0,
"completion_length": 2837.833381652832,
"epoch": 0.368,
"grad_norm": 0.10218925029039383,
"kl": 4.9501657485961914e-05,
"lambda_div_used": 0.6059064492583275,
"learning_rate": 4.079579333738039e-07,
"loss": -0.0178,
"reward": -0.2954628551378846,
"reward_after_mean": -0.2954628551378846,
"reward_after_std": 0.5665331184864044,
"reward_before_mean": -0.04816420469433069,
"reward_before_std": 0.48093545995652676,
"reward_change_max": 0.0,
"reward_change_mean": -0.24729863554239273,
"reward_change_min": -0.3516420107334852,
"reward_change_std": 0.13184416200965643,
"reward_std": 0.5665331222116947,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.13149754563346505,
"step": 322
},
{
"clip_fraction": 0.0,
"completion_length": 3015.666717529297,
"epoch": 0.36914285714285716,
"grad_norm": 0.0596209317445755,
"kl": 1.7877668142318726e-05,
"lambda_div_used": 0.5565166473388672,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0465,
"reward": -0.16177266091108322,
"reward_after_mean": -0.16177266091108322,
"reward_after_std": 0.42436218820512295,
"reward_before_mean": 0.2816486116498709,
"reward_before_std": 0.2578566027805209,
"reward_change_max": 0.0,
"reward_change_mean": -0.4434212874621153,
"reward_change_min": -0.6335049495100975,
"reward_change_std": 0.23841418512165546,
"reward_std": 0.4243622049689293,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": -0.010018057189881802,
"step": 323
},
{
"clip_fraction": 0.0,
"completion_length": 2751.6458435058594,
"epoch": 0.3702857142857143,
"grad_norm": 0.10376841574907303,
"kl": 2.1306797862052917e-05,
"lambda_div_used": 0.5670785158872604,
"learning_rate": 4.020100089676376e-07,
"loss": -0.0038,
"reward": -0.10797288408502936,
"reward_after_mean": -0.10797288408502936,
"reward_after_std": 0.4300071895122528,
"reward_before_mean": 0.3295288155786693,
"reward_before_std": 0.3011532872915268,
"reward_change_max": 0.0,
"reward_change_mean": -0.43750171549618244,
"reward_change_min": -0.6185614429414272,
"reward_change_std": 0.2333526872098446,
"reward_std": 0.43000719882547855,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.07952881557866931,
"step": 324
},
{
"clip_fraction": 0.0,
"completion_length": 2943.8958587646484,
"epoch": 0.37142857142857144,
"grad_norm": 0.07239633053541183,
"kl": 9.275972843170166e-06,
"lambda_div_used": 0.6356078162789345,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0667,
"reward": -0.050798285752534866,
"reward_after_mean": -0.050798285752534866,
"reward_after_std": 0.6440786644816399,
"reward_before_mean": 0.24087253957986832,
"reward_before_std": 0.6331603992730379,
"reward_change_max": 0.0,
"reward_change_mean": -0.2916708290576935,
"reward_change_min": -0.4932599440217018,
"reward_change_std": 0.20057004038244486,
"reward_std": 0.6440786886960268,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.02996080555021763,
"step": 325
},
{
"clip_fraction": 0.0,
"completion_length": 2375.645851135254,
"epoch": 0.37257142857142855,
"grad_norm": 0.09299691766500473,
"kl": 2.9705464839935303e-05,
"lambda_div_used": 0.5330315083265305,
"learning_rate": 3.9609093550344907e-07,
"loss": -0.0979,
"reward": -0.20862240344285965,
"reward_after_mean": -0.20862240344285965,
"reward_after_std": 0.3296848703175783,
"reward_before_mean": 0.2517512815538794,
"reward_before_std": 0.1450797226279974,
"reward_change_max": 0.0,
"reward_change_mean": -0.460373692214489,
"reward_change_min": -0.6264396719634533,
"reward_change_std": 0.23280893173068762,
"reward_std": 0.3296848740428686,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": 0.001751287141814828,
"step": 326
},
{
"clip_fraction": 0.0,
"completion_length": 2810.187511444092,
"epoch": 0.3737142857142857,
"grad_norm": 0.06210591271519661,
"kl": 8.400529623031616e-06,
"lambda_div_used": 0.5783005133271217,
"learning_rate": 3.931425787051832e-07,
"loss": -0.0483,
"reward": -0.11793380603194237,
"reward_after_mean": -0.11793380603194237,
"reward_after_std": 0.45353105291724205,
"reward_before_mean": 0.2935123089700937,
"reward_before_std": 0.3566841436550021,
"reward_change_max": 0.0,
"reward_change_mean": -0.41144610941410065,
"reward_change_min": -0.6089312434196472,
"reward_change_std": 0.23675687983632088,
"reward_std": 0.4535310585051775,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.022678975947201252,
"step": 327
},
{
"clip_fraction": 0.0,
"completion_length": 3518.8958435058594,
"epoch": 0.37485714285714283,
"grad_norm": 0.05192619562149048,
"kl": 2.485513687133789e-05,
"lambda_div_used": 0.5715365409851074,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0155,
"reward": -0.4497411046177149,
"reward_after_mean": -0.4497411046177149,
"reward_after_std": 0.40177739411592484,
"reward_before_mean": -0.21567542850971222,
"reward_before_std": 0.32284149527549744,
"reward_change_max": 0.0,
"reward_change_mean": -0.23406569100916386,
"reward_change_min": -0.33814629539847374,
"reward_change_std": 0.12489900179207325,
"reward_std": 0.40177739597857,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2573420889675617,
"step": 328
},
{
"clip_fraction": 0.0,
"completion_length": 2243.0416946411133,
"epoch": 0.376,
"grad_norm": 0.10950610786676407,
"kl": 2.469681203365326e-05,
"lambda_div_used": 0.6176783889532089,
"learning_rate": 3.872689434630585e-07,
"loss": -0.0562,
"reward": -0.013319691643118858,
"reward_after_mean": -0.013319691643118858,
"reward_after_std": 0.5674807205796242,
"reward_before_mean": 0.3346351385116577,
"reward_before_std": 0.5435493532568216,
"reward_change_max": 0.0,
"reward_change_mean": -0.34795483760535717,
"reward_change_min": -0.5437430925667286,
"reward_change_std": 0.22110824659466743,
"reward_std": 0.5674807410687208,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": 0.06380179291591048,
"step": 329
},
{
"clip_fraction": 0.0,
"completion_length": 2141.479190826416,
"epoch": 0.37714285714285717,
"grad_norm": 0.10051169991493225,
"kl": 4.89354133605957e-05,
"lambda_div_used": 0.5594945177435875,
"learning_rate": 3.843439512918949e-07,
"loss": -0.0064,
"reward": -0.43674849811941385,
"reward_after_mean": -0.43674849811941385,
"reward_after_std": 0.32430145144462585,
"reward_before_mean": -0.17029727809131145,
"reward_before_std": 0.2681279256939888,
"reward_change_max": 0.0,
"reward_change_mean": -0.266451220959425,
"reward_change_min": -0.44791119545698166,
"reward_change_std": 0.1575082140043378,
"reward_std": 0.32430145516991615,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2119639493757859,
"step": 330
},
{
"clip_fraction": 0.0,
"completion_length": 2564.604202270508,
"epoch": 0.3782857142857143,
"grad_norm": 0.12382425367832184,
"kl": 4.225596785545349e-05,
"lambda_div_used": 0.5574543103575706,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0413,
"reward": -0.41342686861753464,
"reward_after_mean": -0.41342686861753464,
"reward_after_std": 0.33145094849169254,
"reward_before_mean": -0.13134576752781868,
"reward_before_std": 0.25930391903966665,
"reward_change_max": 0.0,
"reward_change_mean": -0.2820810880511999,
"reward_change_min": -0.4496995583176613,
"reward_change_std": 0.16215670108795166,
"reward_std": 0.33145095966756344,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.17301243357360363,
"step": 331
},
{
"clip_fraction": 0.0,
"completion_length": 2420.6666870117188,
"epoch": 0.37942857142857145,
"grad_norm": 0.07938601076602936,
"kl": 2.1241605281829834e-05,
"lambda_div_used": 0.5416555106639862,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0287,
"reward": -0.21761326864361763,
"reward_after_mean": -0.21761326864361763,
"reward_after_std": 0.34953613951802254,
"reward_before_mean": 0.2352729644626379,
"reward_before_std": 0.18298226408660412,
"reward_change_max": 0.0,
"reward_change_mean": -0.45288625732064247,
"reward_change_min": -0.6215734221041203,
"reward_change_std": 0.2345103258267045,
"reward_std": 0.34953614324331284,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": -0.014727018773555756,
"step": 332
},
{
"clip_fraction": 0.0,
"completion_length": 2350.6875381469727,
"epoch": 0.38057142857142856,
"grad_norm": 0.08657004684209824,
"kl": 2.1725893020629883e-05,
"lambda_div_used": 0.6729468256235123,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0369,
"reward": -0.01155824027955532,
"reward_after_mean": -0.01155824027955532,
"reward_after_std": 0.8104779217392206,
"reward_before_mean": 0.23989354074001312,
"reward_before_std": 0.8117707390338182,
"reward_change_max": 0.0,
"reward_change_mean": -0.2514517717063427,
"reward_change_min": -0.5040576457977295,
"reward_change_std": 0.19348772894591093,
"reward_std": 0.810477938503027,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.030939804390072823,
"step": 333
},
{
"clip_fraction": 0.0,
"completion_length": 3491.6041870117188,
"epoch": 0.38171428571428573,
"grad_norm": 0.05208956450223923,
"kl": 1.1973665095865726e-05,
"lambda_div_used": 0.5869207382202148,
"learning_rate": 3.72726140684072e-07,
"loss": -0.009,
"reward": -0.2944636158645153,
"reward_after_mean": -0.2944636158645153,
"reward_after_std": 0.43844909220933914,
"reward_before_mean": -0.013160821050405502,
"reward_before_std": 0.40495526185259223,
"reward_change_max": 0.0,
"reward_change_mean": -0.2813027612864971,
"reward_change_min": -0.4717176593840122,
"reward_change_std": 0.1835445323958993,
"reward_std": 0.4384490940719843,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.13816083781421185,
"step": 334
},
{
"clip_fraction": 0.0,
"completion_length": 2589.0833892822266,
"epoch": 0.38285714285714284,
"grad_norm": 0.0683097094297409,
"kl": 1.9595026969909668e-05,
"lambda_div_used": 0.6027273386716843,
"learning_rate": 3.6984293534939737e-07,
"loss": -0.0488,
"reward": 0.005996011197566986,
"reward_after_mean": 0.005996011197566986,
"reward_after_std": 0.5422599408775568,
"reward_before_mean": 0.41999348998069763,
"reward_before_std": 0.4701922629028559,
"reward_change_max": 0.0,
"reward_change_mean": -0.41399746760725975,
"reward_change_min": -0.623339332640171,
"reward_change_std": 0.24341293703764677,
"reward_std": 0.5422599650919437,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.08666014112532139,
"step": 335
},
{
"clip_fraction": 0.0,
"completion_length": 3162.1458740234375,
"epoch": 0.384,
"grad_norm": 0.067026786506176,
"kl": 2.0024715922772884e-05,
"lambda_div_used": 0.5950900241732597,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0138,
"reward": -0.16571878641843796,
"reward_after_mean": -0.16571878641843796,
"reward_after_std": 0.4802239239215851,
"reward_before_mean": 0.14471458829939365,
"reward_before_std": 0.4368621027097106,
"reward_change_max": 0.0,
"reward_change_mean": -0.31043340265750885,
"reward_change_min": -0.45523249730467796,
"reward_change_std": 0.18632046319544315,
"reward_std": 0.48022393323481083,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.06361875729635358,
"step": 336
},
{
"clip_fraction": 0.0,
"completion_length": 3228.916717529297,
"epoch": 0.3851428571428571,
"grad_norm": 0.05528295040130615,
"kl": 1.9825994968414307e-05,
"lambda_div_used": 0.6006625667214394,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0552,
"reward": -0.2433483125641942,
"reward_after_mean": -0.2433483125641942,
"reward_after_std": 0.5013067033141851,
"reward_before_mean": 0.03825543075799942,
"reward_before_std": 0.4624515902251005,
"reward_change_max": 0.0,
"reward_change_mean": -0.28160375356674194,
"reward_change_min": -0.4847262054681778,
"reward_change_std": 0.17836903873831034,
"reward_std": 0.501306714490056,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.10757789574563503,
"step": 337
},
{
"clip_fraction": 0.0,
"completion_length": 2393.645866394043,
"epoch": 0.3862857142857143,
"grad_norm": 0.10292567312717438,
"kl": 3.5159289836883545e-05,
"lambda_div_used": 0.6184682846069336,
"learning_rate": 3.612465628992203e-07,
"loss": 0.1082,
"reward": -0.010080082342028618,
"reward_after_mean": -0.010080082342028618,
"reward_after_std": 0.6298389285802841,
"reward_before_mean": 0.3685674872249365,
"reward_before_std": 0.546540604904294,
"reward_change_max": 0.0,
"reward_change_mean": -0.3786476030945778,
"reward_change_min": -0.5825354494154453,
"reward_change_std": 0.2281006295233965,
"reward_std": 0.6298389509320259,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/cosine_scaled_reward": 0.05606749979779124,
"step": 338
},
{
"clip_fraction": 0.0,
"completion_length": 3026.9583892822266,
"epoch": 0.38742857142857146,
"grad_norm": 0.0688738226890564,
"kl": 2.4460256099700928e-05,
"lambda_div_used": 0.5621443763375282,
"learning_rate": 3.5839931879571725e-07,
"loss": -0.0178,
"reward": -0.32898029685020447,
"reward_after_mean": -0.32898029685020447,
"reward_after_std": 0.3227359801530838,
"reward_before_mean": -0.01589403674006462,
"reward_before_std": 0.2781273443251848,
"reward_change_max": 0.0,
"reward_change_mean": -0.31308628246188164,
"reward_change_min": -0.48185204714536667,
"reward_change_std": 0.1813461184501648,
"reward_std": 0.3227359913289547,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.12006070092320442,
"step": 339
},
{
"clip_fraction": 0.0,
"completion_length": 2447.937515258789,
"epoch": 0.38857142857142857,
"grad_norm": 0.10290948301553726,
"kl": 2.363821113249287e-05,
"lambda_div_used": 0.5592405423521996,
"learning_rate": 3.555614130391079e-07,
"loss": -0.0046,
"reward": -0.42997122183442116,
"reward_after_mean": -0.42997122183442116,
"reward_after_std": 0.33643546886742115,
"reward_before_mean": -0.1512418081983924,
"reward_before_std": 0.2663228642195463,
"reward_change_max": 0.0,
"reward_change_mean": -0.2787294201552868,
"reward_change_min": -0.4129678010940552,
"reward_change_std": 0.1523351836949587,
"reward_std": 0.3364354781806469,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.17207514215260744,
"step": 340
},
{
"clip_fraction": 0.0,
"completion_length": 2488.9583587646484,
"epoch": 0.38971428571428574,
"grad_norm": 0.061042170971632004,
"kl": 1.909211277961731e-05,
"lambda_div_used": 0.5834595933556557,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0023,
"reward": 0.009830114664509892,
"reward_after_mean": 0.009830114664509892,
"reward_after_std": 0.5199873205274343,
"reward_before_mean": 0.4715769328176975,
"reward_before_std": 0.38039534725248814,
"reward_change_max": 0.0,
"reward_change_mean": -0.46174679696559906,
"reward_change_min": -0.6357490979135036,
"reward_change_std": 0.2529716519638896,
"reward_std": 0.5199873335659504,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.13824361143633723,
"step": 341
},
{
"clip_fraction": 0.0,
"completion_length": 3020.520854949951,
"epoch": 0.39085714285714285,
"grad_norm": 0.07325860857963562,
"kl": 2.4762004613876343e-05,
"lambda_div_used": 0.5985761880874634,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0007,
"reward": -0.06264204811304808,
"reward_after_mean": -0.06264204811304808,
"reward_after_std": 0.5387034490704536,
"reward_before_mean": 0.33080895524472,
"reward_before_std": 0.45346040464937687,
"reward_change_max": 0.0,
"reward_change_mean": -0.3934510052204132,
"reward_change_min": -0.5952773988246918,
"reward_change_std": 0.23232300952076912,
"reward_std": 0.5387034583836794,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.03914228640496731,
"step": 342
},
{
"clip_fraction": 0.0,
"completion_length": 3405.1875610351562,
"epoch": 0.392,
"grad_norm": 0.04665439575910568,
"kl": 2.7433037757873535e-05,
"lambda_div_used": 0.6150833070278168,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0509,
"reward": -0.1582264108583331,
"reward_after_mean": -0.1582264108583331,
"reward_after_std": 0.5790915079414845,
"reward_before_mean": 0.13680532574653625,
"reward_before_std": 0.5264813583344221,
"reward_change_max": 0.0,
"reward_change_mean": -0.29503174126148224,
"reward_change_min": -0.4708465002477169,
"reward_change_std": 0.1767220702022314,
"reward_std": 0.5790915191173553,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.050694676116108894,
"step": 343
},
{
"clip_fraction": 0.0,
"completion_length": 2360.875045776367,
"epoch": 0.3931428571428571,
"grad_norm": 0.07271434366703033,
"kl": 1.041218638420105e-05,
"lambda_div_used": 0.5770246461033821,
"learning_rate": 3.4430593282358777e-07,
"loss": -0.0008,
"reward": 0.04454847797751427,
"reward_after_mean": 0.04454847797751427,
"reward_after_std": 0.5018230397254229,
"reward_before_mean": 0.5492894258350134,
"reward_before_std": 0.35145203582942486,
"reward_change_max": 0.0,
"reward_change_mean": -0.5047409404069185,
"reward_change_min": -0.7455123476684093,
"reward_change_std": 0.2790954224765301,
"reward_std": 0.5018230620771646,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_scaled_reward": 0.1117894072085619,
"step": 344
},
{
"clip_fraction": 0.0,
"completion_length": 3030.0833740234375,
"epoch": 0.3942857142857143,
"grad_norm": 0.0703606829047203,
"kl": 3.1925737857818604e-05,
"lambda_div_used": 0.5804503262042999,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0646,
"reward": -0.22463034093379974,
"reward_after_mean": -0.22463034093379974,
"reward_after_std": 0.41369511373341084,
"reward_before_mean": 0.09700941108167171,
"reward_before_std": 0.37086532823741436,
"reward_change_max": 0.0,
"reward_change_mean": -0.32163975574076176,
"reward_change_min": -0.5076877251267433,
"reward_change_std": 0.1956562791019678,
"reward_std": 0.4136951379477978,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.06965726800262928,
"step": 345
},
{
"clip_fraction": 0.0,
"completion_length": 3186.6458740234375,
"epoch": 0.3954285714285714,
"grad_norm": 0.05292130261659622,
"kl": 1.4215707778930664e-05,
"lambda_div_used": 0.5958857163786888,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0331,
"reward": -0.24454469978809357,
"reward_after_mean": -0.24454469978809357,
"reward_after_std": 0.48920249938964844,
"reward_before_mean": 0.04633236164227128,
"reward_before_std": 0.44041235372424126,
"reward_change_max": 0.0,
"reward_change_mean": -0.29087705351412296,
"reward_change_min": -0.4697858430445194,
"reward_change_std": 0.179568306542933,
"reward_std": 0.4892025087028742,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.12033431546296924,
"step": 346
},
{
"clip_fraction": 0.0,
"completion_length": 3055.4583587646484,
"epoch": 0.3965714285714286,
"grad_norm": 0.07354738563299179,
"kl": 6.0675665736198425e-06,
"lambda_div_used": 0.5582710355520248,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0132,
"reward": -0.42621116526424885,
"reward_after_mean": -0.42621116526424885,
"reward_after_std": 0.3322943150997162,
"reward_before_mean": -0.15516536496579647,
"reward_before_std": 0.2633455842733383,
"reward_change_max": 0.0,
"reward_change_mean": -0.2710457965731621,
"reward_change_min": -0.4307188205420971,
"reward_change_std": 0.15532648842781782,
"reward_std": 0.33229432441294193,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.19683203659951687,
"step": 347
},
{
"clip_fraction": 0.0,
"completion_length": 2852.4583435058594,
"epoch": 0.3977142857142857,
"grad_norm": 0.09605983644723892,
"kl": 3.167241811752319e-05,
"lambda_div_used": 0.5788434967398643,
"learning_rate": 3.3321084665422803e-07,
"loss": -0.0017,
"reward": -0.2087371125817299,
"reward_after_mean": -0.2087371125817299,
"reward_after_std": 0.40125221759080887,
"reward_before_mean": 0.12491290923207998,
"reward_before_std": 0.36025606002658606,
"reward_change_max": 0.0,
"reward_change_mean": -0.3336500097066164,
"reward_change_min": -0.5146553069353104,
"reward_change_std": 0.20004253275692463,
"reward_std": 0.40125224366784096,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.04175376985222101,
"step": 348
},
{
"clip_fraction": 0.0,
"completion_length": 2964.875015258789,
"epoch": 0.39885714285714285,
"grad_norm": 0.06869405508041382,
"kl": 1.731887459754944e-05,
"lambda_div_used": 0.5606642514467239,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0031,
"reward": -0.2807905152440071,
"reward_after_mean": -0.2807905152440071,
"reward_after_std": 0.39816635474562645,
"reward_before_mean": 0.08202904835343361,
"reward_before_std": 0.2763519547879696,
"reward_change_max": 0.0,
"reward_change_mean": -0.36281956918537617,
"reward_change_min": -0.54488330706954,
"reward_change_std": 0.20150058157742023,
"reward_std": 0.39816636219620705,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.0846376121044159,
"step": 349
},
{
"clip_fraction": 0.0,
"completion_length": 2521.9791870117188,
"epoch": 0.4,
"grad_norm": 0.10303740203380585,
"kl": 2.6671215891838074e-05,
"lambda_div_used": 0.6121688261628151,
"learning_rate": 3.2772616003709616e-07,
"loss": -0.0711,
"reward": -0.04820730444043875,
"reward_after_mean": -0.04820730444043875,
"reward_after_std": 0.5604431573301554,
"reward_before_mean": 0.287683189380914,
"reward_before_std": 0.5214535812847316,
"reward_change_max": 0.0,
"reward_change_mean": -0.3358904607594013,
"reward_change_min": -0.5252225995063782,
"reward_change_std": 0.21241459622979164,
"reward_std": 0.5604431666433811,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": 0.016849845182150602,
"step": 350
},
{
"clip_fraction": 0.0,
"completion_length": 3268.250030517578,
"epoch": 0.40114285714285713,
"grad_norm": 0.04833042621612549,
"kl": 2.4568289518356323e-05,
"lambda_div_used": 0.5601532310247421,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0519,
"reward": -0.250777292996645,
"reward_after_mean": -0.250777292996645,
"reward_after_std": 0.38171103224158287,
"reward_before_mean": 0.1175131555646658,
"reward_before_std": 0.2718004435300827,
"reward_change_max": 0.0,
"reward_change_mean": -0.36829047463834286,
"reward_change_min": -0.5422449931502342,
"reward_change_std": 0.20415859669446945,
"reward_std": 0.3817110415548086,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.06998682580888271,
"step": 351
},
{
"clip_fraction": 0.0,
"completion_length": 2589.333366394043,
"epoch": 0.4022857142857143,
"grad_norm": 0.06552080065011978,
"kl": 1.905485987663269e-05,
"lambda_div_used": 0.6104472130537033,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0025,
"reward": -0.09347447147592902,
"reward_after_mean": -0.09347447147592902,
"reward_after_std": 0.6156985405832529,
"reward_before_mean": 0.26158976616716245,
"reward_before_std": 0.5061299707740545,
"reward_change_max": 0.0,
"reward_change_mean": -0.3550642393529415,
"reward_change_min": -0.5232127867639065,
"reward_change_std": 0.1957317991182208,
"reward_std": 0.6156985703855753,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.01158975763246417,
"step": 352
},
{
"clip_fraction": 0.0,
"completion_length": 2533.7916946411133,
"epoch": 0.4034285714285714,
"grad_norm": 0.07949981093406677,
"kl": 4.393863491714001e-05,
"lambda_div_used": 0.6204324588179588,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0456,
"reward": -0.12379613053053617,
"reward_after_mean": -0.12379613053053617,
"reward_after_std": 0.6454745382070541,
"reward_before_mean": 0.203694608528167,
"reward_before_std": 0.5591385969892144,
"reward_change_max": 0.0,
"reward_change_mean": -0.327490733936429,
"reward_change_min": -0.5519858598709106,
"reward_change_std": 0.1984806014224887,
"reward_std": 0.6454745400696993,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.0463053984567523,
"step": 353
},
{
"clip_fraction": 0.0,
"completion_length": 1914.562515258789,
"epoch": 0.4045714285714286,
"grad_norm": 0.07853464037179947,
"kl": 2.3346394300460815e-05,
"lambda_div_used": 0.5937526449561119,
"learning_rate": 3.168878457820915e-07,
"loss": -0.0345,
"reward": 0.036306386813521385,
"reward_after_mean": 0.036306386813521385,
"reward_after_std": 0.5187021996825933,
"reward_before_mean": 0.4854283039458096,
"reward_before_std": 0.42800275422632694,
"reward_change_max": 0.0,
"reward_change_mean": -0.4491219110786915,
"reward_change_min": -0.637521255761385,
"reward_change_std": 0.252533134073019,
"reward_std": 0.5187022183090448,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.15209495089948177,
"step": 354
},
{
"clip_fraction": 0.0,
"completion_length": 2335.8958740234375,
"epoch": 0.4057142857142857,
"grad_norm": 0.09039249271154404,
"kl": 2.7535483241081238e-05,
"lambda_div_used": 0.6392721608281136,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0158,
"reward": 0.017912205308675766,
"reward_after_mean": 0.017912205308675766,
"reward_after_std": 0.6705970745533705,
"reward_before_mean": 0.355070261284709,
"reward_before_std": 0.6396346259862185,
"reward_change_max": 0.0,
"reward_change_mean": -0.33715808019042015,
"reward_change_min": -0.5581717267632484,
"reward_change_std": 0.21758130192756653,
"reward_std": 0.6705970987677574,
"rewards/accuracy_reward": 0.354166679084301,
"rewards/cosine_scaled_reward": 0.0009035973343998194,
"step": 355
},
{
"clip_fraction": 0.0,
"completion_length": 2585.4791870117188,
"epoch": 0.40685714285714286,
"grad_norm": 0.06332956254482269,
"kl": 1.6089528799057007e-05,
"lambda_div_used": 0.6472217217087746,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0329,
"reward": -0.00402236171066761,
"reward_after_mean": -0.00402236171066761,
"reward_after_std": 0.6937260664999485,
"reward_before_mean": 0.2978215580806136,
"reward_before_std": 0.6849680617451668,
"reward_change_max": 0.0,
"reward_change_mean": -0.3018439058214426,
"reward_change_min": -0.4991762898862362,
"reward_change_std": 0.2024542335420847,
"reward_std": 0.6937260907143354,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/cosine_scaled_reward": 0.00615486316382885,
"step": 356
},
{
"clip_fraction": 0.0,
"completion_length": 3264.8125610351562,
"epoch": 0.408,
"grad_norm": 0.057182345539331436,
"kl": 2.91336327791214e-05,
"lambda_div_used": 0.5862653851509094,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0104,
"reward": -0.31888771802186966,
"reward_after_mean": -0.31888771802186966,
"reward_after_std": 0.43322005309164524,
"reward_before_mean": -0.049007685855031013,
"reward_before_std": 0.39840223640203476,
"reward_change_max": 0.0,
"reward_change_mean": -0.2698800265789032,
"reward_change_min": -0.48054200410842896,
"reward_change_std": 0.1758509548380971,
"reward_std": 0.43322005309164524,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1531743509694934,
"step": 357
},
{
"clip_fraction": 0.0,
"completion_length": 3050.2500534057617,
"epoch": 0.40914285714285714,
"grad_norm": 0.05330098420381546,
"kl": 2.4370267055928707e-05,
"lambda_div_used": 0.6431510671973228,
"learning_rate": 3.062313053727671e-07,
"loss": 0.003,
"reward": -0.07201657444238663,
"reward_after_mean": -0.07201657444238663,
"reward_after_std": 0.688179362565279,
"reward_before_mean": 0.2027557131368667,
"reward_before_std": 0.6673171781003475,
"reward_change_max": 0.0,
"reward_change_mean": -0.27477228827774525,
"reward_change_min": -0.4944618083536625,
"reward_change_std": 0.18934499006718397,
"reward_std": 0.6881793700158596,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.026410957798361778,
"step": 358
},
{
"clip_fraction": 0.0,
"completion_length": 2671.937530517578,
"epoch": 0.4102857142857143,
"grad_norm": 0.07592163234949112,
"kl": 3.884732723236084e-05,
"lambda_div_used": 0.5795001536607742,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0464,
"reward": -0.2641681991517544,
"reward_after_mean": -0.2641681991517544,
"reward_after_std": 0.4112328961491585,
"reward_before_mean": 0.03885669261217117,
"reward_before_std": 0.36149344593286514,
"reward_change_max": 0.0,
"reward_change_mean": -0.30302487686276436,
"reward_change_min": -0.4962599165737629,
"reward_change_std": 0.18252001702785492,
"reward_std": 0.4112329035997391,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.10697665251791477,
"step": 359
},
{
"clip_fraction": 0.0,
"completion_length": 2798.937530517578,
"epoch": 0.4114285714285714,
"grad_norm": 0.06665439158678055,
"kl": 3.1637027859687805e-05,
"lambda_div_used": 0.6062901616096497,
"learning_rate": 3.0097380284049523e-07,
"loss": -0.0029,
"reward": -0.0033570118248462677,
"reward_after_mean": -0.0033570118248462677,
"reward_after_std": 0.5593161657452583,
"reward_before_mean": 0.4031880460679531,
"reward_before_std": 0.4958424214273691,
"reward_change_max": 0.0,
"reward_change_mean": -0.40654509887099266,
"reward_change_min": -0.6473257802426815,
"reward_change_std": 0.25588439870625734,
"reward_std": 0.5593161787837744,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.0698547288775444,
"step": 360
},
{
"clip_fraction": 0.0,
"completion_length": 3150.625030517578,
"epoch": 0.4125714285714286,
"grad_norm": 0.06971946358680725,
"kl": 2.9014074243605137e-05,
"lambda_div_used": 0.6219506114721298,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0048,
"reward": -0.11981392558664083,
"reward_after_mean": -0.11981392558664083,
"reward_after_std": 0.5868565142154694,
"reward_before_mean": 0.17182117886841297,
"reward_before_std": 0.564973471686244,
"reward_change_max": 0.0,
"reward_change_mean": -0.29163510724902153,
"reward_change_min": -0.5129627995193005,
"reward_change_std": 0.19524423126131296,
"reward_std": 0.5868565402925014,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.057345494627952576,
"step": 361
},
{
"clip_fraction": 0.0,
"completion_length": 1471.916706085205,
"epoch": 0.4137142857142857,
"grad_norm": 0.12450961768627167,
"kl": 2.4568289518356323e-05,
"lambda_div_used": 0.571638435125351,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0303,
"reward": 0.013313630130141973,
"reward_after_mean": 0.013313630130141973,
"reward_after_std": 0.4897063076496124,
"reward_before_mean": 0.505017813295126,
"reward_before_std": 0.32288316125050187,
"reward_change_max": 0.0,
"reward_change_mean": -0.4917041715234518,
"reward_change_min": -0.6615025624632835,
"reward_change_std": 0.25815817900002,
"reward_std": 0.4897063188254833,
"rewards/accuracy_reward": 0.37500000558793545,
"rewards/cosine_scaled_reward": 0.1300177900120616,
"step": 362
},
{
"clip_fraction": 0.0,
"completion_length": 2174.270881652832,
"epoch": 0.41485714285714287,
"grad_norm": 0.08419760316610336,
"kl": 4.357472062110901e-05,
"lambda_div_used": 0.6037088930606842,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0176,
"reward": -0.09517315030097961,
"reward_after_mean": -0.09517315030097961,
"reward_after_std": 0.57970448769629,
"reward_before_mean": 0.282845395617187,
"reward_before_std": 0.4830307289958,
"reward_change_max": 0.0,
"reward_change_mean": -0.37801856361329556,
"reward_change_min": -0.5875305943191051,
"reward_change_std": 0.22446841653436422,
"reward_std": 0.57970448769629,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.03284538397565484,
"step": 363
},
{
"clip_fraction": 0.0,
"completion_length": 2614.9167098999023,
"epoch": 0.416,
"grad_norm": 0.07822614908218384,
"kl": 2.289540134370327e-05,
"lambda_div_used": 0.5602610111236572,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0079,
"reward": -0.4077296704053879,
"reward_after_mean": -0.4077296704053879,
"reward_after_std": 0.3345623780041933,
"reward_before_mean": -0.1330121699720621,
"reward_before_std": 0.27498441375792027,
"reward_change_max": 0.0,
"reward_change_mean": -0.2747174873948097,
"reward_change_min": -0.4260003827512264,
"reward_change_std": 0.16245513781905174,
"reward_std": 0.3345623817294836,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.195512181147933,
"step": 364
},
{
"clip_fraction": 0.0,
"completion_length": 3003.604202270508,
"epoch": 0.41714285714285715,
"grad_norm": 0.04828161001205444,
"kl": 6.87967985868454e-06,
"lambda_div_used": 0.6036554351449013,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0555,
"reward": -0.31679424038156867,
"reward_after_mean": -0.31679424038156867,
"reward_after_std": 0.5247047282755375,
"reward_before_mean": -0.07733823172748089,
"reward_before_std": 0.4768796032294631,
"reward_change_max": 0.0,
"reward_change_mean": -0.23945600911974907,
"reward_change_min": -0.42952996119856834,
"reward_change_std": 0.15284609980881214,
"reward_std": 0.5247047450393438,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.18150490219704807,
"step": 365
},
{
"clip_fraction": 0.0,
"completion_length": 1893.00004196167,
"epoch": 0.41828571428571426,
"grad_norm": 0.08456390351057053,
"kl": 1.6473233699798584e-05,
"lambda_div_used": 0.6091032102704048,
"learning_rate": 2.854966364683872e-07,
"loss": -0.0195,
"reward": 0.005720527842640877,
"reward_after_mean": 0.005720527842640877,
"reward_after_std": 0.6514943428337574,
"reward_before_mean": 0.43899719044566154,
"reward_before_std": 0.49832448456436396,
"reward_change_max": 0.0,
"reward_change_mean": -0.4332766607403755,
"reward_change_min": -0.6058122254908085,
"reward_change_std": 0.23482245951890945,
"reward_std": 0.6514943763613701,
"rewards/accuracy_reward": 0.39583333767950535,
"rewards/cosine_scaled_reward": 0.043163834139704704,
"step": 366
},
{
"clip_fraction": 0.0,
"completion_length": 2780.937515258789,
"epoch": 0.41942857142857143,
"grad_norm": 0.06994574517011642,
"kl": 2.946704626083374e-05,
"lambda_div_used": 0.6079575195908546,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0087,
"reward": -0.16347008850425482,
"reward_after_mean": -0.16347008850425482,
"reward_after_std": 0.5466162711381912,
"reward_before_mean": 0.1403359491378069,
"reward_before_std": 0.49341694079339504,
"reward_change_max": 0.0,
"reward_change_mean": -0.30380604416131973,
"reward_change_min": -0.4883100688457489,
"reward_change_std": 0.18378268275409937,
"reward_std": 0.5466162823140621,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.0471640374744311,
"step": 367
},
{
"clip_fraction": 0.0,
"completion_length": 2967.041679382324,
"epoch": 0.4205714285714286,
"grad_norm": 0.07414139062166214,
"kl": 4.155188798904419e-05,
"lambda_div_used": 0.5803077146410942,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0504,
"reward": -0.27402403950691223,
"reward_after_mean": -0.27402403950691223,
"reward_after_std": 0.4059589561074972,
"reward_before_mean": 0.008191026747226715,
"reward_before_std": 0.371637674048543,
"reward_change_max": 0.0,
"reward_change_mean": -0.28221505135297775,
"reward_change_min": -0.44202784821391106,
"reward_change_std": 0.17603088915348053,
"reward_std": 0.40595896542072296,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.15847563743591309,
"step": 368
},
{
"clip_fraction": 0.0,
"completion_length": 2863.1667404174805,
"epoch": 0.4217142857142857,
"grad_norm": 0.07171155512332916,
"kl": 2.0891427993774414e-05,
"lambda_div_used": 0.6472674459218979,
"learning_rate": 2.7793039831193133e-07,
"loss": -0.0044,
"reward": -0.04304695222526789,
"reward_after_mean": -0.04304695222526789,
"reward_after_std": 0.687617726624012,
"reward_before_mean": 0.25146727729588747,
"reward_before_std": 0.6888436311855912,
"reward_change_max": 0.0,
"reward_change_mean": -0.29451421834528446,
"reward_change_min": -0.6169496960937977,
"reward_change_std": 0.22693553566932678,
"reward_std": 0.6876177359372377,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.0014672745019197464,
"step": 369
},
{
"clip_fraction": 0.0,
"completion_length": 3261.2291870117188,
"epoch": 0.4228571428571429,
"grad_norm": 0.058094967156648636,
"kl": 2.1554529666900635e-05,
"lambda_div_used": 0.5657742545008659,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0031,
"reward": -0.2483972143381834,
"reward_after_mean": -0.2483972143381834,
"reward_after_std": 0.4499538466334343,
"reward_before_mean": 0.13386818021535873,
"reward_before_std": 0.29791492875665426,
"reward_change_max": 0.0,
"reward_change_mean": -0.38226539455354214,
"reward_change_min": -0.51585279032588,
"reward_change_std": 0.19617348536849022,
"reward_std": 0.4499538540840149,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.03279848676174879,
"step": 370
},
{
"clip_fraction": 0.0,
"completion_length": 1885.645866394043,
"epoch": 0.424,
"grad_norm": 0.12381042540073395,
"kl": 1.5240773791447282e-05,
"lambda_div_used": 0.5325883999466896,
"learning_rate": 2.729523361034538e-07,
"loss": -0.0304,
"reward": -0.14332129061222076,
"reward_after_mean": -0.14332129061222076,
"reward_after_std": 0.3713335506618023,
"reward_before_mean": 0.3775896169245243,
"reward_before_std": 0.14313361048698425,
"reward_change_max": 0.0,
"reward_change_mean": -0.5209108907729387,
"reward_change_min": -0.6799871250987053,
"reward_change_std": 0.26304140500724316,
"reward_std": 0.3713335543870926,
"rewards/accuracy_reward": 0.375,
"rewards/cosine_scaled_reward": 0.0025896020233631134,
"step": 371
},
{
"clip_fraction": 0.0,
"completion_length": 3061.6250228881836,
"epoch": 0.42514285714285716,
"grad_norm": 0.0565616711974144,
"kl": 1.3771001249551773e-05,
"lambda_div_used": 0.6153379678726196,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0019,
"reward": 0.08842738252133131,
"reward_after_mean": 0.08842738252133131,
"reward_after_std": 0.6899000462144613,
"reward_before_mean": 0.5513297475408763,
"reward_before_std": 0.5352799613028765,
"reward_change_max": 0.0,
"reward_change_mean": -0.46290238574147224,
"reward_change_min": -0.6905807815492153,
"reward_change_std": 0.26435263454914093,
"reward_std": 0.6899000480771065,
"rewards/accuracy_reward": 0.3958333358168602,
"rewards/cosine_scaled_reward": 0.15549640776589513,
"step": 372
},
{
"clip_fraction": 0.0,
"completion_length": 1999.791690826416,
"epoch": 0.42628571428571427,
"grad_norm": 0.11289634555578232,
"kl": 2.6244670152664185e-05,
"lambda_div_used": 0.5901408270001411,
"learning_rate": 2.6802828488599294e-07,
"loss": -0.0028,
"reward": -0.25084885116666555,
"reward_after_mean": -0.25084885116666555,
"reward_after_std": 0.47954990342259407,
"reward_before_mean": 0.03629123326390982,
"reward_before_std": 0.41134614683687687,
"reward_change_max": 0.0,
"reward_change_mean": -0.28714008443057537,
"reward_change_min": -0.4200817756354809,
"reward_change_std": 0.1626526527106762,
"reward_std": 0.47954992204904556,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.08870877418667078,
"step": 373
},
{
"clip_fraction": 0.0,
"completion_length": 2700.791717529297,
"epoch": 0.42742857142857144,
"grad_norm": 0.0688440129160881,
"kl": 2.674013376235962e-05,
"lambda_div_used": 0.6036395579576492,
"learning_rate": 2.655868138008171e-07,
"loss": -0.0525,
"reward": -0.17511339485645294,
"reward_after_mean": -0.17511339485645294,
"reward_after_std": 0.5073369853198528,
"reward_before_mean": 0.13348117470741272,
"reward_before_std": 0.48259105905890465,
"reward_change_max": 0.0,
"reward_change_mean": -0.30859458073973656,
"reward_change_min": -0.5309739634394646,
"reward_change_std": 0.20630291104316711,
"reward_std": 0.5073370076715946,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.07485215552151203,
"step": 374
},
{
"clip_fraction": 0.0,
"completion_length": 3033.791702270508,
"epoch": 0.42857142857142855,
"grad_norm": 0.05866050720214844,
"kl": 1.8252991139888763e-05,
"lambda_div_used": 0.6017400398850441,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0477,
"reward": -0.18571221362799406,
"reward_after_mean": -0.18571221362799406,
"reward_after_std": 0.5023255608975887,
"reward_before_mean": 0.11062880232930183,
"reward_before_std": 0.46861019916832447,
"reward_change_max": 0.0,
"reward_change_mean": -0.2963410019874573,
"reward_change_min": -0.45132842287421227,
"reward_change_std": 0.1817708509042859,
"reward_std": 0.5023255832493305,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.097704553976655,
"step": 375
},
{
"clip_fraction": 0.0,
"completion_length": 2246.770851135254,
"epoch": 0.4297142857142857,
"grad_norm": 0.08337263762950897,
"kl": 8.605420589447021e-06,
"lambda_div_used": 0.5853807479143143,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0128,
"reward": -0.3334618601948023,
"reward_after_mean": -0.3334618601948023,
"reward_after_std": 0.4486316703259945,
"reward_before_mean": -0.05933803477091715,
"reward_before_std": 0.3892780668102205,
"reward_change_max": 0.0,
"reward_change_mean": -0.2741238009184599,
"reward_change_min": -0.46821001917123795,
"reward_change_std": 0.1663337228819728,
"reward_std": 0.44863167591392994,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.163504708558321,
"step": 376
},
{
"clip_fraction": 0.0,
"completion_length": 3488.3541870117188,
"epoch": 0.4308571428571429,
"grad_norm": 0.055966660380363464,
"kl": 2.5756657123565674e-05,
"lambda_div_used": 0.5742340236902237,
"learning_rate": 2.583460445215911e-07,
"loss": -0.0032,
"reward": -0.3738445993512869,
"reward_after_mean": -0.3738445993512869,
"reward_after_std": 0.39967909548431635,
"reward_before_mean": -0.10715527582215145,
"reward_before_std": 0.33878533728420734,
"reward_change_max": 0.0,
"reward_change_mean": -0.26668932288885117,
"reward_change_min": -0.4109174869954586,
"reward_change_std": 0.15686567965894938,
"reward_std": 0.39967911317944527,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.19048861227929592,
"step": 377
},
{
"clip_fraction": 0.0,
"completion_length": 2134.3333740234375,
"epoch": 0.432,
"grad_norm": 0.13181807100772858,
"kl": 2.8654932975769043e-05,
"lambda_div_used": 0.6004306748509407,
"learning_rate": 2.5596072820445254e-07,
"loss": -0.0331,
"reward": -0.06989812850952148,
"reward_after_mean": -0.06989812850952148,
"reward_after_std": 0.5459046512842178,
"reward_before_mean": 0.31881395215168595,
"reward_before_std": 0.46263560838997364,
"reward_change_max": 0.0,
"reward_change_mean": -0.3887120857834816,
"reward_change_min": -0.5917952992022038,
"reward_change_std": 0.23005317710340023,
"reward_std": 0.5459046605974436,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.027147281914949417,
"step": 378
},
{
"clip_fraction": 0.0,
"completion_length": 3121.0416870117188,
"epoch": 0.43314285714285716,
"grad_norm": 0.07427296042442322,
"kl": 1.5733763575553894e-05,
"lambda_div_used": 0.5833418369293213,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0048,
"reward": -0.29101302847266197,
"reward_after_mean": -0.29101302847266197,
"reward_after_std": 0.4290795102715492,
"reward_before_mean": -0.009382706135511398,
"reward_before_std": 0.38597164303064346,
"reward_change_max": 0.0,
"reward_change_mean": -0.2816303465515375,
"reward_change_min": -0.47839244455099106,
"reward_change_std": 0.17911072820425034,
"reward_std": 0.4290795363485813,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.13438269309699535,
"step": 379
},
{
"clip_fraction": 0.0,
"completion_length": 2823.416702270508,
"epoch": 0.4342857142857143,
"grad_norm": 0.06611248105764389,
"kl": 2.2601336240768433e-05,
"lambda_div_used": 0.6066281795501709,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0533,
"reward": -0.17613293696194887,
"reward_after_mean": -0.17613293696194887,
"reward_after_std": 0.495351817458868,
"reward_before_mean": 0.1346770692616701,
"reward_before_std": 0.48867712169885635,
"reward_change_max": 0.0,
"reward_change_mean": -0.31081000342965126,
"reward_change_min": -0.5689719989895821,
"reward_change_std": 0.21053248085081577,
"reward_std": 0.4953518286347389,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.05282294252538122,
"step": 380
},
{
"clip_fraction": 0.0,
"completion_length": 2983.7916984558105,
"epoch": 0.43542857142857144,
"grad_norm": 0.08519254624843597,
"kl": 3.2504089176654816e-05,
"lambda_div_used": 0.6109358817338943,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0165,
"reward": -0.2218069350346923,
"reward_after_mean": -0.2218069350346923,
"reward_after_std": 0.5642109382897615,
"reward_before_mean": 0.040106164291501045,
"reward_before_std": 0.5099438140168786,
"reward_change_max": 0.0,
"reward_change_mean": -0.261913089081645,
"reward_change_min": -0.45296039059758186,
"reward_change_std": 0.1642473293468356,
"reward_std": 0.5642109606415033,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.10572716826573014,
"step": 381
},
{
"clip_fraction": 0.0,
"completion_length": 2500.437545776367,
"epoch": 0.43657142857142855,
"grad_norm": 0.07325682789087296,
"kl": 3.65767627954483e-05,
"lambda_div_used": 0.5600408837199211,
"learning_rate": 2.465639255873246e-07,
"loss": -0.0096,
"reward": -0.4078812226653099,
"reward_after_mean": -0.4078812226653099,
"reward_after_std": 0.3340430334210396,
"reward_before_mean": -0.13890772312879562,
"reward_before_std": 0.27443209011107683,
"reward_change_max": 0.0,
"reward_change_mean": -0.26897350139915943,
"reward_change_min": -0.42554087564349174,
"reward_change_std": 0.15754834469407797,
"reward_std": 0.3340430427342653,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.20140772499144077,
"step": 382
},
{
"clip_fraction": 0.0,
"completion_length": 2824.291717529297,
"epoch": 0.4377142857142857,
"grad_norm": 0.08796168118715286,
"kl": 4.8138201236724854e-05,
"lambda_div_used": 0.5781937688589096,
"learning_rate": 2.4425141308231765e-07,
"loss": -0.0086,
"reward": -0.16291768848896027,
"reward_after_mean": -0.16291768848896027,
"reward_after_std": 0.387674568220973,
"reward_before_mean": 0.1924248207360506,
"reward_before_std": 0.35597400926053524,
"reward_change_max": 0.0,
"reward_change_mean": -0.3553424943238497,
"reward_change_min": -0.5255323797464371,
"reward_change_std": 0.21135967783629894,
"reward_std": 0.3876745719462633,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/cosine_scaled_reward": -0.01590852066874504,
"step": 383
},
{
"clip_fraction": 0.0,
"completion_length": 2347.6042404174805,
"epoch": 0.43885714285714283,
"grad_norm": 0.09212023764848709,
"kl": 2.508983016014099e-05,
"lambda_div_used": 0.6615950018167496,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0408,
"reward": 0.27819300815463066,
"reward_after_mean": 0.27819300815463066,
"reward_after_std": 0.7268820777535439,
"reward_before_mean": 0.6785084716975689,
"reward_before_std": 0.7540020374581218,
"reward_change_max": 0.0,
"reward_change_mean": -0.4003154691308737,
"reward_change_min": -0.6902611702680588,
"reward_change_std": 0.283959056250751,
"reward_std": 0.7268821019679308,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/cosine_scaled_reward": 0.22017512656748295,
"step": 384
},
{
"clip_fraction": 0.0,
"completion_length": 2711.958396911621,
"epoch": 0.44,
"grad_norm": 0.07038652151823044,
"kl": 1.942366361618042e-05,
"lambda_div_used": 0.6165113672614098,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.012,
"reward": -0.25626325886696577,
"reward_after_mean": -0.25626325886696577,
"reward_after_std": 0.5892766248434782,
"reward_before_mean": -0.0074533987790346146,
"reward_before_std": 0.5360177559778094,
"reward_change_max": 0.0,
"reward_change_mean": -0.24880987033247948,
"reward_change_min": -0.44854912906885147,
"reward_change_std": 0.1595793990418315,
"reward_std": 0.5892766322940588,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.13245339877903461,
"step": 385
},
{
"clip_fraction": 0.0,
"completion_length": 2889.1042404174805,
"epoch": 0.44114285714285717,
"grad_norm": 0.060970280319452286,
"kl": 1.8231570720672607e-05,
"lambda_div_used": 0.5940364003181458,
"learning_rate": 2.374037332934512e-07,
"loss": -0.0287,
"reward": -0.24576714914292097,
"reward_after_mean": -0.24576714914292097,
"reward_after_std": 0.4883785657584667,
"reward_before_mean": 0.04634229093790054,
"reward_before_std": 0.4279829766601324,
"reward_change_max": 0.0,
"reward_change_mean": -0.29210945032536983,
"reward_change_min": -0.4432514049112797,
"reward_change_std": 0.17009661067277193,
"reward_std": 0.48837856762111187,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.07865770626813173,
"step": 386
},
{
"clip_fraction": 0.0,
"completion_length": 2997.3125,
"epoch": 0.4422857142857143,
"grad_norm": 0.07781542837619781,
"kl": 1.3821758329868317e-05,
"lambda_div_used": 0.5551474094390869,
"learning_rate": 2.3515149676898552e-07,
"loss": -0.0529,
"reward": -0.38486793637275696,
"reward_after_mean": -0.38486793637275696,
"reward_after_std": 0.31045267172157764,
"reward_before_mean": -0.10442159324884415,
"reward_before_std": 0.24840335873886943,
"reward_change_max": 0.0,
"reward_change_mean": -0.28044634498655796,
"reward_change_min": -0.4241391494870186,
"reward_change_std": 0.15805031638592482,
"reward_std": 0.31045267917215824,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.20858826488256454,
"step": 387
},
{
"clip_fraction": 0.0,
"completion_length": 2572.2708587646484,
"epoch": 0.44342857142857145,
"grad_norm": 0.06563723832368851,
"kl": 2.9848888516426086e-06,
"lambda_div_used": 0.5865650475025177,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0407,
"reward": -0.15434425324201584,
"reward_after_mean": -0.15434425324201584,
"reward_after_std": 0.47601727209985256,
"reward_before_mean": 0.20021704956889153,
"reward_before_std": 0.39916726760566235,
"reward_change_max": 0.0,
"reward_change_mean": -0.354561323300004,
"reward_change_min": -0.5467559210956097,
"reward_change_std": 0.21171134896576405,
"reward_std": 0.47601727209985256,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": -0.07061627879738808,
"step": 388
},
{
"clip_fraction": 0.0,
"completion_length": 2650.937545776367,
"epoch": 0.44457142857142856,
"grad_norm": 0.07840488851070404,
"kl": 1.73286534845829e-05,
"lambda_div_used": 0.6045578718185425,
"learning_rate": 2.306931685585657e-07,
"loss": -0.0394,
"reward": -0.26122746989130974,
"reward_after_mean": -0.26122746989130974,
"reward_after_std": 0.5210577324032784,
"reward_before_mean": 0.002998221665620804,
"reward_before_std": 0.486898148432374,
"reward_change_max": 0.0,
"reward_change_mean": -0.26422569528222084,
"reward_change_min": -0.4951511509716511,
"reward_change_std": 0.18348107766360044,
"reward_std": 0.5210577566176653,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.1428351059439592,
"step": 389
},
{
"clip_fraction": 0.0,
"completion_length": 3007.666679382324,
"epoch": 0.44571428571428573,
"grad_norm": 0.07837636768817902,
"kl": 1.6082078218460083e-05,
"lambda_div_used": 0.61600511521101,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0713,
"reward": -0.18647840432822704,
"reward_after_mean": -0.18647840432822704,
"reward_after_std": 0.5758322961628437,
"reward_before_mean": 0.08954079262912273,
"reward_before_std": 0.5347468825057149,
"reward_change_max": 0.0,
"reward_change_mean": -0.27601918019354343,
"reward_change_min": -0.47189946845173836,
"reward_change_std": 0.17452176753431559,
"reward_std": 0.5758323054760695,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.07712590089067817,
"step": 390
},
{
"clip_fraction": 0.0,
"completion_length": 2716.708351135254,
"epoch": 0.44685714285714284,
"grad_norm": 0.09803462773561478,
"kl": 2.5559216737747192e-05,
"lambda_div_used": 0.6362191960215569,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0166,
"reward": -0.013867436617147177,
"reward_after_mean": -0.013867436617147177,
"reward_after_std": 0.6719868443906307,
"reward_before_mean": 0.30256712157279253,
"reward_before_std": 0.6324740117415786,
"reward_change_max": 0.0,
"reward_change_mean": -0.3164345696568489,
"reward_change_min": -0.5261501334607601,
"reward_change_std": 0.20623697619885206,
"reward_std": 0.6719868592917919,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.03173377411440015,
"step": 391
},
{
"clip_fraction": 0.0,
"completion_length": 2130.9791984558105,
"epoch": 0.448,
"grad_norm": 0.11540649086236954,
"kl": 1.8610619008541107e-05,
"lambda_div_used": 0.6027960926294327,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0919,
"reward": -0.16835473664104939,
"reward_after_mean": -0.16835473664104939,
"reward_after_std": 0.5122340489178896,
"reward_before_mean": 0.1393747702240944,
"reward_before_std": 0.4794600326567888,
"reward_change_max": 0.0,
"reward_change_mean": -0.3077295087277889,
"reward_change_min": -0.4898943528532982,
"reward_change_std": 0.1954572731629014,
"reward_std": 0.5122340768575668,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.027291906299069524,
"step": 392
},
{
"clip_fraction": 0.0,
"completion_length": 2649.562530517578,
"epoch": 0.4491428571428571,
"grad_norm": 0.07226411253213882,
"kl": 2.648681402206421e-05,
"lambda_div_used": 0.5953627824783325,
"learning_rate": 2.2196411766036487e-07,
"loss": -0.067,
"reward": -0.20140517689287663,
"reward_after_mean": -0.20140517689287663,
"reward_after_std": 0.4766168761998415,
"reward_before_mean": 0.09705937840044498,
"reward_before_std": 0.4375506564974785,
"reward_change_max": 0.0,
"reward_change_mean": -0.2984645199030638,
"reward_change_min": -0.47758448868989944,
"reward_change_std": 0.18242334388196468,
"reward_std": 0.47661688551306725,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.09044064581394196,
"step": 393
},
{
"clip_fraction": 0.0,
"completion_length": 3104.1458740234375,
"epoch": 0.4502857142857143,
"grad_norm": 0.07446404546499252,
"kl": 2.555176615715027e-05,
"lambda_div_used": 0.5502220839262009,
"learning_rate": 2.1982156097370557e-07,
"loss": -0.1089,
"reward": -0.46874738670885563,
"reward_after_mean": -0.46874738670885563,
"reward_after_std": 0.30342659167945385,
"reward_before_mean": -0.20392105542123318,
"reward_before_std": 0.22309745661914349,
"reward_change_max": 0.0,
"reward_change_mean": -0.2648263294249773,
"reward_change_min": -0.3818593733012676,
"reward_change_std": 0.14172559697180986,
"reward_std": 0.30342659167945385,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.22475438751280308,
"step": 394
},
{
"clip_fraction": 0.0,
"completion_length": 2284.6458435058594,
"epoch": 0.4514285714285714,
"grad_norm": 0.12902304530143738,
"kl": 4.9501657485961914e-05,
"lambda_div_used": 0.5602920204401016,
"learning_rate": 2.1769509671835223e-07,
"loss": -0.1675,
"reward": -0.3165151756256819,
"reward_after_mean": -0.3165151756256819,
"reward_after_std": 0.3793674483895302,
"reward_before_mean": 0.0216768067330122,
"reward_before_std": 0.27139727398753166,
"reward_change_max": 0.0,
"reward_change_mean": -0.3381919823586941,
"reward_change_min": -0.48955149203538895,
"reward_change_std": 0.17990652937442064,
"reward_std": 0.3793674521148205,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.12415653187781572,
"step": 395
},
{
"clip_fraction": 0.0,
"completion_length": 3048.1458587646484,
"epoch": 0.45257142857142857,
"grad_norm": 0.054839495569467545,
"kl": 7.04331323504448e-05,
"lambda_div_used": 0.6417915895581245,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0192,
"reward": -0.09411877207458019,
"reward_after_mean": -0.09411877207458019,
"reward_after_std": 0.6742421016097069,
"reward_before_mean": 0.1871709941624431,
"reward_before_std": 0.659186695702374,
"reward_change_max": 0.0,
"reward_change_mean": -0.2812897562980652,
"reward_change_min": -0.5664796940982342,
"reward_change_std": 0.20405743923038244,
"reward_std": 0.6742421071976423,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.021162351593375206,
"step": 396
},
{
"clip_fraction": 0.0,
"completion_length": 2982.750030517578,
"epoch": 0.45371428571428574,
"grad_norm": 0.05798732861876488,
"kl": 2.0850449800491333e-05,
"lambda_div_used": 0.5765915215015411,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0391,
"reward": -0.36006474308669567,
"reward_after_mean": -0.36006474308669567,
"reward_after_std": 0.40458301082253456,
"reward_before_mean": -0.08499279711395502,
"reward_before_std": 0.3492864612489939,
"reward_change_max": 0.0,
"reward_change_mean": -0.2750719413161278,
"reward_change_min": -0.4323217496275902,
"reward_change_std": 0.16262990981340408,
"reward_std": 0.40458302199840546,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.16832613572478294,
"step": 397
},
{
"clip_fraction": 0.0,
"completion_length": 2822.5209045410156,
"epoch": 0.45485714285714285,
"grad_norm": 0.06360434740781784,
"kl": 2.2810418158769608e-05,
"lambda_div_used": 0.6480221897363663,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0465,
"reward": -0.09229415841400623,
"reward_after_mean": -0.09229415841400623,
"reward_after_std": 0.7079674638807774,
"reward_before_mean": 0.1752314588520676,
"reward_before_std": 0.6917341919615865,
"reward_change_max": 0.0,
"reward_change_mean": -0.26752561889588833,
"reward_change_min": -0.5181693434715271,
"reward_change_std": 0.19102454278618097,
"reward_std": 0.7079674787819386,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.03310186788439751,
"step": 398
},
{
"clip_fraction": 0.0,
"completion_length": 2191.583351135254,
"epoch": 0.456,
"grad_norm": 0.07585066556930542,
"kl": 3.449246287345886e-05,
"lambda_div_used": 0.6071069464087486,
"learning_rate": 2.0935222495670968e-07,
"loss": -0.0428,
"reward": -0.06968086212873459,
"reward_after_mean": -0.06968086212873459,
"reward_after_std": 0.5623702071607113,
"reward_before_mean": 0.29000685061328113,
"reward_before_std": 0.4939764440059662,
"reward_change_max": 0.0,
"reward_change_mean": -0.35968772508203983,
"reward_change_min": -0.5781176537275314,
"reward_change_std": 0.2176226656883955,
"reward_std": 0.5623702295124531,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.01917351223528385,
"step": 399
},
{
"clip_fraction": 0.0,
"completion_length": 1814.4375305175781,
"epoch": 0.45714285714285713,
"grad_norm": 0.09906096011400223,
"kl": 3.718771040439606e-05,
"lambda_div_used": 0.615766242146492,
"learning_rate": 2.0730776160846853e-07,
"loss": -0.0203,
"reward": 0.023047026246786118,
"reward_after_mean": 0.023047026246786118,
"reward_after_std": 0.6236102003604174,
"reward_before_mean": 0.43256790889427066,
"reward_before_std": 0.5322064086794853,
"reward_change_max": 0.0,
"reward_change_mean": -0.40952087566256523,
"reward_change_min": -0.6220400035381317,
"reward_change_std": 0.2354581467807293,
"reward_std": 0.62361023388803,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.12006791215389967,
"step": 400
},
{
"clip_fraction": 0.0,
"completion_length": 3091.7916870117188,
"epoch": 0.4582857142857143,
"grad_norm": 0.05475914105772972,
"kl": 3.5446137189865112e-06,
"lambda_div_used": 0.562153548002243,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0508,
"reward": -0.29909564182162285,
"reward_after_mean": -0.29909564182162285,
"reward_after_std": 0.3837758805602789,
"reward_before_mean": 0.04575100168585777,
"reward_before_std": 0.28164876997470856,
"reward_change_max": 0.0,
"reward_change_mean": -0.3448466807603836,
"reward_change_min": -0.5300594680011272,
"reward_change_std": 0.19381076097488403,
"reward_std": 0.38377588987350464,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.12091565364971757,
"step": 401
},
{
"clip_fraction": 0.0,
"completion_length": 2440.1458740234375,
"epoch": 0.4594285714285714,
"grad_norm": 0.11770489811897278,
"kl": 1.8846243619918823e-05,
"lambda_div_used": 0.5605455562472343,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0261,
"reward": -0.2383374497294426,
"reward_after_mean": -0.2383374497294426,
"reward_after_std": 0.3794025480747223,
"reward_before_mean": 0.14294641837477684,
"reward_before_std": 0.27562023140490055,
"reward_change_max": 0.0,
"reward_change_mean": -0.38128384202718735,
"reward_change_min": -0.5627436973154545,
"reward_change_std": 0.2136234436184168,
"reward_std": 0.3794025518000126,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.044553620740771294,
"step": 402
},
{
"clip_fraction": 0.0,
"completion_length": 1985.6875343322754,
"epoch": 0.4605714285714286,
"grad_norm": 0.10363903641700745,
"kl": 2.3838132619857788e-05,
"lambda_div_used": 0.5793934315443039,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.003,
"reward": -0.17537187691777945,
"reward_after_mean": -0.17537187691777945,
"reward_after_std": 0.46652381494641304,
"reward_before_mean": 0.20323466695845127,
"reward_before_std": 0.3647587588056922,
"reward_change_max": 0.0,
"reward_change_mean": -0.37860656157135963,
"reward_change_min": -0.5506669841706753,
"reward_change_std": 0.21334033645689487,
"reward_std": 0.4665238317102194,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.025931986048817635,
"step": 403
},
{
"clip_fraction": 0.0,
"completion_length": 2137.375,
"epoch": 0.4617142857142857,
"grad_norm": 0.10717064887285233,
"kl": 3.7202611565589905e-05,
"lambda_div_used": 0.5993036478757858,
"learning_rate": 1.9929791578083655e-07,
"loss": -0.0043,
"reward": -0.053446926176548004,
"reward_after_mean": -0.053446926176548004,
"reward_after_std": 0.4612566214054823,
"reward_before_mean": 0.3198554217815399,
"reward_before_std": 0.4542539082467556,
"reward_change_max": 0.0,
"reward_change_mean": -0.3733023554086685,
"reward_change_min": -0.5774630047380924,
"reward_change_std": 0.2339334823191166,
"reward_std": 0.4612566400319338,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.04902207478880882,
"step": 404
},
{
"clip_fraction": 0.0,
"completion_length": 2319.5000228881836,
"epoch": 0.46285714285714286,
"grad_norm": 0.13891565799713135,
"kl": 2.9403716325759888e-05,
"lambda_div_used": 0.6237343773245811,
"learning_rate": 1.9733794420337213e-07,
"loss": -0.0602,
"reward": 0.1285000964999199,
"reward_after_mean": 0.1285000964999199,
"reward_after_std": 0.7067493386566639,
"reward_before_mean": 0.5776062086224556,
"reward_before_std": 0.5746421907097101,
"reward_change_max": 0.0,
"reward_change_mean": -0.449106115847826,
"reward_change_min": -0.6593505516648293,
"reward_change_std": 0.263070298358798,
"reward_std": 0.7067493461072445,
"rewards/accuracy_reward": 0.4166666679084301,
"rewards/cosine_scaled_reward": 0.1609395444393158,
"step": 405
},
{
"clip_fraction": 0.0,
"completion_length": 2442.1458892822266,
"epoch": 0.464,
"grad_norm": 0.07689624279737473,
"kl": 8.527189493179321e-06,
"lambda_div_used": 0.6336105018854141,
"learning_rate": 1.9539516087697517e-07,
"loss": -0.0,
"reward": -0.11170937749557197,
"reward_after_mean": -0.11170937749557197,
"reward_after_std": 0.6453428398817778,
"reward_before_mean": 0.17541324836201966,
"reward_before_std": 0.6142212487757206,
"reward_change_max": 0.0,
"reward_change_mean": -0.2871226370334625,
"reward_change_min": -0.5107162520289421,
"reward_change_std": 0.19029081240296364,
"reward_std": 0.645342854782939,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.032920089550316334,
"step": 406
},
{
"clip_fraction": 0.0,
"completion_length": 2516.6041946411133,
"epoch": 0.46514285714285714,
"grad_norm": 0.09361904859542847,
"kl": 2.995133399963379e-05,
"lambda_div_used": 0.5926980003714561,
"learning_rate": 1.934696604901642e-07,
"loss": 0.011,
"reward": -0.16482560662552714,
"reward_after_mean": -0.16482560662552714,
"reward_after_std": 0.5236028637737036,
"reward_before_mean": 0.18611273169517517,
"reward_before_std": 0.4258856289088726,
"reward_change_max": 0.0,
"reward_change_mean": -0.35093834809958935,
"reward_change_min": -0.5517234578728676,
"reward_change_std": 0.20658957865089178,
"reward_std": 0.5236028656363487,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.06388726737350225,
"step": 407
},
{
"clip_fraction": 0.0,
"completion_length": 2709.4791984558105,
"epoch": 0.4662857142857143,
"grad_norm": 0.09306048601865768,
"kl": 3.203703090548515e-05,
"lambda_div_used": 0.5583930537104607,
"learning_rate": 1.915615368891117e-07,
"loss": -0.111,
"reward": -0.17684245621785522,
"reward_after_mean": -0.17684245621785522,
"reward_after_std": 0.42921141162514687,
"reward_before_mean": 0.2615569829940796,
"reward_before_std": 0.26226984336972237,
"reward_change_max": 0.0,
"reward_change_mean": -0.43839943781495094,
"reward_change_min": -0.6072936318814754,
"reward_change_std": 0.22694199439138174,
"reward_std": 0.42921141907572746,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": -0.009276359807699919,
"step": 408
},
{
"clip_fraction": 0.0,
"completion_length": 3529.3541870117188,
"epoch": 0.4674285714285714,
"grad_norm": 0.048929836601018906,
"kl": 7.6089054346084595e-06,
"lambda_div_used": 0.6116252392530441,
"learning_rate": 1.8967088307307e-07,
"loss": 0.001,
"reward": -0.17062923312187195,
"reward_after_mean": -0.17062923312187195,
"reward_after_std": 0.5682655684649944,
"reward_before_mean": 0.12847105879336596,
"reward_before_std": 0.5097848381847143,
"reward_change_max": 0.0,
"reward_change_mean": -0.2991002984344959,
"reward_change_min": -0.43997345492243767,
"reward_change_std": 0.17078326642513275,
"reward_std": 0.5682655889540911,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.038195611676201224,
"step": 409
},
{
"clip_fraction": 0.0,
"completion_length": 2435.625045776367,
"epoch": 0.4685714285714286,
"grad_norm": 0.08304905891418457,
"kl": 1.5234574675559998e-05,
"lambda_div_used": 0.6268336623907089,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0499,
"reward": -0.03433734131976962,
"reward_after_mean": -0.03433734131976962,
"reward_after_std": 0.6416803412139416,
"reward_before_mean": 0.2960522407665849,
"reward_before_std": 0.5826444877311587,
"reward_change_max": 0.0,
"reward_change_mean": -0.3303895927965641,
"reward_change_min": -0.4872638136148453,
"reward_change_std": 0.19508948456496,
"reward_std": 0.6416803747415543,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.04605222793179564,
"step": 410
},
{
"clip_fraction": 0.0,
"completion_length": 3163.1041717529297,
"epoch": 0.4697142857142857,
"grad_norm": 0.06846357136964798,
"kl": 1.529604196548462e-05,
"lambda_div_used": 0.5874563306570053,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0214,
"reward": -0.22584301605820656,
"reward_after_mean": -0.22584301605820656,
"reward_after_std": 0.42311797849833965,
"reward_before_mean": 0.0774743240326643,
"reward_before_std": 0.4020446836948395,
"reward_change_max": 0.0,
"reward_change_mean": -0.3033173345029354,
"reward_change_min": -0.48705917969346046,
"reward_change_std": 0.1920422399416566,
"reward_std": 0.42311798594892025,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.08919236063957214,
"step": 411
},
{
"clip_fraction": 0.0,
"completion_length": 2888.125015258789,
"epoch": 0.47085714285714286,
"grad_norm": 0.06694348156452179,
"kl": 2.148747444152832e-05,
"lambda_div_used": 0.6005539819598198,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.049,
"reward": -0.21726901549845934,
"reward_after_mean": -0.21726901549845934,
"reward_after_std": 0.48868509009480476,
"reward_before_mean": 0.08159955404698849,
"reward_before_std": 0.463420107960701,
"reward_change_max": 0.0,
"reward_change_mean": -0.2988685816526413,
"reward_change_min": -0.554147582501173,
"reward_change_std": 0.19932363275438547,
"reward_std": 0.48868510872125626,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.06423377990722656,
"step": 412
},
{
"clip_fraction": 0.0,
"completion_length": 2530.2708702087402,
"epoch": 0.472,
"grad_norm": 0.08418448269367218,
"kl": 2.208724617958069e-05,
"lambda_div_used": 0.6184473037719727,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0462,
"reward": 0.05549921467900276,
"reward_after_mean": 0.05549921467900276,
"reward_after_std": 0.6010984163731337,
"reward_before_mean": 0.4671566132456064,
"reward_before_std": 0.546179112046957,
"reward_change_max": 0.0,
"reward_change_mean": -0.4116574004292488,
"reward_change_min": -0.6464647352695465,
"reward_change_std": 0.253952544182539,
"reward_std": 0.6010984499007463,
"rewards/accuracy_reward": 0.37500000931322575,
"rewards/cosine_scaled_reward": 0.09215660532936454,
"step": 413
},
{
"clip_fraction": 0.0,
"completion_length": 2948.562530517578,
"epoch": 0.47314285714285714,
"grad_norm": 0.06618716567754745,
"kl": 1.0458752512931824e-05,
"lambda_div_used": 0.6192215830087662,
"learning_rate": 1.804828558898332e-07,
"loss": -0.0483,
"reward": -0.21263186633586884,
"reward_after_mean": -0.21263186633586884,
"reward_after_std": 0.5998369809240103,
"reward_before_mean": 0.04798411298543215,
"reward_before_std": 0.555210480466485,
"reward_change_max": 0.0,
"reward_change_mean": -0.2606159746646881,
"reward_change_min": -0.4564817361533642,
"reward_change_std": 0.16899613942950964,
"reward_std": 0.5998369976878166,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.0978492172434926,
"step": 414
},
{
"clip_fraction": 0.0,
"completion_length": 3234.8125,
"epoch": 0.4742857142857143,
"grad_norm": 0.06085793301463127,
"kl": 2.8954818844795227e-05,
"lambda_div_used": 0.5570018589496613,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0073,
"reward": -0.46066740388050675,
"reward_after_mean": -0.46066740388050675,
"reward_after_std": 0.3419057931751013,
"reward_before_mean": -0.2106443401426077,
"reward_before_std": 0.2546057654544711,
"reward_change_max": 0.0,
"reward_change_mean": -0.2500230632722378,
"reward_change_min": -0.36453111097216606,
"reward_change_std": 0.1324605904519558,
"reward_std": 0.3419058118015528,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.23147768154740334,
"step": 415
},
{
"clip_fraction": 0.0,
"completion_length": 1748.9583892822266,
"epoch": 0.4754285714285714,
"grad_norm": 0.09120064228773117,
"kl": 8.981674909591675e-06,
"lambda_div_used": 0.6067373231053352,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0771,
"reward": -0.13937147613614798,
"reward_after_mean": -0.13937147613614798,
"reward_after_std": 0.6240962240844965,
"reward_before_mean": 0.20647221896797419,
"reward_before_std": 0.48884215706493706,
"reward_change_max": 0.0,
"reward_change_mean": -0.34584368020296097,
"reward_change_min": -0.5141387544572353,
"reward_change_std": 0.18731264490634203,
"reward_std": 0.6240962333977222,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.04352780181216076,
"step": 416
},
{
"clip_fraction": 0.0,
"completion_length": 3307.250015258789,
"epoch": 0.4765714285714286,
"grad_norm": 0.05766940861940384,
"kl": 3.1495699658989906e-05,
"lambda_div_used": 0.5544050261378288,
"learning_rate": 1.7518544168045524e-07,
"loss": -0.0019,
"reward": -0.4980150870978832,
"reward_after_mean": -0.4980150870978832,
"reward_after_std": 0.3123048096895218,
"reward_before_mean": -0.2511206082999706,
"reward_before_std": 0.24326619878411293,
"reward_change_max": 0.0,
"reward_change_mean": -0.24689447320997715,
"reward_change_min": -0.3710709176957607,
"reward_change_std": 0.13346257898956537,
"reward_std": 0.3123048171401024,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2719539441168308,
"step": 417
},
{
"clip_fraction": 0.0,
"completion_length": 2171.625015258789,
"epoch": 0.4777142857142857,
"grad_norm": 0.0854194164276123,
"kl": 2.8558075428009033e-05,
"lambda_div_used": 0.6033797562122345,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0073,
"reward": -0.0949536501429975,
"reward_after_mean": -0.0949536501429975,
"reward_after_std": 0.540143633261323,
"reward_before_mean": 0.25413690507411957,
"reward_before_std": 0.4725718079134822,
"reward_change_max": 0.0,
"reward_change_mean": -0.34909053333103657,
"reward_change_min": -0.5411153584718704,
"reward_change_std": 0.2063078135251999,
"reward_std": 0.5401436407119036,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.03752976981922984,
"step": 418
},
{
"clip_fraction": 0.0,
"completion_length": 2464.3750228881836,
"epoch": 0.47885714285714287,
"grad_norm": 0.08315848559141159,
"kl": 3.0182301998138428e-05,
"lambda_div_used": 0.5937136337161064,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.03,
"reward": -0.27206650376319885,
"reward_after_mean": -0.27206650376319885,
"reward_after_std": 0.49108788557350636,
"reward_before_mean": 0.0038498505018651485,
"reward_before_std": 0.4298001816496253,
"reward_change_max": 0.0,
"reward_change_mean": -0.27591635659337044,
"reward_change_min": -0.4215012192726135,
"reward_change_std": 0.16396540496498346,
"reward_std": 0.49108790047466755,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.12115013878792524,
"step": 419
},
{
"clip_fraction": 0.0,
"completion_length": 1972.7083625793457,
"epoch": 0.48,
"grad_norm": 0.11145550012588501,
"kl": 3.493949770927429e-05,
"lambda_div_used": 0.5797194987535477,
"learning_rate": 1.7005243352409333e-07,
"loss": -0.0607,
"reward": -0.1652970388531685,
"reward_after_mean": -0.1652970388531685,
"reward_after_std": 0.4477591011673212,
"reward_before_mean": 0.18980162939988077,
"reward_before_std": 0.36517443507909775,
"reward_change_max": 0.0,
"reward_change_mean": -0.35509867407381535,
"reward_change_min": -0.5067496970295906,
"reward_change_std": 0.19984195847064257,
"reward_std": 0.4477591197937727,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.03936504106968641,
"step": 420
},
{
"clip_fraction": 0.0,
"completion_length": 3191.937515258789,
"epoch": 0.48114285714285715,
"grad_norm": 0.07170204073190689,
"kl": 3.11434268951416e-05,
"lambda_div_used": 0.5753046199679375,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0088,
"reward": -0.38796948781237006,
"reward_after_mean": -0.38796948781237006,
"reward_after_std": 0.4225973077118397,
"reward_before_mean": -0.12759371474385262,
"reward_before_std": 0.3405588921159506,
"reward_change_max": 0.0,
"reward_change_mean": -0.2603757604956627,
"reward_change_min": -0.37436668202281,
"reward_change_std": 0.13996374886482954,
"reward_std": 0.4225973132997751,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.16926039289683104,
"step": 421
},
{
"clip_fraction": 0.0,
"completion_length": 2776.437545776367,
"epoch": 0.48228571428571426,
"grad_norm": 0.06681375205516815,
"kl": 1.9788742065429688e-05,
"lambda_div_used": 0.5904401987791061,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0293,
"reward": -0.25396816432476044,
"reward_after_mean": -0.25396816432476044,
"reward_after_std": 0.4787884410470724,
"reward_before_mean": 0.03486193250864744,
"reward_before_std": 0.4093271289020777,
"reward_change_max": 0.0,
"reward_change_mean": -0.2888300847262144,
"reward_change_min": -0.43693453073501587,
"reward_change_std": 0.16317449882626534,
"reward_std": 0.4787884559482336,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.11097141215577722,
"step": 422
},
{
"clip_fraction": 0.0,
"completion_length": 2945.9583740234375,
"epoch": 0.48342857142857143,
"grad_norm": 0.06767871230840683,
"kl": 2.205371856689453e-05,
"lambda_div_used": 0.611551009118557,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0153,
"reward": -0.16482537053525448,
"reward_after_mean": -0.16482537053525448,
"reward_after_std": 0.5363675616681576,
"reward_before_mean": 0.12604539189487696,
"reward_before_std": 0.518328445032239,
"reward_change_max": 0.0,
"reward_change_mean": -0.2908707782626152,
"reward_change_min": -0.4775175042450428,
"reward_change_std": 0.18889948446303606,
"reward_std": 0.5363675802946091,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.040621266700327396,
"step": 423
},
{
"clip_fraction": 0.0,
"completion_length": 3215.0208740234375,
"epoch": 0.4845714285714286,
"grad_norm": 0.061716482043266296,
"kl": 1.1460855603218079e-05,
"lambda_div_used": 0.5937408357858658,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0327,
"reward": -0.3415182586759329,
"reward_after_mean": -0.3415182586759329,
"reward_after_std": 0.48476750776171684,
"reward_before_mean": -0.08947536488994956,
"reward_before_std": 0.4272408355027437,
"reward_change_max": 0.0,
"reward_change_mean": -0.2520429063588381,
"reward_change_min": -0.42084335163235664,
"reward_change_std": 0.15120856929570436,
"reward_std": 0.4847675133496523,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.17280870315153152,
"step": 424
},
{
"clip_fraction": 0.0,
"completion_length": 2154.9375610351562,
"epoch": 0.4857142857142857,
"grad_norm": 0.09371069073677063,
"kl": 3.830809146165848e-05,
"lambda_div_used": 0.6315138638019562,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0058,
"reward": 0.34828917868435383,
"reward_after_mean": 0.34828917868435383,
"reward_after_std": 0.7491090279072523,
"reward_before_mean": 0.91605463065207,
"reward_before_std": 0.6105400957167149,
"reward_change_max": 0.0,
"reward_change_mean": -0.5677654705941677,
"reward_change_min": -0.9185826852917671,
"reward_change_std": 0.34253316558897495,
"reward_std": 0.7491090428084135,
"rewards/accuracy_reward": 0.5833333358168602,
"rewards/cosine_scaled_reward": 0.3327212668955326,
"step": 425
},
{
"clip_fraction": 0.0,
"completion_length": 2205.3125190734863,
"epoch": 0.4868571428571429,
"grad_norm": 0.09298980236053467,
"kl": 2.3175030946731567e-05,
"lambda_div_used": 0.6215192526578903,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.002,
"reward": -0.029936233535408974,
"reward_after_mean": -0.029936233535408974,
"reward_after_std": 0.5604026541113853,
"reward_before_mean": 0.3139693345874548,
"reward_before_std": 0.5635973755270243,
"reward_change_max": 0.0,
"reward_change_mean": -0.34390556812286377,
"reward_change_min": -0.5907084755599499,
"reward_change_std": 0.23498705588281155,
"reward_std": 0.5604026671499014,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.0014693308621644974,
"step": 426
},
{
"clip_fraction": 0.0,
"completion_length": 3295.7708740234375,
"epoch": 0.488,
"grad_norm": 0.0548052154481411,
"kl": 2.4527311325073242e-05,
"lambda_div_used": 0.5990054532885551,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0088,
"reward": -0.2071160115301609,
"reward_after_mean": -0.2071160115301609,
"reward_after_std": 0.49672279693186283,
"reward_before_mean": 0.08893106225878,
"reward_before_std": 0.454012256115675,
"reward_change_max": 0.0,
"reward_change_mean": -0.2960470784455538,
"reward_change_min": -0.4723533205688,
"reward_change_std": 0.18442231137305498,
"reward_std": 0.4967228155583143,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.09856893587857485,
"step": 427
},
{
"clip_fraction": 0.0,
"completion_length": 2302.520881652832,
"epoch": 0.48914285714285716,
"grad_norm": 0.11483462899923325,
"kl": 0.000337366946041584,
"lambda_div_used": 0.5547136962413788,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0311,
"reward": -0.42663199454545975,
"reward_after_mean": -0.42663199454545975,
"reward_after_std": 0.29241302236914635,
"reward_before_mean": -0.1547260768711567,
"reward_before_std": 0.24498768709599972,
"reward_change_max": 0.0,
"reward_change_mean": -0.27190591767430305,
"reward_change_min": -0.42438486963510513,
"reward_change_std": 0.1579811777919531,
"reward_std": 0.2924130354076624,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.2380594089627266,
"step": 428
},
{
"clip_fraction": 0.0,
"completion_length": 2431.520881652832,
"epoch": 0.49028571428571427,
"grad_norm": 0.0878632664680481,
"kl": 2.4685636162757874e-05,
"lambda_div_used": 0.6025609895586967,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0378,
"reward": -0.057155030546709895,
"reward_after_mean": -0.057155030546709895,
"reward_after_std": 0.5567535478621721,
"reward_before_mean": 0.3208682704716921,
"reward_before_std": 0.47341752983629704,
"reward_change_max": 0.0,
"reward_change_mean": -0.3780232574790716,
"reward_change_min": -0.5763550102710724,
"reward_change_std": 0.22364642471075058,
"reward_std": 0.5567535553127527,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.029201554832980037,
"step": 429
},
{
"clip_fraction": 0.0,
"completion_length": 2577.3125610351562,
"epoch": 0.49142857142857144,
"grad_norm": 0.08103640377521515,
"kl": 1.4763325452804565e-05,
"lambda_div_used": 0.6012379750609398,
"learning_rate": 1.5415814221002265e-07,
"loss": -0.0049,
"reward": 0.031602535396814346,
"reward_after_mean": 0.031602535396814346,
"reward_after_std": 0.5440461356192827,
"reward_before_mean": 0.4564003311097622,
"reward_before_std": 0.46839726250618696,
"reward_change_max": 0.0,
"reward_change_mean": -0.42479780688881874,
"reward_change_min": -0.6648109555244446,
"reward_change_std": 0.2579617351293564,
"reward_std": 0.5440461542457342,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.1022336557507515,
"step": 430
},
{
"clip_fraction": 0.0,
"completion_length": 2690.6458740234375,
"epoch": 0.49257142857142855,
"grad_norm": 0.06538030505180359,
"kl": 2.8170645236968994e-05,
"lambda_div_used": 0.5918548330664635,
"learning_rate": 1.5267358321348285e-07,
"loss": -0.0328,
"reward": -0.24143310775980353,
"reward_after_mean": -0.24143310775980353,
"reward_after_std": 0.481786971911788,
"reward_before_mean": 0.05766227189451456,
"reward_before_std": 0.41715206764638424,
"reward_change_max": 0.0,
"reward_change_mean": -0.29909538850188255,
"reward_change_min": -0.4450513869524002,
"reward_change_std": 0.1692206682637334,
"reward_std": 0.4817869979888201,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.08817105047637597,
"step": 431
},
{
"clip_fraction": 0.0,
"completion_length": 2843.750045776367,
"epoch": 0.4937142857142857,
"grad_norm": 0.05780716612935066,
"kl": 3.104284405708313e-05,
"lambda_div_used": 0.576793298125267,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0053,
"reward": -0.42155745439231396,
"reward_after_mean": -0.42155745439231396,
"reward_after_std": 0.4230448566377163,
"reward_before_mean": -0.19034346495755017,
"reward_before_std": 0.35024640895426273,
"reward_change_max": 0.0,
"reward_change_mean": -0.23121398873627186,
"reward_change_min": -0.3715337961912155,
"reward_change_std": 0.1338915005326271,
"reward_std": 0.4230448678135872,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.2528434665873647,
"step": 432
},
{
"clip_fraction": 0.0,
"completion_length": 2992.062530517578,
"epoch": 0.4948571428571429,
"grad_norm": 0.06562553346157074,
"kl": 3.0972063541412354e-05,
"lambda_div_used": 0.6079665347933769,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.1201,
"reward": -0.26266857516020536,
"reward_after_mean": -0.26266857516020536,
"reward_after_std": 0.5306479204446077,
"reward_before_mean": -0.0102414321154356,
"reward_before_std": 0.5030816271901131,
"reward_change_max": 0.0,
"reward_change_mean": -0.25242713652551174,
"reward_change_min": -0.4955142140388489,
"reward_change_std": 0.1781205264851451,
"reward_std": 0.5306479260325432,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.15607476886361837,
"step": 433
},
{
"clip_fraction": 0.0,
"completion_length": 2915.625030517578,
"epoch": 0.496,
"grad_norm": 0.06934478878974915,
"kl": 3.8422178477048874e-05,
"lambda_div_used": 0.5604980885982513,
"learning_rate": 1.483363816965435e-07,
"loss": -0.004,
"reward": -0.4280230412259698,
"reward_after_mean": -0.4280230412259698,
"reward_after_std": 0.32627478428184986,
"reward_before_mean": -0.15262745507061481,
"reward_before_std": 0.2731653768569231,
"reward_change_max": 0.0,
"reward_change_mean": -0.27539557591080666,
"reward_change_min": -0.44822341948747635,
"reward_change_std": 0.16339552495628595,
"reward_std": 0.32627478800714016,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1942941201850772,
"step": 434
},
{
"clip_fraction": 0.0,
"completion_length": 2320.750026702881,
"epoch": 0.49714285714285716,
"grad_norm": 0.13675029575824738,
"kl": 4.8140063881874084e-05,
"lambda_div_used": 0.5811168253421783,
"learning_rate": 1.469297078922642e-07,
"loss": -0.0374,
"reward": -0.17623315937817097,
"reward_after_mean": -0.17623315937817097,
"reward_after_std": 0.483237961307168,
"reward_before_mean": 0.19585114251822233,
"reward_before_std": 0.3759047882631421,
"reward_change_max": 0.0,
"reward_change_mean": -0.37208431400358677,
"reward_change_min": -0.5468379594385624,
"reward_change_std": 0.20962903555482626,
"reward_std": 0.4832379762083292,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.01248218398541212,
"step": 435
},
{
"clip_fraction": 0.0,
"completion_length": 2316.3125381469727,
"epoch": 0.4982857142857143,
"grad_norm": 0.09783437103033066,
"kl": 2.839416265487671e-05,
"lambda_div_used": 0.6289880573749542,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0826,
"reward": 0.12041758373379707,
"reward_after_mean": 0.12041758373379707,
"reward_after_std": 0.6426001656800508,
"reward_before_mean": 0.5348608233034611,
"reward_before_std": 0.6021685730665922,
"reward_change_max": 0.0,
"reward_change_mean": -0.41444322280585766,
"reward_change_min": -0.6993351243436337,
"reward_change_std": 0.271469890139997,
"reward_std": 0.6426001694053411,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.13902746886014938,
"step": 436
},
{
"clip_fraction": 0.0,
"completion_length": 2954.0416717529297,
"epoch": 0.49942857142857144,
"grad_norm": 0.05496148392558098,
"kl": 0.00023943744599819183,
"lambda_div_used": 0.5626013651490211,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.05,
"reward": -0.42826351523399353,
"reward_after_mean": -0.42826351523399353,
"reward_after_std": 0.33650987036526203,
"reward_before_mean": -0.16868688352406025,
"reward_before_std": 0.2832178361713886,
"reward_change_max": 0.0,
"reward_change_mean": -0.25957662612199783,
"reward_change_min": -0.42033713683485985,
"reward_change_std": 0.1560918828472495,
"reward_std": 0.3365098759531975,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.2311868779361248,
"step": 437
},
{
"clip_fraction": 0.0,
"completion_length": 3007.500015258789,
"epoch": 0.5005714285714286,
"grad_norm": 0.05682806298136711,
"kl": 4.358415026217699e-06,
"lambda_div_used": 0.6039799600839615,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0289,
"reward": -0.29300207551568747,
"reward_after_mean": -0.29300207551568747,
"reward_after_std": 0.5204536523669958,
"reward_before_mean": -0.03094739466905594,
"reward_before_std": 0.4829921592026949,
"reward_change_max": 0.0,
"reward_change_mean": -0.2620546855032444,
"reward_change_min": -0.5145607963204384,
"reward_change_std": 0.18371508549898863,
"reward_std": 0.5204536579549313,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.15594739385414869,
"step": 438
},
{
"clip_fraction": 0.0,
"completion_length": 2249.5000076293945,
"epoch": 0.5017142857142857,
"grad_norm": 0.0828532725572586,
"kl": 3.217160701751709e-05,
"lambda_div_used": 0.5708463862538338,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.021,
"reward": -0.40206424333155155,
"reward_after_mean": -0.40206424333155155,
"reward_after_std": 0.40689974650740623,
"reward_before_mean": -0.14063972979784012,
"reward_before_std": 0.32079045102000237,
"reward_change_max": 0.0,
"reward_change_mean": -0.261424507945776,
"reward_change_min": -0.38026969507336617,
"reward_change_std": 0.14067976083606482,
"reward_std": 0.406899768859148,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1823064020718448,
"step": 439
},
{
"clip_fraction": 0.0,
"completion_length": 2964.000011444092,
"epoch": 0.5028571428571429,
"grad_norm": 0.07350306212902069,
"kl": 2.7701258659362793e-05,
"lambda_div_used": 0.5365751013159752,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0201,
"reward": -0.5367953963577747,
"reward_after_mean": -0.5367953963577747,
"reward_after_std": 0.231884878128767,
"reward_before_mean": -0.28081973269581795,
"reward_before_std": 0.16059848852455616,
"reward_change_max": 0.0,
"reward_change_mean": -0.25597566179931164,
"reward_change_min": -0.38940757140517235,
"reward_change_std": 0.13862022012472153,
"reward_std": 0.2318848893046379,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.28081973642110825,
"step": 440
},
{
"clip_fraction": 0.0,
"completion_length": 3007.770896911621,
"epoch": 0.504,
"grad_norm": 0.07498018443584442,
"kl": 2.944841980934143e-05,
"lambda_div_used": 0.6011143997311592,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.0116,
"reward": -0.13889071717858315,
"reward_after_mean": -0.13889071717858315,
"reward_after_std": 0.4864419028162956,
"reward_before_mean": 0.17084867507219315,
"reward_before_std": 0.4689741334877908,
"reward_change_max": 0.0,
"reward_change_mean": -0.3097394183278084,
"reward_change_min": -0.514279019087553,
"reward_change_std": 0.20097953081130981,
"reward_std": 0.48644191212952137,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.05831799004226923,
"step": 441
},
{
"clip_fraction": 0.0,
"completion_length": 2837.4583435058594,
"epoch": 0.5051428571428571,
"grad_norm": 0.0673719272017479,
"kl": 3.160163760185242e-05,
"lambda_div_used": 0.5689455196261406,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0084,
"reward": -0.31079866108484566,
"reward_after_mean": -0.31079866108484566,
"reward_after_std": 0.3982803635299206,
"reward_before_mean": -0.010625829687342048,
"reward_before_std": 0.31661996035836637,
"reward_change_max": 0.0,
"reward_change_mean": -0.30017283000051975,
"reward_change_min": -0.4261031448841095,
"reward_change_std": 0.16800945159047842,
"reward_std": 0.3982803765684366,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.1147925069089979,
"step": 442
},
{
"clip_fraction": 0.0,
"completion_length": 3194.270854949951,
"epoch": 0.5062857142857143,
"grad_norm": 0.05984271690249443,
"kl": 1.745857298374176e-05,
"lambda_div_used": 0.5607094466686249,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0053,
"reward": -0.38279488682746887,
"reward_after_mean": -0.38279488682746887,
"reward_after_std": 0.33906700275838375,
"reward_before_mean": -0.09817820321768522,
"reward_before_std": 0.2764374865218997,
"reward_change_max": 0.0,
"reward_change_mean": -0.2846166864037514,
"reward_change_min": -0.4438001811504364,
"reward_change_std": 0.16594033408910036,
"reward_std": 0.3390670083463192,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.16067820694297552,
"step": 443
},
{
"clip_fraction": 0.0,
"completion_length": 2902.2291717529297,
"epoch": 0.5074285714285715,
"grad_norm": 0.0638304203748703,
"kl": 1.5033408999443054e-05,
"lambda_div_used": 0.5774018093943596,
"learning_rate": 1.351615817851748e-07,
"loss": -0.0355,
"reward": -0.3352237604558468,
"reward_after_mean": -0.3352237604558468,
"reward_after_std": 0.4039857666939497,
"reward_before_mean": -0.05285666696727276,
"reward_before_std": 0.35096561443060637,
"reward_change_max": 0.0,
"reward_change_mean": -0.2823671065270901,
"reward_change_min": -0.42594215646386147,
"reward_change_std": 0.16656992863863707,
"reward_std": 0.40398577041924,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.15702332742512226,
"step": 444
},
{
"clip_fraction": 0.0,
"completion_length": 3006.916717529297,
"epoch": 0.5085714285714286,
"grad_norm": 0.06397882103919983,
"kl": 2.842303365468979e-05,
"lambda_div_used": 0.6194909662008286,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0715,
"reward": -0.05263347551226616,
"reward_after_mean": -0.05263347551226616,
"reward_after_std": 0.5679135732352734,
"reward_before_mean": 0.2764017879962921,
"reward_before_std": 0.5510673765093088,
"reward_change_max": 0.0,
"reward_change_mean": -0.3290352523326874,
"reward_change_min": -0.5197134874761105,
"reward_change_std": 0.21186745446175337,
"reward_std": 0.5679135844111443,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": 0.0055684298276901245,
"step": 445
},
{
"clip_fraction": 0.0,
"completion_length": 3028.8333892822266,
"epoch": 0.5097142857142857,
"grad_norm": 0.06140498071908951,
"kl": 3.2648444175720215e-05,
"lambda_div_used": 0.606703408062458,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.0241,
"reward": -0.2073356769979,
"reward_after_mean": -0.2073356769979,
"reward_after_std": 0.5240648984909058,
"reward_before_mean": 0.0723939798772335,
"reward_before_std": 0.4952498711645603,
"reward_change_max": 0.0,
"reward_change_mean": -0.27972964756190777,
"reward_change_min": -0.5379137881100178,
"reward_change_std": 0.1909602265805006,
"reward_std": 0.5240649078041315,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.07343936339020729,
"step": 446
},
{
"clip_fraction": 0.0,
"completion_length": 2571.166721343994,
"epoch": 0.5108571428571429,
"grad_norm": 0.08778514713048935,
"kl": 9.072478860616684e-06,
"lambda_div_used": 0.5970501974225044,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0018,
"reward": -0.19514422863721848,
"reward_after_mean": -0.19514422863721848,
"reward_after_std": 0.4898754768073559,
"reward_before_mean": 0.10967571474611759,
"reward_before_std": 0.4461768325418234,
"reward_change_max": 0.0,
"reward_change_mean": -0.3048199340701103,
"reward_change_min": -0.48957687243819237,
"reward_change_std": 0.18721044715493917,
"reward_std": 0.4898754861205816,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.07782429456710815,
"step": 447
},
{
"clip_fraction": 0.0,
"completion_length": 2039.6042022705078,
"epoch": 0.512,
"grad_norm": 0.12122303992509842,
"kl": 3.530830144882202e-05,
"lambda_div_used": 0.5591797679662704,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0057,
"reward": -0.3564775697886944,
"reward_after_mean": -0.3564775697886944,
"reward_after_std": 0.33722602762281895,
"reward_before_mean": -0.06660962291061878,
"reward_before_std": 0.27086107339709997,
"reward_change_max": 0.0,
"reward_change_mean": -0.28986795619130135,
"reward_change_min": -0.4345347508788109,
"reward_change_std": 0.1656003799289465,
"reward_std": 0.33722603134810925,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.14994295500218868,
"step": 448
},
{
"clip_fraction": 0.0,
"completion_length": 2680.0208435058594,
"epoch": 0.5131428571428571,
"grad_norm": 0.07745695859193802,
"kl": 3.194063901901245e-05,
"lambda_div_used": 0.5827131420373917,
"learning_rate": 1.2932844562179352e-07,
"loss": -0.0247,
"reward": -0.278532937169075,
"reward_after_mean": -0.278532937169075,
"reward_after_std": 0.44566163793206215,
"reward_before_mean": 0.01611769199371338,
"reward_before_std": 0.3775772461667657,
"reward_change_max": 0.0,
"reward_change_mean": -0.29465061984956264,
"reward_change_min": -0.4545559212565422,
"reward_change_std": 0.17489958554506302,
"reward_std": 0.44566163793206215,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.17138232477009296,
"step": 449
},
{
"clip_fraction": 0.0,
"completion_length": 2405.0833702087402,
"epoch": 0.5142857142857142,
"grad_norm": 0.07359456270933151,
"kl": 4.271417856216431e-05,
"lambda_div_used": 0.5313882529735565,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0047,
"reward": -0.40455422177910805,
"reward_after_mean": -0.40455422177910805,
"reward_after_std": 0.27055008336901665,
"reward_before_mean": -0.05470774322748184,
"reward_before_std": 0.13779542688280344,
"reward_change_max": 0.0,
"reward_change_mean": -0.34984651021659374,
"reward_change_min": -0.482446551322937,
"reward_change_std": 0.18013106007128954,
"reward_std": 0.2705500964075327,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.17970772832632065,
"step": 450
},
{
"clip_fraction": 0.0,
"completion_length": 2726.7916870117188,
"epoch": 0.5154285714285715,
"grad_norm": 0.10456772893667221,
"kl": 5.123019218444824e-05,
"lambda_div_used": 0.5659012496471405,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0437,
"reward": -0.233762688934803,
"reward_after_mean": -0.233762688934803,
"reward_after_std": 0.3937326893210411,
"reward_before_mean": 0.14449233608320355,
"reward_before_std": 0.30359079129993916,
"reward_change_max": 0.0,
"reward_change_mean": -0.3782550562173128,
"reward_change_min": -0.5748258531093597,
"reward_change_std": 0.21745321340858936,
"reward_std": 0.3937326930463314,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.0430076620541513,
"step": 451
},
{
"clip_fraction": 0.0,
"completion_length": 3173.812530517578,
"epoch": 0.5165714285714286,
"grad_norm": 0.05382152646780014,
"kl": 1.621060073375702e-05,
"lambda_div_used": 0.628274716436863,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0152,
"reward": 0.08829959481954575,
"reward_after_mean": 0.08829959481954575,
"reward_after_std": 0.6491915434598923,
"reward_before_mean": 0.49067158019170165,
"reward_before_std": 0.5948440972715616,
"reward_change_max": 0.0,
"reward_change_mean": -0.4023719858378172,
"reward_change_min": -0.6429549157619476,
"reward_change_std": 0.2526876190677285,
"reward_std": 0.6491915658116341,
"rewards/accuracy_reward": 0.37500000558793545,
"rewards/cosine_scaled_reward": 0.11567158252000809,
"step": 452
},
{
"clip_fraction": 0.0,
"completion_length": 2601.1666870117188,
"epoch": 0.5177142857142857,
"grad_norm": 0.10710335522890091,
"kl": 4.57763671875e-05,
"lambda_div_used": 0.5769856572151184,
"learning_rate": 1.2503063339313356e-07,
"loss": -0.0233,
"reward": -0.4120060931891203,
"reward_after_mean": -0.4120060931891203,
"reward_after_std": 0.4247955661267042,
"reward_before_mean": -0.17395742796361446,
"reward_before_std": 0.3489131908863783,
"reward_change_max": 0.0,
"reward_change_mean": -0.23804865032434464,
"reward_change_min": -0.3744778670370579,
"reward_change_std": 0.12989939749240875,
"reward_std": 0.4247955847531557,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21562409959733486,
"step": 453
},
{
"clip_fraction": 0.0,
"completion_length": 2794.8958740234375,
"epoch": 0.5188571428571429,
"grad_norm": 0.059399981051683426,
"kl": 1.829490065574646e-05,
"lambda_div_used": 0.604412779211998,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0084,
"reward": -0.23108407109975815,
"reward_after_mean": -0.23108407109975815,
"reward_after_std": 0.5207228269428015,
"reward_before_mean": 0.05918463226407766,
"reward_before_std": 0.4853199487552047,
"reward_change_max": 0.0,
"reward_change_mean": -0.29026869870722294,
"reward_change_min": -0.5276229903101921,
"reward_change_std": 0.1960765514522791,
"reward_std": 0.5207228306680918,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.08664871653309092,
"step": 454
},
{
"clip_fraction": 0.0,
"completion_length": 3120.0416717529297,
"epoch": 0.52,
"grad_norm": 0.07823975384235382,
"kl": 1.1070631444454193e-05,
"lambda_div_used": 0.5381916239857674,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0035,
"reward": -0.5376988351345062,
"reward_after_mean": -0.5376988351345062,
"reward_after_std": 0.2399882897734642,
"reward_before_mean": -0.28268107399344444,
"reward_before_std": 0.16770172398537397,
"reward_change_max": 0.0,
"reward_change_mean": -0.25501775927841663,
"reward_change_min": -0.38403724879026413,
"reward_change_std": 0.1382020702585578,
"reward_std": 0.23998829536139965,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.28268107026815414,
"step": 455
},
{
"clip_fraction": 0.0,
"completion_length": 3183.125,
"epoch": 0.5211428571428571,
"grad_norm": 0.07441714406013489,
"kl": 1.7248094081878662e-05,
"lambda_div_used": 0.535825714468956,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0061,
"reward": -0.5242688432335854,
"reward_after_mean": -0.5242688432335854,
"reward_after_std": 0.2355129700154066,
"reward_before_mean": -0.2603513076901436,
"reward_before_std": 0.15733365854248405,
"reward_change_max": 0.0,
"reward_change_mean": -0.2639175299555063,
"reward_change_min": -0.3936190530657768,
"reward_change_std": 0.14126356784254313,
"reward_std": 0.23551297932863235,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2603513114154339,
"step": 456
},
{
"clip_fraction": 0.0,
"completion_length": 3090.9166717529297,
"epoch": 0.5222857142857142,
"grad_norm": 0.06740739941596985,
"kl": 1.6771256923675537e-05,
"lambda_div_used": 0.5661213397979736,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0183,
"reward": -0.2666686773300171,
"reward_after_mean": -0.2666686773300171,
"reward_after_std": 0.3630063198506832,
"reward_before_mean": 0.06939902156591415,
"reward_before_std": 0.3003841144964099,
"reward_change_max": 0.0,
"reward_change_mean": -0.3360677044838667,
"reward_change_min": -0.5109980814158916,
"reward_change_std": 0.19570716377347708,
"reward_std": 0.36300632916390896,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.0764343123883009,
"step": 457
},
{
"clip_fraction": 0.0,
"completion_length": 2337.0416717529297,
"epoch": 0.5234285714285715,
"grad_norm": 0.10038017481565475,
"kl": 1.5580561012029648e-05,
"lambda_div_used": 0.587464913725853,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0337,
"reward": -0.2921891398727894,
"reward_after_mean": -0.2921891398727894,
"reward_after_std": 0.46863692067563534,
"reward_before_mean": -0.017078701872378588,
"reward_before_std": 0.3978575337678194,
"reward_change_max": 0.0,
"reward_change_mean": -0.27511043660342693,
"reward_change_min": -0.4165792725980282,
"reward_change_std": 0.15801922790706158,
"reward_std": 0.4686369299888611,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.14207870024256408,
"step": 458
},
{
"clip_fraction": 0.0,
"completion_length": 1897.9791870117188,
"epoch": 0.5245714285714286,
"grad_norm": 0.10566994547843933,
"kl": 3.1791627407073975e-05,
"lambda_div_used": 0.6483638733625412,
"learning_rate": 1.1920622611056974e-07,
"loss": -0.0327,
"reward": 0.04763873480260372,
"reward_after_mean": 0.04763873480260372,
"reward_after_std": 0.6893943976610899,
"reward_before_mean": 0.37104589492082596,
"reward_before_std": 0.6905025038868189,
"reward_change_max": 0.0,
"reward_change_mean": -0.3234071359038353,
"reward_change_min": -0.573150984942913,
"reward_change_std": 0.22597133833914995,
"reward_std": 0.6893944274634123,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.07937921397387981,
"step": 459
},
{
"clip_fraction": 0.0,
"completion_length": 3152.9583587646484,
"epoch": 0.5257142857142857,
"grad_norm": 0.0553775392472744,
"kl": 1.6976147890090942e-05,
"lambda_div_used": 0.6212490126490593,
"learning_rate": 1.1830871145697412e-07,
"loss": -0.0403,
"reward": -0.2310712798498571,
"reward_after_mean": -0.2310712798498571,
"reward_after_std": 0.5967046339064837,
"reward_before_mean": 0.018506756518036127,
"reward_before_std": 0.5584379080682993,
"reward_change_max": 0.0,
"reward_change_mean": -0.24957803264260292,
"reward_change_min": -0.45073715783655643,
"reward_change_std": 0.16405375488102436,
"reward_std": 0.5967046469449997,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.1273265864001587,
"step": 460
},
{
"clip_fraction": 0.0,
"completion_length": 2965.625015258789,
"epoch": 0.5268571428571428,
"grad_norm": 0.0683179497718811,
"kl": 3.898143768310547e-05,
"lambda_div_used": 0.5838751494884491,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0201,
"reward": -0.36440329253673553,
"reward_after_mean": -0.36440329253673553,
"reward_after_std": 0.44049660488963127,
"reward_before_mean": -0.10854836623184383,
"reward_before_std": 0.3863721750676632,
"reward_change_max": 0.0,
"reward_change_mean": -0.25585492700338364,
"reward_change_min": -0.41783962957561016,
"reward_change_std": 0.15460436698049307,
"reward_std": 0.44049660861492157,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.19188169576227665,
"step": 461
},
{
"clip_fraction": 0.0,
"completion_length": 2874.3125076293945,
"epoch": 0.528,
"grad_norm": 0.07832667976617813,
"kl": 2.7470290660858154e-05,
"lambda_div_used": 0.5419389680027962,
"learning_rate": 1.1657684494105386e-07,
"loss": -0.0381,
"reward": -0.5123195722699165,
"reward_after_mean": -0.5123195722699165,
"reward_after_std": 0.2568210382014513,
"reward_before_mean": -0.2502432279288769,
"reward_before_std": 0.1842196974903345,
"reward_change_max": 0.0,
"reward_change_mean": -0.2620763499289751,
"reward_change_min": -0.4005816727876663,
"reward_change_std": 0.14368234388530254,
"reward_std": 0.2568210456520319,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2502432242035866,
"step": 462
},
{
"clip_fraction": 0.0,
"completion_length": 2986.8333740234375,
"epoch": 0.5291428571428571,
"grad_norm": 0.053950026631355286,
"kl": 8.771196007728577e-06,
"lambda_div_used": 0.622698076069355,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0106,
"reward": -0.2041953857988119,
"reward_after_mean": -0.2041953857988119,
"reward_after_std": 0.6069091446697712,
"reward_before_mean": 0.05050618201494217,
"reward_before_std": 0.5684067364782095,
"reward_change_max": 0.0,
"reward_change_mean": -0.25470156222581863,
"reward_change_min": -0.4568561315536499,
"reward_change_std": 0.16765617858618498,
"reward_std": 0.6069091446697712,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.09532715613022447,
"step": 463
},
{
"clip_fraction": 0.0,
"completion_length": 1749.0416717529297,
"epoch": 0.5302857142857142,
"grad_norm": 0.11516103893518448,
"kl": 3.466010093688965e-05,
"lambda_div_used": 0.5341480076313019,
"learning_rate": 1.1492947512799328e-07,
"loss": -0.0088,
"reward": -0.040389321744441986,
"reward_after_mean": -0.040389321744441986,
"reward_after_std": 0.37696985714137554,
"reward_before_mean": 0.5386080192402005,
"reward_before_std": 0.14989514648914337,
"reward_change_max": 0.0,
"reward_change_mean": -0.5789973307400942,
"reward_change_min": -0.7674467526376247,
"reward_change_std": 0.2917799688875675,
"reward_std": 0.37696986459195614,
"rewards/accuracy_reward": 0.375,
"rewards/cosine_scaled_reward": 0.16360801365226507,
"step": 464
},
{
"clip_fraction": 0.0,
"completion_length": 3036.854179382324,
"epoch": 0.5314285714285715,
"grad_norm": 0.08283663541078568,
"kl": 2.4955719709396362e-05,
"lambda_div_used": 0.572467640042305,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0373,
"reward": -0.40219551732297987,
"reward_after_mean": -0.40219551732297987,
"reward_after_std": 0.40797613374888897,
"reward_before_mean": -0.1455877646803856,
"reward_before_std": 0.32708599977195263,
"reward_change_max": 0.0,
"reward_change_mean": -0.256607748568058,
"reward_change_min": -0.37336331233382225,
"reward_change_std": 0.1377662243321538,
"reward_std": 0.40797614119946957,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.18725443072617054,
"step": 465
},
{
"clip_fraction": 0.0,
"completion_length": 3122.9583435058594,
"epoch": 0.5325714285714286,
"grad_norm": 0.07085831463336945,
"kl": 2.0582228899002075e-05,
"lambda_div_used": 0.5319224968552589,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0563,
"reward": -0.36368466913700104,
"reward_after_mean": -0.36368466913700104,
"reward_after_std": 0.27606455981731415,
"reward_before_mean": 0.005439521744847298,
"reward_before_std": 0.14014938473701477,
"reward_change_max": 0.0,
"reward_change_mean": -0.36912417598068714,
"reward_change_min": -0.5061142668128014,
"reward_change_std": 0.1899967910721898,
"reward_std": 0.2760645691305399,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.11956048291176558,
"step": 466
},
{
"clip_fraction": 0.0,
"completion_length": 3113.2708854675293,
"epoch": 0.5337142857142857,
"grad_norm": 0.07966674119234085,
"kl": 3.802310675382614e-05,
"lambda_div_used": 0.5612561926245689,
"learning_rate": 1.1261754973965422e-07,
"loss": -0.0067,
"reward": -0.3137255348265171,
"reward_after_mean": -0.3137255348265171,
"reward_after_std": 0.39207486622035503,
"reward_before_mean": 0.03320633992552757,
"reward_before_std": 0.2764817178249359,
"reward_change_max": 0.0,
"reward_change_mean": -0.3469318952411413,
"reward_change_min": -0.4828169047832489,
"reward_change_std": 0.18068420328199863,
"reward_std": 0.3920748811215162,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.11262698657810688,
"step": 467
},
{
"clip_fraction": 0.0,
"completion_length": 2898.770854949951,
"epoch": 0.5348571428571428,
"grad_norm": 0.07577298581600189,
"kl": 4.957616329193115e-05,
"lambda_div_used": 0.5363663136959076,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0338,
"reward": -0.5352834053337574,
"reward_after_mean": -0.5352834053337574,
"reward_after_std": 0.22454985231161118,
"reward_before_mean": -0.2775777019560337,
"reward_before_std": 0.15978037798777223,
"reward_change_max": 0.0,
"reward_change_mean": -0.2577057033777237,
"reward_change_min": -0.3789172098040581,
"reward_change_std": 0.14186344109475613,
"reward_std": 0.22454985417425632,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.27757770381867886,
"step": 468
},
{
"clip_fraction": 0.0,
"completion_length": 2854.8750228881836,
"epoch": 0.536,
"grad_norm": 0.09182782471179962,
"kl": 2.0341947674751282e-05,
"lambda_div_used": 0.5550094842910767,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0301,
"reward": -0.35969678312540054,
"reward_after_mean": -0.35969678312540054,
"reward_after_std": 0.30523936823010445,
"reward_before_mean": -0.04356633126735687,
"reward_before_std": 0.2476841462776065,
"reward_change_max": 0.0,
"reward_change_mean": -0.31613045185804367,
"reward_change_min": -0.45744797959923744,
"reward_change_std": 0.17962745483964682,
"reward_std": 0.3052393738180399,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.12689966335892677,
"step": 469
},
{
"clip_fraction": 0.0,
"completion_length": 3333.562530517578,
"epoch": 0.5371428571428571,
"grad_norm": 0.05610484257340431,
"kl": 2.9811635613441467e-05,
"lambda_div_used": 0.5911690816283226,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0307,
"reward": -0.37843912467360497,
"reward_after_mean": -0.37843912467360497,
"reward_after_std": 0.49228920973837376,
"reward_before_mean": -0.14763083557772916,
"reward_before_std": 0.4128831513226032,
"reward_change_max": 0.0,
"reward_change_mean": -0.230808287858963,
"reward_change_min": -0.3457215949892998,
"reward_change_std": 0.12627399526536465,
"reward_std": 0.49228921718895435,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.21013083728030324,
"step": 470
},
{
"clip_fraction": 0.0,
"completion_length": 3032.312515258789,
"epoch": 0.5382857142857143,
"grad_norm": 0.053547170013189316,
"kl": 2.0138919353485107e-05,
"lambda_div_used": 0.5520770102739334,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0036,
"reward": -0.32707202807068825,
"reward_after_mean": -0.32707202807068825,
"reward_after_std": 0.30774626694619656,
"reward_before_mean": 0.006061417981982231,
"reward_before_std": 0.23228682670742273,
"reward_change_max": 0.0,
"reward_change_mean": -0.3331334535032511,
"reward_change_min": -0.47613633051514626,
"reward_change_std": 0.181988756172359,
"reward_std": 0.3077462762594223,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.09810524806380272,
"step": 471
},
{
"clip_fraction": 0.0,
"completion_length": 3115.8333435058594,
"epoch": 0.5394285714285715,
"grad_norm": 0.05183921754360199,
"kl": 1.757591962814331e-05,
"lambda_div_used": 0.6081600487232208,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0248,
"reward": -0.22974545462056994,
"reward_after_mean": -0.22974545462056994,
"reward_after_std": 0.5618578754365444,
"reward_before_mean": 0.03380160592496395,
"reward_before_std": 0.4960998175665736,
"reward_change_max": 0.0,
"reward_change_mean": -0.2635470647364855,
"reward_change_min": -0.4026007801294327,
"reward_change_std": 0.1525269951671362,
"reward_std": 0.5618579015135765,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.1120317269815132,
"step": 472
},
{
"clip_fraction": 0.0,
"completion_length": 3389.7291870117188,
"epoch": 0.5405714285714286,
"grad_norm": 0.05399390682578087,
"kl": 2.3730099201202393e-05,
"lambda_div_used": 0.6023471653461456,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0081,
"reward": -0.16527403378859162,
"reward_after_mean": -0.16527403378859162,
"reward_after_std": 0.4859522972255945,
"reward_before_mean": 0.15660450607538223,
"reward_before_std": 0.469901567324996,
"reward_change_max": 0.0,
"reward_change_mean": -0.3218785412609577,
"reward_change_min": -0.5562809407711029,
"reward_change_std": 0.21282578073441982,
"reward_std": 0.48595230281352997,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.051728841848671436,
"step": 473
},
{
"clip_fraction": 0.0,
"completion_length": 2609.3541946411133,
"epoch": 0.5417142857142857,
"grad_norm": 0.07730484753847122,
"kl": 1.3811513781547546e-05,
"lambda_div_used": 0.5951687097549438,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.085,
"reward": 0.0976928174495697,
"reward_after_mean": 0.0976928174495697,
"reward_after_std": 0.5153173375874758,
"reward_before_mean": 0.5706379320472479,
"reward_before_std": 0.4373670984059572,
"reward_change_max": 0.0,
"reward_change_mean": -0.47294510155916214,
"reward_change_min": -0.6950589865446091,
"reward_change_std": 0.27640796452760696,
"reward_std": 0.515317365527153,
"rewards/accuracy_reward": 0.3958333432674408,
"rewards/cosine_scaled_reward": 0.1748045664280653,
"step": 474
},
{
"clip_fraction": 0.0,
"completion_length": 2361.020851135254,
"epoch": 0.5428571428571428,
"grad_norm": 0.08214546740055084,
"kl": 9.525567293167114e-06,
"lambda_div_used": 0.6207183972001076,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0204,
"reward": 0.0007177861407399178,
"reward_after_mean": 0.0007177861407399178,
"reward_after_std": 0.5817896965891123,
"reward_before_mean": 0.34289272502064705,
"reward_before_std": 0.5596933793276548,
"reward_change_max": 0.0,
"reward_change_mean": -0.3421749472618103,
"reward_change_min": -0.5431834943592548,
"reward_change_std": 0.21874003671109676,
"reward_std": 0.581789730116725,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.05122605012729764,
"step": 475
},
{
"clip_fraction": 0.0,
"completion_length": 2583.8333740234375,
"epoch": 0.544,
"grad_norm": 0.08052952587604523,
"kl": 3.0007213354110718e-05,
"lambda_div_used": 0.6784488782286644,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0154,
"reward": -0.031517775263637304,
"reward_after_mean": -0.031517775263637304,
"reward_after_std": 0.8641770519316196,
"reward_before_mean": 0.2038349723443389,
"reward_before_std": 0.8286011293530464,
"reward_change_max": 0.0,
"reward_change_mean": -0.23535274900496006,
"reward_change_min": -0.41804002970457077,
"reward_change_std": 0.16118939872831106,
"reward_std": 0.8641770891845226,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.025331700686365366,
"step": 476
},
{
"clip_fraction": 0.0,
"completion_length": 2308.291732788086,
"epoch": 0.5451428571428572,
"grad_norm": 0.06839341670274734,
"kl": 8.73953104019165e-06,
"lambda_div_used": 0.6096622571349144,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1111,
"reward": 0.19456686079502106,
"reward_after_mean": 0.19456686079502106,
"reward_after_std": 0.5847878716886044,
"reward_before_mean": 0.678333050571382,
"reward_before_std": 0.5006472393870354,
"reward_change_max": 0.0,
"reward_change_mean": -0.48376619443297386,
"reward_change_min": -0.6835142783820629,
"reward_change_std": 0.2774948738515377,
"reward_std": 0.5847878959029913,
"rewards/accuracy_reward": 0.45833334885537624,
"rewards/cosine_scaled_reward": 0.21999971382319927,
"step": 477
},
{
"clip_fraction": 0.0,
"completion_length": 3169.3125610351562,
"epoch": 0.5462857142857143,
"grad_norm": 0.052864328026771545,
"kl": 6.990041583776474e-06,
"lambda_div_used": 0.5758631080389023,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0349,
"reward": -0.23655124567449093,
"reward_after_mean": -0.23655124567449093,
"reward_after_std": 0.4532997701317072,
"reward_before_mean": 0.11144567281007767,
"reward_before_std": 0.3459607223048806,
"reward_change_max": 0.0,
"reward_change_mean": -0.3479969333857298,
"reward_change_min": -0.5187661461532116,
"reward_change_std": 0.19546091184020042,
"reward_std": 0.4532997887581587,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.09688764810562134,
"step": 478
},
{
"clip_fraction": 0.0,
"completion_length": 2825.916702270508,
"epoch": 0.5474285714285714,
"grad_norm": 0.08355361968278885,
"kl": 3.739655949175358e-05,
"lambda_div_used": 0.5902442187070847,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.033,
"reward": -0.37585191056132317,
"reward_after_mean": -0.37585191056132317,
"reward_after_std": 0.45466959848999977,
"reward_before_mean": -0.12638765759766102,
"reward_before_std": 0.4168459586799145,
"reward_change_max": 0.0,
"reward_change_mean": -0.2494642548263073,
"reward_change_min": -0.4229847304522991,
"reward_change_std": 0.15689124166965485,
"reward_std": 0.4546696189790964,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.18888765573501587,
"step": 479
},
{
"clip_fraction": 0.0,
"completion_length": 2415.895866394043,
"epoch": 0.5485714285714286,
"grad_norm": 0.08123134076595306,
"kl": 1.4044344425201416e-05,
"lambda_div_used": 0.5890126600861549,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0739,
"reward": -0.36289478465914726,
"reward_after_mean": -0.36289478465914726,
"reward_after_std": 0.4815117195248604,
"reward_before_mean": -0.11086839716881514,
"reward_before_std": 0.4026770405471325,
"reward_change_max": 0.0,
"reward_change_mean": -0.25202639400959015,
"reward_change_min": -0.37619682028889656,
"reward_change_std": 0.13623447716236115,
"reward_std": 0.4815117232501507,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.17336839414201677,
"step": 480
},
{
"clip_fraction": 0.0,
"completion_length": 3057.625030517578,
"epoch": 0.5497142857142857,
"grad_norm": 0.05632660165429115,
"kl": 2.8233975172042847e-05,
"lambda_div_used": 0.5841230228543282,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0198,
"reward": -0.386552257463336,
"reward_after_mean": -0.386552257463336,
"reward_after_std": 0.4468122199177742,
"reward_before_mean": -0.14256569184362888,
"reward_before_std": 0.38717135414481163,
"reward_change_max": 0.0,
"reward_change_mean": -0.24398657120764256,
"reward_change_min": -0.42186837270855904,
"reward_change_std": 0.14736179821193218,
"reward_std": 0.4468122236430645,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.20506569370627403,
"step": 481
},
{
"clip_fraction": 0.0,
"completion_length": 2824.5833587646484,
"epoch": 0.5508571428571428,
"grad_norm": 0.0865793228149414,
"kl": 2.4417764507234097e-05,
"lambda_div_used": 0.622517004609108,
"learning_rate": 1.0395300688680625e-07,
"loss": -0.0313,
"reward": -0.05876433290541172,
"reward_after_mean": -0.05876433290541172,
"reward_after_std": 0.5989817604422569,
"reward_before_mean": 0.259432727470994,
"reward_before_std": 0.5737235806882381,
"reward_change_max": 0.0,
"reward_change_mean": -0.31819707341492176,
"reward_change_min": -0.5269695781171322,
"reward_change_std": 0.21138717606663704,
"reward_std": 0.5989817790687084,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": 0.030266055837273598,
"step": 482
},
{
"clip_fraction": 0.0,
"completion_length": 2869.250015258789,
"epoch": 0.552,
"grad_norm": 0.08511485904455185,
"kl": 3.474205732345581e-05,
"lambda_div_used": 0.6013401597738266,
"learning_rate": 1.0354838440848501e-07,
"loss": -0.0837,
"reward": -0.2791392467916012,
"reward_after_mean": -0.2791392467916012,
"reward_after_std": 0.5272991992533207,
"reward_before_mean": -0.02371177263557911,
"reward_before_std": 0.4689871799200773,
"reward_change_max": 0.0,
"reward_change_mean": -0.2554274704307318,
"reward_change_min": -0.40494704991579056,
"reward_change_std": 0.15485912468284369,
"reward_std": 0.5272992141544819,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1278784405440092,
"step": 483
},
{
"clip_fraction": 0.0,
"completion_length": 2591.7500228881836,
"epoch": 0.5531428571428572,
"grad_norm": 0.08745326101779938,
"kl": 3.6368146538734436e-07,
"lambda_div_used": 0.6466837078332901,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0082,
"reward": -0.04187892563641071,
"reward_after_mean": -0.04187892563641071,
"reward_after_std": 0.6843612100929022,
"reward_before_mean": 0.242530676885508,
"reward_before_std": 0.6844684220850468,
"reward_change_max": 0.0,
"reward_change_mean": -0.284409599378705,
"reward_change_min": -0.49204646795988083,
"reward_change_std": 0.1993796620517969,
"reward_std": 0.6843612212687731,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.0283026653341949,
"step": 484
},
{
"clip_fraction": 0.0,
"completion_length": 2046.2500381469727,
"epoch": 0.5542857142857143,
"grad_norm": 0.09076947718858719,
"kl": 2.752244472503662e-05,
"lambda_div_used": 0.5719796344637871,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0035,
"reward": -0.40644849208183587,
"reward_after_mean": -0.40644849208183587,
"reward_after_std": 0.398155614733696,
"reward_before_mean": -0.14403727410535794,
"reward_before_std": 0.3282261691056192,
"reward_change_max": 0.0,
"reward_change_mean": -0.2624112106859684,
"reward_change_min": -0.43003450334072113,
"reward_change_std": 0.15388297475874424,
"reward_std": 0.39815562404692173,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.20653727487660944,
"step": 485
},
{
"clip_fraction": 0.0,
"completion_length": 1967.7916946411133,
"epoch": 0.5554285714285714,
"grad_norm": 0.09958811849355698,
"kl": 3.91155481338501e-05,
"lambda_div_used": 0.5703976079821587,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0966,
"reward": -0.26609380822628736,
"reward_after_mean": -0.26609380822628736,
"reward_after_std": 0.4386676363646984,
"reward_before_mean": 0.07946969009935856,
"reward_before_std": 0.31633214373141527,
"reward_change_max": 0.0,
"reward_change_mean": -0.34556350111961365,
"reward_change_min": -0.48576217144727707,
"reward_change_std": 0.1828850321471691,
"reward_std": 0.4386676475405693,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.08719698037020862,
"step": 486
},
{
"clip_fraction": 0.0,
"completion_length": 1967.0625228881836,
"epoch": 0.5565714285714286,
"grad_norm": 0.10718576610088348,
"kl": 3.7085264921188354e-06,
"lambda_div_used": 0.6087248548865318,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0461,
"reward": 0.08780635055154562,
"reward_after_mean": 0.08780635055154562,
"reward_after_std": 0.6550032701343298,
"reward_before_mean": 0.5625860100844875,
"reward_before_std": 0.5011934200301766,
"reward_change_max": 0.0,
"reward_change_mean": -0.47477963753044605,
"reward_change_min": -0.7034505233168602,
"reward_change_std": 0.26853484753519297,
"reward_std": 0.6550032943487167,
"rewards/accuracy_reward": 0.3958333358168602,
"rewards/cosine_scaled_reward": 0.16675265738740563,
"step": 487
},
{
"clip_fraction": 0.0,
"completion_length": 2356.8125228881836,
"epoch": 0.5577142857142857,
"grad_norm": 0.08583448082208633,
"kl": 2.1670013666152954e-05,
"lambda_div_used": 0.5598724335432053,
"learning_rate": 1.0185202062281336e-07,
"loss": -0.0446,
"reward": -0.39712974801659584,
"reward_after_mean": -0.39712974801659584,
"reward_after_std": 0.3282298669219017,
"reward_before_mean": -0.11880321707576513,
"reward_before_std": 0.2731999019160867,
"reward_change_max": 0.0,
"reward_change_mean": -0.27832652255892754,
"reward_change_min": -0.42709672823548317,
"reward_change_std": 0.16301770228892565,
"reward_std": 0.3282298743724823,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.20213654916733503,
"step": 488
},
{
"clip_fraction": 0.0,
"completion_length": 3039.2916870117188,
"epoch": 0.5588571428571428,
"grad_norm": 0.08493036031723022,
"kl": 4.955753684043884e-05,
"lambda_div_used": 0.5549413114786148,
"learning_rate": 1.0157821333772304e-07,
"loss": -0.0127,
"reward": -0.4395306259393692,
"reward_after_mean": -0.4395306259393692,
"reward_after_std": 0.30519232526421547,
"reward_before_mean": -0.16155868768692017,
"reward_before_std": 0.24803727120161057,
"reward_change_max": 0.0,
"reward_change_mean": -0.27797193080186844,
"reward_change_min": -0.44194458797574043,
"reward_change_std": 0.1623495165258646,
"reward_std": 0.3051923308521509,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.22405867651104927,
"step": 489
},
{
"clip_fraction": 0.0,
"completion_length": 2838.0833892822266,
"epoch": 0.56,
"grad_norm": 0.06069381162524223,
"kl": 1.2032687664031982e-05,
"lambda_div_used": 0.583185076713562,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0139,
"reward": -0.1338807214051485,
"reward_after_mean": -0.1338807214051485,
"reward_after_std": 0.4649778436869383,
"reward_before_mean": 0.25663699954748154,
"reward_before_std": 0.3789667785167694,
"reward_change_max": 0.0,
"reward_change_mean": -0.39051773957908154,
"reward_change_min": -0.6341840960085392,
"reward_change_std": 0.2300328817218542,
"reward_std": 0.46497784554958344,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.014196328818798065,
"step": 490
},
{
"clip_fraction": 0.0,
"completion_length": 2593.7291870117188,
"epoch": 0.5611428571428572,
"grad_norm": 0.07340901345014572,
"kl": 3.531062975525856e-05,
"lambda_div_used": 0.5965912714600563,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0268,
"reward": 0.01746355928480625,
"reward_after_mean": 0.01746355928480625,
"reward_after_std": 0.5940692499279976,
"reward_before_mean": 0.47233812790364027,
"reward_before_std": 0.4421461224555969,
"reward_change_max": 0.0,
"reward_change_mean": -0.4548745583742857,
"reward_change_min": -0.6400704979896545,
"reward_change_std": 0.24487750884145498,
"reward_std": 0.5940692741423845,
"rewards/accuracy_reward": 0.3541666679084301,
"rewards/cosine_scaled_reward": 0.11817142926156521,
"step": 491
},
{
"clip_fraction": 0.0,
"completion_length": 2583.5416679382324,
"epoch": 0.5622857142857143,
"grad_norm": 0.08282249420881271,
"kl": 2.4866312742233276e-05,
"lambda_div_used": 0.5367531701922417,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0376,
"reward": -0.3908469006419182,
"reward_after_mean": -0.3908469006419182,
"reward_after_std": 0.27869052439928055,
"reward_before_mean": -0.03699151985347271,
"reward_before_std": 0.16138391755521297,
"reward_change_max": 0.0,
"reward_change_mean": -0.3538553789258003,
"reward_change_min": -0.5079905577003956,
"reward_change_std": 0.18725439626723528,
"reward_std": 0.27869053184986115,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.1619915273040533,
"step": 492
},
{
"clip_fraction": 0.0,
"completion_length": 2176.8333740234375,
"epoch": 0.5634285714285714,
"grad_norm": 0.09936831891536713,
"kl": 6.021931767463684e-06,
"lambda_div_used": 0.596942737698555,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0246,
"reward": -0.037120603024959564,
"reward_after_mean": -0.037120603024959564,
"reward_after_std": 0.46812310442328453,
"reward_before_mean": 0.34137603268027306,
"reward_before_std": 0.44823691714555025,
"reward_change_max": 0.0,
"reward_change_mean": -0.3784966245293617,
"reward_change_min": -0.5661123469471931,
"reward_change_std": 0.234741548076272,
"reward_std": 0.4681231305003166,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.07054269965738058,
"step": 493
},
{
"clip_fraction": 0.0,
"completion_length": 2352.583427429199,
"epoch": 0.5645714285714286,
"grad_norm": 0.07574566453695297,
"kl": 1.6648322343826294e-05,
"lambda_div_used": 0.6796721071004868,
"learning_rate": 1.005372381963547e-07,
"loss": 0.092,
"reward": 0.10963849350810051,
"reward_after_mean": 0.10963849350810051,
"reward_after_std": 0.8239130303263664,
"reward_before_mean": 0.4074633736163378,
"reward_before_std": 0.848267612978816,
"reward_change_max": 0.0,
"reward_change_mean": -0.2978248745203018,
"reward_change_min": -0.6511921100318432,
"reward_change_std": 0.2445714958012104,
"reward_std": 0.8239130582660437,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/cosine_scaled_reward": 0.09496336756274104,
"step": 494
},
{
"clip_fraction": 0.0,
"completion_length": 3203.8959045410156,
"epoch": 0.5657142857142857,
"grad_norm": 0.057527996599674225,
"kl": 2.3846514523029327e-05,
"lambda_div_used": 0.5994853675365448,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0637,
"reward": -0.21300244145095348,
"reward_after_mean": -0.21300244145095348,
"reward_after_std": 0.48377062380313873,
"reward_before_mean": 0.08397414721548557,
"reward_before_std": 0.4531490486115217,
"reward_change_max": 0.0,
"reward_change_mean": -0.29697658494114876,
"reward_change_min": -0.49157723411917686,
"reward_change_std": 0.1875515878200531,
"reward_std": 0.4837706368416548,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.08269252139143646,
"step": 495
},
{
"clip_fraction": 0.0,
"completion_length": 1895.9375457763672,
"epoch": 0.5668571428571428,
"grad_norm": 0.10791665315628052,
"kl": 1.890142448246479e-05,
"lambda_div_used": 0.6198414713144302,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0183,
"reward": 0.09977892541792244,
"reward_after_mean": 0.09977892541792244,
"reward_after_std": 0.6343465056270361,
"reward_before_mean": 0.5255357641726732,
"reward_before_std": 0.555769513361156,
"reward_change_max": 0.0,
"reward_change_mean": -0.4257568307220936,
"reward_change_min": -0.6588771641254425,
"reward_change_std": 0.25742017664015293,
"reward_std": 0.634346516802907,
"rewards/accuracy_reward": 0.35416667349636555,
"rewards/cosine_scaled_reward": 0.17136909160763025,
"step": 496
},
{
"clip_fraction": 0.0,
"completion_length": 2577.645851135254,
"epoch": 0.568,
"grad_norm": 0.06887742131948471,
"kl": 2.493336796760559e-05,
"lambda_div_used": 0.6232517957687378,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0143,
"reward": 0.039356768131256104,
"reward_after_mean": 0.039356768131256104,
"reward_after_std": 0.5658366903662682,
"reward_before_mean": 0.401943476870656,
"reward_before_std": 0.5692849718034267,
"reward_change_max": 0.0,
"reward_change_mean": -0.362586697563529,
"reward_change_min": -0.6212925836443901,
"reward_change_std": 0.24119753576815128,
"reward_std": 0.5658366959542036,
"rewards/accuracy_reward": 0.3333333469927311,
"rewards/cosine_scaled_reward": 0.06861014291644096,
"step": 497
},
{
"clip_fraction": 0.0,
"completion_length": 2857.2916946411133,
"epoch": 0.5691428571428572,
"grad_norm": 0.06479962170124054,
"kl": 9.963754564523697e-06,
"lambda_div_used": 0.6236685812473297,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0008,
"reward": -0.18094617873430252,
"reward_after_mean": -0.18094617873430252,
"reward_after_std": 0.6155845355242491,
"reward_before_mean": 0.07468715589493513,
"reward_before_std": 0.5768492119386792,
"reward_change_max": 0.0,
"reward_change_mean": -0.2556333262473345,
"reward_change_min": -0.4431779384613037,
"reward_change_std": 0.16846577636897564,
"reward_std": 0.6155845616012812,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.0919795110821724,
"step": 498
},
{
"clip_fraction": 0.0,
"completion_length": 2707.791702270508,
"epoch": 0.5702857142857143,
"grad_norm": 0.06317302584648132,
"kl": 1.5532365068793297e-05,
"lambda_div_used": 0.6250721588730812,
"learning_rate": 1.000438641958131e-07,
"loss": -0.0298,
"reward": -0.12677906453609467,
"reward_after_mean": -0.12677906453609467,
"reward_after_std": 0.6044176463037729,
"reward_before_mean": 0.1698420336470008,
"reward_before_std": 0.5829241154715419,
"reward_change_max": 0.0,
"reward_change_mean": -0.2966211009770632,
"reward_change_min": -0.5905468240380287,
"reward_change_std": 0.2108100038021803,
"reward_std": 0.6044176481664181,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.03849130589514971,
"step": 499
},
{
"clip_fraction": 0.0,
"completion_length": 2980.9167098999023,
"epoch": 0.5714285714285714,
"grad_norm": 0.0621151477098465,
"kl": 1.033581793308258e-05,
"lambda_div_used": 0.6023986041545868,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0314,
"reward": -0.26747864981007297,
"reward_after_mean": -0.26747864981007297,
"reward_after_std": 0.5251844674348831,
"reward_before_mean": -0.0025623496621847153,
"reward_before_std": 0.4696931503713131,
"reward_change_max": 0.0,
"reward_change_mean": -0.26491627655923367,
"reward_change_min": -0.43205036222934723,
"reward_change_std": 0.15884541906416416,
"reward_std": 0.5251844730228186,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.10672901570796967,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.005314271912367986,
"train_runtime": 106074.3369,
"train_samples_per_second": 0.226,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}