OpenRS-GRPO / trainer_state.json
arthurwangheng's picture
Model save
73ec496 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1644.166748046875,
"epoch": 0.001142857142857143,
"grad_norm": 0.20607953518495117,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0022,
"reward": -0.1127668060362339,
"reward_std": 0.20213491283357143,
"rewards/cosine_scaled_reward": -0.18138340720906854,
"rewards/format_reward": 0.25,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1656.791748046875,
"epoch": 0.002285714285714286,
"grad_norm": 0.31679714617652144,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0623,
"reward": -0.05582176148891449,
"reward_std": 0.6275629922747612,
"rewards/cosine_scaled_reward": -0.19457754865288734,
"rewards/format_reward": 0.3333333432674408,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1606.7500610351562,
"epoch": 0.0034285714285714284,
"grad_norm": 0.2789602147805501,
"kl": 3.388524055480957e-05,
"learning_rate": 6e-08,
"loss": 0.0376,
"reward": -0.2583192214369774,
"reward_std": 0.2636854462325573,
"rewards/cosine_scaled_reward": -0.222909614443779,
"rewards/format_reward": 0.1875000074505806,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1690.6250610351562,
"epoch": 0.004571428571428572,
"grad_norm": 0.27232938747073254,
"kl": 4.017353057861328e-05,
"learning_rate": 8e-08,
"loss": 0.0159,
"reward": -0.40017254278063774,
"reward_std": 0.17111004143953323,
"rewards/cosine_scaled_reward": -0.3146696165204048,
"rewards/format_reward": 0.2291666716337204,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1618.3541870117188,
"epoch": 0.005714285714285714,
"grad_norm": 0.2939867481096334,
"kl": 2.8431415557861328e-05,
"learning_rate": 1e-07,
"loss": 0.0576,
"reward": 0.13743871822953224,
"reward_std": 0.7271581590175629,
"rewards/cosine_scaled_reward": -0.12919731251895428,
"rewards/format_reward": 0.3958333395421505,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1629.4791870117188,
"epoch": 0.006857142857142857,
"grad_norm": 0.248871735331751,
"kl": 3.477931022644043e-05,
"learning_rate": 1.2e-07,
"loss": -0.0029,
"reward": -0.029103130102157593,
"reward_std": 0.5708433166146278,
"rewards/cosine_scaled_reward": -0.1708015874028206,
"rewards/format_reward": 0.3125000037252903,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1490.6458740234375,
"epoch": 0.008,
"grad_norm": 0.22790937530079167,
"kl": 3.007054328918457e-05,
"learning_rate": 1.4e-07,
"loss": 0.0903,
"reward": 0.12145921215415001,
"reward_std": 0.5416159555315971,
"rewards/cosine_scaled_reward": -0.10593708232045174,
"rewards/format_reward": 0.33333334140479565,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 1683.5000305175781,
"epoch": 0.009142857142857144,
"grad_norm": 0.20752077742039396,
"kl": 4.646182060241699e-05,
"learning_rate": 1.6e-07,
"loss": 0.0277,
"reward": -0.23692437633872032,
"reward_std": 0.4620281979441643,
"rewards/cosine_scaled_reward": -0.2747122012078762,
"rewards/format_reward": 0.31250000558793545,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1719.2292175292969,
"epoch": 0.010285714285714285,
"grad_norm": 0.2983323511333683,
"kl": 4.1991472244262695e-05,
"learning_rate": 1.8e-07,
"loss": 0.0511,
"reward": -0.31221747025847435,
"reward_std": 0.21310735493898392,
"rewards/cosine_scaled_reward": -0.24985874257981777,
"rewards/format_reward": 0.1875000074505806,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 1477.2083740234375,
"epoch": 0.011428571428571429,
"grad_norm": 0.23645082786220448,
"kl": 3.116577863693237e-05,
"learning_rate": 2e-07,
"loss": 0.0495,
"reward": 0.37697479128837585,
"reward_std": 0.44906593672931194,
"rewards/cosine_scaled_reward": -0.05109592713415623,
"rewards/format_reward": 0.4791666716337204,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1508.8958587646484,
"epoch": 0.012571428571428572,
"grad_norm": 0.339825377520832,
"kl": 2.8848648071289062e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0535,
"reward": -0.13005081936717033,
"reward_std": 0.6173823103308678,
"rewards/cosine_scaled_reward": -0.2525254301726818,
"rewards/format_reward": 0.37500000558793545,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1631.1041870117188,
"epoch": 0.013714285714285714,
"grad_norm": 0.20658630326267732,
"kl": 3.084540367126465e-05,
"learning_rate": 2.4e-07,
"loss": 0.0635,
"reward": 0.03064786270260811,
"reward_std": 0.4376446008682251,
"rewards/cosine_scaled_reward": -0.1513427309691906,
"rewards/format_reward": 0.33333334140479565,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1422.604232788086,
"epoch": 0.014857142857142857,
"grad_norm": 0.23614097630983502,
"kl": 2.527981996536255e-05,
"learning_rate": 2.6e-07,
"loss": -0.0306,
"reward": 0.4512472003698349,
"reward_std": 0.40983884781599045,
"rewards/cosine_scaled_reward": -0.02437640482094139,
"rewards/format_reward": 0.5,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1652.3542175292969,
"epoch": 0.016,
"grad_norm": 0.2206408502680819,
"kl": 3.93986701965332e-05,
"learning_rate": 2.8e-07,
"loss": 0.0059,
"reward": -0.2542928569018841,
"reward_std": 0.17246506363153458,
"rewards/cosine_scaled_reward": -0.26256311126053333,
"rewards/format_reward": 0.2708333395421505,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1679.229248046875,
"epoch": 0.017142857142857144,
"grad_norm": 0.2314183406404789,
"kl": 4.3898820877075195e-05,
"learning_rate": 3e-07,
"loss": 0.0053,
"reward": -0.258657343685627,
"reward_std": 0.23606499657034874,
"rewards/cosine_scaled_reward": -0.1918286692816764,
"rewards/format_reward": 0.125,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1396.7917175292969,
"epoch": 0.018285714285714287,
"grad_norm": 0.25436941656143647,
"kl": 2.3171305656433105e-05,
"learning_rate": 3.2e-07,
"loss": 0.1053,
"reward": 0.20216324925422668,
"reward_std": 0.4999893419444561,
"rewards/cosine_scaled_reward": -0.13850171491503716,
"rewards/format_reward": 0.4791666716337204,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1719.416748046875,
"epoch": 0.019428571428571427,
"grad_norm": 0.23312894299622924,
"kl": 4.0084123611450195e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0007,
"reward": -0.41149570792913437,
"reward_std": 0.13166083209216595,
"rewards/cosine_scaled_reward": -0.26824783720076084,
"rewards/format_reward": 0.125,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1686.0833740234375,
"epoch": 0.02057142857142857,
"grad_norm": 0.24676487462788851,
"kl": 4.7713518142700195e-05,
"learning_rate": 3.6e-07,
"loss": 0.0814,
"reward": -0.32610235549509525,
"reward_std": 0.23402154073119164,
"rewards/cosine_scaled_reward": -0.25680116564035416,
"rewards/format_reward": 0.18750000186264515,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1773.6458740234375,
"epoch": 0.021714285714285714,
"grad_norm": 0.21561964662639843,
"kl": 2.1457672119140625e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0164,
"reward": -0.5961569249629974,
"reward_std": 0.1714775264263153,
"rewards/cosine_scaled_reward": -0.3501618057489395,
"rewards/format_reward": 0.10416666977107525,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1529.3125610351562,
"epoch": 0.022857142857142857,
"grad_norm": 0.251130340260543,
"kl": 3.24249267578125e-05,
"learning_rate": 4e-07,
"loss": 0.0293,
"reward": -0.048260755836963654,
"reward_std": 0.34835576079785824,
"rewards/cosine_scaled_reward": -0.20121371746063232,
"rewards/format_reward": 0.35416667722165585,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1494.6250305175781,
"epoch": 0.024,
"grad_norm": 0.3018968569179871,
"kl": 2.6673078536987305e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0278,
"reward": 0.021329142153263092,
"reward_std": 0.45257429778575897,
"rewards/cosine_scaled_reward": -0.15600210055708885,
"rewards/format_reward": 0.3333333358168602,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 1778.5625610351562,
"epoch": 0.025142857142857144,
"grad_norm": 0.29253387654098556,
"kl": 3.1888484954833984e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0494,
"reward": -0.5034094974398613,
"reward_std": 0.3080843798816204,
"rewards/cosine_scaled_reward": -0.29337141662836075,
"rewards/format_reward": 0.08333333395421505,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1762.8958740234375,
"epoch": 0.026285714285714287,
"grad_norm": 0.21053978305274443,
"kl": 4.506111145019531e-05,
"learning_rate": 4.6e-07,
"loss": 0.0144,
"reward": -0.028878159821033478,
"reward_std": 0.5564102046191692,
"rewards/cosine_scaled_reward": -0.10818908177316189,
"rewards/format_reward": 0.1875000074505806,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1352.5625305175781,
"epoch": 0.027428571428571427,
"grad_norm": 0.20202450012624545,
"kl": 1.6548670828342438e-05,
"learning_rate": 4.8e-07,
"loss": 0.0005,
"reward": 0.6555859744548798,
"reward_std": 0.47822858951985836,
"rewards/cosine_scaled_reward": 0.06737629324197769,
"rewards/format_reward": 0.520833333954215,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1597.1875610351562,
"epoch": 0.02857142857142857,
"grad_norm": 0.4327230812041704,
"kl": 3.0606985092163086e-05,
"learning_rate": 5e-07,
"loss": 0.0701,
"reward": 0.05484675616025925,
"reward_std": 0.6329891942441463,
"rewards/cosine_scaled_reward": -0.11840994283556938,
"rewards/format_reward": 0.29166667722165585,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1647.916748046875,
"epoch": 0.029714285714285714,
"grad_norm": 0.21123992049117873,
"kl": 2.2917985916137695e-05,
"learning_rate": 5.2e-07,
"loss": 0.031,
"reward": -0.24321994185447693,
"reward_std": 0.12097731977701187,
"rewards/cosine_scaled_reward": -0.18410997837781906,
"rewards/format_reward": 0.125,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1638.8958740234375,
"epoch": 0.030857142857142857,
"grad_norm": 0.21745088219923464,
"kl": 3.2067298889160156e-05,
"learning_rate": 5.4e-07,
"loss": -0.0097,
"reward": -0.3657397888600826,
"reward_std": 0.24539830163121223,
"rewards/cosine_scaled_reward": -0.2974532376974821,
"rewards/format_reward": 0.2291666716337204,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1711.2709045410156,
"epoch": 0.032,
"grad_norm": 0.2552233664551883,
"kl": 2.8468668460845947e-05,
"learning_rate": 5.6e-07,
"loss": 0.0256,
"reward": -0.38710537925362587,
"reward_std": 0.2530311979353428,
"rewards/cosine_scaled_reward": -0.2768860347568989,
"rewards/format_reward": 0.1666666716337204,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1713.8125610351562,
"epoch": 0.03314285714285714,
"grad_norm": 0.202249350617508,
"kl": 2.86102294921875e-05,
"learning_rate": 5.8e-07,
"loss": 0.0135,
"reward": -0.1931730881333351,
"reward_std": 0.5632064789533615,
"rewards/cosine_scaled_reward": -0.20075321290642023,
"rewards/format_reward": 0.2083333358168602,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 1732.291748046875,
"epoch": 0.03428571428571429,
"grad_norm": 0.23328556356102392,
"kl": 2.165883779525757e-05,
"learning_rate": 6e-07,
"loss": 0.0564,
"reward": -0.3746844604611397,
"reward_std": 0.34011659026145935,
"rewards/cosine_scaled_reward": -0.24984224140644073,
"rewards/format_reward": 0.12500000186264515,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1445.3125305175781,
"epoch": 0.03542857142857143,
"grad_norm": 0.30643607095324277,
"kl": 3.966689109802246e-05,
"learning_rate": 6.2e-07,
"loss": 0.0923,
"reward": -0.09436208941042423,
"reward_std": 0.3265727870166302,
"rewards/cosine_scaled_reward": -0.21384770551230758,
"rewards/format_reward": 0.33333333395421505,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1810.7917175292969,
"epoch": 0.036571428571428574,
"grad_norm": 0.20484433233713875,
"kl": 2.8021633625030518e-05,
"learning_rate": 6.4e-07,
"loss": 0.0202,
"reward": -0.5034667998552322,
"reward_std": 0.15860500000417233,
"rewards/cosine_scaled_reward": -0.2621500678360462,
"rewards/format_reward": 0.02083333395421505,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1750.9584045410156,
"epoch": 0.037714285714285714,
"grad_norm": 0.2027434434467969,
"kl": 2.5600194931030273e-05,
"learning_rate": 6.6e-07,
"loss": -0.0171,
"reward": -0.25296103954315186,
"reward_std": 0.4817052260041237,
"rewards/cosine_scaled_reward": -0.2514805067330599,
"rewards/format_reward": 0.25000000558793545,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 1634.8333740234375,
"epoch": 0.038857142857142854,
"grad_norm": 0.23764579059557195,
"kl": 2.331659197807312e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0003,
"reward": -0.3657361939549446,
"reward_std": 0.2039697989821434,
"rewards/cosine_scaled_reward": -0.25578476674854755,
"rewards/format_reward": 0.14583333395421505,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1691.1875610351562,
"epoch": 0.04,
"grad_norm": 0.2390715088796384,
"kl": 1.8522143363952637e-05,
"learning_rate": 7e-07,
"loss": 0.0579,
"reward": -0.1916074175387621,
"reward_std": 0.40257398039102554,
"rewards/cosine_scaled_reward": -0.23122038505971432,
"rewards/format_reward": 0.27083334885537624,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1526.2292175292969,
"epoch": 0.04114285714285714,
"grad_norm": 0.2361249356185026,
"kl": 3.781914710998535e-05,
"learning_rate": 7.2e-07,
"loss": 0.0401,
"reward": 0.35939645767211914,
"reward_std": 0.39011720940470695,
"rewards/cosine_scaled_reward": -0.01821846514940262,
"rewards/format_reward": 0.3958333395421505,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1645.7708740234375,
"epoch": 0.04228571428571429,
"grad_norm": 0.26864783041008133,
"kl": 3.820657730102539e-05,
"learning_rate": 7.4e-07,
"loss": 0.0746,
"reward": -0.2870800420641899,
"reward_std": 0.46812814101576805,
"rewards/cosine_scaled_reward": -0.25812335684895515,
"rewards/format_reward": 0.2291666679084301,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1722.5000610351562,
"epoch": 0.04342857142857143,
"grad_norm": 0.27664066975056834,
"kl": 5.131959915161133e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0586,
"reward": -0.15014038234949112,
"reward_std": 0.4126087427139282,
"rewards/cosine_scaled_reward": -0.2000702191144228,
"rewards/format_reward": 0.2500000074505806,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1678.7083740234375,
"epoch": 0.044571428571428574,
"grad_norm": 0.3003829192682386,
"kl": 4.968792200088501e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.097,
"reward": -0.21257384680211544,
"reward_std": 0.48539142310619354,
"rewards/cosine_scaled_reward": -0.2312869280576706,
"rewards/format_reward": 0.2500000111758709,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 1690.8958740234375,
"epoch": 0.045714285714285714,
"grad_norm": 0.20909108511646457,
"kl": 5.0902366638183594e-05,
"learning_rate": 8e-07,
"loss": 0.0436,
"reward": -0.5045258924365044,
"reward_std": 0.2920587807893753,
"rewards/cosine_scaled_reward": -0.3564296290278435,
"rewards/format_reward": 0.2083333358168602,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1806.3334045410156,
"epoch": 0.046857142857142854,
"grad_norm": 0.2168555566166619,
"kl": 3.137439489364624e-05,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0012,
"reward": 0.04771171510219574,
"reward_std": 0.33250839821994305,
"rewards/cosine_scaled_reward": -0.06989414617419243,
"rewards/format_reward": 0.18750000186264515,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1300.6250457763672,
"epoch": 0.048,
"grad_norm": 0.40542845209419376,
"kl": 0.000291675329208374,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0768,
"reward": 0.27488730661571026,
"reward_std": 0.45710677094757557,
"rewards/cosine_scaled_reward": -0.1646396858850494,
"rewards/format_reward": 0.6041666716337204,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1705.8750610351562,
"epoch": 0.04914285714285714,
"grad_norm": 0.21842925663095267,
"kl": 3.538280725479126e-05,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0308,
"reward": -0.2755163535475731,
"reward_std": 0.3637393806129694,
"rewards/cosine_scaled_reward": -0.2210915139876306,
"rewards/format_reward": 0.1666666679084301,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1665.0625305175781,
"epoch": 0.05028571428571429,
"grad_norm": 0.26271417694787236,
"kl": 0.00046503543853759766,
"learning_rate": 8.799999999999999e-07,
"loss": 0.073,
"reward": -0.12092901021242142,
"reward_std": 0.5556337833404541,
"rewards/cosine_scaled_reward": -0.17504783952608705,
"rewards/format_reward": 0.2291666679084301,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1733.2084045410156,
"epoch": 0.05142857142857143,
"grad_norm": 0.21285192669515357,
"kl": 5.0537288188934326e-05,
"learning_rate": 9e-07,
"loss": 0.0423,
"reward": -0.05799056589603424,
"reward_std": 0.4342048391699791,
"rewards/cosine_scaled_reward": -0.14357861876487732,
"rewards/format_reward": 0.22916666977107525,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1640.0834045410156,
"epoch": 0.052571428571428575,
"grad_norm": 0.2622293688477209,
"kl": 0.00013068318367004395,
"learning_rate": 9.2e-07,
"loss": 0.0317,
"reward": -0.005384169518947601,
"reward_std": 0.3068407401442528,
"rewards/cosine_scaled_reward": -0.1068587563931942,
"rewards/format_reward": 0.20833333395421505,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1498.8333892822266,
"epoch": 0.053714285714285714,
"grad_norm": 0.274608905827555,
"kl": 0.0001885145902633667,
"learning_rate": 9.399999999999999e-07,
"loss": 0.049,
"reward": -0.002073638141155243,
"reward_std": 0.4514222964644432,
"rewards/cosine_scaled_reward": -0.17812015302479267,
"rewards/format_reward": 0.3541666753590107,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1610.4792175292969,
"epoch": 0.054857142857142854,
"grad_norm": 0.24771930467103717,
"kl": 0.00015616416931152344,
"learning_rate": 9.6e-07,
"loss": 0.0334,
"reward": -0.22091616783291101,
"reward_std": 0.33334225323051214,
"rewards/cosine_scaled_reward": -0.21462474018335342,
"rewards/format_reward": 0.20833334140479565,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1341.1458740234375,
"epoch": 0.056,
"grad_norm": 0.3710205417665813,
"kl": 0.00029793381690979004,
"learning_rate": 9.8e-07,
"loss": 0.0862,
"reward": 0.40674951672554016,
"reward_std": 0.5115297809243202,
"rewards/cosine_scaled_reward": -0.025791920721530914,
"rewards/format_reward": 0.45833333395421505,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 1335.1667175292969,
"epoch": 0.05714285714285714,
"grad_norm": 0.3034272517231627,
"kl": 0.0005925297737121582,
"learning_rate": 1e-06,
"loss": 0.1036,
"reward": 0.36978277564048767,
"reward_std": 0.4990865057334304,
"rewards/cosine_scaled_reward": -0.033858626149594784,
"rewards/format_reward": 0.43750002048909664,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1686.8959045410156,
"epoch": 0.05828571428571429,
"grad_norm": 0.3009121706411098,
"kl": 0.00032591819763183594,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0864,
"reward": -0.20582207757979631,
"reward_std": 0.5198994930833578,
"rewards/cosine_scaled_reward": -0.19666103832423687,
"rewards/format_reward": 0.1875000111758709,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1718.2291870117188,
"epoch": 0.05942857142857143,
"grad_norm": 0.21311754620957382,
"kl": 0.0005127787590026855,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0262,
"reward": -0.39756081253290176,
"reward_std": 0.34694093093276024,
"rewards/cosine_scaled_reward": -0.2716970667243004,
"rewards/format_reward": 0.1458333358168602,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1611.8334045410156,
"epoch": 0.060571428571428575,
"grad_norm": 0.22683388578373892,
"kl": 0.0005531832575798035,
"learning_rate": 9.999013075636804e-07,
"loss": 0.068,
"reward": -0.13391486555337906,
"reward_std": 0.27848392724990845,
"rewards/cosine_scaled_reward": -0.22320742718875408,
"rewards/format_reward": 0.31250000186264515,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1442.0834045410156,
"epoch": 0.061714285714285715,
"grad_norm": 0.24769106962876689,
"kl": 0.0002713203430175781,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0911,
"reward": -0.11875106766819954,
"reward_std": 0.1542784534394741,
"rewards/cosine_scaled_reward": -0.2572922073304653,
"rewards/format_reward": 0.3958333432674408,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1688.4167175292969,
"epoch": 0.06285714285714286,
"grad_norm": 0.22851815885942953,
"kl": 0.0001881718635559082,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0068,
"reward": -0.3640219047665596,
"reward_std": 0.2585913948714733,
"rewards/cosine_scaled_reward": -0.2965943031013012,
"rewards/format_reward": 0.2291666753590107,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1569.4166870117188,
"epoch": 0.064,
"grad_norm": 0.2466081306910316,
"kl": 0.0021448135375976562,
"learning_rate": 9.996052735444862e-07,
"loss": 0.096,
"reward": -0.4589140391908586,
"reward_std": 0.4320836700499058,
"rewards/cosine_scaled_reward": -0.3440403640270233,
"rewards/format_reward": 0.2291666679084301,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1629.979248046875,
"epoch": 0.06514285714285714,
"grad_norm": 0.22573731739546327,
"kl": 0.0010238885879516602,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0592,
"reward": -0.3061641752719879,
"reward_std": 0.5002065226435661,
"rewards/cosine_scaled_reward": -0.26766542345285416,
"rewards/format_reward": 0.2291666716337204,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1660.4792175292969,
"epoch": 0.06628571428571428,
"grad_norm": 0.22190381637143303,
"kl": 0.0011049509048461914,
"learning_rate": 9.992983438818915e-07,
"loss": 0.022,
"reward": -0.32173825055360794,
"reward_std": 0.27725364826619625,
"rewards/cosine_scaled_reward": -0.2754524536430836,
"rewards/format_reward": 0.2291666679084301,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1690.0417175292969,
"epoch": 0.06742857142857143,
"grad_norm": 0.21914617585966853,
"kl": 0.0010164976119995117,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0444,
"reward": -0.021609768271446228,
"reward_std": 0.3677750062197447,
"rewards/cosine_scaled_reward": -0.135804895311594,
"rewards/format_reward": 0.25000000558793545,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 1581.6875305175781,
"epoch": 0.06857142857142857,
"grad_norm": 0.4016735260144472,
"kl": 0.01423954963684082,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0192,
"reward": 0.11502109467983246,
"reward_std": 0.29630398005247116,
"rewards/cosine_scaled_reward": -0.057072801515460014,
"rewards/format_reward": 0.2291666716337204,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1475.5833740234375,
"epoch": 0.06971428571428571,
"grad_norm": 0.24285848407581584,
"kl": 0.0003628730773925781,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0731,
"reward": 0.5937481597065926,
"reward_std": 0.6881431620568037,
"rewards/cosine_scaled_reward": 0.046874068677425385,
"rewards/format_reward": 0.5000000149011612,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1805.7500610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.19714468440546948,
"kl": 0.0005519390106201172,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0172,
"reward": -0.4636555463075638,
"reward_std": 0.3160466430708766,
"rewards/cosine_scaled_reward": -0.2734944522380829,
"rewards/format_reward": 0.08333333395421505,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1329.2917175292969,
"epoch": 0.072,
"grad_norm": 0.28510447078335305,
"kl": 0.004929542541503906,
"learning_rate": 9.981479793771866e-07,
"loss": 0.1079,
"reward": 0.30475724674761295,
"reward_std": 0.4675188772380352,
"rewards/cosine_scaled_reward": -0.0976213626563549,
"rewards/format_reward": 0.5000000149011612,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1636.5208740234375,
"epoch": 0.07314285714285715,
"grad_norm": 0.20815660806735267,
"kl": 0.0003604888916015625,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0625,
"reward": 0.29327625688165426,
"reward_std": 0.5610844530165195,
"rewards/cosine_scaled_reward": -0.03044520819094032,
"rewards/format_reward": 0.354166679084301,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1559.5000305175781,
"epoch": 0.07428571428571429,
"grad_norm": 0.24172417943995111,
"kl": 0.001363515853881836,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0995,
"reward": 0.1283707581460476,
"reward_std": 0.7667413726449013,
"rewards/cosine_scaled_reward": -0.13373128045350313,
"rewards/format_reward": 0.3958333507180214,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1729.6667175292969,
"epoch": 0.07542857142857143,
"grad_norm": 0.20090852438136195,
"kl": 0.00067138671875,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0209,
"reward": -0.39017004892230034,
"reward_std": 0.32542612217366695,
"rewards/cosine_scaled_reward": -0.3200850263237953,
"rewards/format_reward": 0.2500000149011612,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1648.7292175292969,
"epoch": 0.07657142857142857,
"grad_norm": 0.18795555019652113,
"kl": 0.0007681846618652344,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0342,
"reward": -0.1792638599872589,
"reward_std": 0.3578680492937565,
"rewards/cosine_scaled_reward": -0.20421527326107025,
"rewards/format_reward": 0.2291666679084301,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1388.5625610351562,
"epoch": 0.07771428571428571,
"grad_norm": 0.3904259482407812,
"kl": 0.00202178955078125,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0637,
"reward": 0.16577239707112312,
"reward_std": 0.3421984985470772,
"rewards/cosine_scaled_reward": -0.09419714100658894,
"rewards/format_reward": 0.3541666716337204,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1507.8333740234375,
"epoch": 0.07885714285714286,
"grad_norm": 0.2361059164440503,
"kl": 0.0008258819580078125,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0613,
"reward": 0.17160904966294765,
"reward_std": 0.38275655917823315,
"rewards/cosine_scaled_reward": -0.10169548355042934,
"rewards/format_reward": 0.37500000558793545,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 1690.3750305175781,
"epoch": 0.08,
"grad_norm": 0.19302606573391104,
"kl": 0.002358675003051758,
"learning_rate": 9.956206309337066e-07,
"loss": 0.105,
"reward": -0.1555338129401207,
"reward_std": 0.37855083122849464,
"rewards/cosine_scaled_reward": -0.20276692137122154,
"rewards/format_reward": 0.25000000186264515,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1441.729232788086,
"epoch": 0.08114285714285714,
"grad_norm": 0.331702227116139,
"kl": 0.0023870468139648438,
"learning_rate": 9.951725498333448e-07,
"loss": 0.1388,
"reward": -0.2453744667582214,
"reward_std": 0.15839526243507862,
"rewards/cosine_scaled_reward": -0.3101872429251671,
"rewards/format_reward": 0.3750000149011612,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1497.3959045410156,
"epoch": 0.08228571428571428,
"grad_norm": 0.33894190686830156,
"kl": 0.0017808079719543457,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0553,
"reward": 0.09824148565530777,
"reward_std": 0.1729265321046114,
"rewards/cosine_scaled_reward": -0.08629592880606651,
"rewards/format_reward": 0.2708333358168602,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1444.7708892822266,
"epoch": 0.08342857142857144,
"grad_norm": 0.9254159035231885,
"kl": 0.039752960205078125,
"learning_rate": 9.942113192828444e-07,
"loss": 0.1025,
"reward": 0.47389062121510506,
"reward_std": 0.7162522077560425,
"rewards/cosine_scaled_reward": -0.05472135776653886,
"rewards/format_reward": 0.5833333507180214,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1484.7083740234375,
"epoch": 0.08457142857142858,
"grad_norm": 0.2164345231616129,
"kl": 0.0021944046020507812,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0129,
"reward": -0.06718481332063675,
"reward_std": 0.16878989525139332,
"rewards/cosine_scaled_reward": -0.22109240666031837,
"rewards/format_reward": 0.3750000037252903,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1526.0417175292969,
"epoch": 0.08571428571428572,
"grad_norm": 0.3075410122107456,
"kl": 0.00359344482421875,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0753,
"reward": 0.17093585059046745,
"reward_std": 0.4688509330153465,
"rewards/cosine_scaled_reward": -0.08119874075055122,
"rewards/format_reward": 0.33333334513008595,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1640.4583740234375,
"epoch": 0.08685714285714285,
"grad_norm": 0.20492660661291412,
"kl": 0.00046312808990478516,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0184,
"reward": 0.029385031666606665,
"reward_std": 0.6126945875585079,
"rewards/cosine_scaled_reward": -0.151974156498909,
"rewards/format_reward": 0.33333333395421505,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1674.5625610351562,
"epoch": 0.088,
"grad_norm": 0.21980728108796918,
"kl": 0.0009822845458984375,
"learning_rate": 9.9202926282791e-07,
"loss": -0.0002,
"reward": -0.18806731700897217,
"reward_std": 0.12730432488024235,
"rewards/cosine_scaled_reward": -0.15653366968035698,
"rewards/format_reward": 0.125,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1518.0625610351562,
"epoch": 0.08914285714285715,
"grad_norm": 0.242785552217566,
"kl": 0.0009822845458984375,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0619,
"reward": 0.13657424598932266,
"reward_std": 0.4360465779900551,
"rewards/cosine_scaled_reward": -0.10879619419574738,
"rewards/format_reward": 0.35416666977107525,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1575.4792175292969,
"epoch": 0.09028571428571429,
"grad_norm": 0.24080955526978698,
"kl": 0.0005426406860351562,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0519,
"reward": 0.016203314065933228,
"reward_std": 0.6479124575853348,
"rewards/cosine_scaled_reward": -0.1585650178603828,
"rewards/format_reward": 0.3333333395421505,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 1733.9167175292969,
"epoch": 0.09142857142857143,
"grad_norm": 0.2186002750502081,
"kl": 0.0005044937133789062,
"learning_rate": 9.901664203302124e-07,
"loss": 0.031,
"reward": -0.5251612327992916,
"reward_std": 0.40141166001558304,
"rewards/cosine_scaled_reward": -0.33549728989601135,
"rewards/format_reward": 0.1458333358168602,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1728.479248046875,
"epoch": 0.09257142857142857,
"grad_norm": 0.21399417944679958,
"kl": 0.0009112358093261719,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0374,
"reward": -0.19506264757364988,
"reward_std": 0.48094464652240276,
"rewards/cosine_scaled_reward": -0.1912813438102603,
"rewards/format_reward": 0.18750000558793545,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1601.9792175292969,
"epoch": 0.09371428571428571,
"grad_norm": 0.2450961734236274,
"kl": 0.0009531974792480469,
"learning_rate": 9.888172094375033e-07,
"loss": 0.077,
"reward": -0.1917775571346283,
"reward_std": 0.5255400985479355,
"rewards/cosine_scaled_reward": -0.25213877484202385,
"rewards/format_reward": 0.31250001303851604,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1748.1875610351562,
"epoch": 0.09485714285714286,
"grad_norm": 0.22448680749018862,
"kl": 0.0004420280456542969,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0159,
"reward": -0.43924427404999733,
"reward_std": 0.2609596960246563,
"rewards/cosine_scaled_reward": -0.27170546911656857,
"rewards/format_reward": 0.10416666977107525,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1515.7708435058594,
"epoch": 0.096,
"grad_norm": 0.2231038243696207,
"kl": 0.0006551742553710938,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0246,
"reward": 0.36620646342635155,
"reward_std": 0.884237602353096,
"rewards/cosine_scaled_reward": -0.06689677853137255,
"rewards/format_reward": 0.5000000074505806,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1701.2083740234375,
"epoch": 0.09714285714285714,
"grad_norm": 0.20906161676384463,
"kl": 0.000804901123046875,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0555,
"reward": -0.39954638853669167,
"reward_std": 0.31576116755604744,
"rewards/cosine_scaled_reward": -0.2726898640394211,
"rewards/format_reward": 0.14583333395421505,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1610.9792175292969,
"epoch": 0.09828571428571428,
"grad_norm": 0.22100681278056383,
"kl": 0.0009822845458984375,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0444,
"reward": -0.24343110900372267,
"reward_std": 0.2885846998542547,
"rewards/cosine_scaled_reward": -0.30921556800603867,
"rewards/format_reward": 0.375,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1695.354248046875,
"epoch": 0.09942857142857142,
"grad_norm": 0.24683069440334848,
"kl": 0.0029506683349609375,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0377,
"reward": -0.09222975745797157,
"reward_std": 0.24668438732624054,
"rewards/cosine_scaled_reward": -0.1502815391868353,
"rewards/format_reward": 0.2083333432674408,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1594.9167175292969,
"epoch": 0.10057142857142858,
"grad_norm": 0.27215086328931853,
"kl": 0.0016989707946777344,
"learning_rate": 9.8425742251254e-07,
"loss": 0.1075,
"reward": 0.18186672404408455,
"reward_std": 0.9013341814279556,
"rewards/cosine_scaled_reward": -0.07573332265019417,
"rewards/format_reward": 0.3333333432674408,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1738.7292175292969,
"epoch": 0.10171428571428572,
"grad_norm": 0.1946900134085172,
"kl": 0.000820159912109375,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0331,
"reward": -0.28752805292606354,
"reward_std": 0.4243736080825329,
"rewards/cosine_scaled_reward": -0.22709737345576286,
"rewards/format_reward": 0.16666667722165585,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1572.2916870117188,
"epoch": 0.10285714285714286,
"grad_norm": 0.20694868118264276,
"kl": 0.0007328987121582031,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0753,
"reward": -0.08595774043351412,
"reward_std": 0.5348180644214153,
"rewards/cosine_scaled_reward": -0.18881220323964953,
"rewards/format_reward": 0.2916666679084301,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1601.5625610351562,
"epoch": 0.104,
"grad_norm": 0.20840771038907893,
"kl": 0.0007948875427246094,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0808,
"reward": -0.015035435557365417,
"reward_std": 0.14022575318813324,
"rewards/cosine_scaled_reward": -0.1429343856871128,
"rewards/format_reward": 0.2708333432674408,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1498.2292175292969,
"epoch": 0.10514285714285715,
"grad_norm": 0.20771988001872319,
"kl": 0.0009174346923828125,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0994,
"reward": 0.07728531863540411,
"reward_std": 0.508693166077137,
"rewards/cosine_scaled_reward": -0.1384406816214323,
"rewards/format_reward": 0.35416666977107525,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1347.8125305175781,
"epoch": 0.10628571428571429,
"grad_norm": 0.27527082284418775,
"kl": 0.0021848678588867188,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0296,
"reward": 0.30088429898023605,
"reward_std": 0.5643313899636269,
"rewards/cosine_scaled_reward": -0.10997452400624752,
"rewards/format_reward": 0.5208333432674408,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1733.2500610351562,
"epoch": 0.10742857142857143,
"grad_norm": 0.23935442867120157,
"kl": 0.0012607574462890625,
"learning_rate": 9.78935800506826e-07,
"loss": 0.021,
"reward": -0.34041892923414707,
"reward_std": 0.2469240017235279,
"rewards/cosine_scaled_reward": -0.26395946741104126,
"rewards/format_reward": 0.18750000186264515,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1738.5625610351562,
"epoch": 0.10857142857142857,
"grad_norm": 0.21273217079983556,
"kl": 0.0006814002990722656,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0093,
"reward": -0.5389137789607048,
"reward_std": 0.17841140553355217,
"rewards/cosine_scaled_reward": -0.3423735648393631,
"rewards/format_reward": 0.14583333395421505,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1433.9375610351562,
"epoch": 0.10971428571428571,
"grad_norm": 0.30691056711732384,
"kl": 0.002574920654296875,
"learning_rate": 9.769942052400235e-07,
"loss": 0.137,
"reward": 0.296867486089468,
"reward_std": 0.3943296894431114,
"rewards/cosine_scaled_reward": -0.04948292672634125,
"rewards/format_reward": 0.3958333432674408,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1567.2500610351562,
"epoch": 0.11085714285714286,
"grad_norm": 0.25051085956589897,
"kl": 0.0013968944549560547,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0267,
"reward": -0.15386457741260529,
"reward_std": 0.37108149379491806,
"rewards/cosine_scaled_reward": -0.21234895661473274,
"rewards/format_reward": 0.2708333432674408,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1406.0208740234375,
"epoch": 0.112,
"grad_norm": 0.366560785041491,
"kl": 0.0012578964233398438,
"learning_rate": 9.749693666068663e-07,
"loss": 0.099,
"reward": 0.3372333124279976,
"reward_std": 0.3852754198014736,
"rewards/cosine_scaled_reward": -0.12305000983178616,
"rewards/format_reward": 0.5833333507180214,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1598.7917175292969,
"epoch": 0.11314285714285714,
"grad_norm": 0.2584279138871096,
"kl": 0.0010881423950195312,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0536,
"reward": 0.1023973822593689,
"reward_std": 0.4502338841557503,
"rewards/cosine_scaled_reward": -0.1258846465498209,
"rewards/format_reward": 0.3541666828095913,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1557.479248046875,
"epoch": 0.11428571428571428,
"grad_norm": 0.23713752727518134,
"kl": 0.0009851455688476562,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0694,
"reward": -0.15063253417611122,
"reward_std": 0.3854830376803875,
"rewards/cosine_scaled_reward": -0.23156626150012016,
"rewards/format_reward": 0.3125000111758709,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1387.4583435058594,
"epoch": 0.11542857142857142,
"grad_norm": 0.32157411791816565,
"kl": 0.001094818115234375,
"learning_rate": 9.717768952713511e-07,
"loss": 0.1116,
"reward": 0.07011325657367706,
"reward_std": 0.3243808038532734,
"rewards/cosine_scaled_reward": -0.19411004893481731,
"rewards/format_reward": 0.4583333395421505,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1449.3750610351562,
"epoch": 0.11657142857142858,
"grad_norm": 0.2168599934302549,
"kl": 0.0015411376953125,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0577,
"reward": -0.21096567437052727,
"reward_std": 0.29599858447909355,
"rewards/cosine_scaled_reward": -0.3138161562383175,
"rewards/format_reward": 0.4166666865348816,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1715.166748046875,
"epoch": 0.11771428571428572,
"grad_norm": 0.21920178674297372,
"kl": 0.0015869140625,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0667,
"reward": -0.18699942529201508,
"reward_std": 0.5092732682824135,
"rewards/cosine_scaled_reward": -0.22891639173030853,
"rewards/format_reward": 0.2708333395421505,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1304.4583435058594,
"epoch": 0.11885714285714286,
"grad_norm": 0.22942484314958453,
"kl": 0.0013804435729980469,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0839,
"reward": 0.5173723250627518,
"reward_std": 0.5176322646439075,
"rewards/cosine_scaled_reward": -0.001730518415570259,
"rewards/format_reward": 0.5208333358168602,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1364.8333740234375,
"epoch": 0.12,
"grad_norm": 0.25403433256650454,
"kl": 0.0016727447509765625,
"learning_rate": 9.672327345550543e-07,
"loss": 0.1156,
"reward": 0.28816052433103323,
"reward_std": 0.240465197712183,
"rewards/cosine_scaled_reward": -0.1267530769109726,
"rewards/format_reward": 0.541666679084301,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1570.6667175292969,
"epoch": 0.12114285714285715,
"grad_norm": 0.2462172191203138,
"kl": 0.0020122528076171875,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0866,
"reward": 0.34020555624738336,
"reward_std": 0.7328735627233982,
"rewards/cosine_scaled_reward": -0.038230573292821646,
"rewards/format_reward": 0.41666666977107525,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1243.4583740234375,
"epoch": 0.12228571428571429,
"grad_norm": 0.22392855280151888,
"kl": 0.001399993896484375,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0861,
"reward": 0.19801579043269157,
"reward_std": 0.4772573560476303,
"rewards/cosine_scaled_reward": -0.18224211037158966,
"rewards/format_reward": 0.5625000149011612,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1376.5625610351562,
"epoch": 0.12342857142857143,
"grad_norm": 0.2328882803373465,
"kl": 0.0032482147216796875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0636,
"reward": 0.6495321169495583,
"reward_std": 0.5899618566036224,
"rewards/cosine_scaled_reward": 0.06434935945435427,
"rewards/format_reward": 0.5208333488553762,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1368.0625305175781,
"epoch": 0.12457142857142857,
"grad_norm": 0.3696050391986309,
"kl": 0.0028667449951171875,
"learning_rate": 9.623632283030077e-07,
"loss": 0.1246,
"reward": -0.031360091641545296,
"reward_std": 0.4002140313386917,
"rewards/cosine_scaled_reward": -0.2656800393015146,
"rewards/format_reward": 0.5,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1444.6666870117188,
"epoch": 0.12571428571428572,
"grad_norm": 0.35213532577859125,
"kl": 0.0029430389404296875,
"learning_rate": 9.610954559391704e-07,
"loss": 0.1339,
"reward": 0.6942434869706631,
"reward_std": 0.9198908805847168,
"rewards/cosine_scaled_reward": 0.06587174534797668,
"rewards/format_reward": 0.5625000149011612,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1072.7708587646484,
"epoch": 0.12685714285714286,
"grad_norm": 0.2985726423715741,
"kl": 0.001979827880859375,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0476,
"reward": 0.7408694333862513,
"reward_std": 0.7333548963069916,
"rewards/cosine_scaled_reward": -0.004565277136862278,
"rewards/format_reward": 0.7500000149011612,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1633.4167175292969,
"epoch": 0.128,
"grad_norm": 0.22471101395696397,
"kl": 0.00258636474609375,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0346,
"reward": -0.05079384706914425,
"reward_std": 0.4366183038800955,
"rewards/cosine_scaled_reward": -0.2337302602827549,
"rewards/format_reward": 0.4166666865348816,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1319.7291870117188,
"epoch": 0.12914285714285714,
"grad_norm": 0.27063696127291986,
"kl": 0.0033721923828125,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0833,
"reward": 0.6321319434791803,
"reward_std": 0.5336715504527092,
"rewards/cosine_scaled_reward": -0.006850697100162506,
"rewards/format_reward": 0.6458333507180214,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1052.3958892822266,
"epoch": 0.13028571428571428,
"grad_norm": 0.250125198289797,
"kl": 0.0016460418701171875,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0768,
"reward": 0.653087726328522,
"reward_std": 0.35864404030144215,
"rewards/cosine_scaled_reward": -0.017206139862537384,
"rewards/format_reward": 0.6875000149011612,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1440.2083740234375,
"epoch": 0.13142857142857142,
"grad_norm": 0.29266585256345196,
"kl": 0.0030155181884765625,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0755,
"reward": 0.21958831325173378,
"reward_std": 0.704796127974987,
"rewards/cosine_scaled_reward": -0.16103917988948524,
"rewards/format_reward": 0.5416666865348816,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1472.5416870117188,
"epoch": 0.13257142857142856,
"grad_norm": 0.26433038131357134,
"kl": 0.003147125244140625,
"learning_rate": 9.530702921077358e-07,
"loss": 0.073,
"reward": 0.018861573189496994,
"reward_std": 0.3587416708469391,
"rewards/cosine_scaled_reward": -0.18848587945103645,
"rewards/format_reward": 0.39583334140479565,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1545.9375305175781,
"epoch": 0.1337142857142857,
"grad_norm": 0.21836493727001577,
"kl": 0.002864837646484375,
"learning_rate": 9.516636183034564e-07,
"loss": 0.1366,
"reward": -0.32600877061486244,
"reward_std": 0.43822694569826126,
"rewards/cosine_scaled_reward": -0.35050439089536667,
"rewards/format_reward": 0.3750000074505806,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1334.1250305175781,
"epoch": 0.13485714285714287,
"grad_norm": 0.24394321780710398,
"kl": 0.0026493072509765625,
"learning_rate": 9.502373679810839e-07,
"loss": 0.035,
"reward": 0.457018606364727,
"reward_std": 0.5285698734223843,
"rewards/cosine_scaled_reward": -0.09440736100077629,
"rewards/format_reward": 0.645833358168602,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1220.3333740234375,
"epoch": 0.136,
"grad_norm": 0.28272459137828676,
"kl": 0.0042877197265625,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0804,
"reward": 0.3442453145980835,
"reward_std": 0.564174473285675,
"rewards/cosine_scaled_reward": -0.12996070086956024,
"rewards/format_reward": 0.6041666716337204,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 1528.6042175292969,
"epoch": 0.13714285714285715,
"grad_norm": 0.2668307726658885,
"kl": 0.00232696533203125,
"learning_rate": 9.473264167865171e-07,
"loss": 0.1032,
"reward": -0.03986197151243687,
"reward_std": 0.37811761628836393,
"rewards/cosine_scaled_reward": -0.2282643192447722,
"rewards/format_reward": 0.4166666716337204,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1584.8125610351562,
"epoch": 0.1382857142857143,
"grad_norm": 0.22786468100552407,
"kl": 0.002208709716796875,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0046,
"reward": 0.16309459879994392,
"reward_std": 0.2453223168849945,
"rewards/cosine_scaled_reward": -0.1372026912868023,
"rewards/format_reward": 0.4375000074505806,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1462.6875610351562,
"epoch": 0.13942857142857143,
"grad_norm": 0.28816738889821486,
"kl": 0.00514984130859375,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0974,
"reward": -0.12114270869642496,
"reward_std": 0.2534109205007553,
"rewards/cosine_scaled_reward": -0.2689046934247017,
"rewards/format_reward": 0.41666666977107525,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1375.7708740234375,
"epoch": 0.14057142857142857,
"grad_norm": 0.32101258824146217,
"kl": 0.0041351318359375,
"learning_rate": 9.428149347714143e-07,
"loss": 0.1284,
"reward": 0.18988706171512604,
"reward_std": 0.8535008877515793,
"rewards/cosine_scaled_reward": -0.17588980495929718,
"rewards/format_reward": 0.541666679084301,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1374.7292175292969,
"epoch": 0.1417142857142857,
"grad_norm": 0.2425349865258595,
"kl": 0.00324249267578125,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0382,
"reward": 0.07038946449756622,
"reward_std": 0.49846766516566277,
"rewards/cosine_scaled_reward": -0.15230527985841036,
"rewards/format_reward": 0.375,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1664.8541870117188,
"epoch": 0.14285714285714285,
"grad_norm": 0.2457250240943947,
"kl": 0.0025730133056640625,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0291,
"reward": 0.004289238480851054,
"reward_std": 0.32331261597573757,
"rewards/cosine_scaled_reward": -0.15410537272691727,
"rewards/format_reward": 0.31250000186264515,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1422.3750305175781,
"epoch": 0.144,
"grad_norm": 0.32285843347583837,
"kl": 0.005126953125,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0961,
"reward": 0.19516459852457047,
"reward_std": 0.6147220581769943,
"rewards/cosine_scaled_reward": -0.162834367249161,
"rewards/format_reward": 0.5208333507180214,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1370.7708435058594,
"epoch": 0.14514285714285713,
"grad_norm": 0.24341515642410516,
"kl": 0.0030078887939453125,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0726,
"reward": -0.08839717879891396,
"reward_std": 0.4017263073474169,
"rewards/cosine_scaled_reward": -0.2941986061632633,
"rewards/format_reward": 0.5000000149011612,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1219.9167175292969,
"epoch": 0.1462857142857143,
"grad_norm": 0.2623858416818109,
"kl": 0.004070281982421875,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0537,
"reward": 0.43044765666127205,
"reward_std": 0.49690980464220047,
"rewards/cosine_scaled_reward": -0.15977618098258972,
"rewards/format_reward": 0.7500000149011612,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1109.1875305175781,
"epoch": 0.14742857142857144,
"grad_norm": 0.2829401049059584,
"kl": 0.00757598876953125,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0692,
"reward": 0.6423492059111595,
"reward_std": 0.4438105970621109,
"rewards/cosine_scaled_reward": -0.03299206681549549,
"rewards/format_reward": 0.7083333358168602,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1495.8333740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.23104014201895975,
"kl": 0.00299835205078125,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0064,
"reward": -0.09923176001757383,
"reward_std": 0.43960002437233925,
"rewards/cosine_scaled_reward": -0.29961589351296425,
"rewards/format_reward": 0.5000000149011612,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1543.2500305175781,
"epoch": 0.14971428571428572,
"grad_norm": 0.22132261730116032,
"kl": 0.0033931732177734375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.1051,
"reward": -0.012558471411466599,
"reward_std": 0.5053001046180725,
"rewards/cosine_scaled_reward": -0.24586258456110954,
"rewards/format_reward": 0.47916667722165585,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1477.0625,
"epoch": 0.15085714285714286,
"grad_norm": 0.2442588816427236,
"kl": 0.004528045654296875,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0768,
"reward": -0.11025669425725937,
"reward_std": 0.18197684548795223,
"rewards/cosine_scaled_reward": -0.284295029938221,
"rewards/format_reward": 0.4583333432674408,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1563.3958740234375,
"epoch": 0.152,
"grad_norm": 0.21023108591248665,
"kl": 0.00415802001953125,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0672,
"reward": 0.13176406361162663,
"reward_std": 0.5022407323122025,
"rewards/cosine_scaled_reward": -0.18411797285079956,
"rewards/format_reward": 0.5000000149011612,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1049.1042022705078,
"epoch": 0.15314285714285714,
"grad_norm": 0.3838039161390532,
"kl": 0.00562286376953125,
"learning_rate": 9.248145583195447e-07,
"loss": 0.1973,
"reward": 0.4749515192816034,
"reward_std": 0.3580738380551338,
"rewards/cosine_scaled_reward": -0.15835759788751602,
"rewards/format_reward": 0.7916666865348816,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1351.5416870117188,
"epoch": 0.15428571428571428,
"grad_norm": 0.34500799880157473,
"kl": 0.00400543212890625,
"learning_rate": 9.230669076497687e-07,
"loss": 0.143,
"reward": 0.2647483544424176,
"reward_std": 0.5427017770707607,
"rewards/cosine_scaled_reward": -0.11762583442032337,
"rewards/format_reward": 0.5000000204890966,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1168.0625305175781,
"epoch": 0.15542857142857142,
"grad_norm": 0.31218899888892226,
"kl": 0.004955291748046875,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0562,
"reward": 0.3584494572132826,
"reward_std": 0.5529016815125942,
"rewards/cosine_scaled_reward": -0.17494194395840168,
"rewards/format_reward": 0.7083333432674408,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1282.0416870117188,
"epoch": 0.15657142857142858,
"grad_norm": 0.2721613126225875,
"kl": 0.007869720458984375,
"learning_rate": 9.195171441101668e-07,
"loss": 0.1358,
"reward": 0.2924184873700142,
"reward_std": 0.5777250528335571,
"rewards/cosine_scaled_reward": -0.16629073955118656,
"rewards/format_reward": 0.6250000055879354,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1041.8958740234375,
"epoch": 0.15771428571428572,
"grad_norm": 0.30890354701331296,
"kl": 0.00521087646484375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0338,
"reward": 0.860385000705719,
"reward_std": 0.8024220168590546,
"rewards/cosine_scaled_reward": 0.023942476138472557,
"rewards/format_reward": 0.8125000149011612,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1192.8125457763672,
"epoch": 0.15885714285714286,
"grad_norm": 0.2622844914783918,
"kl": 0.00412750244140625,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0825,
"reward": 0.5425689108669758,
"reward_std": 0.5253265127539635,
"rewards/cosine_scaled_reward": -0.12454888969659805,
"rewards/format_reward": 0.7916666865348816,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1283.4375305175781,
"epoch": 0.16,
"grad_norm": 0.24897560413424463,
"kl": 0.004375457763671875,
"learning_rate": 9.140576474687263e-07,
"loss": 0.1075,
"reward": 0.3927510902285576,
"reward_std": 0.43108681961894035,
"rewards/cosine_scaled_reward": -0.1577911265194416,
"rewards/format_reward": 0.7083333507180214,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1262.7916717529297,
"epoch": 0.16114285714285714,
"grad_norm": 0.3772239136734691,
"kl": 0.005344390869140625,
"learning_rate": 9.122022088101613e-07,
"loss": 0.1713,
"reward": 0.37745123356580734,
"reward_std": 0.5623941943049431,
"rewards/cosine_scaled_reward": -0.13419108092784882,
"rewards/format_reward": 0.6458333432674408,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1246.6042022705078,
"epoch": 0.16228571428571428,
"grad_norm": 0.28965855619789826,
"kl": 0.0045166015625,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0725,
"reward": 0.5083264335989952,
"reward_std": 0.5853047892451286,
"rewards/cosine_scaled_reward": -0.047920111566782,
"rewards/format_reward": 0.6041666828095913,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1448.9375610351562,
"epoch": 0.16342857142857142,
"grad_norm": 0.2549900151123108,
"kl": 0.006290435791015625,
"learning_rate": 9.084384631108882e-07,
"loss": 0.1142,
"reward": 0.13985165720805526,
"reward_std": 0.2659877985715866,
"rewards/cosine_scaled_reward": -0.20090750604867935,
"rewards/format_reward": 0.5416666865348816,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1203.2083740234375,
"epoch": 0.16457142857142856,
"grad_norm": 0.2224971436424363,
"kl": 0.005550384521484375,
"learning_rate": 9.065303395098358e-07,
"loss": 0.085,
"reward": 0.5334329381585121,
"reward_std": 0.5584629252552986,
"rewards/cosine_scaled_reward": -0.10828354395925999,
"rewards/format_reward": 0.75,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1086.9792175292969,
"epoch": 0.1657142857142857,
"grad_norm": 0.3813865927902241,
"kl": 0.0063629150390625,
"learning_rate": 9.046048391230247e-07,
"loss": 0.1879,
"reward": 0.2875216994434595,
"reward_std": 0.5303685888648033,
"rewards/cosine_scaled_reward": -0.23123916238546371,
"rewards/format_reward": 0.7500000149011612,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1285.5625610351562,
"epoch": 0.16685714285714287,
"grad_norm": 0.2972743546494519,
"kl": 0.0059814453125,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0594,
"reward": 0.2565866466611624,
"reward_std": 0.46598899737000465,
"rewards/cosine_scaled_reward": -0.25712333619594574,
"rewards/format_reward": 0.770833358168602,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 896.6250305175781,
"epoch": 0.168,
"grad_norm": 0.32647401434979056,
"kl": 0.006877899169921875,
"learning_rate": 9.007020842191634e-07,
"loss": -0.0011,
"reward": 1.0985181145370007,
"reward_std": 0.5338096916675568,
"rewards/cosine_scaled_reward": 0.05967570189386606,
"rewards/format_reward": 0.9791666716337204,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1160.9583435058594,
"epoch": 0.16914285714285715,
"grad_norm": 0.2816274273158885,
"kl": 0.00585174560546875,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0442,
"reward": 0.18387611024081707,
"reward_std": 0.2959946282207966,
"rewards/cosine_scaled_reward": -0.3143119588494301,
"rewards/format_reward": 0.8125000149011612,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1223.9791870117188,
"epoch": 0.1702857142857143,
"grad_norm": 0.2823488259457116,
"kl": 0.00612640380859375,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0654,
"reward": 0.47756416723132133,
"reward_std": 0.7413289695978165,
"rewards/cosine_scaled_reward": -0.11538459919393063,
"rewards/format_reward": 0.7083333432674408,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 986.6042022705078,
"epoch": 0.17142857142857143,
"grad_norm": 0.318064277562745,
"kl": 0.0080413818359375,
"learning_rate": 8.9471999940354e-07,
"loss": 0.1332,
"reward": 0.401881605386734,
"reward_std": 0.6674076840281487,
"rewards/cosine_scaled_reward": -0.20530920289456844,
"rewards/format_reward": 0.8125000298023224,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1111.4375610351562,
"epoch": 0.17257142857142857,
"grad_norm": 0.23750964874516883,
"kl": 0.005405426025390625,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0547,
"reward": 0.42157261446118355,
"reward_std": 0.2637167125940323,
"rewards/cosine_scaled_reward": -0.14338038116693497,
"rewards/format_reward": 0.7083333432674408,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1390.2292175292969,
"epoch": 0.1737142857142857,
"grad_norm": 0.3108688575018839,
"kl": 0.008697509765625,
"learning_rate": 8.906477750432903e-07,
"loss": 0.1077,
"reward": 0.11867762915790081,
"reward_std": 0.5801703371107578,
"rewards/cosine_scaled_reward": -0.2739945203065872,
"rewards/format_reward": 0.6666666716337204,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1189.4792022705078,
"epoch": 0.17485714285714285,
"grad_norm": 0.22859697466435477,
"kl": 0.006011962890625,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0402,
"reward": 0.46854234486818314,
"reward_std": 0.5257667489349842,
"rewards/cosine_scaled_reward": -0.07822884852066636,
"rewards/format_reward": 0.6250000149011612,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1227.7292175292969,
"epoch": 0.176,
"grad_norm": 0.23458511838935112,
"kl": 0.0063323974609375,
"learning_rate": 8.865091407243394e-07,
"loss": 0.129,
"reward": 0.7308447554241866,
"reward_std": 0.4724605940282345,
"rewards/cosine_scaled_reward": 0.011255700141191483,
"rewards/format_reward": 0.7083333432674408,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1320.6666870117188,
"epoch": 0.17714285714285713,
"grad_norm": 0.29059316598505575,
"kl": 0.007198333740234375,
"learning_rate": 8.844151714648274e-07,
"loss": 0.1327,
"reward": -0.1417454145848751,
"reward_std": 0.3702365458011627,
"rewards/cosine_scaled_reward": -0.3521227166056633,
"rewards/format_reward": 0.5625000055879354,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1116.000015258789,
"epoch": 0.1782857142857143,
"grad_norm": 0.37926874198201693,
"kl": 0.00821685791015625,
"learning_rate": 8.823049032816478e-07,
"loss": 0.2189,
"reward": 0.15536441165022552,
"reward_std": 0.2769140414893627,
"rewards/cosine_scaled_reward": -0.2869011387228966,
"rewards/format_reward": 0.729166679084301,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1049.5000305175781,
"epoch": 0.17942857142857144,
"grad_norm": 0.3728612044799926,
"kl": 0.02156829833984375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0389,
"reward": 0.7612650550436229,
"reward_std": 0.31401624344289303,
"rewards/cosine_scaled_reward": 0.005632489919662476,
"rewards/format_reward": 0.75,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1386.0833740234375,
"epoch": 0.18057142857142858,
"grad_norm": 0.3889317879381384,
"kl": 0.00850677490234375,
"learning_rate": 8.780358823396352e-07,
"loss": 0.1308,
"reward": 0.06261628679931164,
"reward_std": 0.3530626520514488,
"rewards/cosine_scaled_reward": -0.3124418593943119,
"rewards/format_reward": 0.6875000298023224,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1129.7708740234375,
"epoch": 0.18171428571428572,
"grad_norm": 0.3220977926530576,
"kl": 0.00748443603515625,
"learning_rate": 8.758773376468604e-07,
"loss": 0.1262,
"reward": 0.6195714063942432,
"reward_std": 0.6696993261575699,
"rewards/cosine_scaled_reward": -0.08604763355106115,
"rewards/format_reward": 0.7916666865348816,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 1160.4167175292969,
"epoch": 0.18285714285714286,
"grad_norm": 0.2845576646765754,
"kl": 0.0069122314453125,
"learning_rate": 8.737029101523929e-07,
"loss": 0.1064,
"reward": 0.6454856535419822,
"reward_std": 0.8377318382263184,
"rewards/cosine_scaled_reward": -0.08350718393921852,
"rewards/format_reward": 0.8125000149011612,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1429.4792175292969,
"epoch": 0.184,
"grad_norm": 0.25219633802451885,
"kl": 0.00858306884765625,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0965,
"reward": 0.009432412683963776,
"reward_std": 0.3042390923947096,
"rewards/cosine_scaled_reward": -0.27653381787240505,
"rewards/format_reward": 0.5625000149011612,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1226.7917175292969,
"epoch": 0.18514285714285714,
"grad_norm": 0.26347106975524837,
"kl": 0.0080108642578125,
"learning_rate": 8.693068314414344e-07,
"loss": 0.077,
"reward": 0.24512136541306973,
"reward_std": 0.43705228716135025,
"rewards/cosine_scaled_reward": -0.29410600662231445,
"rewards/format_reward": 0.8333333432674408,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 926.1250305175781,
"epoch": 0.18628571428571428,
"grad_norm": 0.41739039022115654,
"kl": 0.0112457275390625,
"learning_rate": 8.670853944836176e-07,
"loss": 0.1827,
"reward": 0.7628292813897133,
"reward_std": 0.8151352852582932,
"rewards/cosine_scaled_reward": -0.04566871002316475,
"rewards/format_reward": 0.8541666716337204,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 946.125,
"epoch": 0.18742857142857142,
"grad_norm": 0.36967841595429546,
"kl": 0.01031494140625,
"learning_rate": 8.648485032310144e-07,
"loss": 0.1834,
"reward": 0.6057916302233934,
"reward_std": 0.48515384271740913,
"rewards/cosine_scaled_reward": -0.10335419327020645,
"rewards/format_reward": 0.8125000298023224,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1174.4792175292969,
"epoch": 0.18857142857142858,
"grad_norm": 0.2708332867444507,
"kl": 0.00788116455078125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0394,
"reward": 0.32264771312475204,
"reward_std": 0.4833778813481331,
"rewards/cosine_scaled_reward": -0.24492615275084972,
"rewards/format_reward": 0.8125000149011612,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 907.0833740234375,
"epoch": 0.18971428571428572,
"grad_norm": 0.3261002575687217,
"kl": 0.00717926025390625,
"learning_rate": 8.603287946810513e-07,
"loss": 0.1283,
"reward": 0.6173169314861298,
"reward_std": 0.2740478292107582,
"rewards/cosine_scaled_reward": -0.1705082282423973,
"rewards/format_reward": 0.9583333432674408,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1270.0417175292969,
"epoch": 0.19085714285714286,
"grad_norm": 0.3129560409827591,
"kl": 0.00971221923828125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.1093,
"reward": 0.3518100567162037,
"reward_std": 0.5595069229602814,
"rewards/cosine_scaled_reward": -0.1470116525888443,
"rewards/format_reward": 0.645833358168602,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1077.1041870117188,
"epoch": 0.192,
"grad_norm": 0.26915582005747507,
"kl": 0.0078125,
"learning_rate": 8.557485869176825e-07,
"loss": 0.1617,
"reward": 0.2642595246434212,
"reward_std": 0.46994560211896896,
"rewards/cosine_scaled_reward": -0.26370356790721416,
"rewards/format_reward": 0.7916666865348816,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1156.2500305175781,
"epoch": 0.19314285714285714,
"grad_norm": 0.35785552736378773,
"kl": 0.0098724365234375,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0922,
"reward": 0.77548947930336,
"reward_std": 0.7726699560880661,
"rewards/cosine_scaled_reward": 0.0023280568420886993,
"rewards/format_reward": 0.770833358168602,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 1073.7292175292969,
"epoch": 0.19428571428571428,
"grad_norm": 0.32755955253118335,
"kl": 0.0117034912109375,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0752,
"reward": 0.19202834740281105,
"reward_std": 0.3850276917219162,
"rewards/cosine_scaled_reward": -0.3206525072455406,
"rewards/format_reward": 0.8333333432674408,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 918.8333511352539,
"epoch": 0.19542857142857142,
"grad_norm": 0.3616914993674,
"kl": 0.00833892822265625,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0904,
"reward": 0.5478162653744221,
"reward_std": 0.6629246398806572,
"rewards/cosine_scaled_reward": -0.1948418878018856,
"rewards/format_reward": 0.9375,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1036.1458740234375,
"epoch": 0.19657142857142856,
"grad_norm": 0.3354400822116869,
"kl": 0.0130157470703125,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0669,
"reward": 0.7608658275566995,
"reward_std": 0.6014236621558666,
"rewards/cosine_scaled_reward": -0.04665040969848633,
"rewards/format_reward": 0.8541666865348816,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1106.4583740234375,
"epoch": 0.1977142857142857,
"grad_norm": 0.3236947350770136,
"kl": 0.0121307373046875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.093,
"reward": 0.7088564559817314,
"reward_std": 0.4235651511698961,
"rewards/cosine_scaled_reward": -0.010155089199543,
"rewards/format_reward": 0.7291666865348816,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1109.5000305175781,
"epoch": 0.19885714285714284,
"grad_norm": 0.37244639543702895,
"kl": 0.015838623046875,
"learning_rate": 8.416539554784089e-07,
"loss": 0.1098,
"reward": 0.17886048182845116,
"reward_std": 0.35543810576200485,
"rewards/cosine_scaled_reward": -0.30640310421586037,
"rewards/format_reward": 0.7916666865348816,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 972.9167022705078,
"epoch": 0.2,
"grad_norm": 0.6554774460546362,
"kl": 0.0153045654296875,
"learning_rate": 8.392544243589427e-07,
"loss": 0.2068,
"reward": 0.607050247490406,
"reward_std": 0.4396999180316925,
"rewards/cosine_scaled_reward": -0.14439154416322708,
"rewards/format_reward": 0.895833358168602,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 998.2291870117188,
"epoch": 0.20114285714285715,
"grad_norm": 0.28748166515655293,
"kl": 0.0133819580078125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0371,
"reward": 0.486224377527833,
"reward_std": 0.6124172061681747,
"rewards/cosine_scaled_reward": -0.17355448007583618,
"rewards/format_reward": 0.8333333432674408,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 916.2291870117188,
"epoch": 0.2022857142857143,
"grad_norm": 0.4438177799902679,
"kl": 0.0131378173828125,
"learning_rate": 8.344131861991828e-07,
"loss": 0.1487,
"reward": 0.8074519336223602,
"reward_std": 0.4988584369421005,
"rewards/cosine_scaled_reward": -0.05460738018155098,
"rewards/format_reward": 0.9166666865348816,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 822.8333740234375,
"epoch": 0.20342857142857143,
"grad_norm": 0.5173286289503403,
"kl": 0.0179443359375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.1961,
"reward": 1.0362385213375092,
"reward_std": 0.5397170335054398,
"rewards/cosine_scaled_reward": 0.13270257785916328,
"rewards/format_reward": 0.770833358168602,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 959.9792022705078,
"epoch": 0.20457142857142857,
"grad_norm": 0.369107073779179,
"kl": 0.016815185546875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.1417,
"reward": 0.6556574255228043,
"reward_std": 0.4815560430288315,
"rewards/cosine_scaled_reward": -0.10967130470089614,
"rewards/format_reward": 0.8750000149011612,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1162.7292022705078,
"epoch": 0.2057142857142857,
"grad_norm": 0.5036563993456736,
"kl": 0.01904296875,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0949,
"reward": 0.2779462654143572,
"reward_std": 0.4615231901407242,
"rewards/cosine_scaled_reward": -0.24644354078918695,
"rewards/format_reward": 0.7708333432674408,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1078.8333435058594,
"epoch": 0.20685714285714285,
"grad_norm": 0.4317948665990577,
"kl": 0.0135955810546875,
"learning_rate": 8.245653237555705e-07,
"loss": 0.1473,
"reward": 0.6264736168086529,
"reward_std": 0.5298948585987091,
"rewards/cosine_scaled_reward": -0.10342983156442642,
"rewards/format_reward": 0.8333333432674408,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1065.8125305175781,
"epoch": 0.208,
"grad_norm": 0.5168299485262725,
"kl": 0.02105712890625,
"learning_rate": 8.220696016880687e-07,
"loss": 0.1884,
"reward": 0.3882112614810467,
"reward_std": 0.5859006345272064,
"rewards/cosine_scaled_reward": -0.2017277143895626,
"rewards/format_reward": 0.7916666865348816,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1069.4583740234375,
"epoch": 0.20914285714285713,
"grad_norm": 0.5024855038579699,
"kl": 0.0205078125,
"learning_rate": 8.195606193320136e-07,
"loss": 0.1323,
"reward": 0.24412129819393158,
"reward_std": 0.47408775985240936,
"rewards/cosine_scaled_reward": -0.2529393620789051,
"rewards/format_reward": 0.7500000149011612,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 936.4792022705078,
"epoch": 0.2102857142857143,
"grad_norm": 0.4981488833418968,
"kl": 0.017730712890625,
"learning_rate": 8.170384989716657e-07,
"loss": 0.137,
"reward": 0.6930912919342518,
"reward_std": 0.5617035925388336,
"rewards/cosine_scaled_reward": -0.03887102263979614,
"rewards/format_reward": 0.7708333432674408,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1030.4166717529297,
"epoch": 0.21142857142857144,
"grad_norm": 0.4904295101939947,
"kl": 0.0301513671875,
"learning_rate": 8.145033635316128e-07,
"loss": 0.1667,
"reward": 0.07037857547402382,
"reward_std": 0.27715054154396057,
"rewards/cosine_scaled_reward": -0.33981072157621384,
"rewards/format_reward": 0.7500000298023224,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1297.3750457763672,
"epoch": 0.21257142857142858,
"grad_norm": 0.359329704495533,
"kl": 0.02447509765625,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0722,
"reward": 0.27740756422281265,
"reward_std": 0.35020239651203156,
"rewards/cosine_scaled_reward": -0.20504622161388397,
"rewards/format_reward": 0.6875000149011612,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1278.2291870117188,
"epoch": 0.21371428571428572,
"grad_norm": 0.6229091446373484,
"kl": 0.037841796875,
"learning_rate": 8.093945422764069e-07,
"loss": 0.159,
"reward": 0.6862413678318262,
"reward_std": 0.806188240647316,
"rewards/cosine_scaled_reward": -0.011045984923839569,
"rewards/format_reward": 0.7083333507180214,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 828.7083587646484,
"epoch": 0.21485714285714286,
"grad_norm": 0.8396211982213951,
"kl": 0.029296875,
"learning_rate": 8.068211054579943e-07,
"loss": 0.1705,
"reward": 0.5941705331206322,
"reward_std": 0.6708386167883873,
"rewards/cosine_scaled_reward": -0.12999806739389896,
"rewards/format_reward": 0.8541666865348816,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1126.9583740234375,
"epoch": 0.216,
"grad_norm": 1.0692586435721545,
"kl": 0.05059814453125,
"learning_rate": 8.04235151541222e-07,
"loss": 0.2306,
"reward": 0.3716874085366726,
"reward_std": 0.6852569133043289,
"rewards/cosine_scaled_reward": -0.17873962549492717,
"rewards/format_reward": 0.7291666939854622,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 1441.7292175292969,
"epoch": 0.21714285714285714,
"grad_norm": 0.4556901372243305,
"kl": 0.0775146484375,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0641,
"reward": -0.02832420915365219,
"reward_std": 0.41898399591445923,
"rewards/cosine_scaled_reward": -0.21207877062261105,
"rewards/format_reward": 0.3958333432674408,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1079.3541870117188,
"epoch": 0.21828571428571428,
"grad_norm": 0.7752155732582218,
"kl": 0.05340576171875,
"learning_rate": 7.990261971595048e-07,
"loss": 0.1862,
"reward": 0.4970630258321762,
"reward_std": 0.6355597376823425,
"rewards/cosine_scaled_reward": -0.1264684833586216,
"rewards/format_reward": 0.7500000149011612,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1067.5417175292969,
"epoch": 0.21942857142857142,
"grad_norm": 0.9433921479755671,
"kl": 0.0628662109375,
"learning_rate": 7.964034505716476e-07,
"loss": 0.1345,
"reward": 0.34896004013717175,
"reward_std": 0.44530968368053436,
"rewards/cosine_scaled_reward": -0.19010332133620977,
"rewards/format_reward": 0.7291666865348816,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1153.1458892822266,
"epoch": 0.22057142857142858,
"grad_norm": 0.557299045473737,
"kl": 0.07440185546875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0623,
"reward": 0.3937496952712536,
"reward_std": 0.4528709352016449,
"rewards/cosine_scaled_reward": -0.14687515422701836,
"rewards/format_reward": 0.6875000149011612,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 772.7708435058594,
"epoch": 0.22171428571428572,
"grad_norm": 1.0195572615380695,
"kl": 0.03753662109375,
"learning_rate": 7.911220577405484e-07,
"loss": 0.1994,
"reward": 1.379511073231697,
"reward_std": 0.604660227894783,
"rewards/cosine_scaled_reward": 0.23142218962311745,
"rewards/format_reward": 0.9166666865348816,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1043.2708740234375,
"epoch": 0.22285714285714286,
"grad_norm": 0.9603645520119819,
"kl": 0.057830810546875,
"learning_rate": 7.884636689049422e-07,
"loss": 0.101,
"reward": 0.9527463093400002,
"reward_std": 0.651703879237175,
"rewards/cosine_scaled_reward": 0.12220647558569908,
"rewards/format_reward": 0.7083333432674408,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1171.8125305175781,
"epoch": 0.224,
"grad_norm": 1.0759043540199384,
"kl": 0.094970703125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0986,
"reward": 0.22757766395807266,
"reward_std": 0.5421559736132622,
"rewards/cosine_scaled_reward": -0.14662783965468407,
"rewards/format_reward": 0.5208333488553762,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1254.6458892822266,
"epoch": 0.22514285714285714,
"grad_norm": 1.2281398548522355,
"kl": 0.1163330078125,
"learning_rate": 7.831121542179086e-07,
"loss": 0.2334,
"reward": 0.1120694987475872,
"reward_std": 0.406834427267313,
"rewards/cosine_scaled_reward": -0.21479860320687294,
"rewards/format_reward": 0.541666679084301,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1551.9792175292969,
"epoch": 0.22628571428571428,
"grad_norm": 1.2807709220712407,
"kl": 0.1573486328125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.1642,
"reward": 0.1520095318555832,
"reward_std": 0.5469059012830257,
"rewards/cosine_scaled_reward": -0.16357857827097178,
"rewards/format_reward": 0.4791666716337204,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1243.7500457763672,
"epoch": 0.22742857142857142,
"grad_norm": 1.2387930807523095,
"kl": 0.1546630859375,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0664,
"reward": 0.5908387266099453,
"reward_std": 0.44286736100912094,
"rewards/cosine_scaled_reward": 0.014169345609843731,
"rewards/format_reward": 0.5625000149011612,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 998.3542022705078,
"epoch": 0.22857142857142856,
"grad_norm": 1.6258231243608119,
"kl": 0.146240234375,
"learning_rate": 7.75e-07,
"loss": 0.223,
"reward": 0.9689896870404482,
"reward_std": 0.6490836925804615,
"rewards/cosine_scaled_reward": 0.10949480719864368,
"rewards/format_reward": 0.7500000298023224,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1197.5208587646484,
"epoch": 0.2297142857142857,
"grad_norm": 1.2117522808382983,
"kl": 0.15203857421875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.1108,
"reward": 0.29535099118947983,
"reward_std": 0.6659888252615929,
"rewards/cosine_scaled_reward": -0.18565785279497504,
"rewards/format_reward": 0.6666666865348816,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1035.2292022705078,
"epoch": 0.23085714285714284,
"grad_norm": 2.430024645446878,
"kl": 0.1729736328125,
"learning_rate": 7.695368466124296e-07,
"loss": 0.1341,
"reward": 0.4362456016242504,
"reward_std": 0.665816992521286,
"rewards/cosine_scaled_reward": -0.13604386523365974,
"rewards/format_reward": 0.7083333432674408,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1285.2083587646484,
"epoch": 0.232,
"grad_norm": 3.5252314114631926,
"kl": 0.2603759765625,
"learning_rate": 7.667891533457718e-07,
"loss": 0.2005,
"reward": 0.48519248701632023,
"reward_std": 0.612464651465416,
"rewards/cosine_scaled_reward": -0.08032042533159256,
"rewards/format_reward": 0.6458333432674408,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 835.0833587646484,
"epoch": 0.23314285714285715,
"grad_norm": 1.8381824332135883,
"kl": 0.1859130859375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.053,
"reward": 1.2399137616157532,
"reward_std": 0.6745168194174767,
"rewards/cosine_scaled_reward": 0.2241235449910164,
"rewards/format_reward": 0.7916666865348816,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1421.9583435058594,
"epoch": 0.2342857142857143,
"grad_norm": 1.7567396533005133,
"kl": 0.362548828125,
"learning_rate": 7.612622032536507e-07,
"loss": 0.1051,
"reward": 0.3085259608924389,
"reward_std": 0.6349210105836391,
"rewards/cosine_scaled_reward": -0.08532036282122135,
"rewards/format_reward": 0.4791666865348816,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1228.7083892822266,
"epoch": 0.23542857142857143,
"grad_norm": 2.3389392066981562,
"kl": 0.30615234375,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0693,
"reward": 0.18148453161120415,
"reward_std": 0.5284193530678749,
"rewards/cosine_scaled_reward": -0.24259107932448387,
"rewards/format_reward": 0.6666666716337204,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1070.4792022705078,
"epoch": 0.23657142857142857,
"grad_norm": 3.543320557594463,
"kl": 0.26416015625,
"learning_rate": 7.556940671764124e-07,
"loss": 0.1883,
"reward": 0.542645301669836,
"reward_std": 0.5379708558320999,
"rewards/cosine_scaled_reward": -0.12451068125665188,
"rewards/format_reward": 0.7916666716337204,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1317.2708740234375,
"epoch": 0.2377142857142857,
"grad_norm": 1.9754558032385148,
"kl": 0.6748046875,
"learning_rate": 7.528948933102438e-07,
"loss": 0.1365,
"reward": 0.09549727046396583,
"reward_std": 0.3623932749032974,
"rewards/cosine_scaled_reward": -0.2126680426299572,
"rewards/format_reward": 0.5208333395421505,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 919.9375457763672,
"epoch": 0.23885714285714285,
"grad_norm": 3.305425458945869,
"kl": 0.474609375,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0593,
"reward": 0.7489641904830933,
"reward_std": 0.4507276937365532,
"rewards/cosine_scaled_reward": 0.030732073821127415,
"rewards/format_reward": 0.6875000149011612,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 908.8958435058594,
"epoch": 0.24,
"grad_norm": 3.7173678494051496,
"kl": 0.403564453125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.1606,
"reward": 0.7559212893247604,
"reward_std": 0.5382421165704727,
"rewards/cosine_scaled_reward": -0.007456040009856224,
"rewards/format_reward": 0.7708333432674408,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1479.0416870117188,
"epoch": 0.24114285714285713,
"grad_norm": 39.96198653082631,
"kl": 2.5693359375,
"learning_rate": 7.444385869608921e-07,
"loss": 0.2707,
"reward": 0.03475058265030384,
"reward_std": 0.3246455695480108,
"rewards/cosine_scaled_reward": -0.18054138123989105,
"rewards/format_reward": 0.39583333395421505,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1051.0208587646484,
"epoch": 0.2422857142857143,
"grad_norm": 3.2393755765757777,
"kl": 0.53466796875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0958,
"reward": 0.6123923324048519,
"reward_std": 0.5387515500187874,
"rewards/cosine_scaled_reward": -0.04797050543129444,
"rewards/format_reward": 0.708333358168602,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1301.3750610351562,
"epoch": 0.24342857142857144,
"grad_norm": 2.65733014184082,
"kl": 0.755859375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.1374,
"reward": 0.1711240354925394,
"reward_std": 0.42111407220363617,
"rewards/cosine_scaled_reward": -0.16443797945976257,
"rewards/format_reward": 0.5000000149011612,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1203.6875610351562,
"epoch": 0.24457142857142858,
"grad_norm": 2.501952170306742,
"kl": 0.50732421875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.1105,
"reward": 0.22278533224016428,
"reward_std": 0.434869222342968,
"rewards/cosine_scaled_reward": -0.22194067016243935,
"rewards/format_reward": 0.6666667014360428,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1249.6667022705078,
"epoch": 0.24571428571428572,
"grad_norm": 4.086485386572322,
"kl": 0.880859375,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0173,
"reward": 0.3316160347312689,
"reward_std": 0.5279753059148788,
"rewards/cosine_scaled_reward": -0.14669198356568813,
"rewards/format_reward": 0.6250000223517418,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1369.1666870117188,
"epoch": 0.24685714285714286,
"grad_norm": 3.328918162087878,
"kl": 0.773193359375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.1402,
"reward": 0.2145287273451686,
"reward_std": 0.5796768814325333,
"rewards/cosine_scaled_reward": -0.16356897167861462,
"rewards/format_reward": 0.5416666865348816,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1269.8542175292969,
"epoch": 0.248,
"grad_norm": 2.8333189883709515,
"kl": 0.75927734375,
"learning_rate": 7.27273859315928e-07,
"loss": -0.0115,
"reward": 0.5310591869056225,
"reward_std": 0.4825605973601341,
"rewards/cosine_scaled_reward": -0.057387083768844604,
"rewards/format_reward": 0.645833358168602,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1252.2708740234375,
"epoch": 0.24914285714285714,
"grad_norm": 4.762778702423241,
"kl": 0.74072265625,
"learning_rate": 7.243820139034464e-07,
"loss": 0.1477,
"reward": 0.5015929639339447,
"reward_std": 0.3994259871542454,
"rewards/cosine_scaled_reward": -0.07212021434679627,
"rewards/format_reward": 0.6458333507180214,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1017.5416870117188,
"epoch": 0.2502857142857143,
"grad_norm": 4.164501369060878,
"kl": 1.07958984375,
"learning_rate": 7.214816693576234e-07,
"loss": 0.1337,
"reward": 0.767455330118537,
"reward_std": 0.5030167028307915,
"rewards/cosine_scaled_reward": 0.07122766599059105,
"rewards/format_reward": 0.6250000223517418,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 1053.1875457763672,
"epoch": 0.25142857142857145,
"grad_norm": 3.588996799420188,
"kl": 0.71142578125,
"learning_rate": 7.185729670371604e-07,
"loss": 0.1866,
"reward": 0.48609594255685806,
"reward_std": 0.617650680243969,
"rewards/cosine_scaled_reward": -0.11111870361492038,
"rewards/format_reward": 0.7083333432674408,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1244.687515258789,
"epoch": 0.25257142857142856,
"grad_norm": 2.946733537468475,
"kl": 1.330078125,
"learning_rate": 7.156560487081051e-07,
"loss": 0.1268,
"reward": 0.4570632018148899,
"reward_std": 0.36856189370155334,
"rewards/cosine_scaled_reward": -0.0006350576877593994,
"rewards/format_reward": 0.4583333395421505,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1376.9167175292969,
"epoch": 0.2537142857142857,
"grad_norm": 3.53418042013775,
"kl": 1.1337890625,
"learning_rate": 7.127310565369415e-07,
"loss": 0.2362,
"reward": 0.1362705221399665,
"reward_std": 0.3934030085802078,
"rewards/cosine_scaled_reward": -0.19228141009807587,
"rewards/format_reward": 0.5208333358168602,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1156.750015258789,
"epoch": 0.25485714285714284,
"grad_norm": 35.23833796360462,
"kl": 2.369140625,
"learning_rate": 7.097981330836616e-07,
"loss": 0.1765,
"reward": 0.6305188983678818,
"reward_std": 0.5979669764637947,
"rewards/cosine_scaled_reward": 0.023592765908688307,
"rewards/format_reward": 0.5833333507180214,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1251.3125305175781,
"epoch": 0.256,
"grad_norm": 3.4418620220945138,
"kl": 1.376953125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.1723,
"reward": 0.5104624545201659,
"reward_std": 0.25178899243474007,
"rewards/cosine_scaled_reward": -0.06768545880913734,
"rewards/format_reward": 0.6458333432674408,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 924.3541870117188,
"epoch": 0.2571428571428571,
"grad_norm": 6.348797231777103,
"kl": 0.9375,
"learning_rate": 7.039090644965509e-07,
"loss": 0.1337,
"reward": 0.7791457176208496,
"reward_std": 0.7603946030139923,
"rewards/cosine_scaled_reward": 0.07707285927608609,
"rewards/format_reward": 0.6250000223517418,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1052.5000305175781,
"epoch": 0.2582857142857143,
"grad_norm": 3.9386080018485288,
"kl": 1.52734375,
"learning_rate": 7.009532063876148e-07,
"loss": 0.2459,
"reward": 0.46499455720186234,
"reward_std": 0.6090477257966995,
"rewards/cosine_scaled_reward": -0.09041939489543438,
"rewards/format_reward": 0.6458333432674408,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 892.1875152587891,
"epoch": 0.25942857142857145,
"grad_norm": 3.4313724086317445,
"kl": 1.125,
"learning_rate": 6.979899910323624e-07,
"loss": 0.1959,
"reward": 0.5925753712654114,
"reward_std": 0.8098603934049606,
"rewards/cosine_scaled_reward": -0.04746231180615723,
"rewards/format_reward": 0.6875000298023224,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1091.6458740234375,
"epoch": 0.26057142857142856,
"grad_norm": 4.447647427267497,
"kl": 1.66015625,
"learning_rate": 6.950195628537299e-07,
"loss": 0.1179,
"reward": 0.24639339372515678,
"reward_std": 0.48318010196089745,
"rewards/cosine_scaled_reward": -0.17888664733618498,
"rewards/format_reward": 0.6041666865348816,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 809.2916870117188,
"epoch": 0.26171428571428573,
"grad_norm": 8.169532609480521,
"kl": 2.046875,
"learning_rate": 6.920420666261961e-07,
"loss": 0.3082,
"reward": 0.5617873594164848,
"reward_std": 0.7489510700106621,
"rewards/cosine_scaled_reward": -0.07327299565076828,
"rewards/format_reward": 0.7083333432674408,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 1110.3542175292969,
"epoch": 0.26285714285714284,
"grad_norm": 2.921180843223507,
"kl": 2.2265625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.1487,
"reward": 0.4394577872008085,
"reward_std": 0.4748491495847702,
"rewards/cosine_scaled_reward": -0.05110444873571396,
"rewards/format_reward": 0.5416666977107525,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1026.3542175292969,
"epoch": 0.264,
"grad_norm": 2.544177744090501,
"kl": 1.572265625,
"learning_rate": 6.860664508377001e-07,
"loss": 0.1564,
"reward": 0.2407762985676527,
"reward_std": 0.5902754589915276,
"rewards/cosine_scaled_reward": -0.20252852141857147,
"rewards/format_reward": 0.645833358168602,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1030.8958587646484,
"epoch": 0.2651428571428571,
"grad_norm": 3.5304119337525526,
"kl": 1.529296875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.1109,
"reward": 0.42541009094566107,
"reward_std": 0.6807678937911987,
"rewards/cosine_scaled_reward": -0.11021162755787373,
"rewards/format_reward": 0.6458333507180214,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1073.3333435058594,
"epoch": 0.2662857142857143,
"grad_norm": 3.0267711493511382,
"kl": 1.1796875,
"learning_rate": 6.800643086250121e-07,
"loss": 0.2702,
"reward": 0.42545080557465553,
"reward_std": 0.48426005244255066,
"rewards/cosine_scaled_reward": -0.15185793861746788,
"rewards/format_reward": 0.7291666716337204,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1166.9791870117188,
"epoch": 0.2674285714285714,
"grad_norm": 2.956369605796136,
"kl": 1.1279296875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.1076,
"reward": 0.3714570254087448,
"reward_std": 0.650765061378479,
"rewards/cosine_scaled_reward": -0.13718816195614636,
"rewards/format_reward": 0.645833358168602,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1054.6667175292969,
"epoch": 0.26857142857142857,
"grad_norm": 4.47554265499188,
"kl": 1.21484375,
"learning_rate": 6.740368101176495e-07,
"loss": 0.2849,
"reward": 0.6623743935488164,
"reward_std": 0.7155829221010208,
"rewards/cosine_scaled_reward": -0.012562822550535202,
"rewards/format_reward": 0.6875000223517418,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1096.1875457763672,
"epoch": 0.26971428571428574,
"grad_norm": 4.925975683565178,
"kl": 1.3408203125,
"learning_rate": 6.710139192768694e-07,
"loss": 0.2351,
"reward": 0.26786297000944614,
"reward_std": 0.5117842257022858,
"rewards/cosine_scaled_reward": -0.2202351950109005,
"rewards/format_reward": 0.708333358168602,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 983.2291870117188,
"epoch": 0.27085714285714285,
"grad_norm": 2.226077510557553,
"kl": 0.77294921875,
"learning_rate": 6.679851303883891e-07,
"loss": 0.1527,
"reward": 0.5171467587351799,
"reward_std": 0.5790724456310272,
"rewards/cosine_scaled_reward": -0.10600997135043144,
"rewards/format_reward": 0.7291666865348816,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1015.9167175292969,
"epoch": 0.272,
"grad_norm": 2.746018994596942,
"kl": 1.0703125,
"learning_rate": 6.649505910711058e-07,
"loss": 0.1685,
"reward": 0.4093864783644676,
"reward_std": 0.5853541940450668,
"rewards/cosine_scaled_reward": -0.1911400929093361,
"rewards/format_reward": 0.7916667014360428,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1138.8542022705078,
"epoch": 0.27314285714285713,
"grad_norm": 2.366422791383297,
"kl": 1.3916015625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.1319,
"reward": 0.03224743437021971,
"reward_std": 0.40017952769994736,
"rewards/cosine_scaled_reward": -0.2963762879371643,
"rewards/format_reward": 0.6250000298023224,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 916.8125305175781,
"epoch": 0.2742857142857143,
"grad_norm": 1.7577643969871468,
"kl": 1.291015625,
"learning_rate": 6.588648530198504e-07,
"loss": 0.13,
"reward": 0.8863477371633053,
"reward_std": 0.6274040639400482,
"rewards/cosine_scaled_reward": 0.10984052997082472,
"rewards/format_reward": 0.6666666865348816,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 891.0417022705078,
"epoch": 0.2754285714285714,
"grad_norm": 2.841473966918375,
"kl": 1.0361328125,
"learning_rate": 6.558139508961654e-07,
"loss": 0.1554,
"reward": 0.48904264718294144,
"reward_std": 0.669127531349659,
"rewards/cosine_scaled_reward": -0.16172868385910988,
"rewards/format_reward": 0.8125000149011612,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 900.8541870117188,
"epoch": 0.2765714285714286,
"grad_norm": 4.202915193648642,
"kl": 0.96337890625,
"learning_rate": 6.527578915497951e-07,
"loss": 0.1132,
"reward": 0.6491687893867493,
"reward_std": 0.6397206410765648,
"rewards/cosine_scaled_reward": -0.08166561461985111,
"rewards/format_reward": 0.8125000149011612,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 871.3542022705078,
"epoch": 0.2777142857142857,
"grad_norm": 4.013401867872089,
"kl": 1.2275390625,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0343,
"reward": 0.6437305957078934,
"reward_std": 0.566775843501091,
"rewards/cosine_scaled_reward": -0.06355137238278985,
"rewards/format_reward": 0.7708333432674408,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 1051.8541870117188,
"epoch": 0.27885714285714286,
"grad_norm": 2.0640323982742346,
"kl": 1.2119140625,
"learning_rate": 6.466308972251785e-07,
"loss": 0.1283,
"reward": 0.6993502229452133,
"reward_std": 0.8381707072257996,
"rewards/cosine_scaled_reward": -0.04615823458880186,
"rewards/format_reward": 0.7916667014360428,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 926.8958587646484,
"epoch": 0.28,
"grad_norm": 2.3095581027269456,
"kl": 1.2373046875,
"learning_rate": 6.435602608679916e-07,
"loss": 0.1728,
"reward": 0.5032865107059479,
"reward_std": 0.4741464629769325,
"rewards/cosine_scaled_reward": -0.15460674837231636,
"rewards/format_reward": 0.8125000298023224,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 948.7292022705078,
"epoch": 0.28114285714285714,
"grad_norm": 2.2705966167509697,
"kl": 1.0166015625,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0879,
"reward": 0.5439350083470345,
"reward_std": 0.6458217911422253,
"rewards/cosine_scaled_reward": -0.11344920098781586,
"rewards/format_reward": 0.770833358168602,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 766.6250152587891,
"epoch": 0.2822857142857143,
"grad_norm": 4.218176679768865,
"kl": 1.375,
"learning_rate": 6.374054580489873e-07,
"loss": 0.1529,
"reward": 0.7583817802369595,
"reward_std": 0.9407426938414574,
"rewards/cosine_scaled_reward": 0.02502422034740448,
"rewards/format_reward": 0.7083333432674408,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1149.2708435058594,
"epoch": 0.2834285714285714,
"grad_norm": 2.966316254338991,
"kl": 1.69921875,
"learning_rate": 6.343215915635761e-07,
"loss": 0.1307,
"reward": 0.37028552405536175,
"reward_std": 0.35450038872659206,
"rewards/cosine_scaled_reward": -0.15860724076628685,
"rewards/format_reward": 0.6875000298023224,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 1236.3750305175781,
"epoch": 0.2845714285714286,
"grad_norm": 2.8644099570080126,
"kl": 1.646484375,
"learning_rate": 6.31233615362752e-07,
"loss": 0.142,
"reward": 0.3449726775288582,
"reward_std": 0.7856429815292358,
"rewards/cosine_scaled_reward": -0.09834698960185051,
"rewards/format_reward": 0.5416666865348816,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 977.7500305175781,
"epoch": 0.2857142857142857,
"grad_norm": 1.9099821609277308,
"kl": 0.921875,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0404,
"reward": 0.6945669716224074,
"reward_std": 0.822948083281517,
"rewards/cosine_scaled_reward": -0.048549871891736984,
"rewards/format_reward": 0.7916666716337204,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1265.9583740234375,
"epoch": 0.28685714285714287,
"grad_norm": 2.751476452748249,
"kl": 1.216796875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.1111,
"reward": 0.12667130306363106,
"reward_std": 0.7467320710420609,
"rewards/cosine_scaled_reward": -0.17624769732356071,
"rewards/format_reward": 0.4791666865348816,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1031.0833740234375,
"epoch": 0.288,
"grad_norm": 3.701835452468544,
"kl": 1.033203125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.2332,
"reward": 0.3126375643769279,
"reward_std": 0.748970627784729,
"rewards/cosine_scaled_reward": -0.09368122089654207,
"rewards/format_reward": 0.5000000074505806,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 889.5833587646484,
"epoch": 0.28914285714285715,
"grad_norm": 5.141640270028422,
"kl": 1.69921875,
"learning_rate": 6.188436263278172e-07,
"loss": -0.1188,
"reward": 0.23392239259555936,
"reward_std": 0.8090809062123299,
"rewards/cosine_scaled_reward": -0.11220548488199711,
"rewards/format_reward": 0.4583333432674408,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 912.5208587646484,
"epoch": 0.29028571428571426,
"grad_norm": 3.5136083178201183,
"kl": 1.1953125,
"learning_rate": 6.157373628530852e-07,
"loss": 0.1793,
"reward": 0.7197382766753435,
"reward_std": 0.9268201515078545,
"rewards/cosine_scaled_reward": 0.057785794138908386,
"rewards/format_reward": 0.6041666865348816,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 1043.3333740234375,
"epoch": 0.2914285714285714,
"grad_norm": 2.8576463073310023,
"kl": 1.361328125,
"learning_rate": 6.126278954320294e-07,
"loss": 0.1618,
"reward": 0.21097473427653313,
"reward_std": 0.8950171619653702,
"rewards/cosine_scaled_reward": -0.08201263658702374,
"rewards/format_reward": 0.3750000074505806,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1132.9166870117188,
"epoch": 0.2925714285714286,
"grad_norm": 2.6390372016890877,
"kl": 0.9296875,
"learning_rate": 6.095153756157051e-07,
"loss": 0.1517,
"reward": 0.3409617803990841,
"reward_std": 0.7687749713659286,
"rewards/cosine_scaled_reward": -0.142019122838974,
"rewards/format_reward": 0.6250000149011612,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1172.1458740234375,
"epoch": 0.2937142857142857,
"grad_norm": 1.7999790033387904,
"kl": 0.8994140625,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0345,
"reward": 0.24714069813489914,
"reward_std": 0.526521310210228,
"rewards/cosine_scaled_reward": -0.20976299978792667,
"rewards/format_reward": 0.6666667014360428,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1025.2292022705078,
"epoch": 0.2948571428571429,
"grad_norm": 3.7817000702854284,
"kl": 0.9970703125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0254,
"reward": 0.371606208384037,
"reward_std": 0.8782027065753937,
"rewards/cosine_scaled_reward": -0.10586357489228249,
"rewards/format_reward": 0.583333358168602,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 1027.1041870117188,
"epoch": 0.296,
"grad_norm": 2.2007546083055627,
"kl": 1.23828125,
"learning_rate": 6.001610194928464e-07,
"loss": 0.1329,
"reward": 0.2863161154091358,
"reward_std": 0.6974881812930107,
"rewards/cosine_scaled_reward": -0.16934195160865784,
"rewards/format_reward": 0.6250000149011612,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 1060.1666870117188,
"epoch": 0.29714285714285715,
"grad_norm": 2.0712856185453226,
"kl": 1.314453125,
"learning_rate": 5.97037808470444e-07,
"loss": -0.0031,
"reward": 0.05191618762910366,
"reward_std": 0.5254812240600586,
"rewards/cosine_scaled_reward": -0.1927919089794159,
"rewards/format_reward": 0.4375000074505806,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 800.5625152587891,
"epoch": 0.29828571428571427,
"grad_norm": 3.953323642394609,
"kl": 1.18359375,
"learning_rate": 5.939123048916173e-07,
"loss": 0.1926,
"reward": 0.16135332686826587,
"reward_std": 0.6497361660003662,
"rewards/cosine_scaled_reward": -0.21099001914262772,
"rewards/format_reward": 0.5833333432674408,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 906.3542022705078,
"epoch": 0.29942857142857143,
"grad_norm": 6.975231366994329,
"kl": 1.1025390625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.2163,
"reward": 0.13131073210388422,
"reward_std": 0.5159479975700378,
"rewards/cosine_scaled_reward": -0.1739279804751277,
"rewards/format_reward": 0.47916667722165585,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 981.3958587646484,
"epoch": 0.30057142857142854,
"grad_norm": 3.6462739135853304,
"kl": 0.93359375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.2144,
"reward": 0.2528093755245209,
"reward_std": 0.6878427565097809,
"rewards/cosine_scaled_reward": -0.19651199039071798,
"rewards/format_reward": 0.645833358168602,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 1036.4792022705078,
"epoch": 0.3017142857142857,
"grad_norm": 2.4186369761638797,
"kl": 1.11328125,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0094,
"reward": 0.34765794809209183,
"reward_std": 0.7917995601892471,
"rewards/cosine_scaled_reward": -0.10742103308439255,
"rewards/format_reward": 0.5625000149011612,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 988.1458587646484,
"epoch": 0.3028571428571429,
"grad_norm": 3.8358402184782845,
"kl": 1.125,
"learning_rate": 5.813904131848564e-07,
"loss": 0.1412,
"reward": 0.22985844686627388,
"reward_std": 0.4855259954929352,
"rewards/cosine_scaled_reward": -0.17673744820058346,
"rewards/format_reward": 0.5833333358168602,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 853.1042022705078,
"epoch": 0.304,
"grad_norm": 3.155418565951925,
"kl": 1.138671875,
"learning_rate": 5.78255733788191e-07,
"loss": -0.0981,
"reward": 0.23544084653258324,
"reward_std": 0.5617225617170334,
"rewards/cosine_scaled_reward": -0.18436292186379433,
"rewards/format_reward": 0.6041666865348816,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1040.7291870117188,
"epoch": 0.30514285714285716,
"grad_norm": 4.49377424287265,
"kl": 1.8671875,
"learning_rate": 5.751196772469237e-07,
"loss": 0.3133,
"reward": 0.019660448655486107,
"reward_std": 0.5969599932432175,
"rewards/cosine_scaled_reward": -0.14641978219151497,
"rewards/format_reward": 0.3125000149011612,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1120.3334045410156,
"epoch": 0.3062857142857143,
"grad_norm": 2.9296163486934588,
"kl": 1.455078125,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0891,
"reward": 0.019381534308195114,
"reward_std": 0.6385679095983505,
"rewards/cosine_scaled_reward": -0.188225906342268,
"rewards/format_reward": 0.3958333432674408,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 940.0833587646484,
"epoch": 0.30742857142857144,
"grad_norm": 3.99474649335861,
"kl": 1.58203125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.2037,
"reward": 0.21233398653566837,
"reward_std": 0.5940781682729721,
"rewards/cosine_scaled_reward": -0.17508301883935928,
"rewards/format_reward": 0.5625000149011612,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 820.8541870117188,
"epoch": 0.30857142857142855,
"grad_norm": 3.64920081986899,
"kl": 1.548828125,
"learning_rate": 5.657047735161255e-07,
"loss": 0.187,
"reward": 0.287849310785532,
"reward_std": 0.7942548245191574,
"rewards/cosine_scaled_reward": -0.16857536626048386,
"rewards/format_reward": 0.6250000298023224,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 918.2708435058594,
"epoch": 0.3097142857142857,
"grad_norm": 4.142397150940974,
"kl": 1.3642578125,
"learning_rate": 5.625647374256061e-07,
"loss": -0.0034,
"reward": 0.21712711825966835,
"reward_std": 0.7582554370164871,
"rewards/cosine_scaled_reward": -0.1726864455267787,
"rewards/format_reward": 0.5625000223517418,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1067.3125305175781,
"epoch": 0.31085714285714283,
"grad_norm": 5.568481701496752,
"kl": 1.576171875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.2629,
"reward": 0.07018839695956558,
"reward_std": 0.6307368651032448,
"rewards/cosine_scaled_reward": -0.17323914170265198,
"rewards/format_reward": 0.4166666753590107,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1032.7916870117188,
"epoch": 0.312,
"grad_norm": 2.7380334201594207,
"kl": 1.763671875,
"learning_rate": 5.562829811526154e-07,
"loss": 0.1532,
"reward": 0.1198783004656434,
"reward_std": 0.5959479659795761,
"rewards/cosine_scaled_reward": -0.15881085954606533,
"rewards/format_reward": 0.4375000149011612,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1005.0000305175781,
"epoch": 0.31314285714285717,
"grad_norm": 3.288058849096818,
"kl": 1.3232421875,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0679,
"reward": 0.33828355744481087,
"reward_std": 0.7625949904322624,
"rewards/cosine_scaled_reward": -0.1329415813088417,
"rewards/format_reward": 0.6041666716337204,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1209.2083892822266,
"epoch": 0.3142857142857143,
"grad_norm": 3.384369498507843,
"kl": 1.3759765625,
"learning_rate": 5.5e-07,
"loss": 0.1487,
"reward": 0.2773652821779251,
"reward_std": 0.7781829237937927,
"rewards/cosine_scaled_reward": -0.09048402030020952,
"rewards/format_reward": 0.45833334885537624,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 871.2500152587891,
"epoch": 0.31542857142857145,
"grad_norm": 3.6001944034052666,
"kl": 1.2470703125,
"learning_rate": 5.468584328659172e-07,
"loss": 0.2545,
"reward": 0.4259207919239998,
"reward_std": 0.7986200153827667,
"rewards/cosine_scaled_reward": -0.1099562719464302,
"rewards/format_reward": 0.6458333432674408,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 970.1458740234375,
"epoch": 0.31657142857142856,
"grad_norm": 5.098242367200561,
"kl": 1.9375,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0347,
"reward": 0.1577397957444191,
"reward_std": 0.8665766268968582,
"rewards/cosine_scaled_reward": -0.16071344492956996,
"rewards/format_reward": 0.479166679084301,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 1138.0417175292969,
"epoch": 0.3177142857142857,
"grad_norm": 4.893358334263393,
"kl": 1.51953125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.2335,
"reward": 0.2129652127623558,
"reward_std": 0.8123987764120102,
"rewards/cosine_scaled_reward": -0.1122674010694027,
"rewards/format_reward": 0.4375000149011612,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1055.5416870117188,
"epoch": 0.31885714285714284,
"grad_norm": 11.325087114885777,
"kl": 1.70703125,
"learning_rate": 5.37435262574394e-07,
"loss": 0.1758,
"reward": 0.2276703668758273,
"reward_std": 0.7087787315249443,
"rewards/cosine_scaled_reward": -0.14658149890601635,
"rewards/format_reward": 0.520833358168602,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 1201.5625305175781,
"epoch": 0.32,
"grad_norm": 4.499791162135755,
"kl": 1.3359375,
"learning_rate": 5.342952264838747e-07,
"loss": 0.199,
"reward": 0.4334499780088663,
"reward_std": 0.8222155347466469,
"rewards/cosine_scaled_reward": -0.0853583601419814,
"rewards/format_reward": 0.6041666716337204,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1183.5833740234375,
"epoch": 0.3211428571428571,
"grad_norm": 3.6400895329844336,
"kl": 1.931640625,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0286,
"reward": -0.14555206894874573,
"reward_std": 0.4930955022573471,
"rewards/cosine_scaled_reward": -0.2081927042454481,
"rewards/format_reward": 0.2708333395421505,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1227.6875305175781,
"epoch": 0.3222857142857143,
"grad_norm": 3.351330372342759,
"kl": 1.51953125,
"learning_rate": 5.28017603591974e-07,
"loss": 0.1735,
"reward": 0.08991836942732334,
"reward_std": 0.7664570957422256,
"rewards/cosine_scaled_reward": -0.1946241520345211,
"rewards/format_reward": 0.479166679084301,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 1011.8958435058594,
"epoch": 0.32342857142857145,
"grad_norm": 3.607306150140324,
"kl": 1.52734375,
"learning_rate": 5.248803227530763e-07,
"loss": 0.1756,
"reward": -0.16347728297114372,
"reward_std": 0.6131603866815567,
"rewards/cosine_scaled_reward": -0.269238643348217,
"rewards/format_reward": 0.3750000074505806,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1238.1875305175781,
"epoch": 0.32457142857142857,
"grad_norm": 3.700854838943554,
"kl": 1.3291015625,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0644,
"reward": 0.19410160928964615,
"reward_std": 0.6351519152522087,
"rewards/cosine_scaled_reward": -0.16336587071418762,
"rewards/format_reward": 0.520833358168602,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1302.5833435058594,
"epoch": 0.32571428571428573,
"grad_norm": 5.590443825333452,
"kl": 1.396484375,
"learning_rate": 5.186095868151436e-07,
"loss": 0.1172,
"reward": 0.0053066437467350625,
"reward_std": 0.6190855652093887,
"rewards/cosine_scaled_reward": -0.1952633447945118,
"rewards/format_reward": 0.3958333507180214,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1408.7708587646484,
"epoch": 0.32685714285714285,
"grad_norm": 5820.413747461295,
"kl": 44.6220703125,
"learning_rate": 5.154764373429315e-07,
"loss": 2.1366,
"reward": 0.321873364970088,
"reward_std": 0.7274122461676598,
"rewards/cosine_scaled_reward": -0.06822998262941837,
"rewards/format_reward": 0.45833334885537624,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1293.6875305175781,
"epoch": 0.328,
"grad_norm": 10688.293773017389,
"kl": 90.048828125,
"learning_rate": 5.123449705004581e-07,
"loss": 3.6012,
"reward": 0.22728685289621353,
"reward_std": 0.6926668882369995,
"rewards/cosine_scaled_reward": -0.10510657541453838,
"rewards/format_reward": 0.4375000074505806,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1143.1042175292969,
"epoch": 0.3291428571428571,
"grad_norm": 69995.08344409091,
"kl": 821.830078125,
"learning_rate": 5.09215338910999e-07,
"loss": 50.9221,
"reward": 0.3029659762978554,
"reward_std": 0.8068300932645798,
"rewards/cosine_scaled_reward": -0.04643368790857494,
"rewards/format_reward": 0.3958333432674408,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1325.7084045410156,
"epoch": 0.3302857142857143,
"grad_norm": 62.300695111663714,
"kl": 1.5146484375,
"learning_rate": 5.060876951083828e-07,
"loss": 0.1171,
"reward": 0.10640177875757217,
"reward_std": 0.6392035633325577,
"rewards/cosine_scaled_reward": -0.08221577852964401,
"rewards/format_reward": 0.2708333358168602,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1066.375015258789,
"epoch": 0.3314285714285714,
"grad_norm": 3.0451709688438138,
"kl": 0.85791015625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0875,
"reward": 0.4837397076189518,
"reward_std": 0.6303973346948624,
"rewards/cosine_scaled_reward": -0.008130142465233803,
"rewards/format_reward": 0.5000000074505806,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1176.2292175292969,
"epoch": 0.3325714285714286,
"grad_norm": 6.431194933370891,
"kl": 1.169921875,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0944,
"reward": 0.004224353935569525,
"reward_std": 0.7458223477005959,
"rewards/cosine_scaled_reward": -0.17497116327285767,
"rewards/format_reward": 0.354166679084301,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1279.1875610351562,
"epoch": 0.33371428571428574,
"grad_norm": 11.784461019524304,
"kl": 1.0419921875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0752,
"reward": -0.019843921065330505,
"reward_std": 0.5733096897602081,
"rewards/cosine_scaled_reward": -0.21825530380010605,
"rewards/format_reward": 0.4166666753590107,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1270.3750305175781,
"epoch": 0.33485714285714285,
"grad_norm": 12451.222306718704,
"kl": 56.82421875,
"learning_rate": 4.93600044896063e-07,
"loss": 2.6089,
"reward": -0.0518635269254446,
"reward_std": 0.4941852539777756,
"rewards/cosine_scaled_reward": -0.22384843230247498,
"rewards/format_reward": 0.3958333507180214,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1304.8750457763672,
"epoch": 0.336,
"grad_norm": 354145.9079404987,
"kl": 3584.8046875,
"learning_rate": 4.904846243842949e-07,
"loss": 283.5748,
"reward": 0.06046904996037483,
"reward_std": 0.7505204379558563,
"rewards/cosine_scaled_reward": -0.13643214339390397,
"rewards/format_reward": 0.3333333358168602,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 1317.5625610351562,
"epoch": 0.33714285714285713,
"grad_norm": 5.242464203702877,
"kl": 1.0029296875,
"learning_rate": 4.873721045679706e-07,
"loss": 0.1195,
"reward": 0.005757967010140419,
"reward_std": 0.6009484976530075,
"rewards/cosine_scaled_reward": -0.12212102208286524,
"rewards/format_reward": 0.2500000111758709,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1103.0625610351562,
"epoch": 0.3382857142857143,
"grad_norm": 4.2430557491796055,
"kl": 0.8115234375,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0632,
"reward": 0.0580328986980021,
"reward_std": 0.6936925277113914,
"rewards/cosine_scaled_reward": -0.15848355647176504,
"rewards/format_reward": 0.3750000149011612,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1432.3333435058594,
"epoch": 0.3394285714285714,
"grad_norm": 2.9908283966206457,
"kl": 0.7646484375,
"learning_rate": 4.811563736721829e-07,
"loss": 0.1022,
"reward": -0.011708778678439558,
"reward_std": 0.5683621913194656,
"rewards/cosine_scaled_reward": -0.12043773010373116,
"rewards/format_reward": 0.2291666716337204,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1451.8750305175781,
"epoch": 0.3405714285714286,
"grad_norm": 4.214445887739457,
"kl": 0.673828125,
"learning_rate": 4.780534655386743e-07,
"loss": -0.0113,
"reward": -0.12220606487244368,
"reward_std": 0.5942584052681923,
"rewards/cosine_scaled_reward": -0.18610304035246372,
"rewards/format_reward": 0.2500000074505806,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1379.4791870117188,
"epoch": 0.3417142857142857,
"grad_norm": 4.524572878515851,
"kl": 0.5302734375,
"learning_rate": 4.749540639777539e-07,
"loss": -0.0319,
"reward": -0.08997016213834286,
"reward_std": 0.6837709844112396,
"rewards/cosine_scaled_reward": -0.1804017536342144,
"rewards/format_reward": 0.27083334140479565,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 1242.2083740234375,
"epoch": 0.34285714285714286,
"grad_norm": 22.44129435449986,
"kl": 0.6015625,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.047,
"reward": 0.4733648784458637,
"reward_std": 0.6498839557170868,
"rewards/cosine_scaled_reward": -0.013317572651430964,
"rewards/format_reward": 0.5000000111758709,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1358.5000610351562,
"epoch": 0.344,
"grad_norm": 5.451894779313779,
"kl": 0.55419921875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0201,
"reward": 0.012628388591110706,
"reward_std": 0.6598528623580933,
"rewards/cosine_scaled_reward": -0.11868580989539623,
"rewards/format_reward": 0.2500000037252903,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1208.0000610351562,
"epoch": 0.34514285714285714,
"grad_norm": 2.502133066720727,
"kl": 0.5078125,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0976,
"reward": 0.01287244912236929,
"reward_std": 0.6720428466796875,
"rewards/cosine_scaled_reward": -0.14981378242373466,
"rewards/format_reward": 0.3125000111758709,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1372.2917175292969,
"epoch": 0.3462857142857143,
"grad_norm": 9.527527408809727,
"kl": 0.591796875,
"learning_rate": 4.6259454195101267e-07,
"loss": -0.0351,
"reward": -0.0026968184392899275,
"reward_std": 0.7502148300409317,
"rewards/cosine_scaled_reward": -0.1784317558631301,
"rewards/format_reward": 0.3541666716337204,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 1228.1458435058594,
"epoch": 0.3474285714285714,
"grad_norm": 5.5176774561091655,
"kl": 0.4345703125,
"learning_rate": 4.59514935484316e-07,
"loss": 0.1598,
"reward": 0.39222877379506826,
"reward_std": 0.840458020567894,
"rewards/cosine_scaled_reward": -0.03305228240787983,
"rewards/format_reward": 0.4583333358168602,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1448.8125305175781,
"epoch": 0.3485714285714286,
"grad_norm": 7.801875434214254,
"kl": 0.3525390625,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0808,
"reward": 0.005279352888464928,
"reward_std": 0.6858643740415573,
"rewards/cosine_scaled_reward": -0.1536103216931224,
"rewards/format_reward": 0.3125000074505806,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1402.6666870117188,
"epoch": 0.3497142857142857,
"grad_norm": 3.566822202421308,
"kl": 0.29638671875,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0791,
"reward": 0.18335522711277008,
"reward_std": 0.6350644528865814,
"rewards/cosine_scaled_reward": -0.13748905574902892,
"rewards/format_reward": 0.4583333432674408,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 1421.8541870117188,
"epoch": 0.35085714285714287,
"grad_norm": 1.9532542741070622,
"kl": 0.289794921875,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0514,
"reward": 0.2609965428709984,
"reward_std": 0.7012953609228134,
"rewards/cosine_scaled_reward": -0.06741839554160833,
"rewards/format_reward": 0.39583334885537624,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1331.2500305175781,
"epoch": 0.352,
"grad_norm": 2.135773174322825,
"kl": 0.26416015625,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.1508,
"reward": 0.21997906267642975,
"reward_std": 0.6842755973339081,
"rewards/cosine_scaled_reward": -0.1191771375015378,
"rewards/format_reward": 0.4583333358168602,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 1291.1666870117188,
"epoch": 0.35314285714285715,
"grad_norm": 3.030174625800062,
"kl": 0.323486328125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.1012,
"reward": -0.060309079475700855,
"reward_std": 0.48270438611507416,
"rewards/cosine_scaled_reward": -0.16557121649384499,
"rewards/format_reward": 0.2708333469927311,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 1296.8125457763672,
"epoch": 0.35428571428571426,
"grad_norm": 3.288974321286699,
"kl": 0.30712890625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.1053,
"reward": 0.3812438789755106,
"reward_std": 0.6454566046595573,
"rewards/cosine_scaled_reward": 0.0031219255179166794,
"rewards/format_reward": 0.37500000558793545,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 1582.4167175292969,
"epoch": 0.3554285714285714,
"grad_norm": 11.037201589242047,
"kl": 0.3916015625,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0554,
"reward": 0.011564895510673523,
"reward_std": 0.5866778641939163,
"rewards/cosine_scaled_reward": -0.12963422574102879,
"rewards/format_reward": 0.27083333767950535,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1511.1458740234375,
"epoch": 0.3565714285714286,
"grad_norm": 541.360267852673,
"kl": 2.48046875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.1743,
"reward": 0.09507806971669197,
"reward_std": 0.7126565277576447,
"rewards/cosine_scaled_reward": -0.12954430282115936,
"rewards/format_reward": 0.3541666716337204,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1310.4792022705078,
"epoch": 0.3577142857142857,
"grad_norm": 1.6060292822301743,
"kl": 0.2235107421875,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0119,
"reward": 0.19681214727461338,
"reward_std": 0.5347588732838631,
"rewards/cosine_scaled_reward": -0.14117726124823093,
"rewards/format_reward": 0.4791666902601719,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 1412.7708740234375,
"epoch": 0.3588571428571429,
"grad_norm": 0.9234012789427545,
"kl": 0.2705078125,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0522,
"reward": 0.1253851738292724,
"reward_std": 0.5503663271665573,
"rewards/cosine_scaled_reward": -0.1352240853011608,
"rewards/format_reward": 0.3958333469927311,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 1183.7916870117188,
"epoch": 0.36,
"grad_norm": 1.7837131712349448,
"kl": 0.248779296875,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.1102,
"reward": 0.06632774323225021,
"reward_std": 0.8003478944301605,
"rewards/cosine_scaled_reward": -0.14391947723925114,
"rewards/format_reward": 0.354166679084301,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1321.1667022705078,
"epoch": 0.36114285714285715,
"grad_norm": 3.8904561936208473,
"kl": 0.311279296875,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0684,
"reward": -0.12211128510534763,
"reward_std": 0.3644377589225769,
"rewards/cosine_scaled_reward": -0.19647231698036194,
"rewards/format_reward": 0.27083333767950535,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 1472.5208435058594,
"epoch": 0.36228571428571427,
"grad_norm": 0.6761305622628668,
"kl": 0.2392578125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0051,
"reward": 0.07694595551583916,
"reward_std": 0.698570191860199,
"rewards/cosine_scaled_reward": -0.08652702532708645,
"rewards/format_reward": 0.25000000186264515,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1446.3959045410156,
"epoch": 0.36342857142857143,
"grad_norm": 1.610083766620256,
"kl": 0.23046875,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.1272,
"reward": 0.22593690548092127,
"reward_std": 0.7007799595594406,
"rewards/cosine_scaled_reward": -0.11619820445775986,
"rewards/format_reward": 0.45833336375653744,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 1419.7500610351562,
"epoch": 0.36457142857142855,
"grad_norm": 1.3177147357732026,
"kl": 0.3505859375,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0908,
"reward": 0.05421498417854309,
"reward_std": 0.6087209582328796,
"rewards/cosine_scaled_reward": -0.10830917488783598,
"rewards/format_reward": 0.27083334140479565,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 1459.2917175292969,
"epoch": 0.3657142857142857,
"grad_norm": 2.383045046585821,
"kl": 0.177001953125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.143,
"reward": 0.23994141444563866,
"reward_std": 0.7169264256954193,
"rewards/cosine_scaled_reward": -0.08836262859404087,
"rewards/format_reward": 0.4166666753590107,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 1525.8542175292969,
"epoch": 0.3668571428571429,
"grad_norm": 1.4014039132566267,
"kl": 0.327880859375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0636,
"reward": 0.07618786534294486,
"reward_std": 0.6110149621963501,
"rewards/cosine_scaled_reward": -0.17023939825594425,
"rewards/format_reward": 0.416666679084301,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1505.1042022705078,
"epoch": 0.368,
"grad_norm": 0.9016635108285753,
"kl": 0.18017578125,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0766,
"reward": 0.1637781597673893,
"reward_std": 0.6868859454989433,
"rewards/cosine_scaled_reward": -0.13686091732233763,
"rewards/format_reward": 0.4375000186264515,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 1378.7500305175781,
"epoch": 0.36914285714285716,
"grad_norm": 1.1982814454055981,
"kl": 0.39306640625,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0913,
"reward": 0.17529202857986093,
"reward_std": 0.6956184059381485,
"rewards/cosine_scaled_reward": -0.14152065757662058,
"rewards/format_reward": 0.4583333432674408,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 1414.5208740234375,
"epoch": 0.3702857142857143,
"grad_norm": 5.168812943695912,
"kl": 0.2706298828125,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0758,
"reward": -0.05163134215399623,
"reward_std": 0.573038712143898,
"rewards/cosine_scaled_reward": -0.2133156731724739,
"rewards/format_reward": 0.37500000558793545,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 1026.2292175292969,
"epoch": 0.37142857142857144,
"grad_norm": 2.717389747197644,
"kl": 0.2042236328125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0446,
"reward": 0.35916636511683464,
"reward_std": 0.7165441811084747,
"rewards/cosine_scaled_reward": -0.11208349000662565,
"rewards/format_reward": 0.583333358168602,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1296.2083435058594,
"epoch": 0.37257142857142855,
"grad_norm": 0.9706132798560072,
"kl": 0.1436767578125,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0264,
"reward": 0.03931037150323391,
"reward_std": 0.5944674462080002,
"rewards/cosine_scaled_reward": -0.24076148495078087,
"rewards/format_reward": 0.5208333395421505,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 1119.3958740234375,
"epoch": 0.3737142857142857,
"grad_norm": 7.20904098295775,
"kl": 0.34619140625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0023,
"reward": 0.5026027010753751,
"reward_std": 0.4505321756005287,
"rewards/cosine_scaled_reward": 0.011718038469552994,
"rewards/format_reward": 0.4791666716337204,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 1340.9791717529297,
"epoch": 0.37485714285714283,
"grad_norm": 1.2860908020915138,
"kl": 0.416259765625,
"learning_rate": 3.872689434630585e-07,
"loss": 0.1449,
"reward": 0.15127216652035713,
"reward_std": 0.6304197087883949,
"rewards/cosine_scaled_reward": -0.15353058651089668,
"rewards/format_reward": 0.4583333507180214,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 1571.3750610351562,
"epoch": 0.376,
"grad_norm": 0.8293118478307562,
"kl": 0.242431640625,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0229,
"reward": 0.09288652800023556,
"reward_std": 0.5842361897230148,
"rewards/cosine_scaled_reward": -0.15147340297698975,
"rewards/format_reward": 0.3958333432674408,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 1407.9583740234375,
"epoch": 0.37714285714285717,
"grad_norm": 1.189781094856149,
"kl": 0.1077880859375,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0681,
"reward": -0.09090141206979752,
"reward_std": 0.5390855148434639,
"rewards/cosine_scaled_reward": -0.21211737021803856,
"rewards/format_reward": 0.3333333395421505,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1255.6875457763672,
"epoch": 0.3782857142857143,
"grad_norm": 1.046472107288498,
"kl": 0.10308837890625,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0811,
"reward": -0.12841611605836079,
"reward_std": 0.39798876643180847,
"rewards/cosine_scaled_reward": -0.3350413963198662,
"rewards/format_reward": 0.5416666865348816,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1201.8958740234375,
"epoch": 0.37942857142857145,
"grad_norm": 1.123018980255247,
"kl": 0.117584228515625,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.072,
"reward": 0.499036006629467,
"reward_std": 0.6711834743618965,
"rewards/cosine_scaled_reward": -0.03173201950266957,
"rewards/format_reward": 0.5625000074505806,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1133.3333587646484,
"epoch": 0.38057142857142856,
"grad_norm": 2.177638459571002,
"kl": 0.14453125,
"learning_rate": 3.72726140684072e-07,
"loss": 0.1488,
"reward": 0.03351620538160205,
"reward_std": 0.4431127682328224,
"rewards/cosine_scaled_reward": -0.27490856871008873,
"rewards/format_reward": 0.5833333432674408,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1252.5833587646484,
"epoch": 0.38171428571428573,
"grad_norm": 1.6680786188797292,
"kl": 2.4984130859375,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.1246,
"reward": -0.1514057070016861,
"reward_std": 0.5695896856486797,
"rewards/cosine_scaled_reward": -0.26320285350084305,
"rewards/format_reward": 0.3750000074505806,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1232.0000610351562,
"epoch": 0.38285714285714284,
"grad_norm": 1.828125714274309,
"kl": 0.07843017578125,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.1105,
"reward": 0.07522661844268441,
"reward_std": 0.5525132827460766,
"rewards/cosine_scaled_reward": -0.21238669380545616,
"rewards/format_reward": 0.5000000149011612,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1240.4375305175781,
"epoch": 0.384,
"grad_norm": 3.2255921262965432,
"kl": 0.19232177734375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.2104,
"reward": -0.07903135940432549,
"reward_std": 0.4235813617706299,
"rewards/cosine_scaled_reward": -0.3103490248322487,
"rewards/format_reward": 0.5416666865348816,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1136.2500457763672,
"epoch": 0.3851428571428571,
"grad_norm": 2.1359050155328076,
"kl": 0.298095703125,
"learning_rate": 3.612465628992203e-07,
"loss": 0.1271,
"reward": 0.29203586652874947,
"reward_std": 0.6221929639577866,
"rewards/cosine_scaled_reward": -0.14564874302595854,
"rewards/format_reward": 0.583333358168602,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 1304.4792175292969,
"epoch": 0.3862857142857143,
"grad_norm": 1.42801024449987,
"kl": 0.2041015625,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0306,
"reward": -0.07640792615711689,
"reward_std": 0.29374565184116364,
"rewards/cosine_scaled_reward": -0.30903729796409607,
"rewards/format_reward": 0.5416666716337204,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 1483.5625305175781,
"epoch": 0.38742857142857146,
"grad_norm": 4.530770296915891,
"kl": 0.2216796875,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0756,
"reward": -0.22593690641224384,
"reward_std": 0.42642898857593536,
"rewards/cosine_scaled_reward": -0.31088512018322945,
"rewards/format_reward": 0.39583334513008595,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 1301.6875305175781,
"epoch": 0.38857142857142857,
"grad_norm": 32.229056752997074,
"kl": 0.72998046875,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0451,
"reward": 0.1187155619263649,
"reward_std": 0.6100866496562958,
"rewards/cosine_scaled_reward": -0.16980887576937675,
"rewards/format_reward": 0.4583333432674408,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1222.5000610351562,
"epoch": 0.38971428571428574,
"grad_norm": 31.15024931066955,
"kl": 1.814453125,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0053,
"reward": 0.4647822715342045,
"reward_std": 0.8535723686218262,
"rewards/cosine_scaled_reward": 0.013641122728586197,
"rewards/format_reward": 0.4375000149011612,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 1550.9583740234375,
"epoch": 0.39085714285714285,
"grad_norm": 5.073035047796139,
"kl": 0.40185546875,
"learning_rate": 3.471051066897562e-07,
"loss": 0.1274,
"reward": -0.049222253262996674,
"reward_std": 0.6296448782086372,
"rewards/cosine_scaled_reward": -0.1704444605857134,
"rewards/format_reward": 0.29166667349636555,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1254.1458740234375,
"epoch": 0.392,
"grad_norm": 2.9987047793682247,
"kl": 0.191650390625,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.132,
"reward": 0.4507103096693754,
"reward_std": 0.46682045608758926,
"rewards/cosine_scaled_reward": -0.11839485540986061,
"rewards/format_reward": 0.6875000298023224,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 1259.0833740234375,
"epoch": 0.3931428571428571,
"grad_norm": 11.834773130920754,
"kl": 0.765625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.1692,
"reward": 0.04102582670748234,
"reward_std": 0.6375212371349335,
"rewards/cosine_scaled_reward": -0.16698708944022655,
"rewards/format_reward": 0.3750000111758709,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 948.8333587646484,
"epoch": 0.3942857142857143,
"grad_norm": 4.082579051373274,
"kl": 0.11126708984375,
"learning_rate": 3.387377967463493e-07,
"loss": 0.1531,
"reward": 0.32552773877978325,
"reward_std": 0.5937002822756767,
"rewards/cosine_scaled_reward": -0.18098615854978561,
"rewards/format_reward": 0.6875000149011612,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 1348.5208740234375,
"epoch": 0.3954285714285714,
"grad_norm": 4.16581520032074,
"kl": 0.233154296875,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0891,
"reward": -0.024696938693523407,
"reward_std": 0.6840994879603386,
"rewards/cosine_scaled_reward": -0.2310984805226326,
"rewards/format_reward": 0.4375000074505806,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 1152.9583435058594,
"epoch": 0.3965714285714286,
"grad_norm": 6.491892036842968,
"kl": 0.688232421875,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.1813,
"reward": 0.7761995047330856,
"reward_std": 0.9014021009206772,
"rewards/cosine_scaled_reward": 0.08601640490815043,
"rewards/format_reward": 0.6041666865348816,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1119.6875305175781,
"epoch": 0.3977142857142857,
"grad_norm": 6.465035426418669,
"kl": 0.2274169921875,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.3084,
"reward": 0.1041297996416688,
"reward_std": 0.5661944150924683,
"rewards/cosine_scaled_reward": -0.2187684327363968,
"rewards/format_reward": 0.541666679084301,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 1280.5000305175781,
"epoch": 0.39885714285714285,
"grad_norm": 5.965340713566614,
"kl": 0.0919189453125,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.264,
"reward": 0.5343287643045187,
"reward_std": 1.0619665831327438,
"rewards/cosine_scaled_reward": -0.024502300075255334,
"rewards/format_reward": 0.5833333432674408,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 1420.2291870117188,
"epoch": 0.4,
"grad_norm": 2.925238124886515,
"kl": 0.185791015625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.1961,
"reward": 0.12700789980590343,
"reward_std": 0.8331074118614197,
"rewards/cosine_scaled_reward": -0.1656627282500267,
"rewards/format_reward": 0.4583333432674408,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 1083.6042175292969,
"epoch": 0.40114285714285713,
"grad_norm": 3.606767246674259,
"kl": 0.18115234375,
"learning_rate": 3.222848061454764e-07,
"loss": -0.0154,
"reward": 0.25727599672973156,
"reward_std": 0.6387183666229248,
"rewards/cosine_scaled_reward": -0.18386201839894056,
"rewards/format_reward": 0.6250000074505806,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 1439.1250610351562,
"epoch": 0.4022857142857143,
"grad_norm": 1.929818758425276,
"kl": 0.1397705078125,
"learning_rate": 3.195807108082429e-07,
"loss": 0.1728,
"reward": -0.14825151395052671,
"reward_std": 0.5558790042996407,
"rewards/cosine_scaled_reward": -0.2824591100215912,
"rewards/format_reward": 0.4166666716337204,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 1523.3542175292969,
"epoch": 0.4034285714285714,
"grad_norm": 1.501137402879622,
"kl": 0.15606689453125,
"learning_rate": 3.168878457820915e-07,
"loss": 0.1054,
"reward": -0.2005203291773796,
"reward_std": 0.5384240373969078,
"rewards/cosine_scaled_reward": -0.2565101645886898,
"rewards/format_reward": 0.31250000558793545,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 1457.8125305175781,
"epoch": 0.4045714285714286,
"grad_norm": 1.7447813143906967,
"kl": 0.19287109375,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0946,
"reward": -0.07205517496913671,
"reward_std": 0.5912996232509613,
"rewards/cosine_scaled_reward": -0.27561092376708984,
"rewards/format_reward": 0.4791666865348816,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 935.1041793823242,
"epoch": 0.4057142857142857,
"grad_norm": 5.735907828017728,
"kl": 0.416259765625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.2126,
"reward": 0.6018264503218234,
"reward_std": 0.43670547753572464,
"rewards/cosine_scaled_reward": -0.04283679276704788,
"rewards/format_reward": 0.6875000149011612,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 1400.4375610351562,
"epoch": 0.40685714285714286,
"grad_norm": 4.2513620245343855,
"kl": 0.2110595703125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0986,
"reward": 0.07107849605381489,
"reward_std": 0.6532387360930443,
"rewards/cosine_scaled_reward": -0.22487742826342583,
"rewards/format_reward": 0.5208333432674408,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 1214.1250305175781,
"epoch": 0.408,
"grad_norm": 1.8135177203210504,
"kl": 0.1314697265625,
"learning_rate": 3.062313053727671e-07,
"loss": 0.1426,
"reward": 0.03724817745387554,
"reward_std": 0.5181447230279446,
"rewards/cosine_scaled_reward": -0.2730425810441375,
"rewards/format_reward": 0.5833333507180214,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 1284.7500305175781,
"epoch": 0.40914285714285714,
"grad_norm": 3.565695417018542,
"kl": 0.18597412109375,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.1245,
"reward": 0.04130622744560242,
"reward_std": 0.7205251231789589,
"rewards/cosine_scaled_reward": -0.19809689931571484,
"rewards/format_reward": 0.4375000111758709,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 1343.2083740234375,
"epoch": 0.4102857142857143,
"grad_norm": 3.2057830256260917,
"kl": 0.14324951171875,
"learning_rate": 3.0097380284049523e-07,
"loss": -0.0078,
"reward": 0.1697351299226284,
"reward_std": 0.3564612567424774,
"rewards/cosine_scaled_reward": -0.13388244062662125,
"rewards/format_reward": 0.4375000111758709,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 1433.2291870117188,
"epoch": 0.4114285714285714,
"grad_norm": 1.6762255456245136,
"kl": 0.1739501953125,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.1781,
"reward": 0.21988008171319962,
"reward_std": 0.7903619408607483,
"rewards/cosine_scaled_reward": -0.10880996193736792,
"rewards/format_reward": 0.4375000223517418,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 1309.6250457763672,
"epoch": 0.4125714285714286,
"grad_norm": 0.9826821036841882,
"kl": 0.135498046875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0186,
"reward": 0.33486853912472725,
"reward_std": 0.500580433756113,
"rewards/cosine_scaled_reward": -0.11381572997197509,
"rewards/format_reward": 0.5625000149011612,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 1173.5625305175781,
"epoch": 0.4137142857142857,
"grad_norm": 4.226570835684713,
"kl": 0.0926513671875,
"learning_rate": 2.931788945420058e-07,
"loss": 0.18,
"reward": 0.15393588319420815,
"reward_std": 0.5774414390325546,
"rewards/cosine_scaled_reward": -0.20428206771612167,
"rewards/format_reward": 0.5625000149011612,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 1250.0416870117188,
"epoch": 0.41485714285714287,
"grad_norm": 1.978862188088671,
"kl": 0.09033203125,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.1327,
"reward": 0.2741839215159416,
"reward_std": 0.6551093906164169,
"rewards/cosine_scaled_reward": -0.17540805786848068,
"rewards/format_reward": 0.6250000149011612,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 1354.9583892822266,
"epoch": 0.416,
"grad_norm": 1.500971160749094,
"kl": 0.1431884765625,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.1556,
"reward": 0.09914333745837212,
"reward_std": 0.5969183072447777,
"rewards/cosine_scaled_reward": -0.17959501221776009,
"rewards/format_reward": 0.4583333432674408,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 1443.8958435058594,
"epoch": 0.41714285714285715,
"grad_norm": 1.5336203716893533,
"kl": 0.14166259765625,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0794,
"reward": 0.08230920624919236,
"reward_std": 0.7491874545812607,
"rewards/cosine_scaled_reward": -0.18801206350326538,
"rewards/format_reward": 0.4583333358168602,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 1280.0833740234375,
"epoch": 0.41828571428571426,
"grad_norm": 5.917103922817008,
"kl": 0.1395263671875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.2201,
"reward": 0.30844624526798725,
"reward_std": 0.6032929718494415,
"rewards/cosine_scaled_reward": -0.11661022901535034,
"rewards/format_reward": 0.5416666865348816,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 1071.7083587646484,
"epoch": 0.41942857142857143,
"grad_norm": 6.764653159306351,
"kl": 1.21533203125,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.2217,
"reward": 0.5629880558699369,
"reward_std": 0.7271402254700661,
"rewards/cosine_scaled_reward": -0.02058931067585945,
"rewards/format_reward": 0.6041666716337204,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 1272.2708892822266,
"epoch": 0.4205714285714286,
"grad_norm": 3.868751461553908,
"kl": 0.376953125,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0282,
"reward": 0.2414314430207014,
"reward_std": 0.783539354801178,
"rewards/cosine_scaled_reward": -0.13970092684030533,
"rewards/format_reward": 0.5208333544433117,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 1453.8333740234375,
"epoch": 0.4217142857142857,
"grad_norm": 1.6191974125060598,
"kl": 0.29150390625,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.172,
"reward": 0.11266430467367172,
"reward_std": 0.7149153798818588,
"rewards/cosine_scaled_reward": -0.1415845244191587,
"rewards/format_reward": 0.39583334140479565,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 1463.3959045410156,
"epoch": 0.4228571428571429,
"grad_norm": 4.101308083609096,
"kl": 0.56884765625,
"learning_rate": 2.729523361034538e-07,
"loss": 0.2149,
"reward": -0.2552230432629585,
"reward_std": 0.5415500551462173,
"rewards/cosine_scaled_reward": -0.26302820444107056,
"rewards/format_reward": 0.27083334140479565,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 1240.8542175292969,
"epoch": 0.424,
"grad_norm": 3.8927886605185447,
"kl": 0.30340576171875,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.1602,
"reward": 0.1614240426570177,
"reward_std": 0.5875495374202728,
"rewards/cosine_scaled_reward": -0.15887131541967392,
"rewards/format_reward": 0.479166679084301,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 1177.3958740234375,
"epoch": 0.42514285714285716,
"grad_norm": 3.066569475752354,
"kl": 0.1824951171875,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.1059,
"reward": 0.2956250160932541,
"reward_std": 0.6594211757183075,
"rewards/cosine_scaled_reward": -0.12302083522081375,
"rewards/format_reward": 0.5416666939854622,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 1163.4375610351562,
"epoch": 0.42628571428571427,
"grad_norm": 5.09566585578463,
"kl": 0.2724609375,
"learning_rate": 2.655868138008171e-07,
"loss": 0.1544,
"reward": 0.07318597589619458,
"reward_std": 0.7096846550703049,
"rewards/cosine_scaled_reward": -0.2759070098400116,
"rewards/format_reward": 0.6250000149011612,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 1246.7500457763672,
"epoch": 0.42742857142857144,
"grad_norm": 32.203352857308325,
"kl": 0.839111328125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.1927,
"reward": 0.08969515189528465,
"reward_std": 0.6610818058252335,
"rewards/cosine_scaled_reward": -0.22598576080054045,
"rewards/format_reward": 0.5416666865348816,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 1280.7708740234375,
"epoch": 0.42857142857142855,
"grad_norm": 63.335567619096544,
"kl": 0.94873046875,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.2184,
"reward": 0.18546735402196646,
"reward_std": 0.9102050960063934,
"rewards/cosine_scaled_reward": -0.17809965554624796,
"rewards/format_reward": 0.5416667014360428,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 1256.6250305175781,
"epoch": 0.4297142857142857,
"grad_norm": 3.6519960558396716,
"kl": 0.310302734375,
"learning_rate": 2.583460445215911e-07,
"loss": 0.1114,
"reward": 0.1940733604133129,
"reward_std": 0.5819907337427139,
"rewards/cosine_scaled_reward": -0.1946299858391285,
"rewards/format_reward": 0.583333358168602,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 1285.6458740234375,
"epoch": 0.4308571428571429,
"grad_norm": 5.319529708040252,
"kl": 0.3394775390625,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0359,
"reward": 0.25018906872719526,
"reward_std": 0.8042758777737617,
"rewards/cosine_scaled_reward": -0.13532213680446148,
"rewards/format_reward": 0.5208333432674408,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 1374.5416870117188,
"epoch": 0.432,
"grad_norm": 20.461016176615068,
"kl": 0.70166015625,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.2346,
"reward": -0.005498896003700793,
"reward_std": 0.5357099026441574,
"rewards/cosine_scaled_reward": -0.22149945423007011,
"rewards/format_reward": 0.4375000074505806,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 1435.1875,
"epoch": 0.43314285714285716,
"grad_norm": 2.391831824846237,
"kl": 0.292236328125,
"learning_rate": 2.512332043064913e-07,
"loss": 0.1982,
"reward": 0.012932289391756058,
"reward_std": 0.799980454146862,
"rewards/cosine_scaled_reward": -0.20186719112098217,
"rewards/format_reward": 0.4166666865348816,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 1369.5833740234375,
"epoch": 0.4342857142857143,
"grad_norm": 2.2747857355280208,
"kl": 0.1715087890625,
"learning_rate": 2.488912271385139e-07,
"loss": 0.1725,
"reward": -0.22791396314278245,
"reward_std": 0.4170580878853798,
"rewards/cosine_scaled_reward": -0.3431236445903778,
"rewards/format_reward": 0.4583333544433117,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 1187.6458587646484,
"epoch": 0.43542857142857144,
"grad_norm": 2.7553958817382593,
"kl": 0.16162109375,
"learning_rate": 2.465639255873246e-07,
"loss": 0.1247,
"reward": 0.19117721682414412,
"reward_std": 0.46048377081751823,
"rewards/cosine_scaled_reward": -0.23774472624063492,
"rewards/format_reward": 0.6666667014360428,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 1300.3125305175781,
"epoch": 0.43657142857142855,
"grad_norm": 2.0362039263750082,
"kl": 0.1822509765625,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.1158,
"reward": 0.2739548869431019,
"reward_std": 0.603746622800827,
"rewards/cosine_scaled_reward": -0.09218922536820173,
"rewards/format_reward": 0.4583333432674408,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 1263.7917175292969,
"epoch": 0.4377142857142857,
"grad_norm": 7.617696331239462,
"kl": 0.2333984375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.1076,
"reward": 0.12070683389902115,
"reward_std": 0.38592402543872595,
"rewards/cosine_scaled_reward": -0.18964658118784428,
"rewards/format_reward": 0.5,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 1170.3750457763672,
"epoch": 0.43885714285714283,
"grad_norm": 3.2601623912372233,
"kl": 0.2103271484375,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.1471,
"reward": -0.021999074146151543,
"reward_std": 0.34355130419135094,
"rewards/cosine_scaled_reward": -0.31308288127183914,
"rewards/format_reward": 0.6041666865348816,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 1233.7083740234375,
"epoch": 0.44,
"grad_norm": 2.1916650637468025,
"kl": 0.16259765625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0922,
"reward": 0.054161038249731064,
"reward_std": 0.7442760765552521,
"rewards/cosine_scaled_reward": -0.2541694864630699,
"rewards/format_reward": 0.5625000149011612,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 870.3333740234375,
"epoch": 0.44114285714285717,
"grad_norm": 1.4860604247340325,
"kl": 0.0894775390625,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.1158,
"reward": 0.28954136464744806,
"reward_std": 0.5479708462953568,
"rewards/cosine_scaled_reward": -0.240646006539464,
"rewards/format_reward": 0.770833358168602,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1227.2291870117188,
"epoch": 0.4422857142857143,
"grad_norm": 1.687755076974517,
"kl": 0.2470703125,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.151,
"reward": -0.0012904666364192963,
"reward_std": 0.4440325200557709,
"rewards/cosine_scaled_reward": -0.2714785784482956,
"rewards/format_reward": 0.5416666865348816,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1033.6042022705078,
"epoch": 0.44342857142857145,
"grad_norm": 2.5341444420596884,
"kl": 0.164947509765625,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0897,
"reward": 0.4180222749710083,
"reward_std": 0.754804901778698,
"rewards/cosine_scaled_reward": -0.14515553694218397,
"rewards/format_reward": 0.7083333432674408,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1356.8333740234375,
"epoch": 0.44457142857142856,
"grad_norm": 3.704344948231344,
"kl": 0.372314453125,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.102,
"reward": 0.2806839719414711,
"reward_std": 0.6125510483980179,
"rewards/cosine_scaled_reward": -0.07840801030397415,
"rewards/format_reward": 0.4375000074505806,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 1090.2708740234375,
"epoch": 0.44571428571428573,
"grad_norm": 14.470921296685399,
"kl": 0.47216796875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.2654,
"reward": 0.07703178748488426,
"reward_std": 0.5665107443928719,
"rewards/cosine_scaled_reward": -0.26356743834912777,
"rewards/format_reward": 0.6041666939854622,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 1098.7500305175781,
"epoch": 0.44685714285714284,
"grad_norm": 2.4001916122615157,
"kl": 0.26904296875,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.1304,
"reward": 0.2017030455172062,
"reward_std": 0.5325312875211239,
"rewards/cosine_scaled_reward": -0.20123182306997478,
"rewards/format_reward": 0.6041666865348816,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 1404.0625610351562,
"epoch": 0.448,
"grad_norm": 12.93850484473414,
"kl": 0.662109375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0663,
"reward": 0.39279897045344114,
"reward_std": 0.9181084930896759,
"rewards/cosine_scaled_reward": -0.04318385384976864,
"rewards/format_reward": 0.4791666939854622,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 1046.0417175292969,
"epoch": 0.4491428571428571,
"grad_norm": 3.1910943036863695,
"kl": 0.2236328125,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.094,
"reward": 0.1259294361807406,
"reward_std": 0.620373547077179,
"rewards/cosine_scaled_reward": -0.23911861330270767,
"rewards/format_reward": 0.6041666865348816,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 898.3333511352539,
"epoch": 0.4502857142857143,
"grad_norm": 4.93057169428389,
"kl": 0.25714111328125,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.2665,
"reward": 0.2223543766885996,
"reward_std": 0.4368506968021393,
"rewards/cosine_scaled_reward": -0.23257281631231308,
"rewards/format_reward": 0.6875000149011612,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 1140.0000457763672,
"epoch": 0.4514285714285714,
"grad_norm": 2.3738396662205945,
"kl": 0.35986328125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.3105,
"reward": 0.10918148793280125,
"reward_std": 0.5202281884849072,
"rewards/cosine_scaled_reward": -0.21624258160591125,
"rewards/format_reward": 0.5416666865348816,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 1151.3958740234375,
"epoch": 0.45257142857142857,
"grad_norm": 2.5367764499763257,
"kl": 0.3154296875,
"learning_rate": 2.134908592756607e-07,
"loss": 0.1917,
"reward": 0.17909681051969528,
"reward_std": 0.7349686250090599,
"rewards/cosine_scaled_reward": -0.2021182719618082,
"rewards/format_reward": 0.5833333432674408,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 1223.8958740234375,
"epoch": 0.45371428571428574,
"grad_norm": 3.0861426217577645,
"kl": 0.38720703125,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.2308,
"reward": 0.6319128852337599,
"reward_std": 0.8242618143558502,
"rewards/cosine_scaled_reward": 0.04512310400605202,
"rewards/format_reward": 0.541666679084301,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 1218.8541870117188,
"epoch": 0.45485714285714285,
"grad_norm": 18.365837770437405,
"kl": 0.6829833984375,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.189,
"reward": 0.27588833356276155,
"reward_std": 0.8127910792827606,
"rewards/cosine_scaled_reward": -0.19538918882608414,
"rewards/format_reward": 0.6666666939854622,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1264.1666870117188,
"epoch": 0.456,
"grad_norm": 3.8049582826738373,
"kl": 0.47314453125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.1823,
"reward": 0.055698491632938385,
"reward_std": 0.49411067366600037,
"rewards/cosine_scaled_reward": -0.21173409838229418,
"rewards/format_reward": 0.4791666828095913,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1013.3750457763672,
"epoch": 0.45714285714285713,
"grad_norm": 7.251771375036044,
"kl": 0.4078369140625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.175,
"reward": 0.2562308683991432,
"reward_std": 0.2563706263899803,
"rewards/cosine_scaled_reward": -0.2260512337088585,
"rewards/format_reward": 0.7083333488553762,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 952.7291870117188,
"epoch": 0.4582857142857143,
"grad_norm": 8.82258461767532,
"kl": 0.45355224609375,
"learning_rate": 2.032690407508949e-07,
"loss": 0.1529,
"reward": 0.4902263447875157,
"reward_std": 0.5446355119347572,
"rewards/cosine_scaled_reward": -0.11947017908096313,
"rewards/format_reward": 0.729166679084301,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 1302.8542175292969,
"epoch": 0.4594285714285714,
"grad_norm": 9.144934630730456,
"kl": 0.51953125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.1489,
"reward": 0.0001004636287689209,
"reward_std": 0.5631029531359673,
"rewards/cosine_scaled_reward": -0.28119976818561554,
"rewards/format_reward": 0.5625000223517418,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 1430.5833740234375,
"epoch": 0.4605714285714286,
"grad_norm": 1.9477748820622875,
"kl": 0.3515625,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.2408,
"reward": -0.06413780152797699,
"reward_std": 0.7934899777173996,
"rewards/cosine_scaled_reward": -0.2195689007639885,
"rewards/format_reward": 0.3750000037252903,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 1262.8333587646484,
"epoch": 0.4617142857142857,
"grad_norm": 3.330875199108497,
"kl": 0.19140625,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.1344,
"reward": 0.1329102972522378,
"reward_std": 0.5511343032121658,
"rewards/cosine_scaled_reward": -0.25646152906119823,
"rewards/format_reward": 0.645833358168602,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 1030.2916870117188,
"epoch": 0.46285714285714286,
"grad_norm": 7.2006501333137996,
"kl": 0.147216796875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.1614,
"reward": 0.41257511638104916,
"reward_std": 0.4603617787361145,
"rewards/cosine_scaled_reward": -0.14787913113832474,
"rewards/format_reward": 0.7083333432674408,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 1085.1666870117188,
"epoch": 0.464,
"grad_norm": 1.7990135612572722,
"kl": 0.25146484375,
"learning_rate": 1.934696604901642e-07,
"loss": 0.1199,
"reward": -0.0262349434196949,
"reward_std": 0.4924147129058838,
"rewards/cosine_scaled_reward": -0.2839508093893528,
"rewards/format_reward": 0.5416666828095913,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 953.4375,
"epoch": 0.46514285714285714,
"grad_norm": 2.205384781012483,
"kl": 0.16357421875,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0901,
"reward": 0.5169772207736969,
"reward_std": 0.28926569409668446,
"rewards/cosine_scaled_reward": -0.0748447310179472,
"rewards/format_reward": 0.6666666828095913,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 914.9583587646484,
"epoch": 0.4662857142857143,
"grad_norm": 2.497956913876472,
"kl": 0.27496337890625,
"learning_rate": 1.8967088307307e-07,
"loss": 0.1155,
"reward": 0.3262156348209828,
"reward_std": 0.6255160942673683,
"rewards/cosine_scaled_reward": -0.13897553086280823,
"rewards/format_reward": 0.6041666716337204,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 1040.8333435058594,
"epoch": 0.4674285714285714,
"grad_norm": 8.69885706641019,
"kl": 0.2950439453125,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.1446,
"reward": 0.45548180863261223,
"reward_std": 0.683892697095871,
"rewards/cosine_scaled_reward": -0.1472591133788228,
"rewards/format_reward": 0.7500000149011612,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 1113.8750305175781,
"epoch": 0.4685714285714286,
"grad_norm": 2.827095364325863,
"kl": 0.17364501953125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.1365,
"reward": -0.055647075176239014,
"reward_std": 0.5701718181371689,
"rewards/cosine_scaled_reward": -0.3299068883061409,
"rewards/format_reward": 0.6041666939854622,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 1180.7500305175781,
"epoch": 0.4697142857142857,
"grad_norm": 6.312691045251246,
"kl": 0.2171630859375,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.26,
"reward": -0.027378916274756193,
"reward_std": 0.5135050415992737,
"rewards/cosine_scaled_reward": -0.33660613000392914,
"rewards/format_reward": 0.6458333432674408,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 936.0625152587891,
"epoch": 0.47085714285714286,
"grad_norm": 8.466457070247934,
"kl": 0.207763671875,
"learning_rate": 1.822847957491922e-07,
"loss": 0.2152,
"reward": 0.2903781367931515,
"reward_std": 0.6151079386472702,
"rewards/cosine_scaled_reward": -0.24022759683430195,
"rewards/format_reward": 0.770833358168602,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 1127.9375610351562,
"epoch": 0.472,
"grad_norm": 7.005816452720984,
"kl": 0.23388671875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.2359,
"reward": -0.05256163072772324,
"reward_std": 0.5086416229605675,
"rewards/cosine_scaled_reward": -0.30753082782030106,
"rewards/format_reward": 0.5625000223517418,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 1254.2292175292969,
"epoch": 0.47314285714285714,
"grad_norm": 3.1930529627345146,
"kl": 0.30908203125,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.091,
"reward": 0.27630291134119034,
"reward_std": 0.601336345076561,
"rewards/cosine_scaled_reward": -0.12226520664989948,
"rewards/format_reward": 0.520833358168602,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 1198.7083740234375,
"epoch": 0.4742857142857143,
"grad_norm": 1.9203274121236615,
"kl": 0.27783203125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.1839,
"reward": 0.15045135095715523,
"reward_std": 0.8359555453062057,
"rewards/cosine_scaled_reward": -0.21644099615514278,
"rewards/format_reward": 0.5833333507180214,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 1303.2708435058594,
"epoch": 0.4754285714285714,
"grad_norm": 5.219130595783076,
"kl": 0.288330078125,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.2384,
"reward": 0.06198018416762352,
"reward_std": 0.7209452688694,
"rewards/cosine_scaled_reward": -0.2502599246799946,
"rewards/format_reward": 0.5625000149011612,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 1104.3958740234375,
"epoch": 0.4765714285714286,
"grad_norm": 343.4311543801194,
"kl": 3.455078125,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.3667,
"reward": 0.25671100057661533,
"reward_std": 0.5841851308941841,
"rewards/cosine_scaled_reward": -0.19456118065863848,
"rewards/format_reward": 0.645833358168602,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 1071.9375305175781,
"epoch": 0.4777142857142857,
"grad_norm": 3.5739561302927703,
"kl": 0.18438720703125,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0318,
"reward": 0.18263494968414307,
"reward_std": 0.688008576631546,
"rewards/cosine_scaled_reward": -0.25243253633379936,
"rewards/format_reward": 0.6875000149011612,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 1059.2500457763672,
"epoch": 0.47885714285714287,
"grad_norm": 42.82614000306872,
"kl": 14.88720703125,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.182,
"reward": 0.10820261249318719,
"reward_std": 0.658612459897995,
"rewards/cosine_scaled_reward": -0.24798204004764557,
"rewards/format_reward": 0.6041666716337204,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1066.7917022705078,
"epoch": 0.48,
"grad_norm": 7.563689623131912,
"kl": 0.54296875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.1366,
"reward": 0.24830662203021348,
"reward_std": 0.6641267538070679,
"rewards/cosine_scaled_reward": -0.19876337423920631,
"rewards/format_reward": 0.6458333432674408,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 1140.8958740234375,
"epoch": 0.48114285714285715,
"grad_norm": 5.102712434876203,
"kl": 0.455322265625,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.238,
"reward": 0.22175164567306638,
"reward_std": 0.48806294053792953,
"rewards/cosine_scaled_reward": -0.19120752811431885,
"rewards/format_reward": 0.6041666865348816,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 1181.3333587646484,
"epoch": 0.48228571428571426,
"grad_norm": 11.187728016893017,
"kl": 0.7470703125,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.2428,
"reward": 0.016264647245407104,
"reward_std": 0.7520715892314911,
"rewards/cosine_scaled_reward": -0.27311767637729645,
"rewards/format_reward": 0.5625000149011612,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 1151.5625305175781,
"epoch": 0.48342857142857143,
"grad_norm": 36.484656907353894,
"kl": 1.12109375,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.225,
"reward": 0.166658578440547,
"reward_std": 0.5137820392847061,
"rewards/cosine_scaled_reward": -0.20833738893270493,
"rewards/format_reward": 0.5833333432674408,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 1222.4167022705078,
"epoch": 0.4845714285714286,
"grad_norm": 5.314021913144739,
"kl": 0.468994140625,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0791,
"reward": -0.053052062867209315,
"reward_std": 0.5032695159316063,
"rewards/cosine_scaled_reward": -0.349442720413208,
"rewards/format_reward": 0.6458333507180214,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 1303.0417022705078,
"epoch": 0.4857142857142857,
"grad_norm": 15.439357915372184,
"kl": 0.76171875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.1567,
"reward": 0.06288054899778217,
"reward_std": 0.8221424967050552,
"rewards/cosine_scaled_reward": -0.24980972707271576,
"rewards/format_reward": 0.5625000111758709,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 1363.2292175292969,
"epoch": 0.4868571428571429,
"grad_norm": 16.190560753791,
"kl": 0.64306640625,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.2057,
"reward": 0.0070614293217659,
"reward_std": 0.8801029026508331,
"rewards/cosine_scaled_reward": -0.18396929651498795,
"rewards/format_reward": 0.3750000111758709,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 1062.6250457763672,
"epoch": 0.488,
"grad_norm": 5.208018104302035,
"kl": 0.289306640625,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.2392,
"reward": 0.1040960568934679,
"reward_std": 0.7021225243806839,
"rewards/cosine_scaled_reward": -0.21878531202673912,
"rewards/format_reward": 0.5416666865348816,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1168.6250457763672,
"epoch": 0.48914285714285716,
"grad_norm": 1.7936384513629215,
"kl": 0.194366455078125,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.1231,
"reward": 0.1094297245144844,
"reward_std": 0.5426923930644989,
"rewards/cosine_scaled_reward": -0.247368473559618,
"rewards/format_reward": 0.6041666716337204,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 1089.6875305175781,
"epoch": 0.49028571428571427,
"grad_norm": 3.242866515089598,
"kl": 0.18408203125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.1081,
"reward": 0.4839252680540085,
"reward_std": 0.5947171896696091,
"rewards/cosine_scaled_reward": -0.03928736597299576,
"rewards/format_reward": 0.5625000260770321,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 1313.5833587646484,
"epoch": 0.49142857142857144,
"grad_norm": 1.478054069262014,
"kl": 0.21612548828125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.1273,
"reward": 0.15572084113955498,
"reward_std": 0.5618212074041367,
"rewards/cosine_scaled_reward": -0.18255625164601952,
"rewards/format_reward": 0.5208333358168602,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 975.4791870117188,
"epoch": 0.49257142857142855,
"grad_norm": 3.541465585724065,
"kl": 0.1605224609375,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.1148,
"reward": 0.420807933434844,
"reward_std": 0.890654593706131,
"rewards/cosine_scaled_reward": -0.11251270584762096,
"rewards/format_reward": 0.6458333507180214,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 1181.062515258789,
"epoch": 0.4937142857142857,
"grad_norm": 3.350973639300781,
"kl": 0.155517578125,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.1027,
"reward": 0.032605723943561316,
"reward_std": 0.5731803774833679,
"rewards/cosine_scaled_reward": -0.2753637991845608,
"rewards/format_reward": 0.5833333507180214,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 1207.0417175292969,
"epoch": 0.4948571428571429,
"grad_norm": 4.990349151202906,
"kl": 0.185546875,
"learning_rate": 1.483363816965435e-07,
"loss": 0.1393,
"reward": 0.08886189805343747,
"reward_std": 0.4594448246061802,
"rewards/cosine_scaled_reward": -0.23681906727142632,
"rewards/format_reward": 0.5625000298023224,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 859.1250457763672,
"epoch": 0.496,
"grad_norm": 1.9877951359345267,
"kl": 0.17950439453125,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0512,
"reward": 1.2721150815486908,
"reward_std": 0.6770742386579514,
"rewards/cosine_scaled_reward": 0.20897419564425945,
"rewards/format_reward": 0.8541666716337204,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 1266.375015258789,
"epoch": 0.49714285714285716,
"grad_norm": 1.8972601097369153,
"kl": 0.2255859375,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.1234,
"reward": 0.10697830189019442,
"reward_std": 0.531020175665617,
"rewards/cosine_scaled_reward": -0.22776086255908012,
"rewards/format_reward": 0.5625000149011612,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 1315.5625305175781,
"epoch": 0.4982857142857143,
"grad_norm": 2.490164316553904,
"kl": 0.2635498046875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.1196,
"reward": -0.10972822457551956,
"reward_std": 0.5596715956926346,
"rewards/cosine_scaled_reward": -0.2840307876467705,
"rewards/format_reward": 0.4583333395421505,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1060.5208587646484,
"epoch": 0.49942857142857144,
"grad_norm": 1.9387158266765225,
"kl": 0.294189453125,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0174,
"reward": 0.52107123285532,
"reward_std": 0.5726887807250023,
"rewards/cosine_scaled_reward": -0.05196441989392042,
"rewards/format_reward": 0.6250000298023224,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 921.3125457763672,
"epoch": 0.5005714285714286,
"grad_norm": 8.654811309244227,
"kl": 0.2535400390625,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.1354,
"reward": 0.20009983237832785,
"reward_std": 0.6868909299373627,
"rewards/cosine_scaled_reward": -0.20203341665910557,
"rewards/format_reward": 0.6041666865348816,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 1083.0833740234375,
"epoch": 0.5017142857142857,
"grad_norm": 5.3889889905872375,
"kl": 0.40216064453125,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.2461,
"reward": 0.11843711510300636,
"reward_std": 0.5985070914030075,
"rewards/cosine_scaled_reward": -0.2636981066316366,
"rewards/format_reward": 0.6458333432674408,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1088.8750305175781,
"epoch": 0.5028571428571429,
"grad_norm": 4.149099334589977,
"kl": 0.30615234375,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.033,
"reward": 0.11394692957401276,
"reward_std": 0.6579174622893333,
"rewards/cosine_scaled_reward": -0.24510987009853125,
"rewards/format_reward": 0.6041666865348816,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 984.3125610351562,
"epoch": 0.504,
"grad_norm": 16.782102815445953,
"kl": 0.367919921875,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0977,
"reward": 0.3985663428902626,
"reward_std": 0.42315196245908737,
"rewards/cosine_scaled_reward": -0.134050190448761,
"rewards/format_reward": 0.6666666828095913,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 988.4583587646484,
"epoch": 0.5051428571428571,
"grad_norm": 4.106390753028162,
"kl": 0.31378173828125,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0056,
"reward": 0.13055693171918392,
"reward_std": 0.48535653203725815,
"rewards/cosine_scaled_reward": -0.27847154438495636,
"rewards/format_reward": 0.6875000223517418,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 1004.3750152587891,
"epoch": 0.5062857142857143,
"grad_norm": 8.08171445493757,
"kl": 0.16259765625,
"learning_rate": 1.351615817851748e-07,
"loss": 0.2301,
"reward": 0.30977149307727814,
"reward_std": 0.6895428746938705,
"rewards/cosine_scaled_reward": -0.18886426091194153,
"rewards/format_reward": 0.6875000298023224,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 1087.9375305175781,
"epoch": 0.5074285714285715,
"grad_norm": 3.849891000062917,
"kl": 0.1669921875,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0975,
"reward": 0.4580417312681675,
"reward_std": 0.640699241310358,
"rewards/cosine_scaled_reward": -0.14597914181649685,
"rewards/format_reward": 0.7500000298023224,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 1224.2500457763672,
"epoch": 0.5085714285714286,
"grad_norm": 23.59411548899569,
"kl": 0.82958984375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.2131,
"reward": 0.051861570216715336,
"reward_std": 0.6278680041432381,
"rewards/cosine_scaled_reward": -0.2553192190825939,
"rewards/format_reward": 0.5625,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 1067.9791870117188,
"epoch": 0.5097142857142857,
"grad_norm": 3.656936199785778,
"kl": 0.24365234375,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0234,
"reward": 0.36385649256408215,
"reward_std": 0.7834623008966446,
"rewards/cosine_scaled_reward": -0.1722384188324213,
"rewards/format_reward": 0.7083333432674408,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 1174.7083740234375,
"epoch": 0.5108571428571429,
"grad_norm": 2.1759216078948036,
"kl": 0.3828125,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.2194,
"reward": 0.23982627410441637,
"reward_std": 0.5332969650626183,
"rewards/cosine_scaled_reward": -0.21342020854353905,
"rewards/format_reward": 0.6666666865348816,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 1180.2083740234375,
"epoch": 0.512,
"grad_norm": 4.2366039265569135,
"kl": 0.31494140625,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.1963,
"reward": 0.3762773834168911,
"reward_std": 0.6801744475960732,
"rewards/cosine_scaled_reward": -0.14519466273486614,
"rewards/format_reward": 0.666666679084301,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 994.3958740234375,
"epoch": 0.5131428571428571,
"grad_norm": 4.146583173336839,
"kl": 0.148681640625,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.1922,
"reward": 0.36078188568353653,
"reward_std": 0.737194113433361,
"rewards/cosine_scaled_reward": -0.15294241392984986,
"rewards/format_reward": 0.6666667014360428,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 851.8958587646484,
"epoch": 0.5142857142857142,
"grad_norm": 51.8228987068238,
"kl": 0.534942626953125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.1269,
"reward": 0.5865043960511684,
"reward_std": 0.4706997238099575,
"rewards/cosine_scaled_reward": -0.10258114710450172,
"rewards/format_reward": 0.7916666865348816,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 1079.7916870117188,
"epoch": 0.5154285714285715,
"grad_norm": 6.392599999015184,
"kl": 0.2735595703125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.2626,
"reward": 0.22280075028538704,
"reward_std": 0.6088056340813637,
"rewards/cosine_scaled_reward": -0.18026629835367203,
"rewards/format_reward": 0.5833333395421505,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 1046.8542022705078,
"epoch": 0.5165714285714286,
"grad_norm": 8.715599338320725,
"kl": 0.152099609375,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.2189,
"reward": -0.0023173224180936813,
"reward_std": 0.5100973732769489,
"rewards/cosine_scaled_reward": -0.3136586770415306,
"rewards/format_reward": 0.6250000298023224,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 1271.9375305175781,
"epoch": 0.5177142857142857,
"grad_norm": 2.4908553038859917,
"kl": 0.40869140625,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.1805,
"reward": 0.027155719697475433,
"reward_std": 0.5863115191459656,
"rewards/cosine_scaled_reward": -0.23642215505242348,
"rewards/format_reward": 0.5000000149011612,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 1104.6667175292969,
"epoch": 0.5188571428571429,
"grad_norm": 42.815024473876115,
"kl": 2.04296875,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.039,
"reward": 0.2878073714673519,
"reward_std": 0.6589629650115967,
"rewards/cosine_scaled_reward": -0.13734631799161434,
"rewards/format_reward": 0.5625000223517418,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 1183.250015258789,
"epoch": 0.52,
"grad_norm": 2.6108705721314824,
"kl": 0.306640625,
"learning_rate": 1.220245676671809e-07,
"loss": 0.1199,
"reward": 0.3790533752180636,
"reward_std": 0.4862861856818199,
"rewards/cosine_scaled_reward": -0.13338997215032578,
"rewards/format_reward": 0.645833358168602,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 1168.4375305175781,
"epoch": 0.5211428571428571,
"grad_norm": 25.701940158931713,
"kl": 0.477294921875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.1681,
"reward": 0.2994745699688792,
"reward_std": 0.7066301554441452,
"rewards/cosine_scaled_reward": -0.15234605269506574,
"rewards/format_reward": 0.6041666865348816,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 1007.4167022705078,
"epoch": 0.5222857142857142,
"grad_norm": 8.570796851314613,
"kl": 0.186279296875,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.3132,
"reward": 0.400404367595911,
"reward_std": 0.5747000873088837,
"rewards/cosine_scaled_reward": -0.15396450087428093,
"rewards/format_reward": 0.708333358168602,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1272.8958435058594,
"epoch": 0.5234285714285715,
"grad_norm": 4.345232068890798,
"kl": 0.48974609375,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.2715,
"reward": 0.2525772713124752,
"reward_std": 0.8047986179590225,
"rewards/cosine_scaled_reward": -0.12371136248111725,
"rewards/format_reward": 0.5000000298023224,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 1063.625015258789,
"epoch": 0.5245714285714286,
"grad_norm": 5.1821320020363615,
"kl": 0.15771484375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.1275,
"reward": 0.02093285135924816,
"reward_std": 0.42146630585193634,
"rewards/cosine_scaled_reward": -0.3124502506107092,
"rewards/format_reward": 0.645833358168602,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 1044.2708435058594,
"epoch": 0.5257142857142857,
"grad_norm": 3.3010395921201514,
"kl": 0.267333984375,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.1046,
"reward": 0.2678923445455439,
"reward_std": 0.896328404545784,
"rewards/cosine_scaled_reward": -0.12647049874067307,
"rewards/format_reward": 0.5208333656191826,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 1053.4583587646484,
"epoch": 0.5268571428571428,
"grad_norm": 1.478075619341315,
"kl": 0.21600341796875,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0555,
"reward": 0.3391416594386101,
"reward_std": 0.9088789522647858,
"rewards/cosine_scaled_reward": -0.20542917400598526,
"rewards/format_reward": 0.7500000223517418,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 1103.2083740234375,
"epoch": 0.528,
"grad_norm": 9530.342121672113,
"kl": 28.46978759765625,
"learning_rate": 1.1574257748745986e-07,
"loss": 1.3293,
"reward": 0.14297988126054406,
"reward_std": 0.5064843520522118,
"rewards/cosine_scaled_reward": -0.25142673472873867,
"rewards/format_reward": 0.645833358168602,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 1043.3333587646484,
"epoch": 0.5291428571428571,
"grad_norm": 4.739243744092973,
"kl": 0.39892578125,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.2493,
"reward": 0.6755956448614597,
"reward_std": 0.4871959462761879,
"rewards/cosine_scaled_reward": 0.025297801941633224,
"rewards/format_reward": 0.6250000298023224,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 992.2916717529297,
"epoch": 0.5302857142857142,
"grad_norm": 122.21833055898026,
"kl": 1.33843994140625,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.2572,
"reward": 0.29958341596648097,
"reward_std": 0.8296171501278877,
"rewards/cosine_scaled_reward": -0.20437496528029442,
"rewards/format_reward": 0.7083333432674408,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 1240.5417175292969,
"epoch": 0.5314285714285715,
"grad_norm": 6.845067293294205,
"kl": 0.55908203125,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.1986,
"reward": 0.072305912617594,
"reward_std": 0.4831778481602669,
"rewards/cosine_scaled_reward": -0.2138470560312271,
"rewards/format_reward": 0.5000000186264515,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 1192.9167175292969,
"epoch": 0.5325714285714286,
"grad_norm": 9.93163371597492,
"kl": 0.49163818359375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0928,
"reward": 0.04001780319958925,
"reward_std": 0.44342009350657463,
"rewards/cosine_scaled_reward": -0.28207441698759794,
"rewards/format_reward": 0.6041666865348816,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 1341.1667175292969,
"epoch": 0.5337142857142857,
"grad_norm": 19.835786495839272,
"kl": 0.8251953125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.2635,
"reward": 0.1306269969791174,
"reward_std": 0.6591696962714195,
"rewards/cosine_scaled_reward": -0.21593650616705418,
"rewards/format_reward": 0.5625000298023224,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 1138.7708740234375,
"epoch": 0.5348571428571428,
"grad_norm": 13.935934940776233,
"kl": 0.576904296875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0976,
"reward": 0.37931894324719906,
"reward_std": 0.5462356135249138,
"rewards/cosine_scaled_reward": -0.10200719349086285,
"rewards/format_reward": 0.5833333432674408,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 1028.6041870117188,
"epoch": 0.536,
"grad_norm": 2.6967193006473216,
"kl": 0.26171875,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0529,
"reward": 0.40807172656059265,
"reward_std": 0.6494475156068802,
"rewards/cosine_scaled_reward": -0.13971414044499397,
"rewards/format_reward": 0.6875000149011612,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 848.5833587646484,
"epoch": 0.5371428571428571,
"grad_norm": 1.7855628531087904,
"kl": 0.1328125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0217,
"reward": 0.6918718162924051,
"reward_std": 0.5211210399866104,
"rewards/cosine_scaled_reward": -0.0811474658548832,
"rewards/format_reward": 0.8541666865348816,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 1070.7500457763672,
"epoch": 0.5382857142857143,
"grad_norm": 6.038242137859596,
"kl": 0.169921875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.2149,
"reward": 0.06172482669353485,
"reward_std": 0.5211478099226952,
"rewards/cosine_scaled_reward": -0.29205426201224327,
"rewards/format_reward": 0.6458333432674408,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 842.7083435058594,
"epoch": 0.5394285714285715,
"grad_norm": 12.109988243714355,
"kl": 0.2894287109375,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0496,
"reward": 0.46792223304510117,
"reward_std": 0.54752978682518,
"rewards/cosine_scaled_reward": -0.17228887975215912,
"rewards/format_reward": 0.8125000149011612,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 1181.1875305175781,
"epoch": 0.5405714285714286,
"grad_norm": 6.409193674543087,
"kl": 0.3604736328125,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0613,
"reward": -0.049041745252907276,
"reward_std": 0.5112807080149651,
"rewards/cosine_scaled_reward": -0.2641042061150074,
"rewards/format_reward": 0.4791666865348816,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 990.3750305175781,
"epoch": 0.5417142857142857,
"grad_norm": 9.264374683069418,
"kl": 0.11328125,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.1628,
"reward": 0.32282854616642,
"reward_std": 0.7814144194126129,
"rewards/cosine_scaled_reward": -0.20316907577216625,
"rewards/format_reward": 0.7291667014360428,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 962.4792022705078,
"epoch": 0.5428571428571428,
"grad_norm": 4.242696196218054,
"kl": 0.12603759765625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.1483,
"reward": 0.08424473810009658,
"reward_std": 0.48827143758535385,
"rewards/cosine_scaled_reward": -0.28079431876540184,
"rewards/format_reward": 0.6458333544433117,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 1048.8958587646484,
"epoch": 0.544,
"grad_norm": 18.108937894089827,
"kl": 0.4520263671875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1426,
"reward": 0.3813807927072048,
"reward_std": 0.6394810080528259,
"rewards/cosine_scaled_reward": -0.05930961295962334,
"rewards/format_reward": 0.5000000223517418,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 723.4791870117188,
"epoch": 0.5451428571428572,
"grad_norm": 5.68071346914076,
"kl": 0.156005859375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0836,
"reward": 0.7284884303808212,
"reward_std": 0.6032212525606155,
"rewards/cosine_scaled_reward": -0.08367248624563217,
"rewards/format_reward": 0.8958333432674408,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 1013.3750305175781,
"epoch": 0.5462857142857143,
"grad_norm": 4.283725447191352,
"kl": 0.0955657958984375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.1397,
"reward": 0.49316432885825634,
"reward_std": 0.45135799795389175,
"rewards/cosine_scaled_reward": -0.12841782718896866,
"rewards/format_reward": 0.7500000149011612,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 998.3125,
"epoch": 0.5474285714285714,
"grad_norm": 1.2914537281090146,
"kl": 0.15277099609375,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0674,
"reward": 0.31127920374274254,
"reward_std": 0.6323697119951248,
"rewards/cosine_scaled_reward": -0.16727706603705883,
"rewards/format_reward": 0.6458333432674408,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 984.8750152587891,
"epoch": 0.5485714285714286,
"grad_norm": 1.5465556570908303,
"kl": 0.077606201171875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0463,
"reward": 0.5994082670658827,
"reward_std": 0.37920307368040085,
"rewards/cosine_scaled_reward": -0.1169625474140048,
"rewards/format_reward": 0.8333333432674408,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 1145.3541870117188,
"epoch": 0.5497142857142857,
"grad_norm": 2.3612933288431535,
"kl": 0.150390625,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0852,
"reward": 0.18339010886847973,
"reward_std": 0.6312093585729599,
"rewards/cosine_scaled_reward": -0.1999716181308031,
"rewards/format_reward": 0.5833333432674408,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 1099.2291870117188,
"epoch": 0.5508571428571428,
"grad_norm": 3.3478046762143303,
"kl": 0.1209716796875,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.1351,
"reward": 0.4303822033107281,
"reward_std": 0.5441673323512077,
"rewards/cosine_scaled_reward": -0.10772557370364666,
"rewards/format_reward": 0.6458333488553762,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 972.8958587646484,
"epoch": 0.552,
"grad_norm": 2.070698520552659,
"kl": 0.1766357421875,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.1898,
"reward": 0.3021550700068474,
"reward_std": 0.6595650911331177,
"rewards/cosine_scaled_reward": -0.244755819439888,
"rewards/format_reward": 0.7916666865348816,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 966.4375,
"epoch": 0.5531428571428572,
"grad_norm": 7.4718244138049785,
"kl": 0.11669921875,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.1535,
"reward": 0.5982861579395831,
"reward_std": 0.72054024040699,
"rewards/cosine_scaled_reward": -0.034190285950899124,
"rewards/format_reward": 0.6666666716337204,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 850.2292022705078,
"epoch": 0.5542857142857143,
"grad_norm": 2.6747580946696172,
"kl": 0.171142578125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0845,
"reward": 0.440962532768026,
"reward_std": 0.4621984176337719,
"rewards/cosine_scaled_reward": -0.1545187532901764,
"rewards/format_reward": 0.7500000298023224,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 1098.4791870117188,
"epoch": 0.5554285714285714,
"grad_norm": 5.009044167350138,
"kl": 0.1492919921875,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.1657,
"reward": 0.31744778295978904,
"reward_std": 0.8680954575538635,
"rewards/cosine_scaled_reward": -0.15377611527219415,
"rewards/format_reward": 0.6250000149011612,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 1250.9167022705078,
"epoch": 0.5565714285714286,
"grad_norm": 2.168704280565103,
"kl": 0.3330078125,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.1075,
"reward": 0.09373046457767487,
"reward_std": 0.7844668254256248,
"rewards/cosine_scaled_reward": -0.18230143561959267,
"rewards/format_reward": 0.4583333432674408,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 1182.0417175292969,
"epoch": 0.5577142857142857,
"grad_norm": 68.49302633471272,
"kl": 1.04931640625,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.2607,
"reward": 0.0013678865507245064,
"reward_std": 0.5483251512050629,
"rewards/cosine_scaled_reward": -0.2805660478770733,
"rewards/format_reward": 0.5625000149011612,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 898.8958740234375,
"epoch": 0.5588571428571428,
"grad_norm": 3.3267908174810983,
"kl": 0.22412109375,
"learning_rate": 1.013262614978859e-07,
"loss": 0.1155,
"reward": 0.9380166502669454,
"reward_std": 0.38279012218117714,
"rewards/cosine_scaled_reward": 0.10442498326301575,
"rewards/format_reward": 0.7291666716337204,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 1022.5417175292969,
"epoch": 0.56,
"grad_norm": 1.2964210055945644,
"kl": 0.142425537109375,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.1375,
"reward": 0.1352614858187735,
"reward_std": 0.5779989808797836,
"rewards/cosine_scaled_reward": -0.29695259779691696,
"rewards/format_reward": 0.7291666716337204,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 1071.2708740234375,
"epoch": 0.5611428571428572,
"grad_norm": 5.198263303721915,
"kl": 0.270751953125,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.1398,
"reward": 0.360213914886117,
"reward_std": 0.5864584296941757,
"rewards/cosine_scaled_reward": -0.12197639048099518,
"rewards/format_reward": 0.6041666716337204,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 1146.0208740234375,
"epoch": 0.5622857142857143,
"grad_norm": 25.32884427185481,
"kl": 0.860107421875,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.279,
"reward": 0.3603329248726368,
"reward_std": 0.4203804060816765,
"rewards/cosine_scaled_reward": -0.11150021478533745,
"rewards/format_reward": 0.583333358168602,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 1079.6250457763672,
"epoch": 0.5634285714285714,
"grad_norm": 5.39013483275012,
"kl": 0.4027099609375,
"learning_rate": 1.005372381963547e-07,
"loss": 0.2018,
"reward": 0.24866360798478127,
"reward_std": 0.6557547599077225,
"rewards/cosine_scaled_reward": -0.21941821463406086,
"rewards/format_reward": 0.6875000298023224,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 1020.0417022705078,
"epoch": 0.5645714285714286,
"grad_norm": 39.118419014119006,
"kl": 1.0677490234375,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.2737,
"reward": 0.027191074565052986,
"reward_std": 0.4351058676838875,
"rewards/cosine_scaled_reward": -0.3301544785499573,
"rewards/format_reward": 0.6875000149011612,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 1194.8958740234375,
"epoch": 0.5657142857142857,
"grad_norm": 5.938670898030579,
"kl": 0.630859375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.2055,
"reward": 0.374758190009743,
"reward_std": 0.6815578863024712,
"rewards/cosine_scaled_reward": -0.09387091733515263,
"rewards/format_reward": 0.5625000298023224,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1120.3542022705078,
"epoch": 0.5668571428571428,
"grad_norm": 16.235625518016562,
"kl": 0.50927734375,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.297,
"reward": 0.40772235160693526,
"reward_std": 0.8966069668531418,
"rewards/cosine_scaled_reward": -0.09822217002511024,
"rewards/format_reward": 0.6041666865348816,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 827.4791870117188,
"epoch": 0.568,
"grad_norm": 6.892460021170429,
"kl": 6.8536376953125,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.2026,
"reward": 0.8302161321043968,
"reward_std": 0.560060553252697,
"rewards/cosine_scaled_reward": 0.06094140186905861,
"rewards/format_reward": 0.708333358168602,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 1077.1250305175781,
"epoch": 0.5691428571428572,
"grad_norm": 10.255398655040155,
"kl": 0.54931640625,
"learning_rate": 1.000438641958131e-07,
"loss": 0.2299,
"reward": 0.033572545275092125,
"reward_std": 0.4632219597697258,
"rewards/cosine_scaled_reward": -0.30613040924072266,
"rewards/format_reward": 0.645833358168602,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 1336.7083435058594,
"epoch": 0.5702857142857143,
"grad_norm": 27.33692044026225,
"kl": 0.934326171875,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.1642,
"reward": -0.13300850987434387,
"reward_std": 0.6832303777337074,
"rewards/cosine_scaled_reward": -0.28525424748659134,
"rewards/format_reward": 0.4375000149011612,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1017.9375305175781,
"epoch": 0.5714285714285714,
"grad_norm": 2.4162040111238334,
"kl": 0.2301025390625,
"learning_rate": 1e-07,
"loss": 0.1131,
"reward": 0.13043908029794693,
"reward_std": 0.5788910314440727,
"rewards/cosine_scaled_reward": -0.29936380684375763,
"rewards/format_reward": 0.729166679084301,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.7532739232839085,
"train_runtime": 13678.504,
"train_samples_per_second": 1.755,
"train_steps_per_second": 0.037
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}