OpenRS-GRPO / trainer_state.json

Model save

73ec496 verified 12 months ago

230 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.5714285714285714,
	"eval_steps": 500,
	"global_step": 500,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio": 0.0,
	"completion_length": 1644.166748046875,
	"epoch": 0.001142857142857143,
	"grad_norm": 0.20607953518495117,
	"kl": 0.0,
	"learning_rate": 2e-08,
	"loss": 0.0022,
	"reward": -0.1127668060362339,
	"reward_std": 0.20213491283357143,
	"rewards/cosine_scaled_reward": -0.18138340720906854,
	"rewards/format_reward": 0.25,
	"step": 1
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1656.791748046875,
	"epoch": 0.002285714285714286,
	"grad_norm": 0.31679714617652144,
	"kl": 0.0,
	"learning_rate": 4e-08,
	"loss": 0.0623,
	"reward": -0.05582176148891449,
	"reward_std": 0.6275629922747612,
	"rewards/cosine_scaled_reward": -0.19457754865288734,
	"rewards/format_reward": 0.3333333432674408,
	"step": 2
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1606.7500610351562,
	"epoch": 0.0034285714285714284,
	"grad_norm": 0.2789602147805501,
	"kl": 3.388524055480957e-05,
	"learning_rate": 6e-08,
	"loss": 0.0376,
	"reward": -0.2583192214369774,
	"reward_std": 0.2636854462325573,
	"rewards/cosine_scaled_reward": -0.222909614443779,
	"rewards/format_reward": 0.1875000074505806,
	"step": 3
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1690.6250610351562,
	"epoch": 0.004571428571428572,
	"grad_norm": 0.27232938747073254,
	"kl": 4.017353057861328e-05,
	"learning_rate": 8e-08,
	"loss": 0.0159,
	"reward": -0.40017254278063774,
	"reward_std": 0.17111004143953323,
	"rewards/cosine_scaled_reward": -0.3146696165204048,
	"rewards/format_reward": 0.2291666716337204,
	"step": 4
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1618.3541870117188,
	"epoch": 0.005714285714285714,
	"grad_norm": 0.2939867481096334,
	"kl": 2.8431415557861328e-05,
	"learning_rate": 1e-07,
	"loss": 0.0576,
	"reward": 0.13743871822953224,
	"reward_std": 0.7271581590175629,
	"rewards/cosine_scaled_reward": -0.12919731251895428,
	"rewards/format_reward": 0.3958333395421505,
	"step": 5
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1629.4791870117188,
	"epoch": 0.006857142857142857,
	"grad_norm": 0.248871735331751,
	"kl": 3.477931022644043e-05,
	"learning_rate": 1.2e-07,
	"loss": -0.0029,
	"reward": -0.029103130102157593,
	"reward_std": 0.5708433166146278,
	"rewards/cosine_scaled_reward": -0.1708015874028206,
	"rewards/format_reward": 0.3125000037252903,
	"step": 6
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1490.6458740234375,
	"epoch": 0.008,
	"grad_norm": 0.22790937530079167,
	"kl": 3.007054328918457e-05,
	"learning_rate": 1.4e-07,
	"loss": 0.0903,
	"reward": 0.12145921215415001,
	"reward_std": 0.5416159555315971,
	"rewards/cosine_scaled_reward": -0.10593708232045174,
	"rewards/format_reward": 0.33333334140479565,
	"step": 7
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1683.5000305175781,
	"epoch": 0.009142857142857144,
	"grad_norm": 0.20752077742039396,
	"kl": 4.646182060241699e-05,
	"learning_rate": 1.6e-07,
	"loss": 0.0277,
	"reward": -0.23692437633872032,
	"reward_std": 0.4620281979441643,
	"rewards/cosine_scaled_reward": -0.2747122012078762,
	"rewards/format_reward": 0.31250000558793545,
	"step": 8
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1719.2292175292969,
	"epoch": 0.010285714285714285,
	"grad_norm": 0.2983323511333683,
	"kl": 4.1991472244262695e-05,
	"learning_rate": 1.8e-07,
	"loss": 0.0511,
	"reward": -0.31221747025847435,
	"reward_std": 0.21310735493898392,
	"rewards/cosine_scaled_reward": -0.24985874257981777,
	"rewards/format_reward": 0.1875000074505806,
	"step": 9
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1477.2083740234375,
	"epoch": 0.011428571428571429,
	"grad_norm": 0.23645082786220448,
	"kl": 3.116577863693237e-05,
	"learning_rate": 2e-07,
	"loss": 0.0495,
	"reward": 0.37697479128837585,
	"reward_std": 0.44906593672931194,
	"rewards/cosine_scaled_reward": -0.05109592713415623,
	"rewards/format_reward": 0.4791666716337204,
	"step": 10
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1508.8958587646484,
	"epoch": 0.012571428571428572,
	"grad_norm": 0.339825377520832,
	"kl": 2.8848648071289062e-05,
	"learning_rate": 2.1999999999999998e-07,
	"loss": 0.0535,
	"reward": -0.13005081936717033,
	"reward_std": 0.6173823103308678,
	"rewards/cosine_scaled_reward": -0.2525254301726818,
	"rewards/format_reward": 0.37500000558793545,
	"step": 11
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1631.1041870117188,
	"epoch": 0.013714285714285714,
	"grad_norm": 0.20658630326267732,
	"kl": 3.084540367126465e-05,
	"learning_rate": 2.4e-07,
	"loss": 0.0635,
	"reward": 0.03064786270260811,
	"reward_std": 0.4376446008682251,
	"rewards/cosine_scaled_reward": -0.1513427309691906,
	"rewards/format_reward": 0.33333334140479565,
	"step": 12
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1422.604232788086,
	"epoch": 0.014857142857142857,
	"grad_norm": 0.23614097630983502,
	"kl": 2.527981996536255e-05,
	"learning_rate": 2.6e-07,
	"loss": -0.0306,
	"reward": 0.4512472003698349,
	"reward_std": 0.40983884781599045,
	"rewards/cosine_scaled_reward": -0.02437640482094139,
	"rewards/format_reward": 0.5,
	"step": 13
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1652.3542175292969,
	"epoch": 0.016,
	"grad_norm": 0.2206408502680819,
	"kl": 3.93986701965332e-05,
	"learning_rate": 2.8e-07,
	"loss": 0.0059,
	"reward": -0.2542928569018841,
	"reward_std": 0.17246506363153458,
	"rewards/cosine_scaled_reward": -0.26256311126053333,
	"rewards/format_reward": 0.2708333395421505,
	"step": 14
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1679.229248046875,
	"epoch": 0.017142857142857144,
	"grad_norm": 0.2314183406404789,
	"kl": 4.3898820877075195e-05,
	"learning_rate": 3e-07,
	"loss": 0.0053,
	"reward": -0.258657343685627,
	"reward_std": 0.23606499657034874,
	"rewards/cosine_scaled_reward": -0.1918286692816764,
	"rewards/format_reward": 0.125,
	"step": 15
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1396.7917175292969,
	"epoch": 0.018285714285714287,
	"grad_norm": 0.25436941656143647,
	"kl": 2.3171305656433105e-05,
	"learning_rate": 3.2e-07,
	"loss": 0.1053,
	"reward": 0.20216324925422668,
	"reward_std": 0.4999893419444561,
	"rewards/cosine_scaled_reward": -0.13850171491503716,
	"rewards/format_reward": 0.4791666716337204,
	"step": 16
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1719.416748046875,
	"epoch": 0.019428571428571427,
	"grad_norm": 0.23312894299622924,
	"kl": 4.0084123611450195e-05,
	"learning_rate": 3.4000000000000003e-07,
	"loss": -0.0007,
	"reward": -0.41149570792913437,
	"reward_std": 0.13166083209216595,
	"rewards/cosine_scaled_reward": -0.26824783720076084,
	"rewards/format_reward": 0.125,
	"step": 17
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1686.0833740234375,
	"epoch": 0.02057142857142857,
	"grad_norm": 0.24676487462788851,
	"kl": 4.7713518142700195e-05,
	"learning_rate": 3.6e-07,
	"loss": 0.0814,
	"reward": -0.32610235549509525,
	"reward_std": 0.23402154073119164,
	"rewards/cosine_scaled_reward": -0.25680116564035416,
	"rewards/format_reward": 0.18750000186264515,
	"step": 18
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1773.6458740234375,
	"epoch": 0.021714285714285714,
	"grad_norm": 0.21561964662639843,
	"kl": 2.1457672119140625e-05,
	"learning_rate": 3.7999999999999996e-07,
	"loss": 0.0164,
	"reward": -0.5961569249629974,
	"reward_std": 0.1714775264263153,
	"rewards/cosine_scaled_reward": -0.3501618057489395,
	"rewards/format_reward": 0.10416666977107525,
	"step": 19
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1529.3125610351562,
	"epoch": 0.022857142857142857,
	"grad_norm": 0.251130340260543,
	"kl": 3.24249267578125e-05,
	"learning_rate": 4e-07,
	"loss": 0.0293,
	"reward": -0.048260755836963654,
	"reward_std": 0.34835576079785824,
	"rewards/cosine_scaled_reward": -0.20121371746063232,
	"rewards/format_reward": 0.35416667722165585,
	"step": 20
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1494.6250305175781,
	"epoch": 0.024,
	"grad_norm": 0.3018968569179871,
	"kl": 2.6673078536987305e-05,
	"learning_rate": 4.1999999999999995e-07,
	"loss": 0.0278,
	"reward": 0.021329142153263092,
	"reward_std": 0.45257429778575897,
	"rewards/cosine_scaled_reward": -0.15600210055708885,
	"rewards/format_reward": 0.3333333358168602,
	"step": 21
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1778.5625610351562,
	"epoch": 0.025142857142857144,
	"grad_norm": 0.29253387654098556,
	"kl": 3.1888484954833984e-05,
	"learning_rate": 4.3999999999999997e-07,
	"loss": 0.0494,
	"reward": -0.5034094974398613,
	"reward_std": 0.3080843798816204,
	"rewards/cosine_scaled_reward": -0.29337141662836075,
	"rewards/format_reward": 0.08333333395421505,
	"step": 22
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1762.8958740234375,
	"epoch": 0.026285714285714287,
	"grad_norm": 0.21053978305274443,
	"kl": 4.506111145019531e-05,
	"learning_rate": 4.6e-07,
	"loss": 0.0144,
	"reward": -0.028878159821033478,
	"reward_std": 0.5564102046191692,
	"rewards/cosine_scaled_reward": -0.10818908177316189,
	"rewards/format_reward": 0.1875000074505806,
	"step": 23
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1352.5625305175781,
	"epoch": 0.027428571428571427,
	"grad_norm": 0.20202450012624545,
	"kl": 1.6548670828342438e-05,
	"learning_rate": 4.8e-07,
	"loss": 0.0005,
	"reward": 0.6555859744548798,
	"reward_std": 0.47822858951985836,
	"rewards/cosine_scaled_reward": 0.06737629324197769,
	"rewards/format_reward": 0.520833333954215,
	"step": 24
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1597.1875610351562,
	"epoch": 0.02857142857142857,
	"grad_norm": 0.4327230812041704,
	"kl": 3.0606985092163086e-05,
	"learning_rate": 5e-07,
	"loss": 0.0701,
	"reward": 0.05484675616025925,
	"reward_std": 0.6329891942441463,
	"rewards/cosine_scaled_reward": -0.11840994283556938,
	"rewards/format_reward": 0.29166667722165585,
	"step": 25
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1647.916748046875,
	"epoch": 0.029714285714285714,
	"grad_norm": 0.21123992049117873,
	"kl": 2.2917985916137695e-05,
	"learning_rate": 5.2e-07,
	"loss": 0.031,
	"reward": -0.24321994185447693,
	"reward_std": 0.12097731977701187,
	"rewards/cosine_scaled_reward": -0.18410997837781906,
	"rewards/format_reward": 0.125,
	"step": 26
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1638.8958740234375,
	"epoch": 0.030857142857142857,
	"grad_norm": 0.21745088219923464,
	"kl": 3.2067298889160156e-05,
	"learning_rate": 5.4e-07,
	"loss": -0.0097,
	"reward": -0.3657397888600826,
	"reward_std": 0.24539830163121223,
	"rewards/cosine_scaled_reward": -0.2974532376974821,
	"rewards/format_reward": 0.2291666716337204,
	"step": 27
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1711.2709045410156,
	"epoch": 0.032,
	"grad_norm": 0.2552233664551883,
	"kl": 2.8468668460845947e-05,
	"learning_rate": 5.6e-07,
	"loss": 0.0256,
	"reward": -0.38710537925362587,
	"reward_std": 0.2530311979353428,
	"rewards/cosine_scaled_reward": -0.2768860347568989,
	"rewards/format_reward": 0.1666666716337204,
	"step": 28
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1713.8125610351562,
	"epoch": 0.03314285714285714,
	"grad_norm": 0.202249350617508,
	"kl": 2.86102294921875e-05,
	"learning_rate": 5.8e-07,
	"loss": 0.0135,
	"reward": -0.1931730881333351,
	"reward_std": 0.5632064789533615,
	"rewards/cosine_scaled_reward": -0.20075321290642023,
	"rewards/format_reward": 0.2083333358168602,
	"step": 29
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1732.291748046875,
	"epoch": 0.03428571428571429,
	"grad_norm": 0.23328556356102392,
	"kl": 2.165883779525757e-05,
	"learning_rate": 6e-07,
	"loss": 0.0564,
	"reward": -0.3746844604611397,
	"reward_std": 0.34011659026145935,
	"rewards/cosine_scaled_reward": -0.24984224140644073,
	"rewards/format_reward": 0.12500000186264515,
	"step": 30
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1445.3125305175781,
	"epoch": 0.03542857142857143,
	"grad_norm": 0.30643607095324277,
	"kl": 3.966689109802246e-05,
	"learning_rate": 6.2e-07,
	"loss": 0.0923,
	"reward": -0.09436208941042423,
	"reward_std": 0.3265727870166302,
	"rewards/cosine_scaled_reward": -0.21384770551230758,
	"rewards/format_reward": 0.33333333395421505,
	"step": 31
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1810.7917175292969,
	"epoch": 0.036571428571428574,
	"grad_norm": 0.20484433233713875,
	"kl": 2.8021633625030518e-05,
	"learning_rate": 6.4e-07,
	"loss": 0.0202,
	"reward": -0.5034667998552322,
	"reward_std": 0.15860500000417233,
	"rewards/cosine_scaled_reward": -0.2621500678360462,
	"rewards/format_reward": 0.02083333395421505,
	"step": 32
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1750.9584045410156,
	"epoch": 0.037714285714285714,
	"grad_norm": 0.2027434434467969,
	"kl": 2.5600194931030273e-05,
	"learning_rate": 6.6e-07,
	"loss": -0.0171,
	"reward": -0.25296103954315186,
	"reward_std": 0.4817052260041237,
	"rewards/cosine_scaled_reward": -0.2514805067330599,
	"rewards/format_reward": 0.25000000558793545,
	"step": 33
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1634.8333740234375,
	"epoch": 0.038857142857142854,
	"grad_norm": 0.23764579059557195,
	"kl": 2.331659197807312e-05,
	"learning_rate": 6.800000000000001e-07,
	"loss": 0.0003,
	"reward": -0.3657361939549446,
	"reward_std": 0.2039697989821434,
	"rewards/cosine_scaled_reward": -0.25578476674854755,
	"rewards/format_reward": 0.14583333395421505,
	"step": 34
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1691.1875610351562,
	"epoch": 0.04,
	"grad_norm": 0.2390715088796384,
	"kl": 1.8522143363952637e-05,
	"learning_rate": 7e-07,
	"loss": 0.0579,
	"reward": -0.1916074175387621,
	"reward_std": 0.40257398039102554,
	"rewards/cosine_scaled_reward": -0.23122038505971432,
	"rewards/format_reward": 0.27083334885537624,
	"step": 35
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1526.2292175292969,
	"epoch": 0.04114285714285714,
	"grad_norm": 0.2361249356185026,
	"kl": 3.781914710998535e-05,
	"learning_rate": 7.2e-07,
	"loss": 0.0401,
	"reward": 0.35939645767211914,
	"reward_std": 0.39011720940470695,
	"rewards/cosine_scaled_reward": -0.01821846514940262,
	"rewards/format_reward": 0.3958333395421505,
	"step": 36
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1645.7708740234375,
	"epoch": 0.04228571428571429,
	"grad_norm": 0.26864783041008133,
	"kl": 3.820657730102539e-05,
	"learning_rate": 7.4e-07,
	"loss": 0.0746,
	"reward": -0.2870800420641899,
	"reward_std": 0.46812814101576805,
	"rewards/cosine_scaled_reward": -0.25812335684895515,
	"rewards/format_reward": 0.2291666679084301,
	"step": 37
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1722.5000610351562,
	"epoch": 0.04342857142857143,
	"grad_norm": 0.27664066975056834,
	"kl": 5.131959915161133e-05,
	"learning_rate": 7.599999999999999e-07,
	"loss": 0.0586,
	"reward": -0.15014038234949112,
	"reward_std": 0.4126087427139282,
	"rewards/cosine_scaled_reward": -0.2000702191144228,
	"rewards/format_reward": 0.2500000074505806,
	"step": 38
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1678.7083740234375,
	"epoch": 0.044571428571428574,
	"grad_norm": 0.3003829192682386,
	"kl": 4.968792200088501e-05,
	"learning_rate": 7.799999999999999e-07,
	"loss": 0.097,
	"reward": -0.21257384680211544,
	"reward_std": 0.48539142310619354,
	"rewards/cosine_scaled_reward": -0.2312869280576706,
	"rewards/format_reward": 0.2500000111758709,
	"step": 39
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1690.8958740234375,
	"epoch": 0.045714285714285714,
	"grad_norm": 0.20909108511646457,
	"kl": 5.0902366638183594e-05,
	"learning_rate": 8e-07,
	"loss": 0.0436,
	"reward": -0.5045258924365044,
	"reward_std": 0.2920587807893753,
	"rewards/cosine_scaled_reward": -0.3564296290278435,
	"rewards/format_reward": 0.2083333358168602,
	"step": 40
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1806.3334045410156,
	"epoch": 0.046857142857142854,
	"grad_norm": 0.2168555566166619,
	"kl": 3.137439489364624e-05,
	"learning_rate": 8.199999999999999e-07,
	"loss": -0.0012,
	"reward": 0.04771171510219574,
	"reward_std": 0.33250839821994305,
	"rewards/cosine_scaled_reward": -0.06989414617419243,
	"rewards/format_reward": 0.18750000186264515,
	"step": 41
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1300.6250457763672,
	"epoch": 0.048,
	"grad_norm": 0.40542845209419376,
	"kl": 0.000291675329208374,
	"learning_rate": 8.399999999999999e-07,
	"loss": 0.0768,
	"reward": 0.27488730661571026,
	"reward_std": 0.45710677094757557,
	"rewards/cosine_scaled_reward": -0.1646396858850494,
	"rewards/format_reward": 0.6041666716337204,
	"step": 42
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1705.8750610351562,
	"epoch": 0.04914285714285714,
	"grad_norm": 0.21842925663095267,
	"kl": 3.538280725479126e-05,
	"learning_rate": 8.599999999999999e-07,
	"loss": 0.0308,
	"reward": -0.2755163535475731,
	"reward_std": 0.3637393806129694,
	"rewards/cosine_scaled_reward": -0.2210915139876306,
	"rewards/format_reward": 0.1666666679084301,
	"step": 43
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1665.0625305175781,
	"epoch": 0.05028571428571429,
	"grad_norm": 0.26271417694787236,
	"kl": 0.00046503543853759766,
	"learning_rate": 8.799999999999999e-07,
	"loss": 0.073,
	"reward": -0.12092901021242142,
	"reward_std": 0.5556337833404541,
	"rewards/cosine_scaled_reward": -0.17504783952608705,
	"rewards/format_reward": 0.2291666679084301,
	"step": 44
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1733.2084045410156,
	"epoch": 0.05142857142857143,
	"grad_norm": 0.21285192669515357,
	"kl": 5.0537288188934326e-05,
	"learning_rate": 9e-07,
	"loss": 0.0423,
	"reward": -0.05799056589603424,
	"reward_std": 0.4342048391699791,
	"rewards/cosine_scaled_reward": -0.14357861876487732,
	"rewards/format_reward": 0.22916666977107525,
	"step": 45
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1640.0834045410156,
	"epoch": 0.052571428571428575,
	"grad_norm": 0.2622293688477209,
	"kl": 0.00013068318367004395,
	"learning_rate": 9.2e-07,
	"loss": 0.0317,
	"reward": -0.005384169518947601,
	"reward_std": 0.3068407401442528,
	"rewards/cosine_scaled_reward": -0.1068587563931942,
	"rewards/format_reward": 0.20833333395421505,
	"step": 46
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1498.8333892822266,
	"epoch": 0.053714285714285714,
	"grad_norm": 0.274608905827555,
	"kl": 0.0001885145902633667,
	"learning_rate": 9.399999999999999e-07,
	"loss": 0.049,
	"reward": -0.002073638141155243,
	"reward_std": 0.4514222964644432,
	"rewards/cosine_scaled_reward": -0.17812015302479267,
	"rewards/format_reward": 0.3541666753590107,
	"step": 47
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1610.4792175292969,
	"epoch": 0.054857142857142854,
	"grad_norm": 0.24771930467103717,
	"kl": 0.00015616416931152344,
	"learning_rate": 9.6e-07,
	"loss": 0.0334,
	"reward": -0.22091616783291101,
	"reward_std": 0.33334225323051214,
	"rewards/cosine_scaled_reward": -0.21462474018335342,
	"rewards/format_reward": 0.20833334140479565,
	"step": 48
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1341.1458740234375,
	"epoch": 0.056,
	"grad_norm": 0.3710205417665813,
	"kl": 0.00029793381690979004,
	"learning_rate": 9.8e-07,
	"loss": 0.0862,
	"reward": 0.40674951672554016,
	"reward_std": 0.5115297809243202,
	"rewards/cosine_scaled_reward": -0.025791920721530914,
	"rewards/format_reward": 0.45833333395421505,
	"step": 49
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1335.1667175292969,
	"epoch": 0.05714285714285714,
	"grad_norm": 0.3034272517231627,
	"kl": 0.0005925297737121582,
	"learning_rate": 1e-06,
	"loss": 0.1036,
	"reward": 0.36978277564048767,
	"reward_std": 0.4990865057334304,
	"rewards/cosine_scaled_reward": -0.033858626149594784,
	"rewards/format_reward": 0.43750002048909664,
	"step": 50
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1686.8959045410156,
	"epoch": 0.05828571428571429,
	"grad_norm": 0.3009121706411098,
	"kl": 0.00032591819763183594,
	"learning_rate": 9.999890338174275e-07,
	"loss": 0.0864,
	"reward": -0.20582207757979631,
	"reward_std": 0.5198994930833578,
	"rewards/cosine_scaled_reward": -0.19666103832423687,
	"rewards/format_reward": 0.1875000111758709,
	"step": 51
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1718.2291870117188,
	"epoch": 0.05942857142857143,
	"grad_norm": 0.21311754620957382,
	"kl": 0.0005127787590026855,
	"learning_rate": 9.999561358041868e-07,
	"loss": 0.0262,
	"reward": -0.39756081253290176,
	"reward_std": 0.34694093093276024,
	"rewards/cosine_scaled_reward": -0.2716970667243004,
	"rewards/format_reward": 0.1458333358168602,
	"step": 52
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1611.8334045410156,
	"epoch": 0.060571428571428575,
	"grad_norm": 0.22683388578373892,
	"kl": 0.0005531832575798035,
	"learning_rate": 9.999013075636804e-07,
	"loss": 0.068,
	"reward": -0.13391486555337906,
	"reward_std": 0.27848392724990845,
	"rewards/cosine_scaled_reward": -0.22320742718875408,
	"rewards/format_reward": 0.31250000186264515,
	"step": 53
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1442.0834045410156,
	"epoch": 0.061714285714285715,
	"grad_norm": 0.24769106962876689,
	"kl": 0.0002713203430175781,
	"learning_rate": 9.998245517681593e-07,
	"loss": 0.0911,
	"reward": -0.11875106766819954,
	"reward_std": 0.1542784534394741,
	"rewards/cosine_scaled_reward": -0.2572922073304653,
	"rewards/format_reward": 0.3958333432674408,
	"step": 54
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1688.4167175292969,
	"epoch": 0.06285714285714286,
	"grad_norm": 0.22851815885942953,
	"kl": 0.0001881718635559082,
	"learning_rate": 9.997258721585931e-07,
	"loss": 0.0068,
	"reward": -0.3640219047665596,
	"reward_std": 0.2585913948714733,
	"rewards/cosine_scaled_reward": -0.2965943031013012,
	"rewards/format_reward": 0.2291666753590107,
	"step": 55
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1569.4166870117188,
	"epoch": 0.064,
	"grad_norm": 0.2466081306910316,
	"kl": 0.0021448135375976562,
	"learning_rate": 9.996052735444862e-07,
	"loss": 0.096,
	"reward": -0.4589140391908586,
	"reward_std": 0.4320836700499058,
	"rewards/cosine_scaled_reward": -0.3440403640270233,
	"rewards/format_reward": 0.2291666679084301,
	"step": 56
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1629.979248046875,
	"epoch": 0.06514285714285714,
	"grad_norm": 0.22573731739546327,
	"kl": 0.0010238885879516602,
	"learning_rate": 9.994627618036452e-07,
	"loss": 0.0592,
	"reward": -0.3061641752719879,
	"reward_std": 0.5002065226435661,
	"rewards/cosine_scaled_reward": -0.26766542345285416,
	"rewards/format_reward": 0.2291666716337204,
	"step": 57
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1660.4792175292969,
	"epoch": 0.06628571428571428,
	"grad_norm": 0.22190381637143303,
	"kl": 0.0011049509048461914,
	"learning_rate": 9.992983438818915e-07,
	"loss": 0.022,
	"reward": -0.32173825055360794,
	"reward_std": 0.27725364826619625,
	"rewards/cosine_scaled_reward": -0.2754524536430836,
	"rewards/format_reward": 0.2291666679084301,
	"step": 58
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1690.0417175292969,
	"epoch": 0.06742857142857143,
	"grad_norm": 0.21914617585966853,
	"kl": 0.0010164976119995117,
	"learning_rate": 9.991120277927223e-07,
	"loss": 0.0444,
	"reward": -0.021609768271446228,
	"reward_std": 0.3677750062197447,
	"rewards/cosine_scaled_reward": -0.135804895311594,
	"rewards/format_reward": 0.25000000558793545,
	"step": 59
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1581.6875305175781,
	"epoch": 0.06857142857142857,
	"grad_norm": 0.4016735260144472,
	"kl": 0.01423954963684082,
	"learning_rate": 9.989038226169207e-07,
	"loss": 0.0192,
	"reward": 0.11502109467983246,
	"reward_std": 0.29630398005247116,
	"rewards/cosine_scaled_reward": -0.057072801515460014,
	"rewards/format_reward": 0.2291666716337204,
	"step": 60
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1475.5833740234375,
	"epoch": 0.06971428571428571,
	"grad_norm": 0.24285848407581584,
	"kl": 0.0003628730773925781,
	"learning_rate": 9.98673738502114e-07,
	"loss": 0.0731,
	"reward": 0.5937481597065926,
	"reward_std": 0.6881431620568037,
	"rewards/cosine_scaled_reward": 0.046874068677425385,
	"rewards/format_reward": 0.5000000149011612,
	"step": 61
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1805.7500610351562,
	"epoch": 0.07085714285714285,
	"grad_norm": 0.19714468440546948,
	"kl": 0.0005519390106201172,
	"learning_rate": 9.98421786662277e-07,
	"loss": 0.0172,
	"reward": -0.4636555463075638,
	"reward_std": 0.3160466430708766,
	"rewards/cosine_scaled_reward": -0.2734944522380829,
	"rewards/format_reward": 0.08333333395421505,
	"step": 62
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1329.2917175292969,
	"epoch": 0.072,
	"grad_norm": 0.28510447078335305,
	"kl": 0.004929542541503906,
	"learning_rate": 9.981479793771866e-07,
	"loss": 0.1079,
	"reward": 0.30475724674761295,
	"reward_std": 0.4675188772380352,
	"rewards/cosine_scaled_reward": -0.0976213626563549,
	"rewards/format_reward": 0.5000000149011612,
	"step": 63
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1636.5208740234375,
	"epoch": 0.07314285714285715,
	"grad_norm": 0.20815660806735267,
	"kl": 0.0003604888916015625,
	"learning_rate": 9.97852329991824e-07,
	"loss": 0.0625,
	"reward": 0.29327625688165426,
	"reward_std": 0.5610844530165195,
	"rewards/cosine_scaled_reward": -0.03044520819094032,
	"rewards/format_reward": 0.354166679084301,
	"step": 64
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1559.5000305175781,
	"epoch": 0.07428571428571429,
	"grad_norm": 0.24172417943995111,
	"kl": 0.001363515853881836,
	"learning_rate": 9.975348529157229e-07,
	"loss": 0.0995,
	"reward": 0.1283707581460476,
	"reward_std": 0.7667413726449013,
	"rewards/cosine_scaled_reward": -0.13373128045350313,
	"rewards/format_reward": 0.3958333507180214,
	"step": 65
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1729.6667175292969,
	"epoch": 0.07542857142857143,
	"grad_norm": 0.20090852438136195,
	"kl": 0.00067138671875,
	"learning_rate": 9.971955636222684e-07,
	"loss": 0.0209,
	"reward": -0.39017004892230034,
	"reward_std": 0.32542612217366695,
	"rewards/cosine_scaled_reward": -0.3200850263237953,
	"rewards/format_reward": 0.2500000149011612,
	"step": 66
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1648.7292175292969,
	"epoch": 0.07657142857142857,
	"grad_norm": 0.18795555019652113,
	"kl": 0.0007681846618652344,
	"learning_rate": 9.968344786479415e-07,
	"loss": 0.0342,
	"reward": -0.1792638599872589,
	"reward_std": 0.3578680492937565,
	"rewards/cosine_scaled_reward": -0.20421527326107025,
	"rewards/format_reward": 0.2291666679084301,
	"step": 67
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1388.5625610351562,
	"epoch": 0.07771428571428571,
	"grad_norm": 0.3904259482407812,
	"kl": 0.00202178955078125,
	"learning_rate": 9.964516155915151e-07,
	"loss": 0.0637,
	"reward": 0.16577239707112312,
	"reward_std": 0.3421984985470772,
	"rewards/cosine_scaled_reward": -0.09419714100658894,
	"rewards/format_reward": 0.3541666716337204,
	"step": 68
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1507.8333740234375,
	"epoch": 0.07885714285714286,
	"grad_norm": 0.2361059164440503,
	"kl": 0.0008258819580078125,
	"learning_rate": 9.960469931131936e-07,
	"loss": 0.0613,
	"reward": 0.17160904966294765,
	"reward_std": 0.38275655917823315,
	"rewards/cosine_scaled_reward": -0.10169548355042934,
	"rewards/format_reward": 0.37500000558793545,
	"step": 69
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1690.3750305175781,
	"epoch": 0.08,
	"grad_norm": 0.19302606573391104,
	"kl": 0.002358675003051758,
	"learning_rate": 9.956206309337066e-07,
	"loss": 0.105,
	"reward": -0.1555338129401207,
	"reward_std": 0.37855083122849464,
	"rewards/cosine_scaled_reward": -0.20276692137122154,
	"rewards/format_reward": 0.25000000186264515,
	"step": 70
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1441.729232788086,
	"epoch": 0.08114285714285714,
	"grad_norm": 0.331702227116139,
	"kl": 0.0023870468139648438,
	"learning_rate": 9.951725498333448e-07,
	"loss": 0.1388,
	"reward": -0.2453744667582214,
	"reward_std": 0.15839526243507862,
	"rewards/cosine_scaled_reward": -0.3101872429251671,
	"rewards/format_reward": 0.3750000149011612,
	"step": 71
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1497.3959045410156,
	"epoch": 0.08228571428571428,
	"grad_norm": 0.33894190686830156,
	"kl": 0.0017808079719543457,
	"learning_rate": 9.947027716509488e-07,
	"loss": 0.0553,
	"reward": 0.09824148565530777,
	"reward_std": 0.1729265321046114,
	"rewards/cosine_scaled_reward": -0.08629592880606651,
	"rewards/format_reward": 0.2708333358168602,
	"step": 72
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1444.7708892822266,
	"epoch": 0.08342857142857144,
	"grad_norm": 0.9254159035231885,
	"kl": 0.039752960205078125,
	"learning_rate": 9.942113192828444e-07,
	"loss": 0.1025,
	"reward": 0.47389062121510506,
	"reward_std": 0.7162522077560425,
	"rewards/cosine_scaled_reward": -0.05472135776653886,
	"rewards/format_reward": 0.5833333507180214,
	"step": 73
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1484.7083740234375,
	"epoch": 0.08457142857142858,
	"grad_norm": 0.2164345231616129,
	"kl": 0.0021944046020507812,
	"learning_rate": 9.93698216681727e-07,
	"loss": 0.0129,
	"reward": -0.06718481332063675,
	"reward_std": 0.16878989525139332,
	"rewards/cosine_scaled_reward": -0.22109240666031837,
	"rewards/format_reward": 0.3750000037252903,
	"step": 74
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1526.0417175292969,
	"epoch": 0.08571428571428572,
	"grad_norm": 0.3075410122107456,
	"kl": 0.00359344482421875,
	"learning_rate": 9.931634888554935e-07,
	"loss": 0.0753,
	"reward": 0.17093585059046745,
	"reward_std": 0.4688509330153465,
	"rewards/cosine_scaled_reward": -0.08119874075055122,
	"rewards/format_reward": 0.33333334513008595,
	"step": 75
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1640.4583740234375,
	"epoch": 0.08685714285714285,
	"grad_norm": 0.20492660661291412,
	"kl": 0.00046312808990478516,
	"learning_rate": 9.926071618660237e-07,
	"loss": 0.0184,
	"reward": 0.029385031666606665,
	"reward_std": 0.6126945875585079,
	"rewards/cosine_scaled_reward": -0.151974156498909,
	"rewards/format_reward": 0.33333333395421505,
	"step": 76
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1674.5625610351562,
	"epoch": 0.088,
	"grad_norm": 0.21980728108796918,
	"kl": 0.0009822845458984375,
	"learning_rate": 9.9202926282791e-07,
	"loss": -0.0002,
	"reward": -0.18806731700897217,
	"reward_std": 0.12730432488024235,
	"rewards/cosine_scaled_reward": -0.15653366968035698,
	"rewards/format_reward": 0.125,
	"step": 77
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1518.0625610351562,
	"epoch": 0.08914285714285715,
	"grad_norm": 0.242785552217566,
	"kl": 0.0009822845458984375,
	"learning_rate": 9.91429819907136e-07,
	"loss": 0.0619,
	"reward": 0.13657424598932266,
	"reward_std": 0.4360465779900551,
	"rewards/cosine_scaled_reward": -0.10879619419574738,
	"rewards/format_reward": 0.35416666977107525,
	"step": 78
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1575.4792175292969,
	"epoch": 0.09028571428571429,
	"grad_norm": 0.24080955526978698,
	"kl": 0.0005426406860351562,
	"learning_rate": 9.908088623197048e-07,
	"loss": 0.0519,
	"reward": 0.016203314065933228,
	"reward_std": 0.6479124575853348,
	"rewards/cosine_scaled_reward": -0.1585650178603828,
	"rewards/format_reward": 0.3333333395421505,
	"step": 79
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1733.9167175292969,
	"epoch": 0.09142857142857143,
	"grad_norm": 0.2186002750502081,
	"kl": 0.0005044937133789062,
	"learning_rate": 9.901664203302124e-07,
	"loss": 0.031,
	"reward": -0.5251612327992916,
	"reward_std": 0.40141166001558304,
	"rewards/cosine_scaled_reward": -0.33549728989601135,
	"rewards/format_reward": 0.1458333358168602,
	"step": 80
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1728.479248046875,
	"epoch": 0.09257142857142857,
	"grad_norm": 0.21399417944679958,
	"kl": 0.0009112358093261719,
	"learning_rate": 9.895025252503755e-07,
	"loss": 0.0374,
	"reward": -0.19506264757364988,
	"reward_std": 0.48094464652240276,
	"rewards/cosine_scaled_reward": -0.1912813438102603,
	"rewards/format_reward": 0.18750000558793545,
	"step": 81
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1601.9792175292969,
	"epoch": 0.09371428571428571,
	"grad_norm": 0.2450961734236274,
	"kl": 0.0009531974792480469,
	"learning_rate": 9.888172094375033e-07,
	"loss": 0.077,
	"reward": -0.1917775571346283,
	"reward_std": 0.5255400985479355,
	"rewards/cosine_scaled_reward": -0.25213877484202385,
	"rewards/format_reward": 0.31250001303851604,
	"step": 82
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1748.1875610351562,
	"epoch": 0.09485714285714286,
	"grad_norm": 0.22448680749018862,
	"kl": 0.0004420280456542969,
	"learning_rate": 9.881105062929221e-07,
	"loss": 0.0159,
	"reward": -0.43924427404999733,
	"reward_std": 0.2609596960246563,
	"rewards/cosine_scaled_reward": -0.27170546911656857,
	"rewards/format_reward": 0.10416666977107525,
	"step": 83
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1515.7708435058594,
	"epoch": 0.096,
	"grad_norm": 0.2231038243696207,
	"kl": 0.0006551742553710938,
	"learning_rate": 9.873824502603459e-07,
	"loss": 0.0246,
	"reward": 0.36620646342635155,
	"reward_std": 0.884237602353096,
	"rewards/cosine_scaled_reward": -0.06689677853137255,
	"rewards/format_reward": 0.5000000074505806,
	"step": 84
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1701.2083740234375,
	"epoch": 0.09714285714285714,
	"grad_norm": 0.20906161676384463,
	"kl": 0.000804901123046875,
	"learning_rate": 9.866330768241983e-07,
	"loss": 0.0555,
	"reward": -0.39954638853669167,
	"reward_std": 0.31576116755604744,
	"rewards/cosine_scaled_reward": -0.2726898640394211,
	"rewards/format_reward": 0.14583333395421505,
	"step": 85
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1610.9792175292969,
	"epoch": 0.09828571428571428,
	"grad_norm": 0.22100681278056383,
	"kl": 0.0009822845458984375,
	"learning_rate": 9.85862422507884e-07,
	"loss": 0.0444,
	"reward": -0.24343110900372267,
	"reward_std": 0.2885846998542547,
	"rewards/cosine_scaled_reward": -0.30921556800603867,
	"rewards/format_reward": 0.375,
	"step": 86
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1695.354248046875,
	"epoch": 0.09942857142857142,
	"grad_norm": 0.24683069440334848,
	"kl": 0.0029506683349609375,
	"learning_rate": 9.850705248720068e-07,
	"loss": 0.0377,
	"reward": -0.09222975745797157,
	"reward_std": 0.24668438732624054,
	"rewards/cosine_scaled_reward": -0.1502815391868353,
	"rewards/format_reward": 0.2083333432674408,
	"step": 87
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1594.9167175292969,
	"epoch": 0.10057142857142858,
	"grad_norm": 0.27215086328931853,
	"kl": 0.0016989707946777344,
	"learning_rate": 9.8425742251254e-07,
	"loss": 0.1075,
	"reward": 0.18186672404408455,
	"reward_std": 0.9013341814279556,
	"rewards/cosine_scaled_reward": -0.07573332265019417,
	"rewards/format_reward": 0.3333333432674408,
	"step": 88
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1738.7292175292969,
	"epoch": 0.10171428571428572,
	"grad_norm": 0.1946900134085172,
	"kl": 0.000820159912109375,
	"learning_rate": 9.83423155058946e-07,
	"loss": 0.0331,
	"reward": -0.28752805292606354,
	"reward_std": 0.4243736080825329,
	"rewards/cosine_scaled_reward": -0.22709737345576286,
	"rewards/format_reward": 0.16666667722165585,
	"step": 89
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1572.2916870117188,
	"epoch": 0.10285714285714286,
	"grad_norm": 0.20694868118264276,
	"kl": 0.0007328987121582031,
	"learning_rate": 9.825677631722435e-07,
	"loss": 0.0753,
	"reward": -0.08595774043351412,
	"reward_std": 0.5348180644214153,
	"rewards/cosine_scaled_reward": -0.18881220323964953,
	"rewards/format_reward": 0.2916666679084301,
	"step": 90
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1601.5625610351562,
	"epoch": 0.104,
	"grad_norm": 0.20840771038907893,
	"kl": 0.0007948875427246094,
	"learning_rate": 9.816912885430258e-07,
	"loss": 0.0808,
	"reward": -0.015035435557365417,
	"reward_std": 0.14022575318813324,
	"rewards/cosine_scaled_reward": -0.1429343856871128,
	"rewards/format_reward": 0.2708333432674408,
	"step": 91
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1498.2292175292969,
	"epoch": 0.10514285714285715,
	"grad_norm": 0.20771988001872319,
	"kl": 0.0009174346923828125,
	"learning_rate": 9.807937738894303e-07,
	"loss": 0.0994,
	"reward": 0.07728531863540411,
	"reward_std": 0.508693166077137,
	"rewards/cosine_scaled_reward": -0.1384406816214323,
	"rewards/format_reward": 0.35416666977107525,
	"step": 92
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1347.8125305175781,
	"epoch": 0.10628571428571429,
	"grad_norm": 0.27527082284418775,
	"kl": 0.0021848678588867188,
	"learning_rate": 9.798752629550546e-07,
	"loss": 0.0296,
	"reward": 0.30088429898023605,
	"reward_std": 0.5643313899636269,
	"rewards/cosine_scaled_reward": -0.10997452400624752,
	"rewards/format_reward": 0.5208333432674408,
	"step": 93
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1733.2500610351562,
	"epoch": 0.10742857142857143,
	"grad_norm": 0.23935442867120157,
	"kl": 0.0012607574462890625,
	"learning_rate": 9.78935800506826e-07,
	"loss": 0.021,
	"reward": -0.34041892923414707,
	"reward_std": 0.2469240017235279,
	"rewards/cosine_scaled_reward": -0.26395946741104126,
	"rewards/format_reward": 0.18750000186264515,
	"step": 94
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1738.5625610351562,
	"epoch": 0.10857142857142857,
	"grad_norm": 0.21273217079983556,
	"kl": 0.0006814002990722656,
	"learning_rate": 9.779754323328192e-07,
	"loss": -0.0093,
	"reward": -0.5389137789607048,
	"reward_std": 0.17841140553355217,
	"rewards/cosine_scaled_reward": -0.3423735648393631,
	"rewards/format_reward": 0.14583333395421505,
	"step": 95
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1433.9375610351562,
	"epoch": 0.10971428571428571,
	"grad_norm": 0.30691056711732384,
	"kl": 0.002574920654296875,
	"learning_rate": 9.769942052400235e-07,
	"loss": 0.137,
	"reward": 0.296867486089468,
	"reward_std": 0.3943296894431114,
	"rewards/cosine_scaled_reward": -0.04948292672634125,
	"rewards/format_reward": 0.3958333432674408,
	"step": 96
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1567.2500610351562,
	"epoch": 0.11085714285714286,
	"grad_norm": 0.25051085956589897,
	"kl": 0.0013968944549560547,
	"learning_rate": 9.759921670520634e-07,
	"loss": 0.0267,
	"reward": -0.15386457741260529,
	"reward_std": 0.37108149379491806,
	"rewards/cosine_scaled_reward": -0.21234895661473274,
	"rewards/format_reward": 0.2708333432674408,
	"step": 97
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1406.0208740234375,
	"epoch": 0.112,
	"grad_norm": 0.366560785041491,
	"kl": 0.0012578964233398438,
	"learning_rate": 9.749693666068663e-07,
	"loss": 0.099,
	"reward": 0.3372333124279976,
	"reward_std": 0.3852754198014736,
	"rewards/cosine_scaled_reward": -0.12305000983178616,
	"rewards/format_reward": 0.5833333507180214,
	"step": 98
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1598.7917175292969,
	"epoch": 0.11314285714285714,
	"grad_norm": 0.2584279138871096,
	"kl": 0.0010881423950195312,
	"learning_rate": 9.739258537542835e-07,
	"loss": 0.0536,
	"reward": 0.1023973822593689,
	"reward_std": 0.4502338841557503,
	"rewards/cosine_scaled_reward": -0.1258846465498209,
	"rewards/format_reward": 0.3541666828095913,
	"step": 99
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1557.479248046875,
	"epoch": 0.11428571428571428,
	"grad_norm": 0.23713752727518134,
	"kl": 0.0009851455688476562,
	"learning_rate": 9.728616793536587e-07,
	"loss": 0.0694,
	"reward": -0.15063253417611122,
	"reward_std": 0.3854830376803875,
	"rewards/cosine_scaled_reward": -0.23156626150012016,
	"rewards/format_reward": 0.3125000111758709,
	"step": 100
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1387.4583435058594,
	"epoch": 0.11542857142857142,
	"grad_norm": 0.32157411791816565,
	"kl": 0.001094818115234375,
	"learning_rate": 9.717768952713511e-07,
	"loss": 0.1116,
	"reward": 0.07011325657367706,
	"reward_std": 0.3243808038532734,
	"rewards/cosine_scaled_reward": -0.19411004893481731,
	"rewards/format_reward": 0.4583333395421505,
	"step": 101
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1449.3750610351562,
	"epoch": 0.11657142857142858,
	"grad_norm": 0.2168599934302549,
	"kl": 0.0015411376953125,
	"learning_rate": 9.706715543782064e-07,
	"loss": 0.0577,
	"reward": -0.21096567437052727,
	"reward_std": 0.29599858447909355,
	"rewards/cosine_scaled_reward": -0.3138161562383175,
	"rewards/format_reward": 0.4166666865348816,
	"step": 102
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1715.166748046875,
	"epoch": 0.11771428571428572,
	"grad_norm": 0.21920178674297372,
	"kl": 0.0015869140625,
	"learning_rate": 9.695457105469804e-07,
	"loss": 0.0667,
	"reward": -0.18699942529201508,
	"reward_std": 0.5092732682824135,
	"rewards/cosine_scaled_reward": -0.22891639173030853,
	"rewards/format_reward": 0.2708333395421505,
	"step": 103
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1304.4583435058594,
	"epoch": 0.11885714285714286,
	"grad_norm": 0.22942484314958453,
	"kl": 0.0013804435729980469,
	"learning_rate": 9.683994186497132e-07,
	"loss": 0.0839,
	"reward": 0.5173723250627518,
	"reward_std": 0.5176322646439075,
	"rewards/cosine_scaled_reward": -0.001730518415570259,
	"rewards/format_reward": 0.5208333358168602,
	"step": 104
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1364.8333740234375,
	"epoch": 0.12,
	"grad_norm": 0.25403433256650454,
	"kl": 0.0016727447509765625,
	"learning_rate": 9.672327345550543e-07,
	"loss": 0.1156,
	"reward": 0.28816052433103323,
	"reward_std": 0.240465197712183,
	"rewards/cosine_scaled_reward": -0.1267530769109726,
	"rewards/format_reward": 0.541666679084301,
	"step": 105
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1570.6667175292969,
	"epoch": 0.12114285714285715,
	"grad_norm": 0.2462172191203138,
	"kl": 0.0020122528076171875,
	"learning_rate": 9.66045715125541e-07,
	"loss": 0.0866,
	"reward": 0.34020555624738336,
	"reward_std": 0.7328735627233982,
	"rewards/cosine_scaled_reward": -0.038230573292821646,
	"rewards/format_reward": 0.41666666977107525,
	"step": 106
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1243.4583740234375,
	"epoch": 0.12228571428571429,
	"grad_norm": 0.22392855280151888,
	"kl": 0.001399993896484375,
	"learning_rate": 9.648384182148252e-07,
	"loss": 0.0861,
	"reward": 0.19801579043269157,
	"reward_std": 0.4772573560476303,
	"rewards/cosine_scaled_reward": -0.18224211037158966,
	"rewards/format_reward": 0.5625000149011612,
	"step": 107
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1376.5625610351562,
	"epoch": 0.12342857142857143,
	"grad_norm": 0.2328882803373465,
	"kl": 0.0032482147216796875,
	"learning_rate": 9.636109026648554e-07,
	"loss": 0.0636,
	"reward": 0.6495321169495583,
	"reward_std": 0.5899618566036224,
	"rewards/cosine_scaled_reward": 0.06434935945435427,
	"rewards/format_reward": 0.5208333488553762,
	"step": 108
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1368.0625305175781,
	"epoch": 0.12457142857142857,
	"grad_norm": 0.3696050391986309,
	"kl": 0.0028667449951171875,
	"learning_rate": 9.623632283030077e-07,
	"loss": 0.1246,
	"reward": -0.031360091641545296,
	"reward_std": 0.4002140313386917,
	"rewards/cosine_scaled_reward": -0.2656800393015146,
	"rewards/format_reward": 0.5,
	"step": 109
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1444.6666870117188,
	"epoch": 0.12571428571428572,
	"grad_norm": 0.35213532577859125,
	"kl": 0.0029430389404296875,
	"learning_rate": 9.610954559391704e-07,
	"loss": 0.1339,
	"reward": 0.6942434869706631,
	"reward_std": 0.9198908805847168,
	"rewards/cosine_scaled_reward": 0.06587174534797668,
	"rewards/format_reward": 0.5625000149011612,
	"step": 110
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1072.7708587646484,
	"epoch": 0.12685714285714286,
	"grad_norm": 0.2985726423715741,
	"kl": 0.001979827880859375,
	"learning_rate": 9.598076473627796e-07,
	"loss": 0.0476,
	"reward": 0.7408694333862513,
	"reward_std": 0.7333548963069916,
	"rewards/cosine_scaled_reward": -0.004565277136862278,
	"rewards/format_reward": 0.7500000149011612,
	"step": 111
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1633.4167175292969,
	"epoch": 0.128,
	"grad_norm": 0.22471101395696397,
	"kl": 0.00258636474609375,
	"learning_rate": 9.58499865339809e-07,
	"loss": 0.0346,
	"reward": -0.05079384706914425,
	"reward_std": 0.4366183038800955,
	"rewards/cosine_scaled_reward": -0.2337302602827549,
	"rewards/format_reward": 0.4166666865348816,
	"step": 112
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1319.7291870117188,
	"epoch": 0.12914285714285714,
	"grad_norm": 0.27063696127291986,
	"kl": 0.0033721923828125,
	"learning_rate": 9.571721736097088e-07,
	"loss": 0.0833,
	"reward": 0.6321319434791803,
	"reward_std": 0.5336715504527092,
	"rewards/cosine_scaled_reward": -0.006850697100162506,
	"rewards/format_reward": 0.6458333507180214,
	"step": 113
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1052.3958892822266,
	"epoch": 0.13028571428571428,
	"grad_norm": 0.250125198289797,
	"kl": 0.0016460418701171875,
	"learning_rate": 9.55824636882301e-07,
	"loss": 0.0768,
	"reward": 0.653087726328522,
	"reward_std": 0.35864404030144215,
	"rewards/cosine_scaled_reward": -0.017206139862537384,
	"rewards/format_reward": 0.6875000149011612,
	"step": 114
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1440.2083740234375,
	"epoch": 0.13142857142857142,
	"grad_norm": 0.29266585256345196,
	"kl": 0.0030155181884765625,
	"learning_rate": 9.54457320834625e-07,
	"loss": 0.0755,
	"reward": 0.21958831325173378,
	"reward_std": 0.704796127974987,
	"rewards/cosine_scaled_reward": -0.16103917988948524,
	"rewards/format_reward": 0.5416666865348816,
	"step": 115
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1472.5416870117188,
	"epoch": 0.13257142857142856,
	"grad_norm": 0.26433038131357134,
	"kl": 0.003147125244140625,
	"learning_rate": 9.530702921077358e-07,
	"loss": 0.073,
	"reward": 0.018861573189496994,
	"reward_std": 0.3587416708469391,
	"rewards/cosine_scaled_reward": -0.18848587945103645,
	"rewards/format_reward": 0.39583334140479565,
	"step": 116
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1545.9375305175781,
	"epoch": 0.1337142857142857,
	"grad_norm": 0.21836493727001577,
	"kl": 0.002864837646484375,
	"learning_rate": 9.516636183034564e-07,
	"loss": 0.1366,
	"reward": -0.32600877061486244,
	"reward_std": 0.43822694569826126,
	"rewards/cosine_scaled_reward": -0.35050439089536667,
	"rewards/format_reward": 0.3750000074505806,
	"step": 117
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1334.1250305175781,
	"epoch": 0.13485714285714287,
	"grad_norm": 0.24394321780710398,
	"kl": 0.0026493072509765625,
	"learning_rate": 9.502373679810839e-07,
	"loss": 0.035,
	"reward": 0.457018606364727,
	"reward_std": 0.5285698734223843,
	"rewards/cosine_scaled_reward": -0.09440736100077629,
	"rewards/format_reward": 0.645833358168602,
	"step": 118
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1220.3333740234375,
	"epoch": 0.136,
	"grad_norm": 0.28272459137828676,
	"kl": 0.0042877197265625,
	"learning_rate": 9.487916106540465e-07,
	"loss": 0.0804,
	"reward": 0.3442453145980835,
	"reward_std": 0.564174473285675,
	"rewards/cosine_scaled_reward": -0.12996070086956024,
	"rewards/format_reward": 0.6041666716337204,
	"step": 119
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1528.6042175292969,
	"epoch": 0.13714285714285715,
	"grad_norm": 0.2668307726658885,
	"kl": 0.00232696533203125,
	"learning_rate": 9.473264167865171e-07,
	"loss": 0.1032,
	"reward": -0.03986197151243687,
	"reward_std": 0.37811761628836393,
	"rewards/cosine_scaled_reward": -0.2282643192447722,
	"rewards/format_reward": 0.4166666716337204,
	"step": 120
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1584.8125610351562,
	"epoch": 0.1382857142857143,
	"grad_norm": 0.22786468100552407,
	"kl": 0.002208709716796875,
	"learning_rate": 9.458418577899774e-07,
	"loss": 0.0046,
	"reward": 0.16309459879994392,
	"reward_std": 0.2453223168849945,
	"rewards/cosine_scaled_reward": -0.1372026912868023,
	"rewards/format_reward": 0.4375000074505806,
	"step": 121
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1462.6875610351562,
	"epoch": 0.13942857142857143,
	"grad_norm": 0.28816738889821486,
	"kl": 0.00514984130859375,
	"learning_rate": 9.443380060197385e-07,
	"loss": 0.0974,
	"reward": -0.12114270869642496,
	"reward_std": 0.2534109205007553,
	"rewards/cosine_scaled_reward": -0.2689046934247017,
	"rewards/format_reward": 0.41666666977107525,
	"step": 122
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1375.7708740234375,
	"epoch": 0.14057142857142857,
	"grad_norm": 0.32101258824146217,
	"kl": 0.0041351318359375,
	"learning_rate": 9.428149347714143e-07,
	"loss": 0.1284,
	"reward": 0.18988706171512604,
	"reward_std": 0.8535008877515793,
	"rewards/cosine_scaled_reward": -0.17588980495929718,
	"rewards/format_reward": 0.541666679084301,
	"step": 123
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1374.7292175292969,
	"epoch": 0.1417142857142857,
	"grad_norm": 0.2425349865258595,
	"kl": 0.00324249267578125,
	"learning_rate": 9.412727182773486e-07,
	"loss": 0.0382,
	"reward": 0.07038946449756622,
	"reward_std": 0.49846766516566277,
	"rewards/cosine_scaled_reward": -0.15230527985841036,
	"rewards/format_reward": 0.375,
	"step": 124
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1664.8541870117188,
	"epoch": 0.14285714285714285,
	"grad_norm": 0.2457250240943947,
	"kl": 0.0025730133056640625,
	"learning_rate": 9.397114317029974e-07,
	"loss": 0.0291,
	"reward": 0.004289238480851054,
	"reward_std": 0.32331261597573757,
	"rewards/cosine_scaled_reward": -0.15410537272691727,
	"rewards/format_reward": 0.31250000186264515,
	"step": 125
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1422.3750305175781,
	"epoch": 0.144,
	"grad_norm": 0.32285843347583837,
	"kl": 0.005126953125,
	"learning_rate": 9.381311511432658e-07,
	"loss": 0.0961,
	"reward": 0.19516459852457047,
	"reward_std": 0.6147220581769943,
	"rewards/cosine_scaled_reward": -0.162834367249161,
	"rewards/format_reward": 0.5208333507180214,
	"step": 126
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1370.7708435058594,
	"epoch": 0.14514285714285713,
	"grad_norm": 0.24341515642410516,
	"kl": 0.0030078887939453125,
	"learning_rate": 9.36531953618799e-07,
	"loss": 0.0726,
	"reward": -0.08839717879891396,
	"reward_std": 0.4017263073474169,
	"rewards/cosine_scaled_reward": -0.2941986061632633,
	"rewards/format_reward": 0.5000000149011612,
	"step": 127
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1219.9167175292969,
	"epoch": 0.1462857142857143,
	"grad_norm": 0.2623858416818109,
	"kl": 0.004070281982421875,
	"learning_rate": 9.34913917072228e-07,
	"loss": 0.0537,
	"reward": 0.43044765666127205,
	"reward_std": 0.49690980464220047,
	"rewards/cosine_scaled_reward": -0.15977618098258972,
	"rewards/format_reward": 0.7500000149011612,
	"step": 128
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1109.1875305175781,
	"epoch": 0.14742857142857144,
	"grad_norm": 0.2829401049059584,
	"kl": 0.00757598876953125,
	"learning_rate": 9.332771203643714e-07,
	"loss": 0.0692,
	"reward": 0.6423492059111595,
	"reward_std": 0.4438105970621109,
	"rewards/cosine_scaled_reward": -0.03299206681549549,
	"rewards/format_reward": 0.7083333358168602,
	"step": 129
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1495.8333740234375,
	"epoch": 0.14857142857142858,
	"grad_norm": 0.23104014201895975,
	"kl": 0.00299835205078125,
	"learning_rate": 9.316216432703916e-07,
	"loss": 0.0064,
	"reward": -0.09923176001757383,
	"reward_std": 0.43960002437233925,
	"rewards/cosine_scaled_reward": -0.29961589351296425,
	"rewards/format_reward": 0.5000000149011612,
	"step": 130
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1543.2500305175781,
	"epoch": 0.14971428571428572,
	"grad_norm": 0.22132261730116032,
	"kl": 0.0033931732177734375,
	"learning_rate": 9.299475664759068e-07,
	"loss": 0.1051,
	"reward": -0.012558471411466599,
	"reward_std": 0.5053001046180725,
	"rewards/cosine_scaled_reward": -0.24586258456110954,
	"rewards/format_reward": 0.47916667722165585,
	"step": 131
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1477.0625,
	"epoch": 0.15085714285714286,
	"grad_norm": 0.2442588816427236,
	"kl": 0.004528045654296875,
	"learning_rate": 9.282549715730579e-07,
	"loss": 0.0768,
	"reward": -0.11025669425725937,
	"reward_std": 0.18197684548795223,
	"rewards/cosine_scaled_reward": -0.284295029938221,
	"rewards/format_reward": 0.4583333432674408,
	"step": 132
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1563.3958740234375,
	"epoch": 0.152,
	"grad_norm": 0.21023108591248665,
	"kl": 0.00415802001953125,
	"learning_rate": 9.265439410565328e-07,
	"loss": 0.0672,
	"reward": 0.13176406361162663,
	"reward_std": 0.5022407323122025,
	"rewards/cosine_scaled_reward": -0.18411797285079956,
	"rewards/format_reward": 0.5000000149011612,
	"step": 133
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1049.1042022705078,
	"epoch": 0.15314285714285714,
	"grad_norm": 0.3838039161390532,
	"kl": 0.00562286376953125,
	"learning_rate": 9.248145583195447e-07,
	"loss": 0.1973,
	"reward": 0.4749515192816034,
	"reward_std": 0.3580738380551338,
	"rewards/cosine_scaled_reward": -0.15835759788751602,
	"rewards/format_reward": 0.7916666865348816,
	"step": 134
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1351.5416870117188,
	"epoch": 0.15428571428571428,
	"grad_norm": 0.34500799880157473,
	"kl": 0.00400543212890625,
	"learning_rate": 9.230669076497687e-07,
	"loss": 0.143,
	"reward": 0.2647483544424176,
	"reward_std": 0.5427017770707607,
	"rewards/cosine_scaled_reward": -0.11762583442032337,
	"rewards/format_reward": 0.5000000204890966,
	"step": 135
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1168.0625305175781,
	"epoch": 0.15542857142857142,
	"grad_norm": 0.31218899888892226,
	"kl": 0.004955291748046875,
	"learning_rate": 9.213010742252327e-07,
	"loss": 0.0562,
	"reward": 0.3584494572132826,
	"reward_std": 0.5529016815125942,
	"rewards/cosine_scaled_reward": -0.17494194395840168,
	"rewards/format_reward": 0.7083333432674408,
	"step": 136
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1282.0416870117188,
	"epoch": 0.15657142857142858,
	"grad_norm": 0.2721613126225875,
	"kl": 0.007869720458984375,
	"learning_rate": 9.195171441101668e-07,
	"loss": 0.1358,
	"reward": 0.2924184873700142,
	"reward_std": 0.5777250528335571,
	"rewards/cosine_scaled_reward": -0.16629073955118656,
	"rewards/format_reward": 0.6250000055879354,
	"step": 137
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1041.8958740234375,
	"epoch": 0.15771428571428572,
	"grad_norm": 0.30890354701331296,
	"kl": 0.00521087646484375,
	"learning_rate": 9.177152042508077e-07,
	"loss": 0.0338,
	"reward": 0.860385000705719,
	"reward_std": 0.8024220168590546,
	"rewards/cosine_scaled_reward": 0.023942476138472557,
	"rewards/format_reward": 0.8125000149011612,
	"step": 138
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1192.8125457763672,
	"epoch": 0.15885714285714286,
	"grad_norm": 0.2622844914783918,
	"kl": 0.00412750244140625,
	"learning_rate": 9.158953424711624e-07,
	"loss": 0.0825,
	"reward": 0.5425689108669758,
	"reward_std": 0.5253265127539635,
	"rewards/cosine_scaled_reward": -0.12454888969659805,
	"rewards/format_reward": 0.7916666865348816,
	"step": 139
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1283.4375305175781,
	"epoch": 0.16,
	"grad_norm": 0.24897560413424463,
	"kl": 0.004375457763671875,
	"learning_rate": 9.140576474687263e-07,
	"loss": 0.1075,
	"reward": 0.3927510902285576,
	"reward_std": 0.43108681961894035,
	"rewards/cosine_scaled_reward": -0.1577911265194416,
	"rewards/format_reward": 0.7083333507180214,
	"step": 140
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1262.7916717529297,
	"epoch": 0.16114285714285714,
	"grad_norm": 0.3772239136734691,
	"kl": 0.005344390869140625,
	"learning_rate": 9.122022088101613e-07,
	"loss": 0.1713,
	"reward": 0.37745123356580734,
	"reward_std": 0.5623941943049431,
	"rewards/cosine_scaled_reward": -0.13419108092784882,
	"rewards/format_reward": 0.6458333432674408,
	"step": 141
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1246.6042022705078,
	"epoch": 0.16228571428571428,
	"grad_norm": 0.28965855619789826,
	"kl": 0.0045166015625,
	"learning_rate": 9.103291169269299e-07,
	"loss": 0.0725,
	"reward": 0.5083264335989952,
	"reward_std": 0.5853047892451286,
	"rewards/cosine_scaled_reward": -0.047920111566782,
	"rewards/format_reward": 0.6041666828095913,
	"step": 142
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1448.9375610351562,
	"epoch": 0.16342857142857142,
	"grad_norm": 0.2549900151123108,
	"kl": 0.006290435791015625,
	"learning_rate": 9.084384631108882e-07,
	"loss": 0.1142,
	"reward": 0.13985165720805526,
	"reward_std": 0.2659877985715866,
	"rewards/cosine_scaled_reward": -0.20090750604867935,
	"rewards/format_reward": 0.5416666865348816,
	"step": 143
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1203.2083740234375,
	"epoch": 0.16457142857142856,
	"grad_norm": 0.2224971436424363,
	"kl": 0.005550384521484375,
	"learning_rate": 9.065303395098358e-07,
	"loss": 0.085,
	"reward": 0.5334329381585121,
	"reward_std": 0.5584629252552986,
	"rewards/cosine_scaled_reward": -0.10828354395925999,
	"rewards/format_reward": 0.75,
	"step": 144
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1086.9792175292969,
	"epoch": 0.1657142857142857,
	"grad_norm": 0.3813865927902241,
	"kl": 0.0063629150390625,
	"learning_rate": 9.046048391230247e-07,
	"loss": 0.1879,
	"reward": 0.2875216994434595,
	"reward_std": 0.5303685888648033,
	"rewards/cosine_scaled_reward": -0.23123916238546371,
	"rewards/format_reward": 0.7500000149011612,
	"step": 145
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1285.5625610351562,
	"epoch": 0.16685714285714287,
	"grad_norm": 0.2972743546494519,
	"kl": 0.0059814453125,
	"learning_rate": 9.026620557966279e-07,
	"loss": 0.0594,
	"reward": 0.2565866466611624,
	"reward_std": 0.46598899737000465,
	"rewards/cosine_scaled_reward": -0.25712333619594574,
	"rewards/format_reward": 0.770833358168602,
	"step": 146
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 896.6250305175781,
	"epoch": 0.168,
	"grad_norm": 0.32647401434979056,
	"kl": 0.006877899169921875,
	"learning_rate": 9.007020842191634e-07,
	"loss": -0.0011,
	"reward": 1.0985181145370007,
	"reward_std": 0.5338096916675568,
	"rewards/cosine_scaled_reward": 0.05967570189386606,
	"rewards/format_reward": 0.9791666716337204,
	"step": 147
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1160.9583435058594,
	"epoch": 0.16914285714285715,
	"grad_norm": 0.2816274273158885,
	"kl": 0.00585174560546875,
	"learning_rate": 8.987250199168808e-07,
	"loss": 0.0442,
	"reward": 0.18387611024081707,
	"reward_std": 0.2959946282207966,
	"rewards/cosine_scaled_reward": -0.3143119588494301,
	"rewards/format_reward": 0.8125000149011612,
	"step": 148
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1223.9791870117188,
	"epoch": 0.1702857142857143,
	"grad_norm": 0.2823488259457116,
	"kl": 0.00612640380859375,
	"learning_rate": 8.967309592491052e-07,
	"loss": 0.0654,
	"reward": 0.47756416723132133,
	"reward_std": 0.7413289695978165,
	"rewards/cosine_scaled_reward": -0.11538459919393063,
	"rewards/format_reward": 0.7083333432674408,
	"step": 149
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 986.6042022705078,
	"epoch": 0.17142857142857143,
	"grad_norm": 0.318064277562745,
	"kl": 0.0080413818359375,
	"learning_rate": 8.9471999940354e-07,
	"loss": 0.1332,
	"reward": 0.401881605386734,
	"reward_std": 0.6674076840281487,
	"rewards/cosine_scaled_reward": -0.20530920289456844,
	"rewards/format_reward": 0.8125000298023224,
	"step": 150
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1111.4375610351562,
	"epoch": 0.17257142857142857,
	"grad_norm": 0.23750964874516883,
	"kl": 0.005405426025390625,
	"learning_rate": 8.926922383915315e-07,
	"loss": 0.0547,
	"reward": 0.42157261446118355,
	"reward_std": 0.2637167125940323,
	"rewards/cosine_scaled_reward": -0.14338038116693497,
	"rewards/format_reward": 0.7083333432674408,
	"step": 151
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1390.2292175292969,
	"epoch": 0.1737142857142857,
	"grad_norm": 0.3108688575018839,
	"kl": 0.008697509765625,
	"learning_rate": 8.906477750432903e-07,
	"loss": 0.1077,
	"reward": 0.11867762915790081,
	"reward_std": 0.5801703371107578,
	"rewards/cosine_scaled_reward": -0.2739945203065872,
	"rewards/format_reward": 0.6666666716337204,
	"step": 152
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1189.4792022705078,
	"epoch": 0.17485714285714285,
	"grad_norm": 0.22859697466435477,
	"kl": 0.006011962890625,
	"learning_rate": 8.88586709003076e-07,
	"loss": 0.0402,
	"reward": 0.46854234486818314,
	"reward_std": 0.5257667489349842,
	"rewards/cosine_scaled_reward": -0.07822884852066636,
	"rewards/format_reward": 0.6250000149011612,
	"step": 153
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1227.7292175292969,
	"epoch": 0.176,
	"grad_norm": 0.23458511838935112,
	"kl": 0.0063323974609375,
	"learning_rate": 8.865091407243394e-07,
	"loss": 0.129,
	"reward": 0.7308447554241866,
	"reward_std": 0.4724605940282345,
	"rewards/cosine_scaled_reward": 0.011255700141191483,
	"rewards/format_reward": 0.7083333432674408,
	"step": 154
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1320.6666870117188,
	"epoch": 0.17714285714285713,
	"grad_norm": 0.29059316598505575,
	"kl": 0.007198333740234375,
	"learning_rate": 8.844151714648274e-07,
	"loss": 0.1327,
	"reward": -0.1417454145848751,
	"reward_std": 0.3702365458011627,
	"rewards/cosine_scaled_reward": -0.3521227166056633,
	"rewards/format_reward": 0.5625000055879354,
	"step": 155
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1116.000015258789,
	"epoch": 0.1782857142857143,
	"grad_norm": 0.37926874198201693,
	"kl": 0.00821685791015625,
	"learning_rate": 8.823049032816478e-07,
	"loss": 0.2189,
	"reward": 0.15536441165022552,
	"reward_std": 0.2769140414893627,
	"rewards/cosine_scaled_reward": -0.2869011387228966,
	"rewards/format_reward": 0.729166679084301,
	"step": 156
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1049.5000305175781,
	"epoch": 0.17942857142857144,
	"grad_norm": 0.3728612044799926,
	"kl": 0.02156829833984375,
	"learning_rate": 8.801784390262943e-07,
	"loss": 0.0389,
	"reward": 0.7612650550436229,
	"reward_std": 0.31401624344289303,
	"rewards/cosine_scaled_reward": 0.005632489919662476,
	"rewards/format_reward": 0.75,
	"step": 157
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1386.0833740234375,
	"epoch": 0.18057142857142858,
	"grad_norm": 0.3889317879381384,
	"kl": 0.00850677490234375,
	"learning_rate": 8.780358823396352e-07,
	"loss": 0.1308,
	"reward": 0.06261628679931164,
	"reward_std": 0.3530626520514488,
	"rewards/cosine_scaled_reward": -0.3124418593943119,
	"rewards/format_reward": 0.6875000298023224,
	"step": 158
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1129.7708740234375,
	"epoch": 0.18171428571428572,
	"grad_norm": 0.3220977926530576,
	"kl": 0.00748443603515625,
	"learning_rate": 8.758773376468604e-07,
	"loss": 0.1262,
	"reward": 0.6195714063942432,
	"reward_std": 0.6696993261575699,
	"rewards/cosine_scaled_reward": -0.08604763355106115,
	"rewards/format_reward": 0.7916666865348816,
	"step": 159
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1160.4167175292969,
	"epoch": 0.18285714285714286,
	"grad_norm": 0.2845576646765754,
	"kl": 0.0069122314453125,
	"learning_rate": 8.737029101523929e-07,
	"loss": 0.1064,
	"reward": 0.6454856535419822,
	"reward_std": 0.8377318382263184,
	"rewards/cosine_scaled_reward": -0.08350718393921852,
	"rewards/format_reward": 0.8125000149011612,
	"step": 160
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1429.4792175292969,
	"epoch": 0.184,
	"grad_norm": 0.25219633802451885,
	"kl": 0.00858306884765625,
	"learning_rate": 8.715127058347614e-07,
	"loss": 0.0965,
	"reward": 0.009432412683963776,
	"reward_std": 0.3042390923947096,
	"rewards/cosine_scaled_reward": -0.27653381787240505,
	"rewards/format_reward": 0.5625000149011612,
	"step": 161
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1226.7917175292969,
	"epoch": 0.18514285714285714,
	"grad_norm": 0.26347106975524837,
	"kl": 0.0080108642578125,
	"learning_rate": 8.693068314414344e-07,
	"loss": 0.077,
	"reward": 0.24512136541306973,
	"reward_std": 0.43705228716135025,
	"rewards/cosine_scaled_reward": -0.29410600662231445,
	"rewards/format_reward": 0.8333333432674408,
	"step": 162
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 926.1250305175781,
	"epoch": 0.18628571428571428,
	"grad_norm": 0.41739039022115654,
	"kl": 0.0112457275390625,
	"learning_rate": 8.670853944836176e-07,
	"loss": 0.1827,
	"reward": 0.7628292813897133,
	"reward_std": 0.8151352852582932,
	"rewards/cosine_scaled_reward": -0.04566871002316475,
	"rewards/format_reward": 0.8541666716337204,
	"step": 163
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 946.125,
	"epoch": 0.18742857142857142,
	"grad_norm": 0.36967841595429546,
	"kl": 0.01031494140625,
	"learning_rate": 8.648485032310144e-07,
	"loss": 0.1834,
	"reward": 0.6057916302233934,
	"reward_std": 0.48515384271740913,
	"rewards/cosine_scaled_reward": -0.10335419327020645,
	"rewards/format_reward": 0.8125000298023224,
	"step": 164
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1174.4792175292969,
	"epoch": 0.18857142857142858,
	"grad_norm": 0.2708332867444507,
	"kl": 0.00788116455078125,
	"learning_rate": 8.625962667065487e-07,
	"loss": 0.0394,
	"reward": 0.32264771312475204,
	"reward_std": 0.4833778813481331,
	"rewards/cosine_scaled_reward": -0.24492615275084972,
	"rewards/format_reward": 0.8125000149011612,
	"step": 165
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 907.0833740234375,
	"epoch": 0.18971428571428572,
	"grad_norm": 0.3261002575687217,
	"kl": 0.00717926025390625,
	"learning_rate": 8.603287946810513e-07,
	"loss": 0.1283,
	"reward": 0.6173169314861298,
	"reward_std": 0.2740478292107582,
	"rewards/cosine_scaled_reward": -0.1705082282423973,
	"rewards/format_reward": 0.9583333432674408,
	"step": 166
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1270.0417175292969,
	"epoch": 0.19085714285714286,
	"grad_norm": 0.3129560409827591,
	"kl": 0.00971221923828125,
	"learning_rate": 8.580461976679099e-07,
	"loss": 0.1093,
	"reward": 0.3518100567162037,
	"reward_std": 0.5595069229602814,
	"rewards/cosine_scaled_reward": -0.1470116525888443,
	"rewards/format_reward": 0.645833358168602,
	"step": 167
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1077.1041870117188,
	"epoch": 0.192,
	"grad_norm": 0.26915582005747507,
	"kl": 0.0078125,
	"learning_rate": 8.557485869176825e-07,
	"loss": 0.1617,
	"reward": 0.2642595246434212,
	"reward_std": 0.46994560211896896,
	"rewards/cosine_scaled_reward": -0.26370356790721416,
	"rewards/format_reward": 0.7916666865348816,
	"step": 168
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1156.2500305175781,
	"epoch": 0.19314285714285714,
	"grad_norm": 0.35785552736378773,
	"kl": 0.0098724365234375,
	"learning_rate": 8.534360744126753e-07,
	"loss": 0.0922,
	"reward": 0.77548947930336,
	"reward_std": 0.7726699560880661,
	"rewards/cosine_scaled_reward": 0.0023280568420886993,
	"rewards/format_reward": 0.770833358168602,
	"step": 169
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1073.7292175292969,
	"epoch": 0.19428571428571428,
	"grad_norm": 0.32755955253118335,
	"kl": 0.0117034912109375,
	"learning_rate": 8.511087728614862e-07,
	"loss": 0.0752,
	"reward": 0.19202834740281105,
	"reward_std": 0.3850276917219162,
	"rewards/cosine_scaled_reward": -0.3206525072455406,
	"rewards/format_reward": 0.8333333432674408,
	"step": 170
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 918.8333511352539,
	"epoch": 0.19542857142857142,
	"grad_norm": 0.3616914993674,
	"kl": 0.00833892822265625,
	"learning_rate": 8.487667956935087e-07,
	"loss": 0.0904,
	"reward": 0.5478162653744221,
	"reward_std": 0.6629246398806572,
	"rewards/cosine_scaled_reward": -0.1948418878018856,
	"rewards/format_reward": 0.9375,
	"step": 171
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1036.1458740234375,
	"epoch": 0.19657142857142856,
	"grad_norm": 0.3354400822116869,
	"kl": 0.0130157470703125,
	"learning_rate": 8.464102570534061e-07,
	"loss": 0.0669,
	"reward": 0.7608658275566995,
	"reward_std": 0.6014236621558666,
	"rewards/cosine_scaled_reward": -0.04665040969848633,
	"rewards/format_reward": 0.8541666865348816,
	"step": 172
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1106.4583740234375,
	"epoch": 0.1977142857142857,
	"grad_norm": 0.3236947350770136,
	"kl": 0.0121307373046875,
	"learning_rate": 8.440392717955475e-07,
	"loss": 0.093,
	"reward": 0.7088564559817314,
	"reward_std": 0.4235651511698961,
	"rewards/cosine_scaled_reward": -0.010155089199543,
	"rewards/format_reward": 0.7291666865348816,
	"step": 173
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1109.5000305175781,
	"epoch": 0.19885714285714284,
	"grad_norm": 0.37244639543702895,
	"kl": 0.015838623046875,
	"learning_rate": 8.416539554784089e-07,
	"loss": 0.1098,
	"reward": 0.17886048182845116,
	"reward_std": 0.35543810576200485,
	"rewards/cosine_scaled_reward": -0.30640310421586037,
	"rewards/format_reward": 0.7916666865348816,
	"step": 174
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 972.9167022705078,
	"epoch": 0.2,
	"grad_norm": 0.6554774460546362,
	"kl": 0.0153045654296875,
	"learning_rate": 8.392544243589427e-07,
	"loss": 0.2068,
	"reward": 0.607050247490406,
	"reward_std": 0.4396999180316925,
	"rewards/cosine_scaled_reward": -0.14439154416322708,
	"rewards/format_reward": 0.895833358168602,
	"step": 175
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 998.2291870117188,
	"epoch": 0.20114285714285715,
	"grad_norm": 0.28748166515655293,
	"kl": 0.0133819580078125,
	"learning_rate": 8.368407953869103e-07,
	"loss": 0.0371,
	"reward": 0.486224377527833,
	"reward_std": 0.6124172061681747,
	"rewards/cosine_scaled_reward": -0.17355448007583618,
	"rewards/format_reward": 0.8333333432674408,
	"step": 176
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 916.2291870117188,
	"epoch": 0.2022857142857143,
	"grad_norm": 0.4438177799902679,
	"kl": 0.0131378173828125,
	"learning_rate": 8.344131861991828e-07,
	"loss": 0.1487,
	"reward": 0.8074519336223602,
	"reward_std": 0.4988584369421005,
	"rewards/cosine_scaled_reward": -0.05460738018155098,
	"rewards/format_reward": 0.9166666865348816,
	"step": 177
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 822.8333740234375,
	"epoch": 0.20342857142857143,
	"grad_norm": 0.5173286289503403,
	"kl": 0.0179443359375,
	"learning_rate": 8.319717151140072e-07,
	"loss": 0.1961,
	"reward": 1.0362385213375092,
	"reward_std": 0.5397170335054398,
	"rewards/cosine_scaled_reward": 0.13270257785916328,
	"rewards/format_reward": 0.770833358168602,
	"step": 178
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 959.9792022705078,
	"epoch": 0.20457142857142857,
	"grad_norm": 0.369107073779179,
	"kl": 0.016815185546875,
	"learning_rate": 8.295165011252396e-07,
	"loss": 0.1417,
	"reward": 0.6556574255228043,
	"reward_std": 0.4815560430288315,
	"rewards/cosine_scaled_reward": -0.10967130470089614,
	"rewards/format_reward": 0.8750000149011612,
	"step": 179
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1162.7292022705078,
	"epoch": 0.2057142857142857,
	"grad_norm": 0.5036563993456736,
	"kl": 0.01904296875,
	"learning_rate": 8.270476638965461e-07,
	"loss": 0.0949,
	"reward": 0.2779462654143572,
	"reward_std": 0.4615231901407242,
	"rewards/cosine_scaled_reward": -0.24644354078918695,
	"rewards/format_reward": 0.7708333432674408,
	"step": 180
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1078.8333435058594,
	"epoch": 0.20685714285714285,
	"grad_norm": 0.4317948665990577,
	"kl": 0.0135955810546875,
	"learning_rate": 8.245653237555705e-07,
	"loss": 0.1473,
	"reward": 0.6264736168086529,
	"reward_std": 0.5298948585987091,
	"rewards/cosine_scaled_reward": -0.10342983156442642,
	"rewards/format_reward": 0.8333333432674408,
	"step": 181
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1065.8125305175781,
	"epoch": 0.208,
	"grad_norm": 0.5168299485262725,
	"kl": 0.02105712890625,
	"learning_rate": 8.220696016880687e-07,
	"loss": 0.1884,
	"reward": 0.3882112614810467,
	"reward_std": 0.5859006345272064,
	"rewards/cosine_scaled_reward": -0.2017277143895626,
	"rewards/format_reward": 0.7916666865348816,
	"step": 182
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1069.4583740234375,
	"epoch": 0.20914285714285713,
	"grad_norm": 0.5024855038579699,
	"kl": 0.0205078125,
	"learning_rate": 8.195606193320136e-07,
	"loss": 0.1323,
	"reward": 0.24412129819393158,
	"reward_std": 0.47408775985240936,
	"rewards/cosine_scaled_reward": -0.2529393620789051,
	"rewards/format_reward": 0.7500000149011612,
	"step": 183
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 936.4792022705078,
	"epoch": 0.2102857142857143,
	"grad_norm": 0.4981488833418968,
	"kl": 0.017730712890625,
	"learning_rate": 8.170384989716657e-07,
	"loss": 0.137,
	"reward": 0.6930912919342518,
	"reward_std": 0.5617035925388336,
	"rewards/cosine_scaled_reward": -0.03887102263979614,
	"rewards/format_reward": 0.7708333432674408,
	"step": 184
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1030.4166717529297,
	"epoch": 0.21142857142857144,
	"grad_norm": 0.4904295101939947,
	"kl": 0.0301513671875,
	"learning_rate": 8.145033635316128e-07,
	"loss": 0.1667,
	"reward": 0.07037857547402382,
	"reward_std": 0.27715054154396057,
	"rewards/cosine_scaled_reward": -0.33981072157621384,
	"rewards/format_reward": 0.7500000298023224,
	"step": 185
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1297.3750457763672,
	"epoch": 0.21257142857142858,
	"grad_norm": 0.359329704495533,
	"kl": 0.02447509765625,
	"learning_rate": 8.119553365707802e-07,
	"loss": 0.0722,
	"reward": 0.27740756422281265,
	"reward_std": 0.35020239651203156,
	"rewards/cosine_scaled_reward": -0.20504622161388397,
	"rewards/format_reward": 0.6875000149011612,
	"step": 186
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1278.2291870117188,
	"epoch": 0.21371428571428572,
	"grad_norm": 0.6229091446373484,
	"kl": 0.037841796875,
	"learning_rate": 8.093945422764069e-07,
	"loss": 0.159,
	"reward": 0.6862413678318262,
	"reward_std": 0.806188240647316,
	"rewards/cosine_scaled_reward": -0.011045984923839569,
	"rewards/format_reward": 0.7083333507180214,
	"step": 187
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 828.7083587646484,
	"epoch": 0.21485714285714286,
	"grad_norm": 0.8396211982213951,
	"kl": 0.029296875,
	"learning_rate": 8.068211054579943e-07,
	"loss": 0.1705,
	"reward": 0.5941705331206322,
	"reward_std": 0.6708386167883873,
	"rewards/cosine_scaled_reward": -0.12999806739389896,
	"rewards/format_reward": 0.8541666865348816,
	"step": 188
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1126.9583740234375,
	"epoch": 0.216,
	"grad_norm": 1.0692586435721545,
	"kl": 0.05059814453125,
	"learning_rate": 8.04235151541222e-07,
	"loss": 0.2306,
	"reward": 0.3716874085366726,
	"reward_std": 0.6852569133043289,
	"rewards/cosine_scaled_reward": -0.17873962549492717,
	"rewards/format_reward": 0.7291666939854622,
	"step": 189
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1441.7292175292969,
	"epoch": 0.21714285714285714,
	"grad_norm": 0.4556901372243305,
	"kl": 0.0775146484375,
	"learning_rate": 8.01636806561836e-07,
	"loss": 0.0641,
	"reward": -0.02832420915365219,
	"reward_std": 0.41898399591445923,
	"rewards/cosine_scaled_reward": -0.21207877062261105,
	"rewards/format_reward": 0.3958333432674408,
	"step": 190
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1079.3541870117188,
	"epoch": 0.21828571428571428,
	"grad_norm": 0.7752155732582218,
	"kl": 0.05340576171875,
	"learning_rate": 7.990261971595048e-07,
	"loss": 0.1862,
	"reward": 0.4970630258321762,
	"reward_std": 0.6355597376823425,
	"rewards/cosine_scaled_reward": -0.1264684833586216,
	"rewards/format_reward": 0.7500000149011612,
	"step": 191
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1067.5417175292969,
	"epoch": 0.21942857142857142,
	"grad_norm": 0.9433921479755671,
	"kl": 0.0628662109375,
	"learning_rate": 7.964034505716476e-07,
	"loss": 0.1345,
	"reward": 0.34896004013717175,
	"reward_std": 0.44530968368053436,
	"rewards/cosine_scaled_reward": -0.19010332133620977,
	"rewards/format_reward": 0.7291666865348816,
	"step": 192
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1153.1458892822266,
	"epoch": 0.22057142857142858,
	"grad_norm": 0.557299045473737,
	"kl": 0.07440185546875,
	"learning_rate": 7.93768694627233e-07,
	"loss": 0.0623,
	"reward": 0.3937496952712536,
	"reward_std": 0.4528709352016449,
	"rewards/cosine_scaled_reward": -0.14687515422701836,
	"rewards/format_reward": 0.6875000149011612,
	"step": 193
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 772.7708435058594,
	"epoch": 0.22171428571428572,
	"grad_norm": 1.0195572615380695,
	"kl": 0.03753662109375,
	"learning_rate": 7.911220577405484e-07,
	"loss": 0.1994,
	"reward": 1.379511073231697,
	"reward_std": 0.604660227894783,
	"rewards/cosine_scaled_reward": 0.23142218962311745,
	"rewards/format_reward": 0.9166666865348816,
	"step": 194
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1043.2708740234375,
	"epoch": 0.22285714285714286,
	"grad_norm": 0.9603645520119819,
	"kl": 0.057830810546875,
	"learning_rate": 7.884636689049422e-07,
	"loss": 0.101,
	"reward": 0.9527463093400002,
	"reward_std": 0.651703879237175,
	"rewards/cosine_scaled_reward": 0.12220647558569908,
	"rewards/format_reward": 0.7083333432674408,
	"step": 195
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1171.8125305175781,
	"epoch": 0.224,
	"grad_norm": 1.0759043540199384,
	"kl": 0.094970703125,
	"learning_rate": 7.857936576865356e-07,
	"loss": 0.0986,
	"reward": 0.22757766395807266,
	"reward_std": 0.5421559736132622,
	"rewards/cosine_scaled_reward": -0.14662783965468407,
	"rewards/format_reward": 0.5208333488553762,
	"step": 196
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1254.6458892822266,
	"epoch": 0.22514285714285714,
	"grad_norm": 1.2281398548522355,
	"kl": 0.1163330078125,
	"learning_rate": 7.831121542179086e-07,
	"loss": 0.2334,
	"reward": 0.1120694987475872,
	"reward_std": 0.406834427267313,
	"rewards/cosine_scaled_reward": -0.21479860320687294,
	"rewards/format_reward": 0.541666679084301,
	"step": 197
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1551.9792175292969,
	"epoch": 0.22628571428571428,
	"grad_norm": 1.2807709220712407,
	"kl": 0.1573486328125,
	"learning_rate": 7.804192891917571e-07,
	"loss": 0.1642,
	"reward": 0.1520095318555832,
	"reward_std": 0.5469059012830257,
	"rewards/cosine_scaled_reward": -0.16357857827097178,
	"rewards/format_reward": 0.4791666716337204,
	"step": 198
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1243.7500457763672,
	"epoch": 0.22742857142857142,
	"grad_norm": 1.2387930807523095,
	"kl": 0.1546630859375,
	"learning_rate": 7.777151938545235e-07,
	"loss": 0.0664,
	"reward": 0.5908387266099453,
	"reward_std": 0.44286736100912094,
	"rewards/cosine_scaled_reward": 0.014169345609843731,
	"rewards/format_reward": 0.5625000149011612,
	"step": 199
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 998.3542022705078,
	"epoch": 0.22857142857142856,
	"grad_norm": 1.6258231243608119,
	"kl": 0.146240234375,
	"learning_rate": 7.75e-07,
	"loss": 0.223,
	"reward": 0.9689896870404482,
	"reward_std": 0.6490836925804615,
	"rewards/cosine_scaled_reward": 0.10949480719864368,
	"rewards/format_reward": 0.7500000298023224,
	"step": 200
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1197.5208587646484,
	"epoch": 0.2297142857142857,
	"grad_norm": 1.2117522808382983,
	"kl": 0.15203857421875,
	"learning_rate": 7.72273839962904e-07,
	"loss": 0.1108,
	"reward": 0.29535099118947983,
	"reward_std": 0.6659888252615929,
	"rewards/cosine_scaled_reward": -0.18565785279497504,
	"rewards/format_reward": 0.6666666865348816,
	"step": 201
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1035.2292022705078,
	"epoch": 0.23085714285714284,
	"grad_norm": 2.430024645446878,
	"kl": 0.1729736328125,
	"learning_rate": 7.695368466124296e-07,
	"loss": 0.1341,
	"reward": 0.4362456016242504,
	"reward_std": 0.665816992521286,
	"rewards/cosine_scaled_reward": -0.13604386523365974,
	"rewards/format_reward": 0.7083333432674408,
	"step": 202
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1285.2083587646484,
	"epoch": 0.232,
	"grad_norm": 3.5252314114631926,
	"kl": 0.2603759765625,
	"learning_rate": 7.667891533457718e-07,
	"loss": 0.2005,
	"reward": 0.48519248701632023,
	"reward_std": 0.612464651465416,
	"rewards/cosine_scaled_reward": -0.08032042533159256,
	"rewards/format_reward": 0.6458333432674408,
	"step": 203
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 835.0833587646484,
	"epoch": 0.23314285714285715,
	"grad_norm": 1.8381824332135883,
	"kl": 0.1859130859375,
	"learning_rate": 7.640308940816239e-07,
	"loss": 0.053,
	"reward": 1.2399137616157532,
	"reward_std": 0.6745168194174767,
	"rewards/cosine_scaled_reward": 0.2241235449910164,
	"rewards/format_reward": 0.7916666865348816,
	"step": 204
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1421.9583435058594,
	"epoch": 0.2342857142857143,
	"grad_norm": 1.7567396533005133,
	"kl": 0.362548828125,
	"learning_rate": 7.612622032536507e-07,
	"loss": 0.1051,
	"reward": 0.3085259608924389,
	"reward_std": 0.6349210105836391,
	"rewards/cosine_scaled_reward": -0.08532036282122135,
	"rewards/format_reward": 0.4791666865348816,
	"step": 205
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1228.7083892822266,
	"epoch": 0.23542857142857143,
	"grad_norm": 2.3389392066981562,
	"kl": 0.30615234375,
	"learning_rate": 7.584832158039378e-07,
	"loss": 0.0693,
	"reward": 0.18148453161120415,
	"reward_std": 0.5284193530678749,
	"rewards/cosine_scaled_reward": -0.24259107932448387,
	"rewards/format_reward": 0.6666666716337204,
	"step": 206
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1070.4792022705078,
	"epoch": 0.23657142857142857,
	"grad_norm": 3.543320557594463,
	"kl": 0.26416015625,
	"learning_rate": 7.556940671764124e-07,
	"loss": 0.1883,
	"reward": 0.542645301669836,
	"reward_std": 0.5379708558320999,
	"rewards/cosine_scaled_reward": -0.12451068125665188,
	"rewards/format_reward": 0.7916666716337204,
	"step": 207
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1317.2708740234375,
	"epoch": 0.2377142857142857,
	"grad_norm": 1.9754558032385148,
	"kl": 0.6748046875,
	"learning_rate": 7.528948933102438e-07,
	"loss": 0.1365,
	"reward": 0.09549727046396583,
	"reward_std": 0.3623932749032974,
	"rewards/cosine_scaled_reward": -0.2126680426299572,
	"rewards/format_reward": 0.5208333395421505,
	"step": 208
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 919.9375457763672,
	"epoch": 0.23885714285714285,
	"grad_norm": 3.305425458945869,
	"kl": 0.474609375,
	"learning_rate": 7.500858306332172e-07,
	"loss": 0.0593,
	"reward": 0.7489641904830933,
	"reward_std": 0.4507276937365532,
	"rewards/cosine_scaled_reward": 0.030732073821127415,
	"rewards/format_reward": 0.6875000149011612,
	"step": 209
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 908.8958435058594,
	"epoch": 0.24,
	"grad_norm": 3.7173678494051496,
	"kl": 0.403564453125,
	"learning_rate": 7.472670160550848e-07,
	"loss": 0.1606,
	"reward": 0.7559212893247604,
	"reward_std": 0.5382421165704727,
	"rewards/cosine_scaled_reward": -0.007456040009856224,
	"rewards/format_reward": 0.7708333432674408,
	"step": 210
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1479.0416870117188,
	"epoch": 0.24114285714285713,
	"grad_norm": 39.96198653082631,
	"kl": 2.5693359375,
	"learning_rate": 7.444385869608921e-07,
	"loss": 0.2707,
	"reward": 0.03475058265030384,
	"reward_std": 0.3246455695480108,
	"rewards/cosine_scaled_reward": -0.18054138123989105,
	"rewards/format_reward": 0.39583333395421505,
	"step": 211
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1051.0208587646484,
	"epoch": 0.2422857142857143,
	"grad_norm": 3.2393755765757777,
	"kl": 0.53466796875,
	"learning_rate": 7.416006812042827e-07,
	"loss": 0.0958,
	"reward": 0.6123923324048519,
	"reward_std": 0.5387515500187874,
	"rewards/cosine_scaled_reward": -0.04797050543129444,
	"rewards/format_reward": 0.708333358168602,
	"step": 212
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1301.3750610351562,
	"epoch": 0.24342857142857144,
	"grad_norm": 2.65733014184082,
	"kl": 0.755859375,
	"learning_rate": 7.387534371007797e-07,
	"loss": 0.1374,
	"reward": 0.1711240354925394,
	"reward_std": 0.42111407220363617,
	"rewards/cosine_scaled_reward": -0.16443797945976257,
	"rewards/format_reward": 0.5000000149011612,
	"step": 213
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1203.6875610351562,
	"epoch": 0.24457142857142858,
	"grad_norm": 2.501952170306742,
	"kl": 0.50732421875,
	"learning_rate": 7.358969934210438e-07,
	"loss": 0.1105,
	"reward": 0.22278533224016428,
	"reward_std": 0.434869222342968,
	"rewards/cosine_scaled_reward": -0.22194067016243935,
	"rewards/format_reward": 0.6666667014360428,
	"step": 214
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1249.6667022705078,
	"epoch": 0.24571428571428572,
	"grad_norm": 4.086485386572322,
	"kl": 0.880859375,
	"learning_rate": 7.330314893841101e-07,
	"loss": 0.0173,
	"reward": 0.3316160347312689,
	"reward_std": 0.5279753059148788,
	"rewards/cosine_scaled_reward": -0.14669198356568813,
	"rewards/format_reward": 0.6250000223517418,
	"step": 215
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1369.1666870117188,
	"epoch": 0.24685714285714286,
	"grad_norm": 3.328918162087878,
	"kl": 0.773193359375,
	"learning_rate": 7.301570646506027e-07,
	"loss": 0.1402,
	"reward": 0.2145287273451686,
	"reward_std": 0.5796768814325333,
	"rewards/cosine_scaled_reward": -0.16356897167861462,
	"rewards/format_reward": 0.5416666865348816,
	"step": 216
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1269.8542175292969,
	"epoch": 0.248,
	"grad_norm": 2.8333189883709515,
	"kl": 0.75927734375,
	"learning_rate": 7.27273859315928e-07,
	"loss": -0.0115,
	"reward": 0.5310591869056225,
	"reward_std": 0.4825605973601341,
	"rewards/cosine_scaled_reward": -0.057387083768844604,
	"rewards/format_reward": 0.645833358168602,
	"step": 217
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1252.2708740234375,
	"epoch": 0.24914285714285714,
	"grad_norm": 4.762778702423241,
	"kl": 0.74072265625,
	"learning_rate": 7.243820139034464e-07,
	"loss": 0.1477,
	"reward": 0.5015929639339447,
	"reward_std": 0.3994259871542454,
	"rewards/cosine_scaled_reward": -0.07212021434679627,
	"rewards/format_reward": 0.6458333507180214,
	"step": 218
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1017.5416870117188,
	"epoch": 0.2502857142857143,
	"grad_norm": 4.164501369060878,
	"kl": 1.07958984375,
	"learning_rate": 7.214816693576234e-07,
	"loss": 0.1337,
	"reward": 0.767455330118537,
	"reward_std": 0.5030167028307915,
	"rewards/cosine_scaled_reward": 0.07122766599059105,
	"rewards/format_reward": 0.6250000223517418,
	"step": 219
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1053.1875457763672,
	"epoch": 0.25142857142857145,
	"grad_norm": 3.588996799420188,
	"kl": 0.71142578125,
	"learning_rate": 7.185729670371604e-07,
	"loss": 0.1866,
	"reward": 0.48609594255685806,
	"reward_std": 0.617650680243969,
	"rewards/cosine_scaled_reward": -0.11111870361492038,
	"rewards/format_reward": 0.7083333432674408,
	"step": 220
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1244.687515258789,
	"epoch": 0.25257142857142856,
	"grad_norm": 2.946733537468475,
	"kl": 1.330078125,
	"learning_rate": 7.156560487081051e-07,
	"loss": 0.1268,
	"reward": 0.4570632018148899,
	"reward_std": 0.36856189370155334,
	"rewards/cosine_scaled_reward": -0.0006350576877593994,
	"rewards/format_reward": 0.4583333395421505,
	"step": 221
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1376.9167175292969,
	"epoch": 0.2537142857142857,
	"grad_norm": 3.53418042013775,
	"kl": 1.1337890625,
	"learning_rate": 7.127310565369415e-07,
	"loss": 0.2362,
	"reward": 0.1362705221399665,
	"reward_std": 0.3934030085802078,
	"rewards/cosine_scaled_reward": -0.19228141009807587,
	"rewards/format_reward": 0.5208333358168602,
	"step": 222
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1156.750015258789,
	"epoch": 0.25485714285714284,
	"grad_norm": 35.23833796360462,
	"kl": 2.369140625,
	"learning_rate": 7.097981330836616e-07,
	"loss": 0.1765,
	"reward": 0.6305188983678818,
	"reward_std": 0.5979669764637947,
	"rewards/cosine_scaled_reward": 0.023592765908688307,
	"rewards/format_reward": 0.5833333507180214,
	"step": 223
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1251.3125305175781,
	"epoch": 0.256,
	"grad_norm": 3.4418620220945138,
	"kl": 1.376953125,
	"learning_rate": 7.068574212948169e-07,
	"loss": 0.1723,
	"reward": 0.5104624545201659,
	"reward_std": 0.25178899243474007,
	"rewards/cosine_scaled_reward": -0.06768545880913734,
	"rewards/format_reward": 0.6458333432674408,
	"step": 224
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 924.3541870117188,
	"epoch": 0.2571428571428571,
	"grad_norm": 6.348797231777103,
	"kl": 0.9375,
	"learning_rate": 7.039090644965509e-07,
	"loss": 0.1337,
	"reward": 0.7791457176208496,
	"reward_std": 0.7603946030139923,
	"rewards/cosine_scaled_reward": 0.07707285927608609,
	"rewards/format_reward": 0.6250000223517418,
	"step": 225
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1052.5000305175781,
	"epoch": 0.2582857142857143,
	"grad_norm": 3.9386080018485288,
	"kl": 1.52734375,
	"learning_rate": 7.009532063876148e-07,
	"loss": 0.2459,
	"reward": 0.46499455720186234,
	"reward_std": 0.6090477257966995,
	"rewards/cosine_scaled_reward": -0.09041939489543438,
	"rewards/format_reward": 0.6458333432674408,
	"step": 226
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 892.1875152587891,
	"epoch": 0.25942857142857145,
	"grad_norm": 3.4313724086317445,
	"kl": 1.125,
	"learning_rate": 6.979899910323624e-07,
	"loss": 0.1959,
	"reward": 0.5925753712654114,
	"reward_std": 0.8098603934049606,
	"rewards/cosine_scaled_reward": -0.04746231180615723,
	"rewards/format_reward": 0.6875000298023224,
	"step": 227
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1091.6458740234375,
	"epoch": 0.26057142857142856,
	"grad_norm": 4.447647427267497,
	"kl": 1.66015625,
	"learning_rate": 6.950195628537299e-07,
	"loss": 0.1179,
	"reward": 0.24639339372515678,
	"reward_std": 0.48318010196089745,
	"rewards/cosine_scaled_reward": -0.17888664733618498,
	"rewards/format_reward": 0.6041666865348816,
	"step": 228
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 809.2916870117188,
	"epoch": 0.26171428571428573,
	"grad_norm": 8.169532609480521,
	"kl": 2.046875,
	"learning_rate": 6.920420666261961e-07,
	"loss": 0.3082,
	"reward": 0.5617873594164848,
	"reward_std": 0.7489510700106621,
	"rewards/cosine_scaled_reward": -0.07327299565076828,
	"rewards/format_reward": 0.7083333432674408,
	"step": 229
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1110.3542175292969,
	"epoch": 0.26285714285714284,
	"grad_norm": 2.921180843223507,
	"kl": 2.2265625,
	"learning_rate": 6.890576474687263e-07,
	"loss": 0.1487,
	"reward": 0.4394577872008085,
	"reward_std": 0.4748491495847702,
	"rewards/cosine_scaled_reward": -0.05110444873571396,
	"rewards/format_reward": 0.5416666977107525,
	"step": 230
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1026.3542175292969,
	"epoch": 0.264,
	"grad_norm": 2.544177744090501,
	"kl": 1.572265625,
	"learning_rate": 6.860664508377001e-07,
	"loss": 0.1564,
	"reward": 0.2407762985676527,
	"reward_std": 0.5902754589915276,
	"rewards/cosine_scaled_reward": -0.20252852141857147,
	"rewards/format_reward": 0.645833358168602,
	"step": 231
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1030.8958587646484,
	"epoch": 0.2651428571428571,
	"grad_norm": 3.5304119337525526,
	"kl": 1.529296875,
	"learning_rate": 6.83068622519821e-07,
	"loss": 0.1109,
	"reward": 0.42541009094566107,
	"reward_std": 0.6807678937911987,
	"rewards/cosine_scaled_reward": -0.11021162755787373,
	"rewards/format_reward": 0.6458333507180214,
	"step": 232
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1073.3333435058594,
	"epoch": 0.2662857142857143,
	"grad_norm": 3.0267711493511382,
	"kl": 1.1796875,
	"learning_rate": 6.800643086250121e-07,
	"loss": 0.2702,
	"reward": 0.42545080557465553,
	"reward_std": 0.48426005244255066,
	"rewards/cosine_scaled_reward": -0.15185793861746788,
	"rewards/format_reward": 0.7291666716337204,
	"step": 233
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1166.9791870117188,
	"epoch": 0.2674285714285714,
	"grad_norm": 2.956369605796136,
	"kl": 1.1279296875,
	"learning_rate": 6.770536555792944e-07,
	"loss": 0.1076,
	"reward": 0.3714570254087448,
	"reward_std": 0.650765061378479,
	"rewards/cosine_scaled_reward": -0.13718816195614636,
	"rewards/format_reward": 0.645833358168602,
	"step": 234
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1054.6667175292969,
	"epoch": 0.26857142857142857,
	"grad_norm": 4.47554265499188,
	"kl": 1.21484375,
	"learning_rate": 6.740368101176495e-07,
	"loss": 0.2849,
	"reward": 0.6623743935488164,
	"reward_std": 0.7155829221010208,
	"rewards/cosine_scaled_reward": -0.012562822550535202,
	"rewards/format_reward": 0.6875000223517418,
	"step": 235
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1096.1875457763672,
	"epoch": 0.26971428571428574,
	"grad_norm": 4.925975683565178,
	"kl": 1.3408203125,
	"learning_rate": 6.710139192768694e-07,
	"loss": 0.2351,
	"reward": 0.26786297000944614,
	"reward_std": 0.5117842257022858,
	"rewards/cosine_scaled_reward": -0.2202351950109005,
	"rewards/format_reward": 0.708333358168602,
	"step": 236
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 983.2291870117188,
	"epoch": 0.27085714285714285,
	"grad_norm": 2.226077510557553,
	"kl": 0.77294921875,
	"learning_rate": 6.679851303883891e-07,
	"loss": 0.1527,
	"reward": 0.5171467587351799,
	"reward_std": 0.5790724456310272,
	"rewards/cosine_scaled_reward": -0.10600997135043144,
	"rewards/format_reward": 0.7291666865348816,
	"step": 237
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1015.9167175292969,
	"epoch": 0.272,
	"grad_norm": 2.746018994596942,
	"kl": 1.0703125,
	"learning_rate": 6.649505910711058e-07,
	"loss": 0.1685,
	"reward": 0.4093864783644676,
	"reward_std": 0.5853541940450668,
	"rewards/cosine_scaled_reward": -0.1911400929093361,
	"rewards/format_reward": 0.7916667014360428,
	"step": 238
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1138.8542022705078,
	"epoch": 0.27314285714285713,
	"grad_norm": 2.366422791383297,
	"kl": 1.3916015625,
	"learning_rate": 6.619104492241847e-07,
	"loss": 0.1319,
	"reward": 0.03224743437021971,
	"reward_std": 0.40017952769994736,
	"rewards/cosine_scaled_reward": -0.2963762879371643,
	"rewards/format_reward": 0.6250000298023224,
	"step": 239
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 916.8125305175781,
	"epoch": 0.2742857142857143,
	"grad_norm": 1.7577643969871468,
	"kl": 1.291015625,
	"learning_rate": 6.588648530198504e-07,
	"loss": 0.13,
	"reward": 0.8863477371633053,
	"reward_std": 0.6274040639400482,
	"rewards/cosine_scaled_reward": 0.10984052997082472,
	"rewards/format_reward": 0.6666666865348816,
	"step": 240
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 891.0417022705078,
	"epoch": 0.2754285714285714,
	"grad_norm": 2.841473966918375,
	"kl": 1.0361328125,
	"learning_rate": 6.558139508961654e-07,
	"loss": 0.1554,
	"reward": 0.48904264718294144,
	"reward_std": 0.669127531349659,
	"rewards/cosine_scaled_reward": -0.16172868385910988,
	"rewards/format_reward": 0.8125000149011612,
	"step": 241
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 900.8541870117188,
	"epoch": 0.2765714285714286,
	"grad_norm": 4.202915193648642,
	"kl": 0.96337890625,
	"learning_rate": 6.527578915497951e-07,
	"loss": 0.1132,
	"reward": 0.6491687893867493,
	"reward_std": 0.6397206410765648,
	"rewards/cosine_scaled_reward": -0.08166561461985111,
	"rewards/format_reward": 0.8125000149011612,
	"step": 242
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 871.3542022705078,
	"epoch": 0.2777142857142857,
	"grad_norm": 4.013401867872089,
	"kl": 1.2275390625,
	"learning_rate": 6.496968239287603e-07,
	"loss": 0.0343,
	"reward": 0.6437305957078934,
	"reward_std": 0.566775843501091,
	"rewards/cosine_scaled_reward": -0.06355137238278985,
	"rewards/format_reward": 0.7708333432674408,
	"step": 243
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1051.8541870117188,
	"epoch": 0.27885714285714286,
	"grad_norm": 2.0640323982742346,
	"kl": 1.2119140625,
	"learning_rate": 6.466308972251785e-07,
	"loss": 0.1283,
	"reward": 0.6993502229452133,
	"reward_std": 0.8381707072257996,
	"rewards/cosine_scaled_reward": -0.04615823458880186,
	"rewards/format_reward": 0.7916667014360428,
	"step": 244
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 926.8958587646484,
	"epoch": 0.28,
	"grad_norm": 2.3095581027269456,
	"kl": 1.2373046875,
	"learning_rate": 6.435602608679916e-07,
	"loss": 0.1728,
	"reward": 0.5032865107059479,
	"reward_std": 0.4741464629769325,
	"rewards/cosine_scaled_reward": -0.15460674837231636,
	"rewards/format_reward": 0.8125000298023224,
	"step": 245
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 948.7292022705078,
	"epoch": 0.28114285714285714,
	"grad_norm": 2.2705966167509697,
	"kl": 1.0166015625,
	"learning_rate": 6.404850645156841e-07,
	"loss": 0.0879,
	"reward": 0.5439350083470345,
	"reward_std": 0.6458217911422253,
	"rewards/cosine_scaled_reward": -0.11344920098781586,
	"rewards/format_reward": 0.770833358168602,
	"step": 246
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 766.6250152587891,
	"epoch": 0.2822857142857143,
	"grad_norm": 4.218176679768865,
	"kl": 1.375,
	"learning_rate": 6.374054580489873e-07,
	"loss": 0.1529,
	"reward": 0.7583817802369595,
	"reward_std": 0.9407426938414574,
	"rewards/cosine_scaled_reward": 0.02502422034740448,
	"rewards/format_reward": 0.7083333432674408,
	"step": 247
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1149.2708435058594,
	"epoch": 0.2834285714285714,
	"grad_norm": 2.966316254338991,
	"kl": 1.69921875,
	"learning_rate": 6.343215915635761e-07,
	"loss": 0.1307,
	"reward": 0.37028552405536175,
	"reward_std": 0.35450038872659206,
	"rewards/cosine_scaled_reward": -0.15860724076628685,
	"rewards/format_reward": 0.6875000298023224,
	"step": 248
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1236.3750305175781,
	"epoch": 0.2845714285714286,
	"grad_norm": 2.8644099570080126,
	"kl": 1.646484375,
	"learning_rate": 6.31233615362752e-07,
	"loss": 0.142,
	"reward": 0.3449726775288582,
	"reward_std": 0.7856429815292358,
	"rewards/cosine_scaled_reward": -0.09834698960185051,
	"rewards/format_reward": 0.5416666865348816,
	"step": 249
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 977.7500305175781,
	"epoch": 0.2857142857142857,
	"grad_norm": 1.9099821609277308,
	"kl": 0.921875,
	"learning_rate": 6.281416799501187e-07,
	"loss": 0.0404,
	"reward": 0.6945669716224074,
	"reward_std": 0.822948083281517,
	"rewards/cosine_scaled_reward": -0.048549871891736984,
	"rewards/format_reward": 0.7916666716337204,
	"step": 250
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1265.9583740234375,
	"epoch": 0.28685714285714287,
	"grad_norm": 2.751476452748249,
	"kl": 1.216796875,
	"learning_rate": 1.000438641958131e-07,
	"loss": 0.1111,
	"reward": 0.12667130306363106,
	"reward_std": 0.7467320710420609,
	"rewards/cosine_scaled_reward": -0.17624769732356071,
	"rewards/format_reward": 0.4791666865348816,
	"step": 251
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1031.0833740234375,
	"epoch": 0.288,
	"grad_norm": 3.701835452468544,
	"kl": 1.033203125,
	"learning_rate": 6.219465344613258e-07,
	"loss": 0.2332,
	"reward": 0.3126375643769279,
	"reward_std": 0.748970627784729,
	"rewards/cosine_scaled_reward": -0.09368122089654207,
	"rewards/format_reward": 0.5000000074505806,
	"step": 252
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 889.5833587646484,
	"epoch": 0.28914285714285715,
	"grad_norm": 5.141640270028422,
	"kl": 1.69921875,
	"learning_rate": 6.188436263278172e-07,
	"loss": -0.1188,
	"reward": 0.23392239259555936,
	"reward_std": 0.8090809062123299,
	"rewards/cosine_scaled_reward": -0.11220548488199711,
	"rewards/format_reward": 0.4583333432674408,
	"step": 253
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 912.5208587646484,
	"epoch": 0.29028571428571426,
	"grad_norm": 3.5136083178201183,
	"kl": 1.1953125,
	"learning_rate": 6.157373628530852e-07,
	"loss": 0.1793,
	"reward": 0.7197382766753435,
	"reward_std": 0.9268201515078545,
	"rewards/cosine_scaled_reward": 0.057785794138908386,
	"rewards/format_reward": 0.6041666865348816,
	"step": 254
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1043.3333740234375,
	"epoch": 0.2914285714285714,
	"grad_norm": 2.8576463073310023,
	"kl": 1.361328125,
	"learning_rate": 6.126278954320294e-07,
	"loss": 0.1618,
	"reward": 0.21097473427653313,
	"reward_std": 0.8950171619653702,
	"rewards/cosine_scaled_reward": -0.08201263658702374,
	"rewards/format_reward": 0.3750000074505806,
	"step": 255
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1132.9166870117188,
	"epoch": 0.2925714285714286,
	"grad_norm": 2.6390372016890877,
	"kl": 0.9296875,
	"learning_rate": 6.095153756157051e-07,
	"loss": 0.1517,
	"reward": 0.3409617803990841,
	"reward_std": 0.7687749713659286,
	"rewards/cosine_scaled_reward": -0.142019122838974,
	"rewards/format_reward": 0.6250000149011612,
	"step": 256
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1172.1458740234375,
	"epoch": 0.2937142857142857,
	"grad_norm": 1.7999790033387904,
	"kl": 0.8994140625,
	"learning_rate": 6.06399955103937e-07,
	"loss": 0.0345,
	"reward": 0.24714069813489914,
	"reward_std": 0.526521310210228,
	"rewards/cosine_scaled_reward": -0.20976299978792667,
	"rewards/format_reward": 0.6666667014360428,
	"step": 257
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1025.2292022705078,
	"epoch": 0.2948571428571429,
	"grad_norm": 3.7817000702854284,
	"kl": 0.9970703125,
	"learning_rate": 6.032817857379256e-07,
	"loss": 0.0254,
	"reward": 0.371606208384037,
	"reward_std": 0.8782027065753937,
	"rewards/cosine_scaled_reward": -0.10586357489228249,
	"rewards/format_reward": 0.583333358168602,
	"step": 258
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1027.1041870117188,
	"epoch": 0.296,
	"grad_norm": 2.2007546083055627,
	"kl": 1.23828125,
	"learning_rate": 6.001610194928464e-07,
	"loss": 0.1329,
	"reward": 0.2863161154091358,
	"reward_std": 0.6974881812930107,
	"rewards/cosine_scaled_reward": -0.16934195160865784,
	"rewards/format_reward": 0.6250000149011612,
	"step": 259
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1060.1666870117188,
	"epoch": 0.29714285714285715,
	"grad_norm": 2.0712856185453226,
	"kl": 1.314453125,
	"learning_rate": 5.97037808470444e-07,
	"loss": -0.0031,
	"reward": 0.05191618762910366,
	"reward_std": 0.5254812240600586,
	"rewards/cosine_scaled_reward": -0.1927919089794159,
	"rewards/format_reward": 0.4375000074505806,
	"step": 260
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 800.5625152587891,
	"epoch": 0.29828571428571427,
	"grad_norm": 3.953323642394609,
	"kl": 1.18359375,
	"learning_rate": 5.939123048916173e-07,
	"loss": 0.1926,
	"reward": 0.16135332686826587,
	"reward_std": 0.6497361660003662,
	"rewards/cosine_scaled_reward": -0.21099001914262772,
	"rewards/format_reward": 0.5833333432674408,
	"step": 261
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 906.3542022705078,
	"epoch": 0.29942857142857143,
	"grad_norm": 6.975231366994329,
	"kl": 1.1025390625,
	"learning_rate": 5.907846610890011e-07,
	"loss": 0.2163,
	"reward": 0.13131073210388422,
	"reward_std": 0.5159479975700378,
	"rewards/cosine_scaled_reward": -0.1739279804751277,
	"rewards/format_reward": 0.47916667722165585,
	"step": 262
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 981.3958587646484,
	"epoch": 0.30057142857142854,
	"grad_norm": 3.6462739135853304,
	"kl": 0.93359375,
	"learning_rate": 5.87655029499542e-07,
	"loss": 0.2144,
	"reward": 0.2528093755245209,
	"reward_std": 0.6878427565097809,
	"rewards/cosine_scaled_reward": -0.19651199039071798,
	"rewards/format_reward": 0.645833358168602,
	"step": 263
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1036.4792022705078,
	"epoch": 0.3017142857142857,
	"grad_norm": 2.4186369761638797,
	"kl": 1.11328125,
	"learning_rate": 5.845235626570683e-07,
	"loss": 0.0094,
	"reward": 0.34765794809209183,
	"reward_std": 0.7917995601892471,
	"rewards/cosine_scaled_reward": -0.10742103308439255,
	"rewards/format_reward": 0.5625000149011612,
	"step": 264
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 988.1458587646484,
	"epoch": 0.3028571428571429,
	"grad_norm": 3.8358402184782845,
	"kl": 1.125,
	"learning_rate": 5.813904131848564e-07,
	"loss": 0.1412,
	"reward": 0.22985844686627388,
	"reward_std": 0.4855259954929352,
	"rewards/cosine_scaled_reward": -0.17673744820058346,
	"rewards/format_reward": 0.5833333358168602,
	"step": 265
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 853.1042022705078,
	"epoch": 0.304,
	"grad_norm": 3.155418565951925,
	"kl": 1.138671875,
	"learning_rate": 5.78255733788191e-07,
	"loss": -0.0981,
	"reward": 0.23544084653258324,
	"reward_std": 0.5617225617170334,
	"rewards/cosine_scaled_reward": -0.18436292186379433,
	"rewards/format_reward": 0.6041666865348816,
	"step": 266
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1040.7291870117188,
	"epoch": 0.30514285714285716,
	"grad_norm": 4.49377424287265,
	"kl": 1.8671875,
	"learning_rate": 5.751196772469237e-07,
	"loss": 0.3133,
	"reward": 0.019660448655486107,
	"reward_std": 0.5969599932432175,
	"rewards/cosine_scaled_reward": -0.14641978219151497,
	"rewards/format_reward": 0.3125000149011612,
	"step": 267
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1120.3334045410156,
	"epoch": 0.3062857142857143,
	"grad_norm": 2.9296163486934588,
	"kl": 1.455078125,
	"learning_rate": 5.71982396408026e-07,
	"loss": 0.0891,
	"reward": 0.019381534308195114,
	"reward_std": 0.6385679095983505,
	"rewards/cosine_scaled_reward": -0.188225906342268,
	"rewards/format_reward": 0.3958333432674408,
	"step": 268
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 940.0833587646484,
	"epoch": 0.30742857142857144,
	"grad_norm": 3.99474649335861,
	"kl": 1.58203125,
	"learning_rate": 5.688440441781398e-07,
	"loss": 0.2037,
	"reward": 0.21233398653566837,
	"reward_std": 0.5940781682729721,
	"rewards/cosine_scaled_reward": -0.17508301883935928,
	"rewards/format_reward": 0.5625000149011612,
	"step": 269
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 820.8541870117188,
	"epoch": 0.30857142857142855,
	"grad_norm": 3.64920081986899,
	"kl": 1.548828125,
	"learning_rate": 5.657047735161255e-07,
	"loss": 0.187,
	"reward": 0.287849310785532,
	"reward_std": 0.7942548245191574,
	"rewards/cosine_scaled_reward": -0.16857536626048386,
	"rewards/format_reward": 0.6250000298023224,
	"step": 270
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 918.2708435058594,
	"epoch": 0.3097142857142857,
	"grad_norm": 4.142397150940974,
	"kl": 1.3642578125,
	"learning_rate": 5.625647374256061e-07,
	"loss": -0.0034,
	"reward": 0.21712711825966835,
	"reward_std": 0.7582554370164871,
	"rewards/cosine_scaled_reward": -0.1726864455267787,
	"rewards/format_reward": 0.5625000223517418,
	"step": 271
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1067.3125305175781,
	"epoch": 0.31085714285714283,
	"grad_norm": 5.568481701496752,
	"kl": 1.576171875,
	"learning_rate": 5.594240889475106e-07,
	"loss": 0.2629,
	"reward": 0.07018839695956558,
	"reward_std": 0.6307368651032448,
	"rewards/cosine_scaled_reward": -0.17323914170265198,
	"rewards/format_reward": 0.4166666753590107,
	"step": 272
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1032.7916870117188,
	"epoch": 0.312,
	"grad_norm": 2.7380334201594207,
	"kl": 1.763671875,
	"learning_rate": 5.562829811526154e-07,
	"loss": 0.1532,
	"reward": 0.1198783004656434,
	"reward_std": 0.5959479659795761,
	"rewards/cosine_scaled_reward": -0.15881085954606533,
	"rewards/format_reward": 0.4375000149011612,
	"step": 273
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1005.0000305175781,
	"epoch": 0.31314285714285717,
	"grad_norm": 3.288058849096818,
	"kl": 1.3232421875,
	"learning_rate": 5.531415671340826e-07,
	"loss": 0.0679,
	"reward": 0.33828355744481087,
	"reward_std": 0.7625949904322624,
	"rewards/cosine_scaled_reward": -0.1329415813088417,
	"rewards/format_reward": 0.6041666716337204,
	"step": 274
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1209.2083892822266,
	"epoch": 0.3142857142857143,
	"grad_norm": 3.384369498507843,
	"kl": 1.3759765625,
	"learning_rate": 5.5e-07,
	"loss": 0.1487,
	"reward": 0.2773652821779251,
	"reward_std": 0.7781829237937927,
	"rewards/cosine_scaled_reward": -0.09048402030020952,
	"rewards/format_reward": 0.45833334885537624,
	"step": 275
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 871.2500152587891,
	"epoch": 0.31542857142857145,
	"grad_norm": 3.6001944034052666,
	"kl": 1.2470703125,
	"learning_rate": 5.468584328659172e-07,
	"loss": 0.2545,
	"reward": 0.4259207919239998,
	"reward_std": 0.7986200153827667,
	"rewards/cosine_scaled_reward": -0.1099562719464302,
	"rewards/format_reward": 0.6458333432674408,
	"step": 276
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 970.1458740234375,
	"epoch": 0.31657142857142856,
	"grad_norm": 5.098242367200561,
	"kl": 1.9375,
	"learning_rate": 5.437170188473847e-07,
	"loss": 0.0347,
	"reward": 0.1577397957444191,
	"reward_std": 0.8665766268968582,
	"rewards/cosine_scaled_reward": -0.16071344492956996,
	"rewards/format_reward": 0.479166679084301,
	"step": 277
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1138.0417175292969,
	"epoch": 0.3177142857142857,
	"grad_norm": 4.893358334263393,
	"kl": 1.51953125,
	"learning_rate": 5.405759110524894e-07,
	"loss": 0.2335,
	"reward": 0.2129652127623558,
	"reward_std": 0.8123987764120102,
	"rewards/cosine_scaled_reward": -0.1122674010694027,
	"rewards/format_reward": 0.4375000149011612,
	"step": 278
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1055.5416870117188,
	"epoch": 0.31885714285714284,
	"grad_norm": 11.325087114885777,
	"kl": 1.70703125,
	"learning_rate": 5.37435262574394e-07,
	"loss": 0.1758,
	"reward": 0.2276703668758273,
	"reward_std": 0.7087787315249443,
	"rewards/cosine_scaled_reward": -0.14658149890601635,
	"rewards/format_reward": 0.520833358168602,
	"step": 279
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1201.5625305175781,
	"epoch": 0.32,
	"grad_norm": 4.499791162135755,
	"kl": 1.3359375,
	"learning_rate": 5.342952264838747e-07,
	"loss": 0.199,
	"reward": 0.4334499780088663,
	"reward_std": 0.8222155347466469,
	"rewards/cosine_scaled_reward": -0.0853583601419814,
	"rewards/format_reward": 0.6041666716337204,
	"step": 280
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1183.5833740234375,
	"epoch": 0.3211428571428571,
	"grad_norm": 3.6400895329844336,
	"kl": 1.931640625,
	"learning_rate": 5.311559558218603e-07,
	"loss": 0.0286,
	"reward": -0.14555206894874573,
	"reward_std": 0.4930955022573471,
	"rewards/cosine_scaled_reward": -0.2081927042454481,
	"rewards/format_reward": 0.2708333395421505,
	"step": 281
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1227.6875305175781,
	"epoch": 0.3222857142857143,
	"grad_norm": 3.351330372342759,
	"kl": 1.51953125,
	"learning_rate": 5.28017603591974e-07,
	"loss": 0.1735,
	"reward": 0.08991836942732334,
	"reward_std": 0.7664570957422256,
	"rewards/cosine_scaled_reward": -0.1946241520345211,
	"rewards/format_reward": 0.479166679084301,
	"step": 282
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1011.8958435058594,
	"epoch": 0.32342857142857145,
	"grad_norm": 3.607306150140324,
	"kl": 1.52734375,
	"learning_rate": 5.248803227530763e-07,
	"loss": 0.1756,
	"reward": -0.16347728297114372,
	"reward_std": 0.6131603866815567,
	"rewards/cosine_scaled_reward": -0.269238643348217,
	"rewards/format_reward": 0.3750000074505806,
	"step": 283
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1238.1875305175781,
	"epoch": 0.32457142857142857,
	"grad_norm": 3.700854838943554,
	"kl": 1.3291015625,
	"learning_rate": 5.21744266211809e-07,
	"loss": 0.0644,
	"reward": 0.19410160928964615,
	"reward_std": 0.6351519152522087,
	"rewards/cosine_scaled_reward": -0.16336587071418762,
	"rewards/format_reward": 0.520833358168602,
	"step": 284
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1302.5833435058594,
	"epoch": 0.32571428571428573,
	"grad_norm": 5.590443825333452,
	"kl": 1.396484375,
	"learning_rate": 5.186095868151436e-07,
	"loss": 0.1172,
	"reward": 0.0053066437467350625,
	"reward_std": 0.6190855652093887,
	"rewards/cosine_scaled_reward": -0.1952633447945118,
	"rewards/format_reward": 0.3958333507180214,
	"step": 285
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1408.7708587646484,
	"epoch": 0.32685714285714285,
	"grad_norm": 5820.413747461295,
	"kl": 44.6220703125,
	"learning_rate": 5.154764373429315e-07,
	"loss": 2.1366,
	"reward": 0.321873364970088,
	"reward_std": 0.7274122461676598,
	"rewards/cosine_scaled_reward": -0.06822998262941837,
	"rewards/format_reward": 0.45833334885537624,
	"step": 286
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1293.6875305175781,
	"epoch": 0.328,
	"grad_norm": 10688.293773017389,
	"kl": 90.048828125,
	"learning_rate": 5.123449705004581e-07,
	"loss": 3.6012,
	"reward": 0.22728685289621353,
	"reward_std": 0.6926668882369995,
	"rewards/cosine_scaled_reward": -0.10510657541453838,
	"rewards/format_reward": 0.4375000074505806,
	"step": 287
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1143.1042175292969,
	"epoch": 0.3291428571428571,
	"grad_norm": 69995.08344409091,
	"kl": 821.830078125,
	"learning_rate": 5.09215338910999e-07,
	"loss": 50.9221,
	"reward": 0.3029659762978554,
	"reward_std": 0.8068300932645798,
	"rewards/cosine_scaled_reward": -0.04643368790857494,
	"rewards/format_reward": 0.3958333432674408,
	"step": 288
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1325.7084045410156,
	"epoch": 0.3302857142857143,
	"grad_norm": 62.300695111663714,
	"kl": 1.5146484375,
	"learning_rate": 5.060876951083828e-07,
	"loss": 0.1171,
	"reward": 0.10640177875757217,
	"reward_std": 0.6392035633325577,
	"rewards/cosine_scaled_reward": -0.08221577852964401,
	"rewards/format_reward": 0.2708333358168602,
	"step": 289
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1066.375015258789,
	"epoch": 0.3314285714285714,
	"grad_norm": 3.0451709688438138,
	"kl": 0.85791015625,
	"learning_rate": 5.02962191529556e-07,
	"loss": 0.0875,
	"reward": 0.4837397076189518,
	"reward_std": 0.6303973346948624,
	"rewards/cosine_scaled_reward": -0.008130142465233803,
	"rewards/format_reward": 0.5000000074505806,
	"step": 290
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1176.2292175292969,
	"epoch": 0.3325714285714286,
	"grad_norm": 6.431194933370891,
	"kl": 1.169921875,
	"learning_rate": 4.998389805071536e-07,
	"loss": 0.0944,
	"reward": 0.004224353935569525,
	"reward_std": 0.7458223477005959,
	"rewards/cosine_scaled_reward": -0.17497116327285767,
	"rewards/format_reward": 0.354166679084301,
	"step": 291
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1279.1875610351562,
	"epoch": 0.33371428571428574,
	"grad_norm": 11.784461019524304,
	"kl": 1.0419921875,
	"learning_rate": 4.967182142620745e-07,
	"loss": 0.0752,
	"reward": -0.019843921065330505,
	"reward_std": 0.5733096897602081,
	"rewards/cosine_scaled_reward": -0.21825530380010605,
	"rewards/format_reward": 0.4166666753590107,
	"step": 292
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1270.3750305175781,
	"epoch": 0.33485714285714285,
	"grad_norm": 12451.222306718704,
	"kl": 56.82421875,
	"learning_rate": 4.93600044896063e-07,
	"loss": 2.6089,
	"reward": -0.0518635269254446,
	"reward_std": 0.4941852539777756,
	"rewards/cosine_scaled_reward": -0.22384843230247498,
	"rewards/format_reward": 0.3958333507180214,
	"step": 293
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1304.8750457763672,
	"epoch": 0.336,
	"grad_norm": 354145.9079404987,
	"kl": 3584.8046875,
	"learning_rate": 4.904846243842949e-07,
	"loss": 283.5748,
	"reward": 0.06046904996037483,
	"reward_std": 0.7505204379558563,
	"rewards/cosine_scaled_reward": -0.13643214339390397,
	"rewards/format_reward": 0.3333333358168602,
	"step": 294
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1317.5625610351562,
	"epoch": 0.33714285714285713,
	"grad_norm": 5.242464203702877,
	"kl": 1.0029296875,
	"learning_rate": 4.873721045679706e-07,
	"loss": 0.1195,
	"reward": 0.005757967010140419,
	"reward_std": 0.6009484976530075,
	"rewards/cosine_scaled_reward": -0.12212102208286524,
	"rewards/format_reward": 0.2500000111758709,
	"step": 295
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1103.0625610351562,
	"epoch": 0.3382857142857143,
	"grad_norm": 4.2430557491796055,
	"kl": 0.8115234375,
	"learning_rate": 4.842626371469149e-07,
	"loss": 0.0632,
	"reward": 0.0580328986980021,
	"reward_std": 0.6936925277113914,
	"rewards/cosine_scaled_reward": -0.15848355647176504,
	"rewards/format_reward": 0.3750000149011612,
	"step": 296
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1432.3333435058594,
	"epoch": 0.3394285714285714,
	"grad_norm": 2.9908283966206457,
	"kl": 0.7646484375,
	"learning_rate": 4.811563736721829e-07,
	"loss": 0.1022,
	"reward": -0.011708778678439558,
	"reward_std": 0.5683621913194656,
	"rewards/cosine_scaled_reward": -0.12043773010373116,
	"rewards/format_reward": 0.2291666716337204,
	"step": 297
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1451.8750305175781,
	"epoch": 0.3405714285714286,
	"grad_norm": 4.214445887739457,
	"kl": 0.673828125,
	"learning_rate": 4.780534655386743e-07,
	"loss": -0.0113,
	"reward": -0.12220606487244368,
	"reward_std": 0.5942584052681923,
	"rewards/cosine_scaled_reward": -0.18610304035246372,
	"rewards/format_reward": 0.2500000074505806,
	"step": 298
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1379.4791870117188,
	"epoch": 0.3417142857142857,
	"grad_norm": 4.524572878515851,
	"kl": 0.5302734375,
	"learning_rate": 4.749540639777539e-07,
	"loss": -0.0319,
	"reward": -0.08997016213834286,
	"reward_std": 0.6837709844112396,
	"rewards/cosine_scaled_reward": -0.1804017536342144,
	"rewards/format_reward": 0.27083334140479565,
	"step": 299
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1242.2083740234375,
	"epoch": 0.34285714285714286,
	"grad_norm": 22.44129435449986,
	"kl": 0.6015625,
	"learning_rate": 4.7185832004988133e-07,
	"loss": 0.047,
	"reward": 0.4733648784458637,
	"reward_std": 0.6498839557170868,
	"rewards/cosine_scaled_reward": -0.013317572651430964,
	"rewards/format_reward": 0.5000000111758709,
	"step": 300
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1358.5000610351562,
	"epoch": 0.344,
	"grad_norm": 5.451894779313779,
	"kl": 0.55419921875,
	"learning_rate": 4.68766384637248e-07,
	"loss": 0.0201,
	"reward": 0.012628388591110706,
	"reward_std": 0.6598528623580933,
	"rewards/cosine_scaled_reward": -0.11868580989539623,
	"rewards/format_reward": 0.2500000037252903,
	"step": 301
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1208.0000610351562,
	"epoch": 0.34514285714285714,
	"grad_norm": 2.502133066720727,
	"kl": 0.5078125,
	"learning_rate": 4.656784084364238e-07,
	"loss": 0.0976,
	"reward": 0.01287244912236929,
	"reward_std": 0.6720428466796875,
	"rewards/cosine_scaled_reward": -0.14981378242373466,
	"rewards/format_reward": 0.3125000111758709,
	"step": 302
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1372.2917175292969,
	"epoch": 0.3462857142857143,
	"grad_norm": 9.527527408809727,
	"kl": 0.591796875,
	"learning_rate": 4.6259454195101267e-07,
	"loss": -0.0351,
	"reward": -0.0026968184392899275,
	"reward_std": 0.7502148300409317,
	"rewards/cosine_scaled_reward": -0.1784317558631301,
	"rewards/format_reward": 0.3541666716337204,
	"step": 303
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1228.1458435058594,
	"epoch": 0.3474285714285714,
	"grad_norm": 5.5176774561091655,
	"kl": 0.4345703125,
	"learning_rate": 4.59514935484316e-07,
	"loss": 0.1598,
	"reward": 0.39222877379506826,
	"reward_std": 0.840458020567894,
	"rewards/cosine_scaled_reward": -0.03305228240787983,
	"rewards/format_reward": 0.4583333358168602,
	"step": 304
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1448.8125305175781,
	"epoch": 0.3485714285714286,
	"grad_norm": 7.801875434214254,
	"kl": 0.3525390625,
	"learning_rate": 4.5643973913200837e-07,
	"loss": 0.0808,
	"reward": 0.005279352888464928,
	"reward_std": 0.6858643740415573,
	"rewards/cosine_scaled_reward": -0.1536103216931224,
	"rewards/format_reward": 0.3125000074505806,
	"step": 305
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1402.6666870117188,
	"epoch": 0.3497142857142857,
	"grad_norm": 3.566822202421308,
	"kl": 0.29638671875,
	"learning_rate": 4.5336910277482155e-07,
	"loss": 0.0791,
	"reward": 0.18335522711277008,
	"reward_std": 0.6350644528865814,
	"rewards/cosine_scaled_reward": -0.13748905574902892,
	"rewards/format_reward": 0.4583333432674408,
	"step": 306
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1421.8541870117188,
	"epoch": 0.35085714285714287,
	"grad_norm": 1.9532542741070622,
	"kl": 0.289794921875,
	"learning_rate": 4.503031760712397e-07,
	"loss": 0.0514,
	"reward": 0.2609965428709984,
	"reward_std": 0.7012953609228134,
	"rewards/cosine_scaled_reward": -0.06741839554160833,
	"rewards/format_reward": 0.39583334885537624,
	"step": 307
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1331.2500305175781,
	"epoch": 0.352,
	"grad_norm": 2.135773174322825,
	"kl": 0.26416015625,
	"learning_rate": 4.4724210845020494e-07,
	"loss": 0.1508,
	"reward": 0.21997906267642975,
	"reward_std": 0.6842755973339081,
	"rewards/cosine_scaled_reward": -0.1191771375015378,
	"rewards/format_reward": 0.4583333358168602,
	"step": 308
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1291.1666870117188,
	"epoch": 0.35314285714285715,
	"grad_norm": 3.030174625800062,
	"kl": 0.323486328125,
	"learning_rate": 4.441860491038345e-07,
	"loss": 0.1012,
	"reward": -0.060309079475700855,
	"reward_std": 0.48270438611507416,
	"rewards/cosine_scaled_reward": -0.16557121649384499,
	"rewards/format_reward": 0.2708333469927311,
	"step": 309
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1296.8125457763672,
	"epoch": 0.35428571428571426,
	"grad_norm": 3.288974321286699,
	"kl": 0.30712890625,
	"learning_rate": 4.4113514698014953e-07,
	"loss": 0.1053,
	"reward": 0.3812438789755106,
	"reward_std": 0.6454566046595573,
	"rewards/cosine_scaled_reward": 0.0031219255179166794,
	"rewards/format_reward": 0.37500000558793545,
	"step": 310
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1582.4167175292969,
	"epoch": 0.3554285714285714,
	"grad_norm": 11.037201589242047,
	"kl": 0.3916015625,
	"learning_rate": 4.3808955077581546e-07,
	"loss": 0.0554,
	"reward": 0.011564895510673523,
	"reward_std": 0.5866778641939163,
	"rewards/cosine_scaled_reward": -0.12963422574102879,
	"rewards/format_reward": 0.27083333767950535,
	"step": 311
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1511.1458740234375,
	"epoch": 0.3565714285714286,
	"grad_norm": 541.360267852673,
	"kl": 2.48046875,
	"learning_rate": 4.350494089288943e-07,
	"loss": 0.1743,
	"reward": 0.09507806971669197,
	"reward_std": 0.7126565277576447,
	"rewards/cosine_scaled_reward": -0.12954430282115936,
	"rewards/format_reward": 0.3541666716337204,
	"step": 312
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1310.4792022705078,
	"epoch": 0.3577142857142857,
	"grad_norm": 1.6060292822301743,
	"kl": 0.2235107421875,
	"learning_rate": 4.3201486961161093e-07,
	"loss": 0.0119,
	"reward": 0.19681214727461338,
	"reward_std": 0.5347588732838631,
	"rewards/cosine_scaled_reward": -0.14117726124823093,
	"rewards/format_reward": 0.4791666902601719,
	"step": 313
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1412.7708740234375,
	"epoch": 0.3588571428571429,
	"grad_norm": 0.9234012789427545,
	"kl": 0.2705078125,
	"learning_rate": 4.2898608072313045e-07,
	"loss": 0.0522,
	"reward": 0.1253851738292724,
	"reward_std": 0.5503663271665573,
	"rewards/cosine_scaled_reward": -0.1352240853011608,
	"rewards/format_reward": 0.3958333469927311,
	"step": 314
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1183.7916870117188,
	"epoch": 0.36,
	"grad_norm": 1.7837131712349448,
	"kl": 0.248779296875,
	"learning_rate": 4.2596318988235037e-07,
	"loss": 0.1102,
	"reward": 0.06632774323225021,
	"reward_std": 0.8003478944301605,
	"rewards/cosine_scaled_reward": -0.14391947723925114,
	"rewards/format_reward": 0.354166679084301,
	"step": 315
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1321.1667022705078,
	"epoch": 0.36114285714285715,
	"grad_norm": 3.8904561936208473,
	"kl": 0.311279296875,
	"learning_rate": 4.2294634442070553e-07,
	"loss": 0.0684,
	"reward": -0.12211128510534763,
	"reward_std": 0.3644377589225769,
	"rewards/cosine_scaled_reward": -0.19647231698036194,
	"rewards/format_reward": 0.27083333767950535,
	"step": 316
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1472.5208435058594,
	"epoch": 0.36228571428571427,
	"grad_norm": 0.6761305622628668,
	"kl": 0.2392578125,
	"learning_rate": 4.1993569137498776e-07,
	"loss": 0.0051,
	"reward": 0.07694595551583916,
	"reward_std": 0.698570191860199,
	"rewards/cosine_scaled_reward": -0.08652702532708645,
	"rewards/format_reward": 0.25000000186264515,
	"step": 317
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1446.3959045410156,
	"epoch": 0.36342857142857143,
	"grad_norm": 1.610083766620256,
	"kl": 0.23046875,
	"learning_rate": 4.1693137748017915e-07,
	"loss": 0.1272,
	"reward": 0.22593690548092127,
	"reward_std": 0.7007799595594406,
	"rewards/cosine_scaled_reward": -0.11619820445775986,
	"rewards/format_reward": 0.45833336375653744,
	"step": 318
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1419.7500610351562,
	"epoch": 0.36457142857142855,
	"grad_norm": 1.3177147357732026,
	"kl": 0.3505859375,
	"learning_rate": 4.1393354916230005e-07,
	"loss": 0.0908,
	"reward": 0.05421498417854309,
	"reward_std": 0.6087209582328796,
	"rewards/cosine_scaled_reward": -0.10830917488783598,
	"rewards/format_reward": 0.27083334140479565,
	"step": 319
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1459.2917175292969,
	"epoch": 0.3657142857142857,
	"grad_norm": 2.383045046585821,
	"kl": 0.177001953125,
	"learning_rate": 4.1094235253127374e-07,
	"loss": 0.143,
	"reward": 0.23994141444563866,
	"reward_std": 0.7169264256954193,
	"rewards/cosine_scaled_reward": -0.08836262859404087,
	"rewards/format_reward": 0.4166666753590107,
	"step": 320
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1525.8542175292969,
	"epoch": 0.3668571428571429,
	"grad_norm": 1.4014039132566267,
	"kl": 0.327880859375,
	"learning_rate": 4.079579333738039e-07,
	"loss": 0.0636,
	"reward": 0.07618786534294486,
	"reward_std": 0.6110149621963501,
	"rewards/cosine_scaled_reward": -0.17023939825594425,
	"rewards/format_reward": 0.416666679084301,
	"step": 321
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1505.1042022705078,
	"epoch": 0.368,
	"grad_norm": 0.9016635108285753,
	"kl": 0.18017578125,
	"learning_rate": 4.0498043714627006e-07,
	"loss": 0.0766,
	"reward": 0.1637781597673893,
	"reward_std": 0.6868859454989433,
	"rewards/cosine_scaled_reward": -0.13686091732233763,
	"rewards/format_reward": 0.4375000186264515,
	"step": 322
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1378.7500305175781,
	"epoch": 0.36914285714285716,
	"grad_norm": 1.1982814454055981,
	"kl": 0.39306640625,
	"learning_rate": 4.020100089676376e-07,
	"loss": 0.0913,
	"reward": 0.17529202857986093,
	"reward_std": 0.6956184059381485,
	"rewards/cosine_scaled_reward": -0.14152065757662058,
	"rewards/format_reward": 0.4583333432674408,
	"step": 323
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1414.5208740234375,
	"epoch": 0.3702857142857143,
	"grad_norm": 5.168812943695912,
	"kl": 0.2706298828125,
	"learning_rate": 3.9904679361238526e-07,
	"loss": 0.0758,
	"reward": -0.05163134215399623,
	"reward_std": 0.573038712143898,
	"rewards/cosine_scaled_reward": -0.2133156731724739,
	"rewards/format_reward": 0.37500000558793545,
	"step": 324
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1026.2292175292969,
	"epoch": 0.37142857142857144,
	"grad_norm": 2.717389747197644,
	"kl": 0.2042236328125,
	"learning_rate": 3.9609093550344907e-07,
	"loss": 0.0446,
	"reward": 0.35916636511683464,
	"reward_std": 0.7165441811084747,
	"rewards/cosine_scaled_reward": -0.11208349000662565,
	"rewards/format_reward": 0.583333358168602,
	"step": 325
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1296.2083435058594,
	"epoch": 0.37257142857142855,
	"grad_norm": 0.9706132798560072,
	"kl": 0.1436767578125,
	"learning_rate": 3.931425787051832e-07,
	"loss": 0.0264,
	"reward": 0.03931037150323391,
	"reward_std": 0.5944674462080002,
	"rewards/cosine_scaled_reward": -0.24076148495078087,
	"rewards/format_reward": 0.5208333395421505,
	"step": 326
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1119.3958740234375,
	"epoch": 0.3737142857142857,
	"grad_norm": 7.20904098295775,
	"kl": 0.34619140625,
	"learning_rate": 3.902018669163384e-07,
	"loss": 0.0023,
	"reward": 0.5026027010753751,
	"reward_std": 0.4505321756005287,
	"rewards/cosine_scaled_reward": 0.011718038469552994,
	"rewards/format_reward": 0.4791666716337204,
	"step": 327
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1340.9791717529297,
	"epoch": 0.37485714285714283,
	"grad_norm": 1.2860908020915138,
	"kl": 0.416259765625,
	"learning_rate": 3.872689434630585e-07,
	"loss": 0.1449,
	"reward": 0.15127216652035713,
	"reward_std": 0.6304197087883949,
	"rewards/cosine_scaled_reward": -0.15353058651089668,
	"rewards/format_reward": 0.4583333507180214,
	"step": 328
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1571.3750610351562,
	"epoch": 0.376,
	"grad_norm": 0.8293118478307562,
	"kl": 0.242431640625,
	"learning_rate": 3.843439512918949e-07,
	"loss": 0.0229,
	"reward": 0.09288652800023556,
	"reward_std": 0.5842361897230148,
	"rewards/cosine_scaled_reward": -0.15147340297698975,
	"rewards/format_reward": 0.3958333432674408,
	"step": 329
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1407.9583740234375,
	"epoch": 0.37714285714285717,
	"grad_norm": 1.189781094856149,
	"kl": 0.1077880859375,
	"learning_rate": 3.8142703296283953e-07,
	"loss": 0.0681,
	"reward": -0.09090141206979752,
	"reward_std": 0.5390855148434639,
	"rewards/cosine_scaled_reward": -0.21211737021803856,
	"rewards/format_reward": 0.3333333395421505,
	"step": 330
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1255.6875457763672,
	"epoch": 0.3782857142857143,
	"grad_norm": 1.046472107288498,
	"kl": 0.10308837890625,
	"learning_rate": 3.785183306423767e-07,
	"loss": 0.0811,
	"reward": -0.12841611605836079,
	"reward_std": 0.39798876643180847,
	"rewards/cosine_scaled_reward": -0.3350413963198662,
	"rewards/format_reward": 0.5416666865348816,
	"step": 331
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1201.8958740234375,
	"epoch": 0.37942857142857145,
	"grad_norm": 1.123018980255247,
	"kl": 0.117584228515625,
	"learning_rate": 3.7561798609655373e-07,
	"loss": 0.072,
	"reward": 0.499036006629467,
	"reward_std": 0.6711834743618965,
	"rewards/cosine_scaled_reward": -0.03173201950266957,
	"rewards/format_reward": 0.5625000074505806,
	"step": 332
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1133.3333587646484,
	"epoch": 0.38057142857142856,
	"grad_norm": 2.177638459571002,
	"kl": 0.14453125,
	"learning_rate": 3.72726140684072e-07,
	"loss": 0.1488,
	"reward": 0.03351620538160205,
	"reward_std": 0.4431127682328224,
	"rewards/cosine_scaled_reward": -0.27490856871008873,
	"rewards/format_reward": 0.5833333432674408,
	"step": 333
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1252.5833587646484,
	"epoch": 0.38171428571428573,
	"grad_norm": 1.6680786188797292,
	"kl": 2.4984130859375,
	"learning_rate": 3.6984293534939737e-07,
	"loss": 0.1246,
	"reward": -0.1514057070016861,
	"reward_std": 0.5695896856486797,
	"rewards/cosine_scaled_reward": -0.26320285350084305,
	"rewards/format_reward": 0.3750000074505806,
	"step": 334
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1232.0000610351562,
	"epoch": 0.38285714285714284,
	"grad_norm": 1.828125714274309,
	"kl": 0.07843017578125,
	"learning_rate": 3.6696851061588994e-07,
	"loss": 0.1105,
	"reward": 0.07522661844268441,
	"reward_std": 0.5525132827460766,
	"rewards/cosine_scaled_reward": -0.21238669380545616,
	"rewards/format_reward": 0.5000000149011612,
	"step": 335
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1240.4375305175781,
	"epoch": 0.384,
	"grad_norm": 3.2255921262965432,
	"kl": 0.19232177734375,
	"learning_rate": 3.641030065789562e-07,
	"loss": 0.2104,
	"reward": -0.07903135940432549,
	"reward_std": 0.4235813617706299,
	"rewards/cosine_scaled_reward": -0.3103490248322487,
	"rewards/format_reward": 0.5416666865348816,
	"step": 336
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1136.2500457763672,
	"epoch": 0.3851428571428571,
	"grad_norm": 2.1359050155328076,
	"kl": 0.298095703125,
	"learning_rate": 3.612465628992203e-07,
	"loss": 0.1271,
	"reward": 0.29203586652874947,
	"reward_std": 0.6221929639577866,
	"rewards/cosine_scaled_reward": -0.14564874302595854,
	"rewards/format_reward": 0.583333358168602,
	"step": 337
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1304.4792175292969,
	"epoch": 0.3862857142857143,
	"grad_norm": 1.42801024449987,
	"kl": 0.2041015625,
	"learning_rate": 3.5839931879571725e-07,
	"loss": 0.0306,
	"reward": -0.07640792615711689,
	"reward_std": 0.29374565184116364,
	"rewards/cosine_scaled_reward": -0.30903729796409607,
	"rewards/format_reward": 0.5416666716337204,
	"step": 338
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1483.5625305175781,
	"epoch": 0.38742857142857146,
	"grad_norm": 4.530770296915891,
	"kl": 0.2216796875,
	"learning_rate": 3.555614130391079e-07,
	"loss": 0.0756,
	"reward": -0.22593690641224384,
	"reward_std": 0.42642898857593536,
	"rewards/cosine_scaled_reward": -0.31088512018322945,
	"rewards/format_reward": 0.39583334513008595,
	"step": 339
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1301.6875305175781,
	"epoch": 0.38857142857142857,
	"grad_norm": 32.229056752997074,
	"kl": 0.72998046875,
	"learning_rate": 3.5273298394491515e-07,
	"loss": 0.0451,
	"reward": 0.1187155619263649,
	"reward_std": 0.6100866496562958,
	"rewards/cosine_scaled_reward": -0.16980887576937675,
	"rewards/format_reward": 0.4583333432674408,
	"step": 340
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1222.5000610351562,
	"epoch": 0.38971428571428574,
	"grad_norm": 31.15024931066955,
	"kl": 1.814453125,
	"learning_rate": 3.4991416936678276e-07,
	"loss": 0.0053,
	"reward": 0.4647822715342045,
	"reward_std": 0.8535723686218262,
	"rewards/cosine_scaled_reward": 0.013641122728586197,
	"rewards/format_reward": 0.4375000149011612,
	"step": 341
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1550.9583740234375,
	"epoch": 0.39085714285714285,
	"grad_norm": 5.073035047796139,
	"kl": 0.40185546875,
	"learning_rate": 3.471051066897562e-07,
	"loss": 0.1274,
	"reward": -0.049222253262996674,
	"reward_std": 0.6296448782086372,
	"rewards/cosine_scaled_reward": -0.1704444605857134,
	"rewards/format_reward": 0.29166667349636555,
	"step": 342
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1254.1458740234375,
	"epoch": 0.392,
	"grad_norm": 2.9987047793682247,
	"kl": 0.191650390625,
	"learning_rate": 3.4430593282358777e-07,
	"loss": 0.132,
	"reward": 0.4507103096693754,
	"reward_std": 0.46682045608758926,
	"rewards/cosine_scaled_reward": -0.11839485540986061,
	"rewards/format_reward": 0.6875000298023224,
	"step": 343
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1259.0833740234375,
	"epoch": 0.3931428571428571,
	"grad_norm": 11.834773130920754,
	"kl": 0.765625,
	"learning_rate": 3.4151678419606233e-07,
	"loss": 0.1692,
	"reward": 0.04102582670748234,
	"reward_std": 0.6375212371349335,
	"rewards/cosine_scaled_reward": -0.16698708944022655,
	"rewards/format_reward": 0.3750000111758709,
	"step": 344
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 948.8333587646484,
	"epoch": 0.3942857142857143,
	"grad_norm": 4.082579051373274,
	"kl": 0.11126708984375,
	"learning_rate": 3.387377967463493e-07,
	"loss": 0.1531,
	"reward": 0.32552773877978325,
	"reward_std": 0.5937002822756767,
	"rewards/cosine_scaled_reward": -0.18098615854978561,
	"rewards/format_reward": 0.6875000149011612,
	"step": 345
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1348.5208740234375,
	"epoch": 0.3954285714285714,
	"grad_norm": 4.16581520032074,
	"kl": 0.233154296875,
	"learning_rate": 3.359691059183761e-07,
	"loss": 0.0891,
	"reward": -0.024696938693523407,
	"reward_std": 0.6840994879603386,
	"rewards/cosine_scaled_reward": -0.2310984805226326,
	"rewards/format_reward": 0.4375000074505806,
	"step": 346
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1152.9583435058594,
	"epoch": 0.3965714285714286,
	"grad_norm": 6.491892036842968,
	"kl": 0.688232421875,
	"learning_rate": 3.3321084665422803e-07,
	"loss": 0.1813,
	"reward": 0.7761995047330856,
	"reward_std": 0.9014021009206772,
	"rewards/cosine_scaled_reward": 0.08601640490815043,
	"rewards/format_reward": 0.6041666865348816,
	"step": 347
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1119.6875305175781,
	"epoch": 0.3977142857142857,
	"grad_norm": 6.465035426418669,
	"kl": 0.2274169921875,
	"learning_rate": 3.3046315338757026e-07,
	"loss": 0.3084,
	"reward": 0.1041297996416688,
	"reward_std": 0.5661944150924683,
	"rewards/cosine_scaled_reward": -0.2187684327363968,
	"rewards/format_reward": 0.541666679084301,
	"step": 348
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1280.5000305175781,
	"epoch": 0.39885714285714285,
	"grad_norm": 5.965340713566614,
	"kl": 0.0919189453125,
	"learning_rate": 3.2772616003709616e-07,
	"loss": 0.264,
	"reward": 0.5343287643045187,
	"reward_std": 1.0619665831327438,
	"rewards/cosine_scaled_reward": -0.024502300075255334,
	"rewards/format_reward": 0.5833333432674408,
	"step": 349
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1420.2291870117188,
	"epoch": 0.4,
	"grad_norm": 2.925238124886515,
	"kl": 0.185791015625,
	"learning_rate": 3.250000000000001e-07,
	"loss": 0.1961,
	"reward": 0.12700789980590343,
	"reward_std": 0.8331074118614197,
	"rewards/cosine_scaled_reward": -0.1656627282500267,
	"rewards/format_reward": 0.4583333432674408,
	"step": 350
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1083.6042175292969,
	"epoch": 0.40114285714285713,
	"grad_norm": 3.606767246674259,
	"kl": 0.18115234375,
	"learning_rate": 3.222848061454764e-07,
	"loss": -0.0154,
	"reward": 0.25727599672973156,
	"reward_std": 0.6387183666229248,
	"rewards/cosine_scaled_reward": -0.18386201839894056,
	"rewards/format_reward": 0.6250000074505806,
	"step": 351
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1439.1250610351562,
	"epoch": 0.4022857142857143,
	"grad_norm": 1.929818758425276,
	"kl": 0.1397705078125,
	"learning_rate": 3.195807108082429e-07,
	"loss": 0.1728,
	"reward": -0.14825151395052671,
	"reward_std": 0.5558790042996407,
	"rewards/cosine_scaled_reward": -0.2824591100215912,
	"rewards/format_reward": 0.4166666716337204,
	"step": 352
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1523.3542175292969,
	"epoch": 0.4034285714285714,
	"grad_norm": 1.501137402879622,
	"kl": 0.15606689453125,
	"learning_rate": 3.168878457820915e-07,
	"loss": 0.1054,
	"reward": -0.2005203291773796,
	"reward_std": 0.5384240373969078,
	"rewards/cosine_scaled_reward": -0.2565101645886898,
	"rewards/format_reward": 0.31250000558793545,
	"step": 353
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1457.8125305175781,
	"epoch": 0.4045714285714286,
	"grad_norm": 1.7447813143906967,
	"kl": 0.19287109375,
	"learning_rate": 3.142063423134644e-07,
	"loss": 0.0946,
	"reward": -0.07205517496913671,
	"reward_std": 0.5912996232509613,
	"rewards/cosine_scaled_reward": -0.27561092376708984,
	"rewards/format_reward": 0.4791666865348816,
	"step": 354
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 935.1041793823242,
	"epoch": 0.4057142857142857,
	"grad_norm": 5.735907828017728,
	"kl": 0.416259765625,
	"learning_rate": 3.115363310950578e-07,
	"loss": 0.2126,
	"reward": 0.6018264503218234,
	"reward_std": 0.43670547753572464,
	"rewards/cosine_scaled_reward": -0.04283679276704788,
	"rewards/format_reward": 0.6875000149011612,
	"step": 355
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1400.4375610351562,
	"epoch": 0.40685714285714286,
	"grad_norm": 4.2513620245343855,
	"kl": 0.2110595703125,
	"learning_rate": 3.0887794225945143e-07,
	"loss": 0.0986,
	"reward": 0.07107849605381489,
	"reward_std": 0.6532387360930443,
	"rewards/cosine_scaled_reward": -0.22487742826342583,
	"rewards/format_reward": 0.5208333432674408,
	"step": 356
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1214.1250305175781,
	"epoch": 0.408,
	"grad_norm": 1.8135177203210504,
	"kl": 0.1314697265625,
	"learning_rate": 3.062313053727671e-07,
	"loss": 0.1426,
	"reward": 0.03724817745387554,
	"reward_std": 0.5181447230279446,
	"rewards/cosine_scaled_reward": -0.2730425810441375,
	"rewards/format_reward": 0.5833333507180214,
	"step": 357
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1284.7500305175781,
	"epoch": 0.40914285714285714,
	"grad_norm": 3.565695417018542,
	"kl": 0.18597412109375,
	"learning_rate": 3.0359654942835247e-07,
	"loss": 0.1245,
	"reward": 0.04130622744560242,
	"reward_std": 0.7205251231789589,
	"rewards/cosine_scaled_reward": -0.19809689931571484,
	"rewards/format_reward": 0.4375000111758709,
	"step": 358
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1343.2083740234375,
	"epoch": 0.4102857142857143,
	"grad_norm": 3.2057830256260917,
	"kl": 0.14324951171875,
	"learning_rate": 3.0097380284049523e-07,
	"loss": -0.0078,
	"reward": 0.1697351299226284,
	"reward_std": 0.3564612567424774,
	"rewards/cosine_scaled_reward": -0.13388244062662125,
	"rewards/format_reward": 0.4375000111758709,
	"step": 359
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1433.2291870117188,
	"epoch": 0.4114285714285714,
	"grad_norm": 1.6762255456245136,
	"kl": 0.1739501953125,
	"learning_rate": 2.9836319343816397e-07,
	"loss": 0.1781,
	"reward": 0.21988008171319962,
	"reward_std": 0.7903619408607483,
	"rewards/cosine_scaled_reward": -0.10880996193736792,
	"rewards/format_reward": 0.4375000223517418,
	"step": 360
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1309.6250457763672,
	"epoch": 0.4125714285714286,
	"grad_norm": 0.9826821036841882,
	"kl": 0.135498046875,
	"learning_rate": 2.9576484845877793e-07,
	"loss": 0.0186,
	"reward": 0.33486853912472725,
	"reward_std": 0.500580433756113,
	"rewards/cosine_scaled_reward": -0.11381572997197509,
	"rewards/format_reward": 0.5625000149011612,
	"step": 361
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1173.5625305175781,
	"epoch": 0.4137142857142857,
	"grad_norm": 4.226570835684713,
	"kl": 0.0926513671875,
	"learning_rate": 2.931788945420058e-07,
	"loss": 0.18,
	"reward": 0.15393588319420815,
	"reward_std": 0.5774414390325546,
	"rewards/cosine_scaled_reward": -0.20428206771612167,
	"rewards/format_reward": 0.5625000149011612,
	"step": 362
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1250.0416870117188,
	"epoch": 0.41485714285714287,
	"grad_norm": 1.978862188088671,
	"kl": 0.09033203125,
	"learning_rate": 2.9060545772359305e-07,
	"loss": 0.1327,
	"reward": 0.2741839215159416,
	"reward_std": 0.6551093906164169,
	"rewards/cosine_scaled_reward": -0.17540805786848068,
	"rewards/format_reward": 0.6250000149011612,
	"step": 363
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1354.9583892822266,
	"epoch": 0.416,
	"grad_norm": 1.500971160749094,
	"kl": 0.1431884765625,
	"learning_rate": 2.8804466342921987e-07,
	"loss": 0.1556,
	"reward": 0.09914333745837212,
	"reward_std": 0.5969183072447777,
	"rewards/cosine_scaled_reward": -0.17959501221776009,
	"rewards/format_reward": 0.4583333432674408,
	"step": 364
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1443.8958435058594,
	"epoch": 0.41714285714285715,
	"grad_norm": 1.5336203716893533,
	"kl": 0.14166259765625,
	"learning_rate": 2.854966364683872e-07,
	"loss": 0.0794,
	"reward": 0.08230920624919236,
	"reward_std": 0.7491874545812607,
	"rewards/cosine_scaled_reward": -0.18801206350326538,
	"rewards/format_reward": 0.4583333358168602,
	"step": 365
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1280.0833740234375,
	"epoch": 0.41828571428571426,
	"grad_norm": 5.917103922817008,
	"kl": 0.1395263671875,
	"learning_rate": 2.829615010283344e-07,
	"loss": 0.2201,
	"reward": 0.30844624526798725,
	"reward_std": 0.6032929718494415,
	"rewards/cosine_scaled_reward": -0.11661022901535034,
	"rewards/format_reward": 0.5416666865348816,
	"step": 366
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1071.7083587646484,
	"epoch": 0.41942857142857143,
	"grad_norm": 6.764653159306351,
	"kl": 1.21533203125,
	"learning_rate": 2.8043938066798645e-07,
	"loss": 0.2217,
	"reward": 0.5629880558699369,
	"reward_std": 0.7271402254700661,
	"rewards/cosine_scaled_reward": -0.02058931067585945,
	"rewards/format_reward": 0.6041666716337204,
	"step": 367
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1272.2708892822266,
	"epoch": 0.4205714285714286,
	"grad_norm": 3.868751461553908,
	"kl": 0.376953125,
	"learning_rate": 2.7793039831193133e-07,
	"loss": 0.0282,
	"reward": 0.2414314430207014,
	"reward_std": 0.783539354801178,
	"rewards/cosine_scaled_reward": -0.13970092684030533,
	"rewards/format_reward": 0.5208333544433117,
	"step": 368
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1453.8333740234375,
	"epoch": 0.4217142857142857,
	"grad_norm": 1.6191974125060598,
	"kl": 0.29150390625,
	"learning_rate": 2.7543467624442956e-07,
	"loss": 0.172,
	"reward": 0.11266430467367172,
	"reward_std": 0.7149153798818588,
	"rewards/cosine_scaled_reward": -0.1415845244191587,
	"rewards/format_reward": 0.39583334140479565,
	"step": 369
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1463.3959045410156,
	"epoch": 0.4228571428571429,
	"grad_norm": 4.101308083609096,
	"kl": 0.56884765625,
	"learning_rate": 2.729523361034538e-07,
	"loss": 0.2149,
	"reward": -0.2552230432629585,
	"reward_std": 0.5415500551462173,
	"rewards/cosine_scaled_reward": -0.26302820444107056,
	"rewards/format_reward": 0.27083334140479565,
	"step": 370
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1240.8542175292969,
	"epoch": 0.424,
	"grad_norm": 3.8927886605185447,
	"kl": 0.30340576171875,
	"learning_rate": 2.7048349887476037e-07,
	"loss": 0.1602,
	"reward": 0.1614240426570177,
	"reward_std": 0.5875495374202728,
	"rewards/cosine_scaled_reward": -0.15887131541967392,
	"rewards/format_reward": 0.479166679084301,
	"step": 371
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1177.3958740234375,
	"epoch": 0.42514285714285716,
	"grad_norm": 3.066569475752354,
	"kl": 0.1824951171875,
	"learning_rate": 2.6802828488599294e-07,
	"loss": 0.1059,
	"reward": 0.2956250160932541,
	"reward_std": 0.6594211757183075,
	"rewards/cosine_scaled_reward": -0.12302083522081375,
	"rewards/format_reward": 0.5416666939854622,
	"step": 372
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1163.4375610351562,
	"epoch": 0.42628571428571427,
	"grad_norm": 5.09566585578463,
	"kl": 0.2724609375,
	"learning_rate": 2.655868138008171e-07,
	"loss": 0.1544,
	"reward": 0.07318597589619458,
	"reward_std": 0.7096846550703049,
	"rewards/cosine_scaled_reward": -0.2759070098400116,
	"rewards/format_reward": 0.6250000149011612,
	"step": 373
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1246.7500457763672,
	"epoch": 0.42742857142857144,
	"grad_norm": 32.203352857308325,
	"kl": 0.839111328125,
	"learning_rate": 2.631592046130896e-07,
	"loss": 0.1927,
	"reward": 0.08969515189528465,
	"reward_std": 0.6610818058252335,
	"rewards/cosine_scaled_reward": -0.22598576080054045,
	"rewards/format_reward": 0.5416666865348816,
	"step": 374
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1280.7708740234375,
	"epoch": 0.42857142857142855,
	"grad_norm": 63.335567619096544,
	"kl": 0.94873046875,
	"learning_rate": 2.6074557564105724e-07,
	"loss": 0.2184,
	"reward": 0.18546735402196646,
	"reward_std": 0.9102050960063934,
	"rewards/cosine_scaled_reward": -0.17809965554624796,
	"rewards/format_reward": 0.5416667014360428,
	"step": 375
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1256.6250305175781,
	"epoch": 0.4297142857142857,
	"grad_norm": 3.6519960558396716,
	"kl": 0.310302734375,
	"learning_rate": 2.583460445215911e-07,
	"loss": 0.1114,
	"reward": 0.1940733604133129,
	"reward_std": 0.5819907337427139,
	"rewards/cosine_scaled_reward": -0.1946299858391285,
	"rewards/format_reward": 0.583333358168602,
	"step": 376
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1285.6458740234375,
	"epoch": 0.4308571428571429,
	"grad_norm": 5.319529708040252,
	"kl": 0.3394775390625,
	"learning_rate": 2.5596072820445254e-07,
	"loss": 0.0359,
	"reward": 0.25018906872719526,
	"reward_std": 0.8042758777737617,
	"rewards/cosine_scaled_reward": -0.13532213680446148,
	"rewards/format_reward": 0.5208333432674408,
	"step": 377
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1374.5416870117188,
	"epoch": 0.432,
	"grad_norm": 20.461016176615068,
	"kl": 0.70166015625,
	"learning_rate": 2.5358974294659373e-07,
	"loss": 0.2346,
	"reward": -0.005498896003700793,
	"reward_std": 0.5357099026441574,
	"rewards/cosine_scaled_reward": -0.22149945423007011,
	"rewards/format_reward": 0.4375000074505806,
	"step": 378
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1435.1875,
	"epoch": 0.43314285714285716,
	"grad_norm": 2.391831824846237,
	"kl": 0.292236328125,
	"learning_rate": 2.512332043064913e-07,
	"loss": 0.1982,
	"reward": 0.012932289391756058,
	"reward_std": 0.799980454146862,
	"rewards/cosine_scaled_reward": -0.20186719112098217,
	"rewards/format_reward": 0.4166666865348816,
	"step": 379
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1369.5833740234375,
	"epoch": 0.4342857142857143,
	"grad_norm": 2.2747857355280208,
	"kl": 0.1715087890625,
	"learning_rate": 2.488912271385139e-07,
	"loss": 0.1725,
	"reward": -0.22791396314278245,
	"reward_std": 0.4170580878853798,
	"rewards/cosine_scaled_reward": -0.3431236445903778,
	"rewards/format_reward": 0.4583333544433117,
	"step": 380
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1187.6458587646484,
	"epoch": 0.43542857142857144,
	"grad_norm": 2.7553958817382593,
	"kl": 0.16162109375,
	"learning_rate": 2.465639255873246e-07,
	"loss": 0.1247,
	"reward": 0.19117721682414412,
	"reward_std": 0.46048377081751823,
	"rewards/cosine_scaled_reward": -0.23774472624063492,
	"rewards/format_reward": 0.6666667014360428,
	"step": 381
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1300.3125305175781,
	"epoch": 0.43657142857142855,
	"grad_norm": 2.0362039263750082,
	"kl": 0.1822509765625,
	"learning_rate": 2.4425141308231765e-07,
	"loss": 0.1158,
	"reward": 0.2739548869431019,
	"reward_std": 0.603746622800827,
	"rewards/cosine_scaled_reward": -0.09218922536820173,
	"rewards/format_reward": 0.4583333432674408,
	"step": 382
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1263.7917175292969,
	"epoch": 0.4377142857142857,
	"grad_norm": 7.617696331239462,
	"kl": 0.2333984375,
	"learning_rate": 2.4195380233209006e-07,
	"loss": 0.1076,
	"reward": 0.12070683389902115,
	"reward_std": 0.38592402543872595,
	"rewards/cosine_scaled_reward": -0.18964658118784428,
	"rewards/format_reward": 0.5,
	"step": 383
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1170.3750457763672,
	"epoch": 0.43885714285714283,
	"grad_norm": 3.2601623912372233,
	"kl": 0.2103271484375,
	"learning_rate": 2.3967120531894857e-07,
	"loss": 0.1471,
	"reward": -0.021999074146151543,
	"reward_std": 0.34355130419135094,
	"rewards/cosine_scaled_reward": -0.31308288127183914,
	"rewards/format_reward": 0.6041666865348816,
	"step": 384
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1233.7083740234375,
	"epoch": 0.44,
	"grad_norm": 2.1916650637468025,
	"kl": 0.16259765625,
	"learning_rate": 2.374037332934512e-07,
	"loss": 0.0922,
	"reward": 0.054161038249731064,
	"reward_std": 0.7442760765552521,
	"rewards/cosine_scaled_reward": -0.2541694864630699,
	"rewards/format_reward": 0.5625000149011612,
	"step": 385
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 870.3333740234375,
	"epoch": 0.44114285714285717,
	"grad_norm": 1.4860604247340325,
	"kl": 0.0894775390625,
	"learning_rate": 2.3515149676898552e-07,
	"loss": 0.1158,
	"reward": 0.28954136464744806,
	"reward_std": 0.5479708462953568,
	"rewards/cosine_scaled_reward": -0.240646006539464,
	"rewards/format_reward": 0.770833358168602,
	"step": 386
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1227.2291870117188,
	"epoch": 0.4422857142857143,
	"grad_norm": 1.687755076974517,
	"kl": 0.2470703125,
	"learning_rate": 2.3291460551638237e-07,
	"loss": 0.151,
	"reward": -0.0012904666364192963,
	"reward_std": 0.4440325200557709,
	"rewards/cosine_scaled_reward": -0.2714785784482956,
	"rewards/format_reward": 0.5416666865348816,
	"step": 387
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1033.6042022705078,
	"epoch": 0.44342857142857145,
	"grad_norm": 2.5341444420596884,
	"kl": 0.164947509765625,
	"learning_rate": 2.306931685585657e-07,
	"loss": 0.0897,
	"reward": 0.4180222749710083,
	"reward_std": 0.754804901778698,
	"rewards/cosine_scaled_reward": -0.14515553694218397,
	"rewards/format_reward": 0.7083333432674408,
	"step": 388
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1356.8333740234375,
	"epoch": 0.44457142857142856,
	"grad_norm": 3.704344948231344,
	"kl": 0.372314453125,
	"learning_rate": 2.2848729416523859e-07,
	"loss": 0.102,
	"reward": 0.2806839719414711,
	"reward_std": 0.6125510483980179,
	"rewards/cosine_scaled_reward": -0.07840801030397415,
	"rewards/format_reward": 0.4375000074505806,
	"step": 389
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1090.2708740234375,
	"epoch": 0.44571428571428573,
	"grad_norm": 14.470921296685399,
	"kl": 0.47216796875,
	"learning_rate": 2.2629708984760706e-07,
	"loss": 0.2654,
	"reward": 0.07703178748488426,
	"reward_std": 0.5665107443928719,
	"rewards/cosine_scaled_reward": -0.26356743834912777,
	"rewards/format_reward": 0.6041666939854622,
	"step": 390
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1098.7500305175781,
	"epoch": 0.44685714285714284,
	"grad_norm": 2.4001916122615157,
	"kl": 0.26904296875,
	"learning_rate": 2.2412266235313973e-07,
	"loss": 0.1304,
	"reward": 0.2017030455172062,
	"reward_std": 0.5325312875211239,
	"rewards/cosine_scaled_reward": -0.20123182306997478,
	"rewards/format_reward": 0.6041666865348816,
	"step": 391
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1404.0625610351562,
	"epoch": 0.448,
	"grad_norm": 12.93850484473414,
	"kl": 0.662109375,
	"learning_rate": 2.2196411766036487e-07,
	"loss": 0.0663,
	"reward": 0.39279897045344114,
	"reward_std": 0.9181084930896759,
	"rewards/cosine_scaled_reward": -0.04318385384976864,
	"rewards/format_reward": 0.4791666939854622,
	"step": 392
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1046.0417175292969,
	"epoch": 0.4491428571428571,
	"grad_norm": 3.1910943036863695,
	"kl": 0.2236328125,
	"learning_rate": 2.1982156097370557e-07,
	"loss": 0.094,
	"reward": 0.1259294361807406,
	"reward_std": 0.620373547077179,
	"rewards/cosine_scaled_reward": -0.23911861330270767,
	"rewards/format_reward": 0.6041666865348816,
	"step": 393
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 898.3333511352539,
	"epoch": 0.4502857142857143,
	"grad_norm": 4.93057169428389,
	"kl": 0.25714111328125,
	"learning_rate": 2.1769509671835223e-07,
	"loss": 0.2665,
	"reward": 0.2223543766885996,
	"reward_std": 0.4368506968021393,
	"rewards/cosine_scaled_reward": -0.23257281631231308,
	"rewards/format_reward": 0.6875000149011612,
	"step": 394
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1140.0000457763672,
	"epoch": 0.4514285714285714,
	"grad_norm": 2.3738396662205945,
	"kl": 0.35986328125,
	"learning_rate": 2.1558482853517253e-07,
	"loss": 0.3105,
	"reward": 0.10918148793280125,
	"reward_std": 0.5202281884849072,
	"rewards/cosine_scaled_reward": -0.21624258160591125,
	"rewards/format_reward": 0.5416666865348816,
	"step": 395
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1151.3958740234375,
	"epoch": 0.45257142857142857,
	"grad_norm": 2.5367764499763257,
	"kl": 0.3154296875,
	"learning_rate": 2.134908592756607e-07,
	"loss": 0.1917,
	"reward": 0.17909681051969528,
	"reward_std": 0.7349686250090599,
	"rewards/cosine_scaled_reward": -0.2021182719618082,
	"rewards/format_reward": 0.5833333432674408,
	"step": 396
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1223.8958740234375,
	"epoch": 0.45371428571428574,
	"grad_norm": 3.0861426217577645,
	"kl": 0.38720703125,
	"learning_rate": 2.1141329099692406e-07,
	"loss": 0.2308,
	"reward": 0.6319128852337599,
	"reward_std": 0.8242618143558502,
	"rewards/cosine_scaled_reward": 0.04512310400605202,
	"rewards/format_reward": 0.541666679084301,
	"step": 397
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1218.8541870117188,
	"epoch": 0.45485714285714285,
	"grad_norm": 18.365837770437405,
	"kl": 0.6829833984375,
	"learning_rate": 2.0935222495670968e-07,
	"loss": 0.189,
	"reward": 0.27588833356276155,
	"reward_std": 0.8127910792827606,
	"rewards/cosine_scaled_reward": -0.19538918882608414,
	"rewards/format_reward": 0.6666666939854622,
	"step": 398
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1264.1666870117188,
	"epoch": 0.456,
	"grad_norm": 3.8049582826738373,
	"kl": 0.47314453125,
	"learning_rate": 2.0730776160846853e-07,
	"loss": 0.1823,
	"reward": 0.055698491632938385,
	"reward_std": 0.49411067366600037,
	"rewards/cosine_scaled_reward": -0.21173409838229418,
	"rewards/format_reward": 0.4791666828095913,
	"step": 399
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1013.3750457763672,
	"epoch": 0.45714285714285713,
	"grad_norm": 7.251771375036044,
	"kl": 0.4078369140625,
	"learning_rate": 2.0528000059645995e-07,
	"loss": 0.175,
	"reward": 0.2562308683991432,
	"reward_std": 0.2563706263899803,
	"rewards/cosine_scaled_reward": -0.2260512337088585,
	"rewards/format_reward": 0.7083333488553762,
	"step": 400
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 952.7291870117188,
	"epoch": 0.4582857142857143,
	"grad_norm": 8.82258461767532,
	"kl": 0.45355224609375,
	"learning_rate": 2.032690407508949e-07,
	"loss": 0.1529,
	"reward": 0.4902263447875157,
	"reward_std": 0.5446355119347572,
	"rewards/cosine_scaled_reward": -0.11947017908096313,
	"rewards/format_reward": 0.729166679084301,
	"step": 401
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1302.8542175292969,
	"epoch": 0.4594285714285714,
	"grad_norm": 9.144934630730456,
	"kl": 0.51953125,
	"learning_rate": 2.0127498008311922e-07,
	"loss": 0.1489,
	"reward": 0.0001004636287689209,
	"reward_std": 0.5631029531359673,
	"rewards/cosine_scaled_reward": -0.28119976818561554,
	"rewards/format_reward": 0.5625000223517418,
	"step": 402
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1430.5833740234375,
	"epoch": 0.4605714285714286,
	"grad_norm": 1.9477748820622875,
	"kl": 0.3515625,
	"learning_rate": 1.9929791578083655e-07,
	"loss": 0.2408,
	"reward": -0.06413780152797699,
	"reward_std": 0.7934899777173996,
	"rewards/cosine_scaled_reward": -0.2195689007639885,
	"rewards/format_reward": 0.3750000037252903,
	"step": 403
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1262.8333587646484,
	"epoch": 0.4617142857142857,
	"grad_norm": 3.330875199108497,
	"kl": 0.19140625,
	"learning_rate": 1.9733794420337213e-07,
	"loss": 0.1344,
	"reward": 0.1329102972522378,
	"reward_std": 0.5511343032121658,
	"rewards/cosine_scaled_reward": -0.25646152906119823,
	"rewards/format_reward": 0.645833358168602,
	"step": 404
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1030.2916870117188,
	"epoch": 0.46285714285714286,
	"grad_norm": 7.2006501333137996,
	"kl": 0.147216796875,
	"learning_rate": 1.9539516087697517e-07,
	"loss": 0.1614,
	"reward": 0.41257511638104916,
	"reward_std": 0.4603617787361145,
	"rewards/cosine_scaled_reward": -0.14787913113832474,
	"rewards/format_reward": 0.7083333432674408,
	"step": 405
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1085.1666870117188,
	"epoch": 0.464,
	"grad_norm": 1.7990135612572722,
	"kl": 0.25146484375,
	"learning_rate": 1.934696604901642e-07,
	"loss": 0.1199,
	"reward": -0.0262349434196949,
	"reward_std": 0.4924147129058838,
	"rewards/cosine_scaled_reward": -0.2839508093893528,
	"rewards/format_reward": 0.5416666828095913,
	"step": 406
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 953.4375,
	"epoch": 0.46514285714285714,
	"grad_norm": 2.205384781012483,
	"kl": 0.16357421875,
	"learning_rate": 1.915615368891117e-07,
	"loss": 0.0901,
	"reward": 0.5169772207736969,
	"reward_std": 0.28926569409668446,
	"rewards/cosine_scaled_reward": -0.0748447310179472,
	"rewards/format_reward": 0.6666666828095913,
	"step": 407
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 914.9583587646484,
	"epoch": 0.4662857142857143,
	"grad_norm": 2.497956913876472,
	"kl": 0.27496337890625,
	"learning_rate": 1.8967088307307e-07,
	"loss": 0.1155,
	"reward": 0.3262156348209828,
	"reward_std": 0.6255160942673683,
	"rewards/cosine_scaled_reward": -0.13897553086280823,
	"rewards/format_reward": 0.6041666716337204,
	"step": 408
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1040.8333435058594,
	"epoch": 0.4674285714285714,
	"grad_norm": 8.69885706641019,
	"kl": 0.2950439453125,
	"learning_rate": 1.8779779118983867e-07,
	"loss": 0.1446,
	"reward": 0.45548180863261223,
	"reward_std": 0.683892697095871,
	"rewards/cosine_scaled_reward": -0.1472591133788228,
	"rewards/format_reward": 0.7500000149011612,
	"step": 409
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1113.8750305175781,
	"epoch": 0.4685714285714286,
	"grad_norm": 2.827095364325863,
	"kl": 0.17364501953125,
	"learning_rate": 1.8594235253127372e-07,
	"loss": 0.1365,
	"reward": -0.055647075176239014,
	"reward_std": 0.5701718181371689,
	"rewards/cosine_scaled_reward": -0.3299068883061409,
	"rewards/format_reward": 0.6041666939854622,
	"step": 410
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1180.7500305175781,
	"epoch": 0.4697142857142857,
	"grad_norm": 6.312691045251246,
	"kl": 0.2171630859375,
	"learning_rate": 1.8410465752883758e-07,
	"loss": 0.26,
	"reward": -0.027378916274756193,
	"reward_std": 0.5135050415992737,
	"rewards/cosine_scaled_reward": -0.33660613000392914,
	"rewards/format_reward": 0.6458333432674408,
	"step": 411
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 936.0625152587891,
	"epoch": 0.47085714285714286,
	"grad_norm": 8.466457070247934,
	"kl": 0.207763671875,
	"learning_rate": 1.822847957491922e-07,
	"loss": 0.2152,
	"reward": 0.2903781367931515,
	"reward_std": 0.6151079386472702,
	"rewards/cosine_scaled_reward": -0.24022759683430195,
	"rewards/format_reward": 0.770833358168602,
	"step": 412
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1127.9375610351562,
	"epoch": 0.472,
	"grad_norm": 7.005816452720984,
	"kl": 0.23388671875,
	"learning_rate": 1.804828558898332e-07,
	"loss": 0.2359,
	"reward": -0.05256163072772324,
	"reward_std": 0.5086416229605675,
	"rewards/cosine_scaled_reward": -0.30753082782030106,
	"rewards/format_reward": 0.5625000223517418,
	"step": 413
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1254.2292175292969,
	"epoch": 0.47314285714285714,
	"grad_norm": 3.1930529627345146,
	"kl": 0.30908203125,
	"learning_rate": 1.7869892577476722e-07,
	"loss": 0.091,
	"reward": 0.27630291134119034,
	"reward_std": 0.601336345076561,
	"rewards/cosine_scaled_reward": -0.12226520664989948,
	"rewards/format_reward": 0.520833358168602,
	"step": 414
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1198.7083740234375,
	"epoch": 0.4742857142857143,
	"grad_norm": 1.9203274121236615,
	"kl": 0.27783203125,
	"learning_rate": 1.7693309235023127e-07,
	"loss": 0.1839,
	"reward": 0.15045135095715523,
	"reward_std": 0.8359555453062057,
	"rewards/cosine_scaled_reward": -0.21644099615514278,
	"rewards/format_reward": 0.5833333507180214,
	"step": 415
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1303.2708435058594,
	"epoch": 0.4754285714285714,
	"grad_norm": 5.219130595783076,
	"kl": 0.288330078125,
	"learning_rate": 1.7518544168045524e-07,
	"loss": 0.2384,
	"reward": 0.06198018416762352,
	"reward_std": 0.7209452688694,
	"rewards/cosine_scaled_reward": -0.2502599246799946,
	"rewards/format_reward": 0.5625000149011612,
	"step": 416
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1104.3958740234375,
	"epoch": 0.4765714285714286,
	"grad_norm": 343.4311543801194,
	"kl": 3.455078125,
	"learning_rate": 1.7345605894346726e-07,
	"loss": 0.3667,
	"reward": 0.25671100057661533,
	"reward_std": 0.5841851308941841,
	"rewards/cosine_scaled_reward": -0.19456118065863848,
	"rewards/format_reward": 0.645833358168602,
	"step": 417
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1071.9375305175781,
	"epoch": 0.4777142857142857,
	"grad_norm": 3.5739561302927703,
	"kl": 0.18438720703125,
	"learning_rate": 1.7174502842694212e-07,
	"loss": 0.0318,
	"reward": 0.18263494968414307,
	"reward_std": 0.688008576631546,
	"rewards/cosine_scaled_reward": -0.25243253633379936,
	"rewards/format_reward": 0.6875000149011612,
	"step": 418
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1059.2500457763672,
	"epoch": 0.47885714285714287,
	"grad_norm": 42.82614000306872,
	"kl": 14.88720703125,
	"learning_rate": 1.7005243352409333e-07,
	"loss": 0.182,
	"reward": 0.10820261249318719,
	"reward_std": 0.658612459897995,
	"rewards/cosine_scaled_reward": -0.24798204004764557,
	"rewards/format_reward": 0.6041666716337204,
	"step": 419
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1066.7917022705078,
	"epoch": 0.48,
	"grad_norm": 7.563689623131912,
	"kl": 0.54296875,
	"learning_rate": 1.6837835672960831e-07,
	"loss": 0.1366,
	"reward": 0.24830662203021348,
	"reward_std": 0.6641267538070679,
	"rewards/cosine_scaled_reward": -0.19876337423920631,
	"rewards/format_reward": 0.6458333432674408,
	"step": 420
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1140.8958740234375,
	"epoch": 0.48114285714285715,
	"grad_norm": 5.102712434876203,
	"kl": 0.455322265625,
	"learning_rate": 1.6672287963562852e-07,
	"loss": 0.238,
	"reward": 0.22175164567306638,
	"reward_std": 0.48806294053792953,
	"rewards/cosine_scaled_reward": -0.19120752811431885,
	"rewards/format_reward": 0.6041666865348816,
	"step": 421
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1181.3333587646484,
	"epoch": 0.48228571428571426,
	"grad_norm": 11.187728016893017,
	"kl": 0.7470703125,
	"learning_rate": 1.6508608292777203e-07,
	"loss": 0.2428,
	"reward": 0.016264647245407104,
	"reward_std": 0.7520715892314911,
	"rewards/cosine_scaled_reward": -0.27311767637729645,
	"rewards/format_reward": 0.5625000149011612,
	"step": 422
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1151.5625305175781,
	"epoch": 0.48342857142857143,
	"grad_norm": 36.484656907353894,
	"kl": 1.12109375,
	"learning_rate": 1.6346804638120098e-07,
	"loss": 0.225,
	"reward": 0.166658578440547,
	"reward_std": 0.5137820392847061,
	"rewards/cosine_scaled_reward": -0.20833738893270493,
	"rewards/format_reward": 0.5833333432674408,
	"step": 423
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1222.4167022705078,
	"epoch": 0.4845714285714286,
	"grad_norm": 5.314021913144739,
	"kl": 0.468994140625,
	"learning_rate": 1.6186884885673413e-07,
	"loss": 0.0791,
	"reward": -0.053052062867209315,
	"reward_std": 0.5032695159316063,
	"rewards/cosine_scaled_reward": -0.349442720413208,
	"rewards/format_reward": 0.6458333507180214,
	"step": 424
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1303.0417022705078,
	"epoch": 0.4857142857142857,
	"grad_norm": 15.439357915372184,
	"kl": 0.76171875,
	"learning_rate": 1.6028856829700258e-07,
	"loss": 0.1567,
	"reward": 0.06288054899778217,
	"reward_std": 0.8221424967050552,
	"rewards/cosine_scaled_reward": -0.24980972707271576,
	"rewards/format_reward": 0.5625000111758709,
	"step": 425
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1363.2292175292969,
	"epoch": 0.4868571428571429,
	"grad_norm": 16.190560753791,
	"kl": 0.64306640625,
	"learning_rate": 1.5872728172265146e-07,
	"loss": 0.2057,
	"reward": 0.0070614293217659,
	"reward_std": 0.8801029026508331,
	"rewards/cosine_scaled_reward": -0.18396929651498795,
	"rewards/format_reward": 0.3750000111758709,
	"step": 426
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1062.6250457763672,
	"epoch": 0.488,
	"grad_norm": 5.208018104302035,
	"kl": 0.289306640625,
	"learning_rate": 1.5718506522858572e-07,
	"loss": 0.2392,
	"reward": 0.1040960568934679,
	"reward_std": 0.7021225243806839,
	"rewards/cosine_scaled_reward": -0.21878531202673912,
	"rewards/format_reward": 0.5416666865348816,
	"step": 427
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1168.6250457763672,
	"epoch": 0.48914285714285716,
	"grad_norm": 1.7936384513629215,
	"kl": 0.194366455078125,
	"learning_rate": 1.5566199398026147e-07,
	"loss": 0.1231,
	"reward": 0.1094297245144844,
	"reward_std": 0.5426923930644989,
	"rewards/cosine_scaled_reward": -0.247368473559618,
	"rewards/format_reward": 0.6041666716337204,
	"step": 428
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1089.6875305175781,
	"epoch": 0.49028571428571427,
	"grad_norm": 3.242866515089598,
	"kl": 0.18408203125,
	"learning_rate": 1.5415814221002265e-07,
	"loss": 0.1081,
	"reward": 0.4839252680540085,
	"reward_std": 0.5947171896696091,
	"rewards/cosine_scaled_reward": -0.03928736597299576,
	"rewards/format_reward": 0.5625000260770321,
	"step": 429
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1313.5833587646484,
	"epoch": 0.49142857142857144,
	"grad_norm": 1.478054069262014,
	"kl": 0.21612548828125,
	"learning_rate": 1.5267358321348285e-07,
	"loss": 0.1273,
	"reward": 0.15572084113955498,
	"reward_std": 0.5618212074041367,
	"rewards/cosine_scaled_reward": -0.18255625164601952,
	"rewards/format_reward": 0.5208333358168602,
	"step": 430
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 975.4791870117188,
	"epoch": 0.49257142857142855,
	"grad_norm": 3.541465585724065,
	"kl": 0.1605224609375,
	"learning_rate": 1.5120838934595337e-07,
	"loss": 0.1148,
	"reward": 0.420807933434844,
	"reward_std": 0.890654593706131,
	"rewards/cosine_scaled_reward": -0.11251270584762096,
	"rewards/format_reward": 0.6458333507180214,
	"step": 431
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1181.062515258789,
	"epoch": 0.4937142857142857,
	"grad_norm": 3.350973639300781,
	"kl": 0.155517578125,
	"learning_rate": 1.4976263201891613e-07,
	"loss": 0.1027,
	"reward": 0.032605723943561316,
	"reward_std": 0.5731803774833679,
	"rewards/cosine_scaled_reward": -0.2753637991845608,
	"rewards/format_reward": 0.5833333507180214,
	"step": 432
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1207.0417175292969,
	"epoch": 0.4948571428571429,
	"grad_norm": 4.990349151202906,
	"kl": 0.185546875,
	"learning_rate": 1.483363816965435e-07,
	"loss": 0.1393,
	"reward": 0.08886189805343747,
	"reward_std": 0.4594448246061802,
	"rewards/cosine_scaled_reward": -0.23681906727142632,
	"rewards/format_reward": 0.5625000298023224,
	"step": 433
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 859.1250457763672,
	"epoch": 0.496,
	"grad_norm": 1.9877951359345267,
	"kl": 0.17950439453125,
	"learning_rate": 1.469297078922642e-07,
	"loss": 0.0512,
	"reward": 1.2721150815486908,
	"reward_std": 0.6770742386579514,
	"rewards/cosine_scaled_reward": 0.20897419564425945,
	"rewards/format_reward": 0.8541666716337204,
	"step": 434
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1266.375015258789,
	"epoch": 0.49714285714285716,
	"grad_norm": 1.8972601097369153,
	"kl": 0.2255859375,
	"learning_rate": 1.4554267916537495e-07,
	"loss": 0.1234,
	"reward": 0.10697830189019442,
	"reward_std": 0.531020175665617,
	"rewards/cosine_scaled_reward": -0.22776086255908012,
	"rewards/format_reward": 0.5625000149011612,
	"step": 435
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1315.5625305175781,
	"epoch": 0.4982857142857143,
	"grad_norm": 2.490164316553904,
	"kl": 0.2635498046875,
	"learning_rate": 1.4417536311769885e-07,
	"loss": 0.1196,
	"reward": -0.10972822457551956,
	"reward_std": 0.5596715956926346,
	"rewards/cosine_scaled_reward": -0.2840307876467705,
	"rewards/format_reward": 0.4583333395421505,
	"step": 436
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1060.5208587646484,
	"epoch": 0.49942857142857144,
	"grad_norm": 1.9387158266765225,
	"kl": 0.294189453125,
	"learning_rate": 1.4282782639029128e-07,
	"loss": 0.0174,
	"reward": 0.52107123285532,
	"reward_std": 0.5726887807250023,
	"rewards/cosine_scaled_reward": -0.05196441989392042,
	"rewards/format_reward": 0.6250000298023224,
	"step": 437
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 921.3125457763672,
	"epoch": 0.5005714285714286,
	"grad_norm": 8.654811309244227,
	"kl": 0.2535400390625,
	"learning_rate": 1.4150013466019114e-07,
	"loss": 0.1354,
	"reward": 0.20009983237832785,
	"reward_std": 0.6868909299373627,
	"rewards/cosine_scaled_reward": -0.20203341665910557,
	"rewards/format_reward": 0.6041666865348816,
	"step": 438
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1083.0833740234375,
	"epoch": 0.5017142857142857,
	"grad_norm": 5.3889889905872375,
	"kl": 0.40216064453125,
	"learning_rate": 1.4019235263722034e-07,
	"loss": 0.2461,
	"reward": 0.11843711510300636,
	"reward_std": 0.5985070914030075,
	"rewards/cosine_scaled_reward": -0.2636981066316366,
	"rewards/format_reward": 0.6458333432674408,
	"step": 439
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1088.8750305175781,
	"epoch": 0.5028571428571429,
	"grad_norm": 4.149099334589977,
	"kl": 0.30615234375,
	"learning_rate": 1.3890454406082956e-07,
	"loss": 0.033,
	"reward": 0.11394692957401276,
	"reward_std": 0.6579174622893333,
	"rewards/cosine_scaled_reward": -0.24510987009853125,
	"rewards/format_reward": 0.6041666865348816,
	"step": 440
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 984.3125610351562,
	"epoch": 0.504,
	"grad_norm": 16.782102815445953,
	"kl": 0.367919921875,
	"learning_rate": 1.3763677169699217e-07,
	"loss": 0.0977,
	"reward": 0.3985663428902626,
	"reward_std": 0.42315196245908737,
	"rewards/cosine_scaled_reward": -0.134050190448761,
	"rewards/format_reward": 0.6666666828095913,
	"step": 441
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 988.4583587646484,
	"epoch": 0.5051428571428571,
	"grad_norm": 4.106390753028162,
	"kl": 0.31378173828125,
	"learning_rate": 1.3638909733514452e-07,
	"loss": 0.0056,
	"reward": 0.13055693171918392,
	"reward_std": 0.48535653203725815,
	"rewards/cosine_scaled_reward": -0.27847154438495636,
	"rewards/format_reward": 0.6875000223517418,
	"step": 442
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1004.3750152587891,
	"epoch": 0.5062857142857143,
	"grad_norm": 8.08171445493757,
	"kl": 0.16259765625,
	"learning_rate": 1.351615817851748e-07,
	"loss": 0.2301,
	"reward": 0.30977149307727814,
	"reward_std": 0.6895428746938705,
	"rewards/cosine_scaled_reward": -0.18886426091194153,
	"rewards/format_reward": 0.6875000298023224,
	"step": 443
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1087.9375305175781,
	"epoch": 0.5074285714285715,
	"grad_norm": 3.849891000062917,
	"kl": 0.1669921875,
	"learning_rate": 1.3395428487445914e-07,
	"loss": 0.0975,
	"reward": 0.4580417312681675,
	"reward_std": 0.640699241310358,
	"rewards/cosine_scaled_reward": -0.14597914181649685,
	"rewards/format_reward": 0.7500000298023224,
	"step": 444
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1224.2500457763672,
	"epoch": 0.5085714285714286,
	"grad_norm": 23.59411548899569,
	"kl": 0.82958984375,
	"learning_rate": 1.3276726544494571e-07,
	"loss": 0.2131,
	"reward": 0.051861570216715336,
	"reward_std": 0.6278680041432381,
	"rewards/cosine_scaled_reward": -0.2553192190825939,
	"rewards/format_reward": 0.5625,
	"step": 445
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1067.9791870117188,
	"epoch": 0.5097142857142857,
	"grad_norm": 3.656936199785778,
	"kl": 0.24365234375,
	"learning_rate": 1.316005813502869e-07,
	"loss": 0.0234,
	"reward": 0.36385649256408215,
	"reward_std": 0.7834623008966446,
	"rewards/cosine_scaled_reward": -0.1722384188324213,
	"rewards/format_reward": 0.7083333432674408,
	"step": 446
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1174.7083740234375,
	"epoch": 0.5108571428571429,
	"grad_norm": 2.1759216078948036,
	"kl": 0.3828125,
	"learning_rate": 1.3045428945301953e-07,
	"loss": 0.2194,
	"reward": 0.23982627410441637,
	"reward_std": 0.5332969650626183,
	"rewards/cosine_scaled_reward": -0.21342020854353905,
	"rewards/format_reward": 0.6666666865348816,
	"step": 447
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1180.2083740234375,
	"epoch": 0.512,
	"grad_norm": 4.2366039265569135,
	"kl": 0.31494140625,
	"learning_rate": 1.2932844562179352e-07,
	"loss": 0.1963,
	"reward": 0.3762773834168911,
	"reward_std": 0.6801744475960732,
	"rewards/cosine_scaled_reward": -0.14519466273486614,
	"rewards/format_reward": 0.666666679084301,
	"step": 448
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 994.3958740234375,
	"epoch": 0.5131428571428571,
	"grad_norm": 4.146583173336839,
	"kl": 0.148681640625,
	"learning_rate": 1.2822310472864885e-07,
	"loss": 0.1922,
	"reward": 0.36078188568353653,
	"reward_std": 0.737194113433361,
	"rewards/cosine_scaled_reward": -0.15294241392984986,
	"rewards/format_reward": 0.6666667014360428,
	"step": 449
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 851.8958587646484,
	"epoch": 0.5142857142857142,
	"grad_norm": 51.8228987068238,
	"kl": 0.534942626953125,
	"learning_rate": 1.2713832064634125e-07,
	"loss": 0.1269,
	"reward": 0.5865043960511684,
	"reward_std": 0.4706997238099575,
	"rewards/cosine_scaled_reward": -0.10258114710450172,
	"rewards/format_reward": 0.7916666865348816,
	"step": 450
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1079.7916870117188,
	"epoch": 0.5154285714285715,
	"grad_norm": 6.392599999015184,
	"kl": 0.2735595703125,
	"learning_rate": 1.260741462457165e-07,
	"loss": 0.2626,
	"reward": 0.22280075028538704,
	"reward_std": 0.6088056340813637,
	"rewards/cosine_scaled_reward": -0.18026629835367203,
	"rewards/format_reward": 0.5833333395421505,
	"step": 451
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1046.8542022705078,
	"epoch": 0.5165714285714286,
	"grad_norm": 8.715599338320725,
	"kl": 0.152099609375,
	"learning_rate": 1.2503063339313356e-07,
	"loss": 0.2189,
	"reward": -0.0023173224180936813,
	"reward_std": 0.5100973732769489,
	"rewards/cosine_scaled_reward": -0.3136586770415306,
	"rewards/format_reward": 0.6250000298023224,
	"step": 452
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1271.9375305175781,
	"epoch": 0.5177142857142857,
	"grad_norm": 2.4908553038859917,
	"kl": 0.40869140625,
	"learning_rate": 1.2400783294793668e-07,
	"loss": 0.1805,
	"reward": 0.027155719697475433,
	"reward_std": 0.5863115191459656,
	"rewards/cosine_scaled_reward": -0.23642215505242348,
	"rewards/format_reward": 0.5000000149011612,
	"step": 453
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1104.6667175292969,
	"epoch": 0.5188571428571429,
	"grad_norm": 42.815024473876115,
	"kl": 2.04296875,
	"learning_rate": 1.2300579475997657e-07,
	"loss": 0.039,
	"reward": 0.2878073714673519,
	"reward_std": 0.6589629650115967,
	"rewards/cosine_scaled_reward": -0.13734631799161434,
	"rewards/format_reward": 0.5625000223517418,
	"step": 454
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1183.250015258789,
	"epoch": 0.52,
	"grad_norm": 2.6108705721314824,
	"kl": 0.306640625,
	"learning_rate": 1.220245676671809e-07,
	"loss": 0.1199,
	"reward": 0.3790533752180636,
	"reward_std": 0.4862861856818199,
	"rewards/cosine_scaled_reward": -0.13338997215032578,
	"rewards/format_reward": 0.645833358168602,
	"step": 455
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1168.4375305175781,
	"epoch": 0.5211428571428571,
	"grad_norm": 25.701940158931713,
	"kl": 0.477294921875,
	"learning_rate": 1.2106419949317388e-07,
	"loss": 0.1681,
	"reward": 0.2994745699688792,
	"reward_std": 0.7066301554441452,
	"rewards/cosine_scaled_reward": -0.15234605269506574,
	"rewards/format_reward": 0.6041666865348816,
	"step": 456
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1007.4167022705078,
	"epoch": 0.5222857142857142,
	"grad_norm": 8.570796851314613,
	"kl": 0.186279296875,
	"learning_rate": 1.2012473704494537e-07,
	"loss": 0.3132,
	"reward": 0.400404367595911,
	"reward_std": 0.5747000873088837,
	"rewards/cosine_scaled_reward": -0.15396450087428093,
	"rewards/format_reward": 0.708333358168602,
	"step": 457
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1272.8958435058594,
	"epoch": 0.5234285714285715,
	"grad_norm": 4.345232068890798,
	"kl": 0.48974609375,
	"learning_rate": 1.1920622611056974e-07,
	"loss": 0.2715,
	"reward": 0.2525772713124752,
	"reward_std": 0.8047986179590225,
	"rewards/cosine_scaled_reward": -0.12371136248111725,
	"rewards/format_reward": 0.5000000298023224,
	"step": 458
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1063.625015258789,
	"epoch": 0.5245714285714286,
	"grad_norm": 5.1821320020363615,
	"kl": 0.15771484375,
	"learning_rate": 1.1830871145697412e-07,
	"loss": 0.1275,
	"reward": 0.02093285135924816,
	"reward_std": 0.42146630585193634,
	"rewards/cosine_scaled_reward": -0.3124502506107092,
	"rewards/format_reward": 0.645833358168602,
	"step": 459
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1044.2708435058594,
	"epoch": 0.5257142857142857,
	"grad_norm": 3.3010395921201514,
	"kl": 0.267333984375,
	"learning_rate": 1.1743223682775649e-07,
	"loss": 0.1046,
	"reward": 0.2678923445455439,
	"reward_std": 0.896328404545784,
	"rewards/cosine_scaled_reward": -0.12647049874067307,
	"rewards/format_reward": 0.5208333656191826,
	"step": 460
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1053.4583587646484,
	"epoch": 0.5268571428571428,
	"grad_norm": 1.478075619341315,
	"kl": 0.21600341796875,
	"learning_rate": 1.1657684494105386e-07,
	"loss": 0.0555,
	"reward": 0.3391416594386101,
	"reward_std": 0.9088789522647858,
	"rewards/cosine_scaled_reward": -0.20542917400598526,
	"rewards/format_reward": 0.7500000223517418,
	"step": 461
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1103.2083740234375,
	"epoch": 0.528,
	"grad_norm": 9530.342121672113,
	"kl": 28.46978759765625,
	"learning_rate": 1.1574257748745986e-07,
	"loss": 1.3293,
	"reward": 0.14297988126054406,
	"reward_std": 0.5064843520522118,
	"rewards/cosine_scaled_reward": -0.25142673472873867,
	"rewards/format_reward": 0.645833358168602,
	"step": 462
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1043.3333587646484,
	"epoch": 0.5291428571428571,
	"grad_norm": 4.739243744092973,
	"kl": 0.39892578125,
	"learning_rate": 1.1492947512799328e-07,
	"loss": 0.2493,
	"reward": 0.6755956448614597,
	"reward_std": 0.4871959462761879,
	"rewards/cosine_scaled_reward": 0.025297801941633224,
	"rewards/format_reward": 0.6250000298023224,
	"step": 463
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 992.2916717529297,
	"epoch": 0.5302857142857142,
	"grad_norm": 122.21833055898026,
	"kl": 1.33843994140625,
	"learning_rate": 1.1413757749211602e-07,
	"loss": 0.2572,
	"reward": 0.29958341596648097,
	"reward_std": 0.8296171501278877,
	"rewards/cosine_scaled_reward": -0.20437496528029442,
	"rewards/format_reward": 0.7083333432674408,
	"step": 464
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1240.5417175292969,
	"epoch": 0.5314285714285715,
	"grad_norm": 6.845067293294205,
	"kl": 0.55908203125,
	"learning_rate": 1.1336692317580158e-07,
	"loss": 0.1986,
	"reward": 0.072305912617594,
	"reward_std": 0.4831778481602669,
	"rewards/cosine_scaled_reward": -0.2138470560312271,
	"rewards/format_reward": 0.5000000186264515,
	"step": 465
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1192.9167175292969,
	"epoch": 0.5325714285714286,
	"grad_norm": 9.93163371597492,
	"kl": 0.49163818359375,
	"learning_rate": 1.1261754973965422e-07,
	"loss": 0.0928,
	"reward": 0.04001780319958925,
	"reward_std": 0.44342009350657463,
	"rewards/cosine_scaled_reward": -0.28207441698759794,
	"rewards/format_reward": 0.6041666865348816,
	"step": 466
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1341.1667175292969,
	"epoch": 0.5337142857142857,
	"grad_norm": 19.835786495839272,
	"kl": 0.8251953125,
	"learning_rate": 1.1188949370707787e-07,
	"loss": 0.2635,
	"reward": 0.1306269969791174,
	"reward_std": 0.6591696962714195,
	"rewards/cosine_scaled_reward": -0.21593650616705418,
	"rewards/format_reward": 0.5625000298023224,
	"step": 467
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1138.7708740234375,
	"epoch": 0.5348571428571428,
	"grad_norm": 13.935934940776233,
	"kl": 0.576904296875,
	"learning_rate": 1.1118279056249653e-07,
	"loss": 0.0976,
	"reward": 0.37931894324719906,
	"reward_std": 0.5462356135249138,
	"rewards/cosine_scaled_reward": -0.10200719349086285,
	"rewards/format_reward": 0.5833333432674408,
	"step": 468
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1028.6041870117188,
	"epoch": 0.536,
	"grad_norm": 2.6967193006473216,
	"kl": 0.26171875,
	"learning_rate": 1.1049747474962444e-07,
	"loss": 0.0529,
	"reward": 0.40807172656059265,
	"reward_std": 0.6494475156068802,
	"rewards/cosine_scaled_reward": -0.13971414044499397,
	"rewards/format_reward": 0.6875000149011612,
	"step": 469
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 848.5833587646484,
	"epoch": 0.5371428571428571,
	"grad_norm": 1.7855628531087904,
	"kl": 0.1328125,
	"learning_rate": 1.0983357966978745e-07,
	"loss": 0.0217,
	"reward": 0.6918718162924051,
	"reward_std": 0.5211210399866104,
	"rewards/cosine_scaled_reward": -0.0811474658548832,
	"rewards/format_reward": 0.8541666865348816,
	"step": 470
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1070.7500457763672,
	"epoch": 0.5382857142857143,
	"grad_norm": 6.038242137859596,
	"kl": 0.169921875,
	"learning_rate": 1.0919113768029517e-07,
	"loss": 0.2149,
	"reward": 0.06172482669353485,
	"reward_std": 0.5211478099226952,
	"rewards/cosine_scaled_reward": -0.29205426201224327,
	"rewards/format_reward": 0.6458333432674408,
	"step": 471
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 842.7083435058594,
	"epoch": 0.5394285714285715,
	"grad_norm": 12.109988243714355,
	"kl": 0.2894287109375,
	"learning_rate": 1.0857018009286381e-07,
	"loss": -0.0496,
	"reward": 0.46792223304510117,
	"reward_std": 0.54752978682518,
	"rewards/cosine_scaled_reward": -0.17228887975215912,
	"rewards/format_reward": 0.8125000149011612,
	"step": 472
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1181.1875305175781,
	"epoch": 0.5405714285714286,
	"grad_norm": 6.409193674543087,
	"kl": 0.3604736328125,
	"learning_rate": 1.0797073717209013e-07,
	"loss": 0.0613,
	"reward": -0.049041745252907276,
	"reward_std": 0.5112807080149651,
	"rewards/cosine_scaled_reward": -0.2641042061150074,
	"rewards/format_reward": 0.4791666865348816,
	"step": 473
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 990.3750305175781,
	"epoch": 0.5417142857142857,
	"grad_norm": 9.264374683069418,
	"kl": 0.11328125,
	"learning_rate": 1.0739283813397639e-07,
	"loss": 0.1628,
	"reward": 0.32282854616642,
	"reward_std": 0.7814144194126129,
	"rewards/cosine_scaled_reward": -0.20316907577216625,
	"rewards/format_reward": 0.7291667014360428,
	"step": 474
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 962.4792022705078,
	"epoch": 0.5428571428571428,
	"grad_norm": 4.242696196218054,
	"kl": 0.12603759765625,
	"learning_rate": 1.068365111445064e-07,
	"loss": 0.1483,
	"reward": 0.08424473810009658,
	"reward_std": 0.48827143758535385,
	"rewards/cosine_scaled_reward": -0.28079431876540184,
	"rewards/format_reward": 0.6458333544433117,
	"step": 475
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1048.8958587646484,
	"epoch": 0.544,
	"grad_norm": 18.108937894089827,
	"kl": 0.4520263671875,
	"learning_rate": 1.063017833182728e-07,
	"loss": 0.1426,
	"reward": 0.3813807927072048,
	"reward_std": 0.6394810080528259,
	"rewards/cosine_scaled_reward": -0.05930961295962334,
	"rewards/format_reward": 0.5000000223517418,
	"step": 476
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 723.4791870117188,
	"epoch": 0.5451428571428572,
	"grad_norm": 5.68071346914076,
	"kl": 0.156005859375,
	"learning_rate": 1.0578868071715544e-07,
	"loss": 0.0836,
	"reward": 0.7284884303808212,
	"reward_std": 0.6032212525606155,
	"rewards/cosine_scaled_reward": -0.08367248624563217,
	"rewards/format_reward": 0.8958333432674408,
	"step": 477
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1013.3750305175781,
	"epoch": 0.5462857142857143,
	"grad_norm": 4.283725447191352,
	"kl": 0.0955657958984375,
	"learning_rate": 1.0529722834905125e-07,
	"loss": 0.1397,
	"reward": 0.49316432885825634,
	"reward_std": 0.45135799795389175,
	"rewards/cosine_scaled_reward": -0.12841782718896866,
	"rewards/format_reward": 0.7500000149011612,
	"step": 478
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 998.3125,
	"epoch": 0.5474285714285714,
	"grad_norm": 1.2914537281090146,
	"kl": 0.15277099609375,
	"learning_rate": 1.0482745016665526e-07,
	"loss": 0.0674,
	"reward": 0.31127920374274254,
	"reward_std": 0.6323697119951248,
	"rewards/cosine_scaled_reward": -0.16727706603705883,
	"rewards/format_reward": 0.6458333432674408,
	"step": 479
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 984.8750152587891,
	"epoch": 0.5485714285714286,
	"grad_norm": 1.5465556570908303,
	"kl": 0.077606201171875,
	"learning_rate": 1.0437936906629334e-07,
	"loss": 0.0463,
	"reward": 0.5994082670658827,
	"reward_std": 0.37920307368040085,
	"rewards/cosine_scaled_reward": -0.1169625474140048,
	"rewards/format_reward": 0.8333333432674408,
	"step": 480
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1145.3541870117188,
	"epoch": 0.5497142857142857,
	"grad_norm": 2.3612933288431535,
	"kl": 0.150390625,
	"learning_rate": 1.0395300688680625e-07,
	"loss": 0.0852,
	"reward": 0.18339010886847973,
	"reward_std": 0.6312093585729599,
	"rewards/cosine_scaled_reward": -0.1999716181308031,
	"rewards/format_reward": 0.5833333432674408,
	"step": 481
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1099.2291870117188,
	"epoch": 0.5508571428571428,
	"grad_norm": 3.3478046762143303,
	"kl": 0.1209716796875,
	"learning_rate": 1.0354838440848501e-07,
	"loss": 0.1351,
	"reward": 0.4303822033107281,
	"reward_std": 0.5441673323512077,
	"rewards/cosine_scaled_reward": -0.10772557370364666,
	"rewards/format_reward": 0.6458333488553762,
	"step": 482
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 972.8958587646484,
	"epoch": 0.552,
	"grad_norm": 2.070698520552659,
	"kl": 0.1766357421875,
	"learning_rate": 1.0316552135205837e-07,
	"loss": 0.1898,
	"reward": 0.3021550700068474,
	"reward_std": 0.6595650911331177,
	"rewards/cosine_scaled_reward": -0.244755819439888,
	"rewards/format_reward": 0.7916666865348816,
	"step": 483
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 966.4375,
	"epoch": 0.5531428571428572,
	"grad_norm": 7.4718244138049785,
	"kl": 0.11669921875,
	"learning_rate": 1.0280443637773163e-07,
	"loss": 0.1535,
	"reward": 0.5982861579395831,
	"reward_std": 0.72054024040699,
	"rewards/cosine_scaled_reward": -0.034190285950899124,
	"rewards/format_reward": 0.6666666716337204,
	"step": 484
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 850.2292022705078,
	"epoch": 0.5542857142857143,
	"grad_norm": 2.6747580946696172,
	"kl": 0.171142578125,
	"learning_rate": 1.0246514708427701e-07,
	"loss": 0.0845,
	"reward": 0.440962532768026,
	"reward_std": 0.4621984176337719,
	"rewards/cosine_scaled_reward": -0.1545187532901764,
	"rewards/format_reward": 0.7500000298023224,
	"step": 485
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1098.4791870117188,
	"epoch": 0.5554285714285714,
	"grad_norm": 5.009044167350138,
	"kl": 0.1492919921875,
	"learning_rate": 1.0214767000817596e-07,
	"loss": 0.1657,
	"reward": 0.31744778295978904,
	"reward_std": 0.8680954575538635,
	"rewards/cosine_scaled_reward": -0.15377611527219415,
	"rewards/format_reward": 0.6250000149011612,
	"step": 486
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1250.9167022705078,
	"epoch": 0.5565714285714286,
	"grad_norm": 2.168704280565103,
	"kl": 0.3330078125,
	"learning_rate": 1.0185202062281336e-07,
	"loss": 0.1075,
	"reward": 0.09373046457767487,
	"reward_std": 0.7844668254256248,
	"rewards/cosine_scaled_reward": -0.18230143561959267,
	"rewards/format_reward": 0.4583333432674408,
	"step": 487
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1182.0417175292969,
	"epoch": 0.5577142857142857,
	"grad_norm": 68.49302633471272,
	"kl": 1.04931640625,
	"learning_rate": 1.0157821333772304e-07,
	"loss": 0.2607,
	"reward": 0.0013678865507245064,
	"reward_std": 0.5483251512050629,
	"rewards/cosine_scaled_reward": -0.2805660478770733,
	"rewards/format_reward": 0.5625000149011612,
	"step": 488
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 898.8958740234375,
	"epoch": 0.5588571428571428,
	"grad_norm": 3.3267908174810983,
	"kl": 0.22412109375,
	"learning_rate": 1.013262614978859e-07,
	"loss": 0.1155,
	"reward": 0.9380166502669454,
	"reward_std": 0.38279012218117714,
	"rewards/cosine_scaled_reward": 0.10442498326301575,
	"rewards/format_reward": 0.7291666716337204,
	"step": 489
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1022.5417175292969,
	"epoch": 0.56,
	"grad_norm": 1.2964210055945644,
	"kl": 0.142425537109375,
	"learning_rate": 1.0109617738307911e-07,
	"loss": 0.1375,
	"reward": 0.1352614858187735,
	"reward_std": 0.5779989808797836,
	"rewards/cosine_scaled_reward": -0.29695259779691696,
	"rewards/format_reward": 0.7291666716337204,
	"step": 490
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1071.2708740234375,
	"epoch": 0.5611428571428572,
	"grad_norm": 5.198263303721915,
	"kl": 0.270751953125,
	"learning_rate": 1.0088797220727779e-07,
	"loss": 0.1398,
	"reward": 0.360213914886117,
	"reward_std": 0.5864584296941757,
	"rewards/cosine_scaled_reward": -0.12197639048099518,
	"rewards/format_reward": 0.6041666716337204,
	"step": 491
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1146.0208740234375,
	"epoch": 0.5622857142857143,
	"grad_norm": 25.32884427185481,
	"kl": 0.860107421875,
	"learning_rate": 1.0070165611810855e-07,
	"loss": 0.279,
	"reward": 0.3603329248726368,
	"reward_std": 0.4203804060816765,
	"rewards/cosine_scaled_reward": -0.11150021478533745,
	"rewards/format_reward": 0.583333358168602,
	"step": 492
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1079.6250457763672,
	"epoch": 0.5634285714285714,
	"grad_norm": 5.39013483275012,
	"kl": 0.4027099609375,
	"learning_rate": 1.005372381963547e-07,
	"loss": 0.2018,
	"reward": 0.24866360798478127,
	"reward_std": 0.6557547599077225,
	"rewards/cosine_scaled_reward": -0.21941821463406086,
	"rewards/format_reward": 0.6875000298023224,
	"step": 493
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1020.0417022705078,
	"epoch": 0.5645714285714286,
	"grad_norm": 39.118419014119006,
	"kl": 1.0677490234375,
	"learning_rate": 1.0039472645551372e-07,
	"loss": 0.2737,
	"reward": 0.027191074565052986,
	"reward_std": 0.4351058676838875,
	"rewards/cosine_scaled_reward": -0.3301544785499573,
	"rewards/format_reward": 0.6875000149011612,
	"step": 494
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1194.8958740234375,
	"epoch": 0.5657142857142857,
	"grad_norm": 5.938670898030579,
	"kl": 0.630859375,
	"learning_rate": 1.002741278414069e-07,
	"loss": 0.2055,
	"reward": 0.374758190009743,
	"reward_std": 0.6815578863024712,
	"rewards/cosine_scaled_reward": -0.09387091733515263,
	"rewards/format_reward": 0.5625000298023224,
	"step": 495
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1120.3542022705078,
	"epoch": 0.5668571428571428,
	"grad_norm": 16.235625518016562,
	"kl": 0.50927734375,
	"learning_rate": 1.0017544823184055e-07,
	"loss": 0.297,
	"reward": 0.40772235160693526,
	"reward_std": 0.8966069668531418,
	"rewards/cosine_scaled_reward": -0.09822217002511024,
	"rewards/format_reward": 0.6041666865348816,
	"step": 496
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 827.4791870117188,
	"epoch": 0.568,
	"grad_norm": 6.892460021170429,
	"kl": 6.8536376953125,
	"learning_rate": 1.0009869243631952e-07,
	"loss": 0.2026,
	"reward": 0.8302161321043968,
	"reward_std": 0.560060553252697,
	"rewards/cosine_scaled_reward": 0.06094140186905861,
	"rewards/format_reward": 0.708333358168602,
	"step": 497
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1077.1250305175781,
	"epoch": 0.5691428571428572,
	"grad_norm": 10.255398655040155,
	"kl": 0.54931640625,
	"learning_rate": 1.000438641958131e-07,
	"loss": 0.2299,
	"reward": 0.033572545275092125,
	"reward_std": 0.4632219597697258,
	"rewards/cosine_scaled_reward": -0.30613040924072266,
	"rewards/format_reward": 0.645833358168602,
	"step": 498
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1336.7083435058594,
	"epoch": 0.5702857142857143,
	"grad_norm": 27.33692044026225,
	"kl": 0.934326171875,
	"learning_rate": 1.0001096618257236e-07,
	"loss": 0.1642,
	"reward": -0.13300850987434387,
	"reward_std": 0.6832303777337074,
	"rewards/cosine_scaled_reward": -0.28525424748659134,
	"rewards/format_reward": 0.4375000149011612,
	"step": 499
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1017.9375305175781,
	"epoch": 0.5714285714285714,
	"grad_norm": 2.4162040111238334,
	"kl": 0.2301025390625,
	"learning_rate": 1e-07,
	"loss": 0.1131,
	"reward": 0.13043908029794693,
	"reward_std": 0.5788910314440727,
	"rewards/cosine_scaled_reward": -0.29936380684375763,
	"rewards/format_reward": 0.729166679084301,
	"step": 500
	},
	{
	"epoch": 0.5714285714285714,
	"step": 500,
	"total_flos": 0.0,
	"train_loss": 0.7532739232839085,
	"train_runtime": 13678.504,
	"train_samples_per_second": 1.755,
	"train_steps_per_second": 0.037
	}
	],
	"logging_steps": 1,
	"max_steps": 500,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}